RouteController.cpp revision 96f261e8b28048b8cb48f5a4e81822c73bb813f4
1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "RouteController.h"
18
19#include "Fwmark.h"
20#include "NetdConstants.h"
21
22#include <arpa/inet.h>
23#include <errno.h>
24#include <linux/fib_rules.h>
25#include <linux/netlink.h>
26#include <linux/rtnetlink.h>
27#include <logwrap/logwrap.h>
28#include <map>
29#include <netinet/in.h>
30#include <net/if.h>
31#include <sys/socket.h>
32#include <sys/uio.h>
33#include <unistd.h>
34
35// Avoids "non-constant-expression cannot be narrowed from type 'unsigned int' to 'unsigned short'"
36// warnings when using RTA_LENGTH(x) inside static initializers (even when x is already uint16_t).
37#define U16_RTA_LENGTH(x) static_cast<uint16_t>(RTA_LENGTH((x)))
38
39namespace {
40
41const uint32_t RULE_PRIORITY_PRIVILEGED_LEGACY     = 11000;
42const uint32_t RULE_PRIORITY_PER_NETWORK_EXPLICIT  = 13000;
43const uint32_t RULE_PRIORITY_PER_NETWORK_INTERFACE = 14000;
44const uint32_t RULE_PRIORITY_LEGACY                = 16000;
45const uint32_t RULE_PRIORITY_PER_NETWORK_NORMAL    = 17000;
46const uint32_t RULE_PRIORITY_DEFAULT_NETWORK       = 19000;
47const uint32_t RULE_PRIORITY_MAIN                  = 20000;
48// TODO: Uncomment once we are sure everything works.
49#if 0
50const uint32_t RULE_PRIORITY_UNREACHABLE           = 21000;
51#endif
52
53// TODO: These should be turned into per-UID tables once the kernel supports UID-based routing.
54const int ROUTE_TABLE_PRIVILEGED_LEGACY = RouteController::ROUTE_TABLE_OFFSET_FROM_INDEX - 901;
55const int ROUTE_TABLE_LEGACY            = RouteController::ROUTE_TABLE_OFFSET_FROM_INDEX - 902;
56
57const uint16_t kNetlinkRequestFlags = NLM_F_REQUEST | NLM_F_ACK;
58const uint16_t kNetlinkCreateRequestFlags = kNetlinkRequestFlags | NLM_F_CREATE | NLM_F_EXCL;
59
60std::map<std::string, uint32_t> interfaceToIndex;
61
62uint32_t getRouteTableForInterface(const char* interface) {
63    uint32_t index = if_nametoindex(interface);
64    if (index) {
65        interfaceToIndex[interface] = index;
66    } else {
67        // If the interface goes away if_nametoindex() will return 0 but we still need to know
68        // the index so we can remove the rules and routes.
69        std::map<std::string, uint32_t>::iterator it = interfaceToIndex.find(interface);
70        if (it != interfaceToIndex.end())
71            index = it->second;
72    }
73    return index ? index + RouteController::ROUTE_TABLE_OFFSET_FROM_INDEX : 0;
74}
75
76// Sends a netlink request and expects an ack.
77// |iov| is an array of struct iovec that contains the netlink message payload.
78// The netlink header is generated by this function based on |action| and |flags|.
79// Returns -errno if there was an error or if the kernel reported an error.
80int sendNetlinkRequest(uint16_t action, uint16_t flags, iovec* iov, int iovlen) {
81    nlmsghdr nlmsg = {
82        .nlmsg_type = action,
83        .nlmsg_flags = flags,
84    };
85    iov[0].iov_base = &nlmsg;
86    iov[0].iov_len = sizeof(nlmsg);
87    for (int i = 0; i < iovlen; ++i) {
88        nlmsg.nlmsg_len += iov[i].iov_len;
89    }
90
91    int ret;
92    struct {
93        nlmsghdr msg;
94        nlmsgerr err;
95    } response;
96
97    sockaddr_nl kernel = {AF_NETLINK, 0, 0, 0};
98    int sock = socket(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE);
99    if (sock != -1 &&
100            connect(sock, reinterpret_cast<sockaddr*>(&kernel), sizeof(kernel)) != -1 &&
101            writev(sock, iov, iovlen) != -1 &&
102            (ret = recv(sock, &response, sizeof(response), 0)) != -1) {
103        if (ret == sizeof(response)) {
104            ret = response.err.error;  // Netlink errors are negative errno.
105        } else {
106            ret = -EBADMSG;
107        }
108    } else {
109        ret = -errno;
110    }
111
112    if (sock != -1) {
113        close(sock);
114    }
115
116    return ret;
117}
118
119// Adds or removes a routing rule for IPv4 and IPv6.
120//
121// + If |table| is non-zero, the rule points at the specified routing table. Otherwise, the rule
122//   returns ENETUNREACH.
123// + If |mask| is non-zero, the rule matches the specified fwmark and mask. Otherwise, |fwmark| is
124//   ignored.
125// + If |interface| is non-NULL, the rule matches the specified outgoing interface.
126//
127// Returns 0 on success or negative errno on failure.
128int modifyIpRule(uint16_t action, uint32_t priority, uint32_t table, uint32_t fwmark, uint32_t mask,
129                 const char* interface) {
130    // The interface name must include exactly one terminating NULL and be properly padded, or older
131    // kernels will refuse to delete rules.
132    uint8_t padding[RTA_ALIGNTO] = {0, 0, 0, 0};
133    uint16_t paddingLength = 0;
134    size_t interfaceLength = 0;
135    char oifname[IFNAMSIZ];
136    if (interface) {
137        interfaceLength = strlcpy(oifname, interface, IFNAMSIZ) + 1;
138        if (interfaceLength > IFNAMSIZ) {
139            return -ENAMETOOLONG;
140        }
141        paddingLength = RTA_SPACE(interfaceLength) - RTA_LENGTH(interfaceLength);
142    }
143
144    // Assemble a rule request and put it in an array of iovec structures.
145    fib_rule_hdr rule = {
146        .action = static_cast<uint8_t>(table ? FR_ACT_TO_TBL : FR_ACT_UNREACHABLE),
147    };
148
149    rtattr fra_priority = { U16_RTA_LENGTH(sizeof(priority)),  FRA_PRIORITY };
150    rtattr fra_table    = { U16_RTA_LENGTH(sizeof(table)),     FRA_TABLE };
151    rtattr fra_fwmark   = { U16_RTA_LENGTH(sizeof(fwmark)),    FRA_FWMARK };
152    rtattr fra_fwmask   = { U16_RTA_LENGTH(sizeof(mask)),      FRA_FWMASK };
153    rtattr fra_oifname  = { U16_RTA_LENGTH(interfaceLength),   FRA_OIFNAME };
154
155    iovec iov[] = {
156        { NULL,           0 },
157        { &rule,          sizeof(rule) },
158        { &fra_priority,  sizeof(fra_priority) },
159        { &priority,      sizeof(priority) },
160        { &fra_table,     table ? sizeof(fra_table) : 0 },
161        { &table,         table ? sizeof(table) : 0 },
162        { &fra_fwmark,    mask ? sizeof(fra_fwmark) : 0 },
163        { &fwmark,        mask ? sizeof(fwmark) : 0 },
164        { &fra_fwmask,    mask ? sizeof(fra_fwmask) : 0 },
165        { &mask,          mask ? sizeof(mask) : 0 },
166        { &fra_oifname,   interface ? sizeof(fra_oifname) : 0 },
167        { oifname,        interfaceLength },
168        { padding,        paddingLength },
169    };
170
171    uint16_t flags = (action == RTM_NEWRULE) ? kNetlinkCreateRequestFlags : kNetlinkRequestFlags;
172    uint8_t family[] = {AF_INET, AF_INET6};
173    for (size_t i = 0; i < ARRAY_SIZE(family); ++i) {
174        rule.family = family[i];
175        int ret = sendNetlinkRequest(action, flags, iov, ARRAY_SIZE(iov));
176        if (ret) {
177            return ret;
178        }
179    }
180
181    return 0;
182}
183
184// Adds or deletes an IPv4 or IPv6 route.
185// Returns 0 on success or negative errno on failure.
186int modifyIpRoute(uint16_t action, uint32_t table, const char* interface, const char* destination,
187                  const char* nexthop) {
188    // At least the destination must be non-null.
189    if (!destination) {
190        return -EFAULT;
191    }
192
193    // Parse the prefix.
194    uint8_t rawAddress[sizeof(in6_addr)];
195    uint8_t family, prefixLength;
196    int rawLength = parsePrefix(destination, &family, rawAddress, sizeof(rawAddress),
197                                &prefixLength);
198    if (rawLength < 0) {
199        return rawLength;
200    }
201
202    if (static_cast<size_t>(rawLength) > sizeof(rawAddress)) {
203        return -ENOBUFS;  // Cannot happen; parsePrefix only supports IPv4 and IPv6.
204    }
205
206    // If an interface was specified, find the ifindex.
207    uint32_t ifindex;
208    if (interface) {
209        ifindex = if_nametoindex(interface);
210        if (!ifindex) {
211            return -ENODEV;
212        }
213    }
214
215    // If a nexthop was specified, parse it as the same family as the prefix.
216    uint8_t rawNexthop[sizeof(in6_addr)];
217    if (nexthop && !inet_pton(family, nexthop, rawNexthop)) {
218        return -EINVAL;
219    }
220
221    // Assemble a rtmsg and put it in an array of iovec structures.
222    rtmsg rtmsg = {
223        .rtm_protocol = RTPROT_STATIC,
224        .rtm_type = RTN_UNICAST,
225        .rtm_family = family,
226        .rtm_dst_len = prefixLength,
227    };
228
229    rtattr rta_table   = { U16_RTA_LENGTH(sizeof(table)),    RTA_TABLE };
230    rtattr rta_oif     = { U16_RTA_LENGTH(sizeof(ifindex)),  RTA_OIF };
231    rtattr rta_dst     = { U16_RTA_LENGTH(rawLength),        RTA_DST };
232    rtattr rta_gateway = { U16_RTA_LENGTH(rawLength),        RTA_GATEWAY };
233
234    iovec iov[] = {
235        { NULL,          0 },
236        { &rtmsg,        sizeof(rtmsg) },
237        { &rta_table,    sizeof(rta_table) },
238        { &table,        sizeof(table) },
239        { &rta_dst,      sizeof(rta_dst) },
240        { rawAddress,    static_cast<size_t>(rawLength) },
241        { &rta_oif,      interface ? sizeof(rta_oif) : 0 },
242        { &ifindex,      interface ? sizeof(ifindex) : 0 },
243        { &rta_gateway,  nexthop ? sizeof(rta_gateway) : 0 },
244        { rawNexthop,    nexthop ? static_cast<size_t>(rawLength) : 0 },
245    };
246
247    uint16_t flags = (action == RTM_NEWROUTE) ? kNetlinkCreateRequestFlags : kNetlinkRequestFlags;
248    return sendNetlinkRequest(action, flags, iov, ARRAY_SIZE(iov));
249}
250
251int modifyPerNetworkRules(unsigned netId, const char* interface, Permission permission, bool add,
252                          bool modifyIptables) {
253    uint32_t table = getRouteTableForInterface(interface);
254    if (!table) {
255        return -ESRCH;
256    }
257
258    uint16_t action = add ? RTM_NEWRULE : RTM_DELRULE;
259    int ret;
260
261    Fwmark fwmark;
262    fwmark.permission = permission;
263
264    Fwmark mask;
265    mask.permission = permission;
266
267    // A rule to route traffic based on a chosen outgoing interface.
268    //
269    // Supports apps that use SO_BINDTODEVICE or IP_PKTINFO options and the kernel that already
270    // knows the outgoing interface (typically for link-local communications).
271    if ((ret = modifyIpRule(action, RULE_PRIORITY_PER_NETWORK_INTERFACE, table, fwmark.intValue,
272                            mask.intValue, interface)) != 0) {
273        return ret;
274    }
275
276    // A rule to route traffic based on the chosen network.
277    //
278    // This is for sockets that have not explicitly requested a particular network, but have been
279    // bound to one when they called connect(). This ensures that sockets connected on a particular
280    // network stay on that network even if the default network changes.
281    fwmark.netId = netId;
282    mask.netId = FWMARK_NET_ID_MASK;
283    if ((ret = modifyIpRule(action, RULE_PRIORITY_PER_NETWORK_NORMAL, table, fwmark.intValue,
284                            mask.intValue, NULL)) != 0) {
285        return ret;
286    }
287
288    // A rule to route traffic based on an explicitly chosen network.
289    //
290    // Supports apps that use the multinetwork APIs to restrict their traffic to a network.
291    //
292    // We don't really need to check the permission bits of the fwmark here, as they would've been
293    // checked at the time the netId was set into the fwmark, but we do so to be consistent.
294    fwmark.explicitlySelected = true;
295    mask.explicitlySelected = true;
296    if ((ret = modifyIpRule(action, RULE_PRIORITY_PER_NETWORK_EXPLICIT, table, fwmark.intValue,
297                            mask.intValue, NULL)) != 0) {
298        return ret;
299    }
300
301    // An iptables rule to mark incoming packets on a network with the netId of the network.
302    //
303    // This is so that the kernel can:
304    // + Use the right fwmark for (and thus correctly route) replies (e.g.: TCP RST, ICMP errors,
305    //   ping replies).
306    // + Mark sockets that accept connections from this interface so that the connection stays on
307    //   the same interface.
308    if (modifyIptables) {
309        const char* iptablesAction = add ? "-A" : "-D";
310        char markString[UINT32_HEX_STRLEN];
311        snprintf(markString, sizeof(markString), "0x%x", netId);
312        if (execIptables(V4V6, "-t", "mangle", iptablesAction, "INPUT", "-i", interface,
313                         "-j", "MARK", "--set-mark", markString, NULL)) {
314            return -EREMOTEIO;
315        }
316    }
317
318    return 0;
319}
320
321int modifyDefaultNetworkRules(const char* interface, Permission permission, uint16_t action) {
322    uint32_t table = getRouteTableForInterface(interface);
323    if (!table) {
324        return -ESRCH;
325    }
326
327    Fwmark fwmark;
328    fwmark.netId = 0;
329    fwmark.permission = permission;
330
331    Fwmark mask;
332    mask.netId = FWMARK_NET_ID_MASK;
333    mask.permission = permission;
334
335    return modifyIpRule(action, RULE_PRIORITY_DEFAULT_NETWORK, table, fwmark.intValue,
336                        mask.intValue, NULL);
337}
338
339// Adds or removes an IPv4 or IPv6 route to the specified table and, if it's directly-connected
340// route, to the main table as well.
341// Returns 0 on success or negative errno on failure.
342int modifyRoute(const char* interface, const char* destination, const char* nexthop,
343                uint16_t action, RouteController::TableType tableType, unsigned /* uid */) {
344    uint32_t table = 0;
345    switch (tableType) {
346        case RouteController::INTERFACE: {
347            table = getRouteTableForInterface(interface);
348            break;
349        }
350        case RouteController::LEGACY: {
351            // TODO: Use the UID to assign a unique table per UID instead of this fixed table.
352            table = ROUTE_TABLE_LEGACY;
353            break;
354        }
355        case RouteController::PRIVILEGED_LEGACY: {
356            // TODO: Use the UID to assign a unique table per UID instead of this fixed table.
357            table = ROUTE_TABLE_PRIVILEGED_LEGACY;
358            break;
359        }
360    }
361    if (!table) {
362        return -ESRCH;
363    }
364
365    int ret = modifyIpRoute(action, table, interface, destination, nexthop);
366    if (ret != 0) {
367        return ret;
368    }
369
370    // If there's no nexthop, this is a directly connected route. Add it to the main table also, to
371    // let the kernel find it when validating nexthops when global routes are added.
372    if (!nexthop) {
373        ret = modifyIpRoute(action, RT_TABLE_MAIN, interface, destination, NULL);
374        // A failure with action == ADD && errno == EEXIST means that the route already exists in
375        // the main table, perhaps because the kernel added it automatically as part of adding the
376        // IP address to the interface. Ignore this, but complain about everything else.
377        if (ret != 0 && !(action == RTM_NEWROUTE && ret == -EEXIST)) {
378            return ret;
379        }
380    }
381
382    return 0;
383}
384
385bool flushRoutes(const char* interface) {
386    uint32_t table = getRouteTableForInterface(interface);
387    if (!table) {
388        return false;
389    }
390    interfaceToIndex.erase(interface);
391
392    char tableString[UINT32_STRLEN];
393    snprintf(tableString, sizeof(tableString), "%u", table);
394
395    const char* version[] = {"-4", "-6"};
396    for (size_t i = 0; i < ARRAY_SIZE(version); ++i) {
397        const char* argv[] = {
398            IP_PATH,
399            version[i],
400            "route"
401            "flush",
402            "table",
403            tableString,
404        };
405        int argc = ARRAY_SIZE(argv);
406
407        if (!android_fork_execvp(argc, const_cast<char**>(argv), NULL, false, false)) {
408            return false;
409        }
410    }
411
412    return true;
413}
414
415}  // namespace
416
417void RouteController::Init() {
418    // Add a new rule to look up the 'main' table, with the same selectors as the "default network"
419    // rule, but with a lower priority. Since the default network rule points to a table with a
420    // default route, the rule we're adding will never be used for normal routing lookups. However,
421    // the kernel may fall-through to it to find directly-connected routes when it validates that a
422    // nexthop (in a route being added) is reachable.
423    Fwmark fwmark;
424    fwmark.netId = 0;
425
426    Fwmark mask;
427    mask.netId = FWMARK_NET_ID_MASK;
428
429    modifyIpRule(RTM_NEWRULE, RULE_PRIORITY_MAIN, RT_TABLE_MAIN, fwmark.intValue, mask.intValue,
430                 NULL);
431
432    // Add rules to allow lookup of legacy routes.
433    //
434    // TODO: Remove these once the kernel supports UID-based routing. Instead, add them on demand
435    // when routes are added.
436    fwmark.netId = 0;
437    mask.netId = 0;
438
439    fwmark.explicitlySelected = false;
440    mask.explicitlySelected = true;
441
442    modifyIpRule(RTM_NEWRULE, RULE_PRIORITY_LEGACY, ROUTE_TABLE_LEGACY, fwmark.intValue,
443                 mask.intValue, NULL);
444
445    fwmark.permission = PERMISSION_CONNECTIVITY_INTERNAL;
446    mask.permission = PERMISSION_CONNECTIVITY_INTERNAL;
447
448    modifyIpRule(RTM_NEWRULE, RULE_PRIORITY_PRIVILEGED_LEGACY, ROUTE_TABLE_PRIVILEGED_LEGACY,
449                 fwmark.intValue, mask.intValue, NULL);
450
451// TODO: Uncomment once we are sure everything works.
452#if 0
453    // Add a rule to preempt the pre-defined "from all lookup main" rule. This ensures that packets
454    // that are already marked with a specific NetId don't fall-through to the main table.
455    modifyIpRule(RTM_NEWRULE, RULE_PRIORITY_UNREACHABLE, 0, 0, 0, NULL);
456#endif
457}
458
459int RouteController::addInterfaceToNetwork(unsigned netId, const char* interface,
460                                           Permission permission) {
461    return modifyPerNetworkRules(netId, interface, permission, true, true);
462}
463
464int RouteController::removeInterfaceFromNetwork(unsigned netId, const char* interface,
465                                                Permission permission) {
466    return modifyPerNetworkRules(netId, interface, permission, false, true) &&
467           flushRoutes(interface);
468}
469
470int RouteController::modifyNetworkPermission(unsigned netId, const char* interface,
471                                             Permission oldPermission, Permission newPermission) {
472    // Add the new rules before deleting the old ones, to avoid race conditions.
473    return modifyPerNetworkRules(netId, interface, newPermission, true, false) &&
474           modifyPerNetworkRules(netId, interface, oldPermission, false, false);
475}
476
477int RouteController::addToDefaultNetwork(const char* interface, Permission permission) {
478    return modifyDefaultNetworkRules(interface, permission, RTM_NEWRULE);
479}
480
481int RouteController::removeFromDefaultNetwork(const char* interface, Permission permission) {
482    return modifyDefaultNetworkRules(interface, permission, RTM_DELRULE);
483}
484
485int RouteController::addRoute(const char* interface, const char* destination,
486                              const char* nexthop, TableType tableType, unsigned uid) {
487    return modifyRoute(interface, destination, nexthop, RTM_NEWROUTE, tableType, uid);
488}
489
490int RouteController::removeRoute(const char* interface, const char* destination,
491                                 const char* nexthop, TableType tableType, unsigned uid) {
492    return modifyRoute(interface, destination, nexthop, RTM_DELROUTE, tableType, uid);
493}
494