Skip to content

Commit

Permalink
northd: ECMP prefer local routes if possible.
Browse files Browse the repository at this point in the history
Assume the following setup:
1. there is an LR connected via LRP to some internal networks
2. the LR is connected via two separate LRPs (LRP-ext-1, LRP-ext-2)
   to a external network
3. there are two default routes, one for each of the external LRPs
4. the external LRPs have ha_chassis_groups with different priorities

In this case for internal traffic arriving to the LR it would first
determine one of the ecmp routes to use and then forward the traffic
appropriately.

This can mean that if we are on the same chassis as LRP-ext-1 then we
could still choose a route that outputs via the chassis of LRP-ext-2.
In this case we would send traffic to another chassis for no real
reason.

To avoid this case we add for each ecmp route additional non-ecmp
routes. These use is_chassis_resident to filter for the above case and
then choose local routes. If there are no local routes available we use
the normal ecmp route selection.

This feature is especially needed in the case of active-active routing.
There there will be a lot of per "project" LRs connected to one LS which
then connects to the external LR as described above. As the LRPs of the
"project" LRs mostly already need ha_chassis_groups for NAT handling the
chance of the traffic to be already on an appropriate chassis is quite
high.

Signed-off-by: Felix Huettner <[email protected]>
  • Loading branch information
felixhuettner committed Nov 14, 2024
1 parent c65f405 commit aae0b90
Show file tree
Hide file tree
Showing 2 changed files with 171 additions and 62 deletions.
129 changes: 119 additions & 10 deletions northd/northd.c
Original file line number Diff line number Diff line change
Expand Up @@ -306,11 +306,16 @@ BUILD_ASSERT_DECL(ACL_OBS_STAGE_MAX < (1 << 2));
* 1. (highest priority) connected routes
* 2. static routes
* 3. routes learned from the outside via ovn-controller (e.g. bgp)
* 4. (lowest priority) src-ip routes */
#define ROUTE_PRIO_OFFSET_MULTIPLIER 4
* 4. (lowest priority) src-ip routes
*
* When having ecmp routes with multiple different output ports on different
* chassis we prioritize being on the same chassis.
* However longer prefix matches are more important than being local. */
#define ROUTE_PRIO_OFFSET_MULTIPLIER 8
#define ROUTE_PRIO_OFFSET_LEARNED 1
#define ROUTE_PRIO_OFFSET_STATIC 2
#define ROUTE_PRIO_OFFSET_CONNECTED 3
#define ROUTE_PRIO_OFFSET_ADD_SPECIFIC_CHASSIS 4

/* Returns the type of the datapath to which a flow with the given 'stage' may
* be added. */
Expand Down Expand Up @@ -11731,6 +11736,10 @@ struct ecmp_groups_node {
uint32_t route_table_id;
uint16_t route_count;
struct ovs_list route_list; /* Contains ecmp_route_list_node */
/* If this is set the route should only apply to chassis where the port
* is resident. It will also receive a higher priority*/
struct sset ports_resident;
bool has_different_chassis;
};

static void
Expand All @@ -11747,6 +11756,22 @@ ecmp_groups_add_route(struct ecmp_groups_node *group,
er->route = route;
er->id = ++group->route_count;
ovs_list_insert(&group->route_list, &er->list_node);

if (!group->has_different_chassis) {
struct ecmp_route_list_node *ern;
struct sset chassis_names = SSET_INITIALIZER(&chassis_names);
LIST_FOR_EACH (ern, list_node, &group->route_list) {
if (ern->route->is_discard_route ||
!ern->route->out_port->is_active_active) {
continue;
}
sset_add(&chassis_names, ern->route->out_port->aa_chassis_name);
}
if (sset_count(&chassis_names) > 1) {
group->has_different_chassis = true;
}
sset_destroy(&chassis_names);
}
}

static struct ecmp_groups_node *
Expand All @@ -11768,7 +11793,9 @@ ecmp_groups_add(struct hmap *ecmp_groups,
eg->is_src_route = route->is_src_route;
eg->source = route->source;
eg->route_table_id = route->route_table_id;
eg->has_different_chassis = false;
ovs_list_init(&eg->route_list);
sset_init(&eg->ports_resident);
ecmp_groups_add_route(eg, route);

return eg;
Expand Down Expand Up @@ -11799,6 +11826,7 @@ ecmp_groups_destroy(struct hmap *ecmp_groups)
ovs_list_remove(&er->list_node);
free(er);
}
sset_destroy(&eg->ports_resident);
hmap_remove(ecmp_groups, &eg->hmap_node);
free(eg);
}
Expand All @@ -11808,15 +11836,19 @@ ecmp_groups_destroy(struct hmap *ecmp_groups)
struct unique_routes_node {
struct hmap_node hmap_node;
const struct parsed_route *route;
/* If this is set the route should only apply to chassis where the port
* is resident. It will also receive a higher priority*/
const char *port_resident;
};

static void
static struct unique_routes_node *
unique_routes_add(struct hmap *unique_routes,
const struct parsed_route *route)
{
struct unique_routes_node *ur = xmalloc(sizeof *ur);
struct unique_routes_node *ur = xzalloc(sizeof *ur);
ur->route = route;
hmap_insert(unique_routes, &ur->hmap_node, route->hash);
return ur;
}

/* Remove the unique_routes_node from the hmap, and return the parsed_route
Expand Down Expand Up @@ -12106,7 +12138,21 @@ build_ecmp_route_flow(struct lflow_table *lflows, struct ovn_datapath *od,
char *prefix_s = build_route_prefix_s(&eg->prefix, eg->plen);
build_route_match(NULL, eg->route_table_id, prefix_s, eg->plen,
eg->is_src_route, is_ipv4, &route_match, &priority, eg->source);
free(prefix_s);

if (sset_count(&eg->ports_resident) > 0) {
priority += ROUTE_PRIO_OFFSET_ADD_SPECIFIC_CHASSIS;
ds_put_format(&route_match, " && (");
bool first = true;
const char *port;
SSET_FOR_EACH (port, &eg->ports_resident) {
if (!first) {
ds_put_format(&route_match, "||");
}
first = false;
ds_put_format(&route_match, " is_chassis_resident(\"%s\") ", port);
}
ds_put_format(&route_match, ")");
}

struct ds actions = DS_EMPTY_INITIALIZER;
ds_put_format(&actions, "ip.ttl--; flags.loopback = 1; %s = %"PRIu16
Expand Down Expand Up @@ -12171,7 +12217,9 @@ build_ecmp_route_flow(struct lflow_table *lflows, struct ovn_datapath *od,
ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_IP_ROUTING_ECMP, 100,
ds_cstr(&match), ds_cstr(&actions),
route->source_hint, lflow_ref);

}
free(prefix_s);
sset_destroy(&visited_ports);
ds_destroy(&match);
ds_destroy(&route_match);
Expand All @@ -12185,7 +12233,8 @@ add_route(struct lflow_table *lflows, struct ovn_datapath *od,
bool is_src_route, const uint32_t rtb_id,
const struct sset *bfd_ports,
const struct ovsdb_idl_row *stage_hint, bool is_discard_route,
enum route_source source, struct lflow_ref *lflow_ref)
enum route_source source, struct lflow_ref *lflow_ref,
const char *port_resident)
{
bool is_ipv4 = strchr(network_s, '.') ? true : false;
struct ds match = DS_EMPTY_INITIALIZER;
Expand All @@ -12203,6 +12252,12 @@ add_route(struct lflow_table *lflows, struct ovn_datapath *od,
build_route_match(op_inport, rtb_id, network_s, plen, is_src_route,
is_ipv4, &match, &priority, source);

if (port_resident) {
priority += ROUTE_PRIO_OFFSET_ADD_SPECIFIC_CHASSIS;
ds_put_format(&match, " && is_chassis_resident(\"%s\")",
port_resident);
}

struct ds common_actions = DS_EMPTY_INITIALIZER;
struct ds actions = DS_EMPTY_INITIALIZER;
if (is_discard_route) {
Expand Down Expand Up @@ -12249,15 +12304,17 @@ static void
build_route_flow(struct lflow_table *lflows, struct ovn_datapath *od,
const struct parsed_route *route,
const struct sset *bfd_ports,
struct lflow_ref *lflow_ref)
struct lflow_ref *lflow_ref,
const char *port_resident)
{
char *prefix_s = build_route_prefix_s(&route->prefix, route->plen);
add_route(lflows, route->is_discard_route ? od : route->out_port->od,
route->out_port, route->lrp_addr_s, prefix_s,
route->plen, route->nexthop, route->is_src_route,
route->route_table_id, bfd_ports,
route->source_hint,
route->is_discard_route, route->source, lflow_ref);
route->is_discard_route, route->source, lflow_ref,
port_resident);

free(prefix_s);
}
Expand Down Expand Up @@ -14039,6 +14096,58 @@ build_route_flows_for_lrouter(
}
}
}

/* We now duplicate some routes based on ecmp groups. The goal here is to
* prioritize taking some route of a ecmp route if we are already on the
* respective chassis. This saves us potentially forwarding traffic between
* chassis for no reason. */
HMAP_FOR_EACH_SAFE (group, hmap_node, &ecmp_groups) {
if (!group->has_different_chassis) {
continue;
}
struct simap chassis_count = SIMAP_INITIALIZER(&chassis_count);
struct ecmp_route_list_node *er;
LIST_FOR_EACH (er, list_node, &group->route_list) {
if (er->route->is_discard_route ||
!er->route->out_port->is_active_active) {
continue;
}
simap_increase(&chassis_count,
er->route->out_port->aa_chassis_name, 1);
}


struct simap_node *chassis_node;
SIMAP_FOR_EACH (chassis_node, &chassis_count) {
ovs_assert(chassis_node->data != 0);
struct ecmp_groups_node *found_group = NULL;
LIST_FOR_EACH (er, list_node, &group->route_list) {
if (er->route->is_discard_route ||
!er->route->out_port->is_active_active ||
strcmp(chassis_node->name,
er->route->out_port->aa_chassis_name)) {
continue;
}
const char *port_name = er->route->out_port->cr_port->key;
if (chassis_node->data == 1) {
struct unique_routes_node *ur =
unique_routes_add(&unique_routes, er->route);
ur->port_resident = port_name;
} else {
if (!found_group) {
found_group = ecmp_groups_add(&ecmp_groups, er->route);
} else {
ecmp_groups_add_route(found_group, er->route);
}
sset_add(&found_group->ports_resident, port_name);
}
}
}

simap_destroy(&chassis_count);
}

/* And now really add the routing flows */
HMAP_FOR_EACH (group, hmap_node, &ecmp_groups) {
/* add a flow in IP_ROUTING, and one flow for each member in
* IP_ROUTING_ECMP. */
Expand All @@ -14047,7 +14156,7 @@ build_route_flows_for_lrouter(
const struct unique_routes_node *ur;
HMAP_FOR_EACH (ur, hmap_node, &unique_routes) {
build_route_flow(lflows, od, ur->route,
bfd_ports, lflow_ref);
bfd_ports, lflow_ref, ur->port_resident);
}
ecmp_groups_destroy(&ecmp_groups);
unique_routes_destroy(&unique_routes);
Expand Down Expand Up @@ -17300,7 +17409,7 @@ build_routable_flows_for_router_port(
laddrs->ipv4_addrs[k].plen, NULL, false, 0,
bfd_ports, &router_port->nbrp->header_,
false, ROUTE_SOURCE_CONNECTED,
lrp->stateful_lflow_ref);
lrp->stateful_lflow_ref, NULL);
}
}
}
Expand Down
Loading

0 comments on commit aae0b90

Please sign in to comment.