diff options
Diffstat (limited to 'net')
139 files changed, 2384 insertions, 1488 deletions
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 37a78d20c0f6..ba1210253f5e 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -94,8 +94,6 @@ struct p9_trans_rdma { struct ib_pd *pd; struct ib_qp *qp; struct ib_cq *cq; - struct ib_mr *dma_mr; - u32 lkey; long timeout; int sq_depth; struct semaphore sq_sem; @@ -382,9 +380,6 @@ static void rdma_destroy_trans(struct p9_trans_rdma *rdma) if (!rdma) return; - if (rdma->dma_mr && !IS_ERR(rdma->dma_mr)) - ib_dereg_mr(rdma->dma_mr); - if (rdma->qp && !IS_ERR(rdma->qp)) ib_destroy_qp(rdma->qp); @@ -415,7 +410,7 @@ post_recv(struct p9_client *client, struct p9_rdma_context *c) sge.addr = c->busa; sge.length = client->msize; - sge.lkey = rdma->lkey; + sge.lkey = rdma->pd->local_dma_lkey; wr.next = NULL; c->wc_op = IB_WC_RECV; @@ -506,7 +501,7 @@ dont_need_post_recv: sge.addr = c->busa; sge.length = c->req->tc->size; - sge.lkey = rdma->lkey; + sge.lkey = rdma->pd->local_dma_lkey; wr.next = NULL; c->wc_op = IB_WC_SEND; @@ -647,7 +642,6 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) struct p9_trans_rdma *rdma; struct rdma_conn_param conn_param; struct ib_qp_init_attr qp_attr; - struct ib_device_attr devattr; struct ib_cq_init_attr cq_attr = {}; /* Parse the transport specific mount options */ @@ -700,11 +694,6 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) if (err || (rdma->state != P9_RDMA_ROUTE_RESOLVED)) goto error; - /* Query the device attributes */ - err = ib_query_device(rdma->cm_id->device, &devattr); - if (err) - goto error; - /* Create the Completion Queue */ cq_attr.cqe = opts.sq_depth + opts.rq_depth + 1; rdma->cq = ib_create_cq(rdma->cm_id->device, cq_comp_handler, @@ -719,17 +708,6 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) if (IS_ERR(rdma->pd)) goto error; - /* Cache the DMA lkey in the transport */ - rdma->dma_mr = NULL; - if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) - rdma->lkey = rdma->cm_id->device->local_dma_lkey; - else { - rdma->dma_mr = ib_get_dma_mr(rdma->pd, IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(rdma->dma_mr)) - goto error; - rdma->lkey = rdma->dma_mr->lkey; - } - /* Create the Queue Pair */ memset(&qp_attr, 0, sizeof qp_attr); qp_attr.event_handler = qp_event_handler; diff --git a/net/atm/clip.c b/net/atm/clip.c index 17e55dfecbe2..e07f551a863c 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -317,6 +317,9 @@ static int clip_constructor(struct neighbour *neigh) static int clip_encap(struct atm_vcc *vcc, int mode) { + if (!CLIP_VCC(vcc)) + return -EBADFD; + CLIP_VCC(vcc)->encap = mode; return 0; } diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index b4548c739a64..2dda439c8cb8 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -91,10 +91,50 @@ static void hci_connect_le_scan_cleanup(struct hci_conn *conn) * autoconnect action, remove them completely. If they are, just unmark * them as waiting for connection, by clearing explicit_connect field. */ - if (params->auto_connect == HCI_AUTO_CONN_EXPLICIT) + params->explicit_connect = false; + + list_del_init(¶ms->action); + + switch (params->auto_connect) { + case HCI_AUTO_CONN_EXPLICIT: hci_conn_params_del(conn->hdev, bdaddr, bdaddr_type); - else - params->explicit_connect = false; + /* return instead of break to avoid duplicate scan update */ + return; + case HCI_AUTO_CONN_DIRECT: + case HCI_AUTO_CONN_ALWAYS: + list_add(¶ms->action, &conn->hdev->pend_le_conns); + break; + case HCI_AUTO_CONN_REPORT: + list_add(¶ms->action, &conn->hdev->pend_le_reports); + break; + default: + break; + } + + hci_update_background_scan(conn->hdev); +} + +static void hci_conn_cleanup(struct hci_conn *conn) +{ + struct hci_dev *hdev = conn->hdev; + + if (test_bit(HCI_CONN_PARAM_REMOVAL_PEND, &conn->flags)) + hci_conn_params_del(conn->hdev, &conn->dst, conn->dst_type); + + hci_chan_list_flush(conn); + + hci_conn_hash_del(hdev, conn); + + if (hdev->notify) + hdev->notify(hdev, HCI_NOTIFY_CONN_DEL); + + hci_conn_del_sysfs(conn); + + debugfs_remove_recursive(conn->debugfs); + + hci_dev_put(hdev); + + hci_conn_put(conn); } /* This function requires the caller holds hdev->lock */ @@ -102,8 +142,13 @@ static void hci_connect_le_scan_remove(struct hci_conn *conn) { hci_connect_le_scan_cleanup(conn); - hci_conn_hash_del(conn->hdev, conn); - hci_update_background_scan(conn->hdev); + /* We can't call hci_conn_del here since that would deadlock + * with trying to call cancel_delayed_work_sync(&conn->disc_work). + * Instead, call just hci_conn_cleanup() which contains the bare + * minimum cleanup operations needed for a connection in this + * state. + */ + hci_conn_cleanup(conn); } static void hci_acl_create_connection(struct hci_conn *conn) @@ -581,27 +626,17 @@ int hci_conn_del(struct hci_conn *conn) } } - hci_chan_list_flush(conn); - if (conn->amp_mgr) amp_mgr_put(conn->amp_mgr); - hci_conn_hash_del(hdev, conn); - if (hdev->notify) - hdev->notify(hdev, HCI_NOTIFY_CONN_DEL); - skb_queue_purge(&conn->data_q); - hci_conn_del_sysfs(conn); - - debugfs_remove_recursive(conn->debugfs); - - if (test_bit(HCI_CONN_PARAM_REMOVAL_PEND, &conn->flags)) - hci_conn_params_del(conn->hdev, &conn->dst, conn->dst_type); - - hci_dev_put(hdev); - - hci_conn_put(conn); + /* Remove the connection from the list and cleanup its remaining + * state. This is a separate function since for some cases like + * BT_CONNECT_SCAN we *only* want the cleanup part without the + * rest of hci_conn_del. + */ + hci_conn_cleanup(conn); return 0; } @@ -973,15 +1008,23 @@ static int hci_explicit_conn_params_set(struct hci_request *req, if (is_connected(hdev, addr, addr_type)) return -EISCONN; - params = hci_conn_params_add(hdev, addr, addr_type); - if (!params) - return -EIO; + params = hci_conn_params_lookup(hdev, addr, addr_type); + if (!params) { + params = hci_conn_params_add(hdev, addr, addr_type); + if (!params) + return -ENOMEM; - /* If we created new params, or existing params were marked as disabled, - * mark them to be used just once to connect. - */ - if (params->auto_connect == HCI_AUTO_CONN_DISABLED) { + /* If we created new params, mark them to be deleted in + * hci_connect_le_scan_cleanup. It's different case than + * existing disabled params, those will stay after cleanup. + */ params->auto_connect = HCI_AUTO_CONN_EXPLICIT; + } + + /* We're trying to connect, so make sure params are at pend_le_conns */ + if (params->auto_connect == HCI_AUTO_CONN_DISABLED || + params->auto_connect == HCI_AUTO_CONN_REPORT || + params->auto_connect == HCI_AUTO_CONN_EXPLICIT) { list_del_init(¶ms->action); list_add(¶ms->action, &hdev->pend_le_conns); } diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index adcbc74c2432..e837539452fb 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2861,13 +2861,6 @@ struct hci_conn_params *hci_explicit_connect_lookup(struct hci_dev *hdev, return param; } - list_for_each_entry(param, &hdev->pend_le_reports, action) { - if (bacmp(¶m->addr, addr) == 0 && - param->addr_type == addr_type && - param->explicit_connect) - return param; - } - return NULL; } diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 186041866315..bc31099d3b5b 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -55,7 +55,12 @@ static void hci_cc_inquiry_cancel(struct hci_dev *hdev, struct sk_buff *skb) wake_up_bit(&hdev->flags, HCI_INQUIRY); hci_dev_lock(hdev); - hci_discovery_set_state(hdev, DISCOVERY_STOPPED); + /* Set discovery state to stopped if we're not doing LE active + * scanning. + */ + if (!hci_dev_test_flag(hdev, HCI_LE_SCAN) || + hdev->le_scan_type != LE_SCAN_ACTIVE) + hci_discovery_set_state(hdev, DISCOVERY_STOPPED); hci_dev_unlock(hdev); hci_conn_check_pending(hdev); @@ -4648,8 +4653,8 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, /* If we're not connectable only connect devices that we have in * our pend_le_conns list. */ - params = hci_explicit_connect_lookup(hdev, addr, addr_type); - + params = hci_pend_le_action_lookup(&hdev->pend_le_conns, addr, + addr_type); if (!params) return NULL; diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index ccaf5a436d8f..c4fe2fee753f 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -3545,6 +3545,7 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, auth_type); } else { u8 addr_type; + struct hci_conn_params *p; /* Convert from L2CAP channel address type to HCI address type */ @@ -3562,7 +3563,10 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, * If connection parameters already exist, then they * will be kept and this function does nothing. */ - hci_conn_params_add(hdev, &cp->addr.bdaddr, addr_type); + p = hci_conn_params_add(hdev, &cp->addr.bdaddr, addr_type); + + if (p->auto_connect == HCI_AUTO_CONN_EXPLICIT) + p->auto_connect = HCI_AUTO_CONN_DISABLED; conn = hci_connect_le_scan(hdev, &cp->addr.bdaddr, addr_type, sec_level, @@ -6117,14 +6121,21 @@ static int hci_conn_params_set(struct hci_request *req, bdaddr_t *addr, __hci_update_background_scan(req); break; case HCI_AUTO_CONN_REPORT: - list_add(¶ms->action, &hdev->pend_le_reports); + if (params->explicit_connect) + list_add(¶ms->action, &hdev->pend_le_conns); + else + list_add(¶ms->action, &hdev->pend_le_reports); __hci_update_background_scan(req); break; case HCI_AUTO_CONN_DIRECT: case HCI_AUTO_CONN_ALWAYS: if (!is_connected(hdev, addr, addr_type)) { list_add(¶ms->action, &hdev->pend_le_conns); - __hci_update_background_scan(req); + /* If we are in scan phase of connecting, we were + * already added to pend_le_conns and scanning. + */ + if (params->auto_connect != HCI_AUTO_CONN_EXPLICIT) + __hci_update_background_scan(req); } break; } @@ -6379,7 +6390,8 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev, goto unlock; } - if (params->auto_connect == HCI_AUTO_CONN_DISABLED) { + if (params->auto_connect == HCI_AUTO_CONN_DISABLED || + params->auto_connect == HCI_AUTO_CONN_EXPLICIT) { err = cmd->cmd_complete(cmd, MGMT_STATUS_INVALID_PARAMS); mgmt_pending_remove(cmd); @@ -6415,6 +6427,10 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev, if (p->auto_connect == HCI_AUTO_CONN_DISABLED) continue; device_removed(sk, hdev, &p->addr, p->addr_type); + if (p->explicit_connect) { + p->auto_connect = HCI_AUTO_CONN_EXPLICIT; + continue; + } list_del(&p->action); list_del(&p->list); kfree(p); diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index ad82324f710f..0510a577a7b5 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -2311,12 +2311,6 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) if (!conn) return 1; - chan = conn->smp; - if (!chan) { - BT_ERR("SMP security requested but not available"); - return 1; - } - if (!hci_dev_test_flag(hcon->hdev, HCI_LE_ENABLED)) return 1; @@ -2330,6 +2324,12 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) if (smp_ltk_encrypt(conn, hcon->pending_sec_level)) return 0; + chan = conn->smp; + if (!chan) { + BT_ERR("SMP security requested but not available"); + return 1; + } + l2cap_chan_lock(chan); /* If SMP is already in progress ignore this request */ diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 66efdc21f548..480b3de1a0e3 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1006,7 +1006,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, ih = igmpv3_report_hdr(skb); num = ntohs(ih->ngrec); - len = sizeof(*ih); + len = skb_transport_offset(skb) + sizeof(*ih); for (i = 0; i < num; i++) { len += sizeof(*grec); @@ -1067,7 +1067,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, icmp6h = icmp6_hdr(skb); num = ntohs(icmp6h->icmp6_dataun.un_data16[1]); - len = sizeof(*icmp6h); + len = skb_transport_offset(skb) + sizeof(*icmp6h); for (i = 0; i < num; i++) { __be16 *nsrcs, _nsrcs; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index af5e187553fd..ea748c93a07f 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -16,7 +16,6 @@ #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/sock.h> -#include <net/switchdev.h> #include <uapi/linux/if_bridge.h> #include "br_private.h" diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 3cd8cc9e804b..5f5a02b49a99 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -117,10 +117,11 @@ out_filt: return err; } -static void __vlan_vid_del(struct net_device *dev, struct net_bridge *br, - u16 vid) +static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br, + u16 vid) { const struct net_device_ops *ops = dev->netdev_ops; + int err = 0; /* If driver uses VLAN ndo ops, use 8021q to delete vid * on device, otherwise try switchdev ops to delete vid. @@ -137,8 +138,12 @@ static void __vlan_vid_del(struct net_device *dev, struct net_bridge *br, }, }; - switchdev_port_obj_del(dev, &vlan_obj); + err = switchdev_port_obj_del(dev, &vlan_obj); + if (err == -EOPNOTSUPP) + err = 0; } + + return err; } static int __vlan_del(struct net_port_vlans *v, u16 vid) @@ -151,7 +156,11 @@ static int __vlan_del(struct net_port_vlans *v, u16 vid) if (v->port_idx) { struct net_bridge_port *p = v->parent.port; - __vlan_vid_del(p->dev, p->br, vid); + int err; + + err = __vlan_vid_del(p->dev, p->br, vid); + if (err) + return err; } clear_bit(vid, v->vlan_bitmap); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index f30329f72641..54a00d66509e 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -357,6 +357,7 @@ ceph_parse_options(char *options, const char *dev_name, opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; + opt->monc_ping_timeout = CEPH_MONC_PING_TIMEOUT_DEFAULT; /* get mon ip(s) */ /* ip1[:port1][,ip2[:port2]...] */ @@ -517,8 +518,11 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client) struct ceph_options *opt = client->options; size_t pos = m->count; - if (opt->name) - seq_printf(m, "name=%s,", opt->name); + if (opt->name) { + seq_puts(m, "name="); + seq_escape(m, opt->name, ", \t\n\\"); + seq_putc(m, ','); + } if (opt->key) seq_puts(m, "secret=<hidden>,"); diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 790fe89d90c0..4440edcce0d6 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -79,10 +79,6 @@ int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey) return 0; } - - -#define AES_KEY_SIZE 16 - static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void) { return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index e3be1d22a247..b9b0e3b5da49 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -163,6 +163,7 @@ static struct kmem_cache *ceph_msg_data_cache; static char tag_msg = CEPH_MSGR_TAG_MSG; static char tag_ack = CEPH_MSGR_TAG_ACK; static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE; +static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2; #ifdef CONFIG_LOCKDEP static struct lock_class_key socket_class; @@ -176,7 +177,7 @@ static struct lock_class_key socket_class; static void queue_con(struct ceph_connection *con); static void cancel_con(struct ceph_connection *con); -static void con_work(struct work_struct *); +static void ceph_con_workfn(struct work_struct *); static void con_fault(struct ceph_connection *con); /* @@ -276,22 +277,22 @@ static void _ceph_msgr_exit(void) ceph_msgr_wq = NULL; } - ceph_msgr_slab_exit(); - BUG_ON(zero_page == NULL); page_cache_release(zero_page); zero_page = NULL; + + ceph_msgr_slab_exit(); } int ceph_msgr_init(void) { + if (ceph_msgr_slab_init()) + return -ENOMEM; + BUG_ON(zero_page != NULL); zero_page = ZERO_PAGE(0); page_cache_get(zero_page); - if (ceph_msgr_slab_init()) - return -ENOMEM; - /* * The number of active work items is limited by the number of * connections, so leave @max_active at default. @@ -749,7 +750,7 @@ void ceph_con_init(struct ceph_connection *con, void *private, mutex_init(&con->mutex); INIT_LIST_HEAD(&con->out_queue); INIT_LIST_HEAD(&con->out_sent); - INIT_DELAYED_WORK(&con->work, con_work); + INIT_DELAYED_WORK(&con->work, ceph_con_workfn); con->state = CON_STATE_CLOSED; } @@ -1351,7 +1352,16 @@ static void prepare_write_keepalive(struct ceph_connection *con) { dout("prepare_write_keepalive %p\n", con); con_out_kvec_reset(con); - con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive); + if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) { + struct timespec now = CURRENT_TIME; + + con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2); + ceph_encode_timespec(&con->out_temp_keepalive2, &now); + con_out_kvec_add(con, sizeof(con->out_temp_keepalive2), + &con->out_temp_keepalive2); + } else { + con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive); + } con_flag_set(con, CON_FLAG_WRITE_PENDING); } @@ -1625,6 +1635,12 @@ static void prepare_read_tag(struct ceph_connection *con) con->in_tag = CEPH_MSGR_TAG_READY; } +static void prepare_read_keepalive_ack(struct ceph_connection *con) +{ + dout("prepare_read_keepalive_ack %p\n", con); + con->in_base_pos = 0; +} + /* * Prepare to read a message. */ @@ -2322,13 +2338,6 @@ static int read_partial_message(struct ceph_connection *con) return ret; BUG_ON(!con->in_msg ^ skip); - if (con->in_msg && data_len > con->in_msg->data_length) { - pr_warn("%s skipping long message (%u > %zd)\n", - __func__, data_len, con->in_msg->data_length); - ceph_msg_put(con->in_msg); - con->in_msg = NULL; - skip = 1; - } if (skip) { /* skip this message */ dout("alloc_msg said skip message\n"); @@ -2457,6 +2466,17 @@ static void process_message(struct ceph_connection *con) mutex_lock(&con->mutex); } +static int read_keepalive_ack(struct ceph_connection *con) +{ + struct ceph_timespec ceph_ts; + size_t size = sizeof(ceph_ts); + int ret = read_partial(con, size, size, &ceph_ts); + if (ret <= 0) + return ret; + ceph_decode_timespec(&con->last_keepalive_ack, &ceph_ts); + prepare_read_tag(con); + return 1; +} /* * Write something to the socket. Called in a worker thread when the @@ -2526,6 +2546,10 @@ more_kvec: do_next: if (con->state == CON_STATE_OPEN) { + if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) { + prepare_write_keepalive(con); + goto more; + } /* is anything else pending? */ if (!list_empty(&con->out_queue)) { prepare_write_message(con); @@ -2535,10 +2559,6 @@ do_next: prepare_write_ack(con); goto more; } - if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) { - prepare_write_keepalive(con); - goto more; - } } /* Nothing to do! */ @@ -2641,6 +2661,9 @@ more: case CEPH_MSGR_TAG_ACK: prepare_read_ack(con); break; + case CEPH_MSGR_TAG_KEEPALIVE2_ACK: + prepare_read_keepalive_ack(con); + break; case CEPH_MSGR_TAG_CLOSE: con_close_socket(con); con->state = CON_STATE_CLOSED; @@ -2684,6 +2707,12 @@ more: process_ack(con); goto more; } + if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) { + ret = read_keepalive_ack(con); + if (ret <= 0) + goto out; + goto more; + } out: dout("try_read done on %p ret %d\n", con, ret); @@ -2799,7 +2828,7 @@ static void con_fault_finish(struct ceph_connection *con) /* * Do some work on a connection. Drop a connection ref when we're done. */ -static void con_work(struct work_struct *work) +static void ceph_con_workfn(struct work_struct *work) { struct ceph_connection *con = container_of(work, struct ceph_connection, work.work); @@ -3101,6 +3130,20 @@ void ceph_con_keepalive(struct ceph_connection *con) } EXPORT_SYMBOL(ceph_con_keepalive); +bool ceph_con_keepalive_expired(struct ceph_connection *con, + unsigned long interval) +{ + if (interval > 0 && + (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) { + struct timespec now = CURRENT_TIME; + struct timespec ts; + jiffies_to_timespec(interval, &ts); + ts = timespec_add(con->last_keepalive_ack, ts); + return timespec_compare(&now, &ts) >= 0; + } + return false; +} + static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type) { struct ceph_msg_data *data; diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 9d6ff1215928..edda01626a45 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -149,6 +149,10 @@ static int __open_session(struct ceph_mon_client *monc) CEPH_ENTITY_TYPE_MON, monc->cur_mon, &monc->monmap->mon_inst[monc->cur_mon].addr); + /* send an initial keepalive to ensure our timestamp is + * valid by the time we are in an OPENED state */ + ceph_con_keepalive(&monc->con); + /* initiatiate authentication handshake */ ret = ceph_auth_build_hello(monc->auth, monc->m_auth->front.iov_base, @@ -170,14 +174,19 @@ static bool __sub_expired(struct ceph_mon_client *monc) */ static void __schedule_delayed(struct ceph_mon_client *monc) { - unsigned int delay; + struct ceph_options *opt = monc->client->options; + unsigned long delay; - if (monc->cur_mon < 0 || __sub_expired(monc)) + if (monc->cur_mon < 0 || __sub_expired(monc)) { delay = 10 * HZ; - else + } else { delay = 20 * HZ; - dout("__schedule_delayed after %u\n", delay); - schedule_delayed_work(&monc->delayed_work, delay); + if (opt->monc_ping_timeout > 0) + delay = min(delay, opt->monc_ping_timeout / 3); + } + dout("__schedule_delayed after %lu\n", delay); + schedule_delayed_work(&monc->delayed_work, + round_jiffies_relative(delay)); } /* @@ -743,11 +752,23 @@ static void delayed_work(struct work_struct *work) __close_session(monc); __open_session(monc); /* continue hunting */ } else { - ceph_con_keepalive(&monc->con); + struct ceph_options *opt = monc->client->options; + int is_auth = ceph_auth_is_authenticated(monc->auth); + if (ceph_con_keepalive_expired(&monc->con, + opt->monc_ping_timeout)) { + dout("monc keepalive timeout\n"); + is_auth = 0; + __close_session(monc); + monc->hunting = true; + __open_session(monc); + } - __validate_auth(monc); + if (!monc->hunting) { + ceph_con_keepalive(&monc->con); + __validate_auth(monc); + } - if (ceph_auth_is_authenticated(monc->auth)) + if (is_auth) __send_subscribe(monc); } __schedule_delayed(monc); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 50033677c0fa..f79ccac6699f 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -285,6 +285,7 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req, switch (op->op) { case CEPH_OSD_OP_READ: case CEPH_OSD_OP_WRITE: + case CEPH_OSD_OP_WRITEFULL: ceph_osd_data_release(&op->extent.osd_data); break; case CEPH_OSD_OP_CALL: @@ -485,13 +486,14 @@ void osd_req_op_extent_init(struct ceph_osd_request *osd_req, size_t payload_len = 0; BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE && - opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE); + opcode != CEPH_OSD_OP_WRITEFULL && opcode != CEPH_OSD_OP_ZERO && + opcode != CEPH_OSD_OP_TRUNCATE); op->extent.offset = offset; op->extent.length = length; op->extent.truncate_size = truncate_size; op->extent.truncate_seq = truncate_seq; - if (opcode == CEPH_OSD_OP_WRITE) + if (opcode == CEPH_OSD_OP_WRITE || opcode == CEPH_OSD_OP_WRITEFULL) payload_len += length; op->payload_len = payload_len; @@ -670,9 +672,11 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, break; case CEPH_OSD_OP_READ: case CEPH_OSD_OP_WRITE: + case CEPH_OSD_OP_WRITEFULL: case CEPH_OSD_OP_ZERO: case CEPH_OSD_OP_TRUNCATE: - if (src->op == CEPH_OSD_OP_WRITE) + if (src->op == CEPH_OSD_OP_WRITE || + src->op == CEPH_OSD_OP_WRITEFULL) request_data_len = src->extent.length; dst->extent.offset = cpu_to_le64(src->extent.offset); dst->extent.length = cpu_to_le64(src->extent.length); @@ -681,7 +685,8 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, dst->extent.truncate_seq = cpu_to_le32(src->extent.truncate_seq); osd_data = &src->extent.osd_data; - if (src->op == CEPH_OSD_OP_WRITE) + if (src->op == CEPH_OSD_OP_WRITE || + src->op == CEPH_OSD_OP_WRITEFULL) ceph_osdc_msg_data_add(req->r_request, osd_data); else ceph_osdc_msg_data_add(req->r_reply, osd_data); @@ -2817,8 +2822,9 @@ out: } /* - * lookup and return message for incoming reply. set up reply message - * pages. + * Lookup and return message for incoming reply. Don't try to do + * anything about a larger than preallocated data portion of the + * message at the moment - for now, just skip the message. */ static struct ceph_msg *get_reply(struct ceph_connection *con, struct ceph_msg_header *hdr, @@ -2836,10 +2842,10 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, mutex_lock(&osdc->request_mutex); req = __lookup_request(osdc, tid); if (!req) { - *skip = 1; + pr_warn("%s osd%d tid %llu unknown, skipping\n", + __func__, osd->o_osd, tid); m = NULL; - dout("get_reply unknown tid %llu from osd%d\n", tid, - osd->o_osd); + *skip = 1; goto out; } @@ -2849,10 +2855,9 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, ceph_msg_revoke_incoming(req->r_reply); if (front_len > req->r_reply->front_alloc_len) { - pr_warn("get_reply front %d > preallocated %d (%u#%llu)\n", - front_len, req->r_reply->front_alloc_len, - (unsigned int)con->peer_name.type, - le64_to_cpu(con->peer_name.num)); + pr_warn("%s osd%d tid %llu front %d > preallocated %d\n", + __func__, osd->o_osd, req->r_tid, front_len, + req->r_reply->front_alloc_len); m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS, false); if (!m) @@ -2860,37 +2865,22 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, ceph_msg_put(req->r_reply); req->r_reply = m; } - m = ceph_msg_get(req->r_reply); - - if (data_len > 0) { - struct ceph_osd_data *osd_data; - /* - * XXX This is assuming there is only one op containing - * XXX page data. Probably OK for reads, but this - * XXX ought to be done more generally. - */ - osd_data = osd_req_op_extent_osd_data(req, 0); - if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { - if (osd_data->pages && - unlikely(osd_data->length < data_len)) { - - pr_warn("tid %lld reply has %d bytes we had only %llu bytes ready\n", - tid, data_len, osd_data->length); - *skip = 1; - ceph_msg_put(m); - m = NULL; - goto out; - } - } + if (data_len > req->r_reply->data_length) { + pr_warn("%s osd%d tid %llu data %d > preallocated %zu, skipping\n", + __func__, osd->o_osd, req->r_tid, data_len, + req->r_reply->data_length); + m = NULL; + *skip = 1; + goto out; } - *skip = 0; + + m = ceph_msg_get(req->r_reply); dout("get_reply tid %lld %p\n", tid, m); out: mutex_unlock(&osdc->request_mutex); return m; - } static struct ceph_msg *alloc_msg(struct ceph_connection *con, diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 4a3125836b64..7d8f581d9f1f 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1300,7 +1300,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, ceph_decode_addr(&addr); pr_info("osd%d up\n", osd); BUG_ON(osd >= map->max_osd); - map->osd_state[osd] |= CEPH_OSD_UP; + map->osd_state[osd] |= CEPH_OSD_UP | CEPH_OSD_EXISTS; map->osd_addr[osd] = addr; } diff --git a/net/core/dev.c b/net/core/dev.c index 877c84834d81..6bb6470f5b7b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4713,6 +4713,8 @@ void napi_disable(struct napi_struct *n) while (test_and_set_bit(NAPI_STATE_SCHED, &n->state)) msleep(1); + while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state)) + msleep(1); hrtimer_cancel(&n->timer); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index b495ab1797fa..29edf74846fc 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1284,7 +1284,7 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) gstrings.len = ret; - data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER); + data = kcalloc(gstrings.len, ETH_GSTRING_LEN, GFP_USER); if (!data) return -ENOMEM; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index ae8306e7c56f..365de66436ac 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -44,7 +44,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops, } EXPORT_SYMBOL(fib_default_rule_add); -u32 fib_default_rule_pref(struct fib_rules_ops *ops) +static u32 fib_default_rule_pref(struct fib_rules_ops *ops) { struct list_head *pos; struct fib_rule *rule; @@ -60,7 +60,6 @@ u32 fib_default_rule_pref(struct fib_rules_ops *ops) return 0; } -EXPORT_SYMBOL(fib_default_rule_pref); static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, @@ -299,8 +298,8 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) } rule->fr_net = net; - if (tb[FRA_PRIORITY]) - rule->pref = nla_get_u32(tb[FRA_PRIORITY]); + rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY]) + : fib_default_rule_pref(ops); if (tb[FRA_IIFNAME]) { struct net_device *dev; @@ -350,9 +349,6 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) else rule->suppress_ifgroup = -1; - if (!tb[FRA_PRIORITY] && ops->default_pref) - rule->pref = ops->default_pref(ops); - err = -EINVAL; if (tb[FRA_GOTO]) { if (rule->action != FR_ACT_GOTO) @@ -635,15 +631,17 @@ static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb, { int idx = 0; struct fib_rule *rule; + int err = 0; rcu_read_lock(); list_for_each_entry_rcu(rule, &ops->rules_list, list) { if (idx < cb->args[1]) goto skip; - if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, RTM_NEWRULE, - NLM_F_MULTI, ops) < 0) + err = fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, RTM_NEWRULE, + NLM_F_MULTI, ops); + if (err) break; skip: idx++; @@ -652,7 +650,7 @@ skip: cb->args[1] = idx; rules_ops_put(ops); - return skb->len; + return err; } static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) @@ -668,7 +666,9 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) if (ops == NULL) return -EAFNOSUPPORT; - return dump_rules(skb, cb, ops); + dump_rules(skb, cb, ops); + + return skb->len; } rcu_read_lock(); diff --git a/net/core/filter.c b/net/core/filter.c index 13079f03902e..bb18c3680001 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -478,9 +478,9 @@ do_pass: bpf_src = BPF_X; } else { insn->dst_reg = BPF_REG_A; - insn->src_reg = BPF_REG_X; insn->imm = fp->k; bpf_src = BPF_SRC(fp->code); + insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0; } /* Common case where 'jump_false' is next insn. */ @@ -1415,6 +1415,7 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) return dev_forward_skb(dev, skb2); skb2->dev = dev; + skb_sender_cpu_clear(skb2); return dev_queue_xmit(skb2); } @@ -1854,9 +1855,13 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, goto out; /* We're copying the filter that has been originally attached, - * so no conversion/decode needed anymore. + * so no conversion/decode needed anymore. eBPF programs that + * have no original program cannot be dumped through this. */ + ret = -EACCES; fprog = filter->prog->orig_prog; + if (!fprog) + goto out; ret = fprog->len; if (!len) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index b279077c3089..830f8a7c1cb1 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -31,7 +31,6 @@ static const char fmt_hex[] = "%#x\n"; static const char fmt_long_hex[] = "%#lx\n"; static const char fmt_dec[] = "%d\n"; -static const char fmt_udec[] = "%u\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; @@ -202,7 +201,7 @@ static ssize_t speed_show(struct device *dev, if (netif_running(netdev)) { struct ethtool_cmd cmd; if (!__ethtool_get_settings(netdev, &cmd)) - ret = sprintf(buf, fmt_udec, ethtool_cmd_speed(&cmd)); + ret = sprintf(buf, fmt_dec, ethtool_cmd_speed(&cmd)); } rtnl_unlock(); return ret; @@ -1481,6 +1480,15 @@ static int of_dev_node_match(struct device *dev, const void *data) return ret == 0 ? dev->of_node == data : ret; } +/* + * of_find_net_device_by_node - lookup the net device for the device node + * @np: OF device node + * + * Looks up the net_device structure corresponding with the device node. + * If successful, returns a pointer to the net_device with the embedded + * struct device refcount incremented by one, or NULL on failure. The + * refcount must be dropped when done with the net_device. + */ struct net_device *of_find_net_device_by_node(struct device_node *np) { struct device *dev; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 6aa3db8dfc3b..8bdada242a7d 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -142,7 +142,7 @@ static void queue_process(struct work_struct *work) */ static int poll_one_napi(struct napi_struct *napi, int budget) { - int work; + int work = 0; /* net_rx_action's ->poll() invocations and our's are * synchronized by this test which is only made while @@ -151,7 +151,12 @@ static int poll_one_napi(struct napi_struct *napi, int budget) if (!test_bit(NAPI_STATE_SCHED, &napi->state)) return budget; - set_bit(NAPI_STATE_NPSVC, &napi->state); + /* If we set this bit but see that it has already been set, + * that indicates that napi has been disabled and we need + * to abort this operation + */ + if (test_and_set_bit(NAPI_STATE_NPSVC, &napi->state)) + goto out; work = napi->poll(napi, budget); WARN_ONCE(work > budget, "%pF exceeded budget in poll\n", napi->poll); @@ -159,6 +164,7 @@ static int poll_one_napi(struct napi_struct *napi, int budget) clear_bit(NAPI_STATE_NPSVC, &napi->state); +out: return budget - work; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a466821d1441..0ec48403ed68 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3047,6 +3047,7 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) u32 portid = NETLINK_CB(cb->skb).portid; u32 seq = cb->nlh->nlmsg_seq; u32 filter_mask = 0; + int err; if (nlmsg_len(cb->nlh) > sizeof(struct ifinfomsg)) { struct nlattr *extfilt; @@ -3067,20 +3068,25 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) struct net_device *br_dev = netdev_master_upper_dev_get(dev); if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) { - if (idx >= cb->args[0] && - br_dev->netdev_ops->ndo_bridge_getlink( - skb, portid, seq, dev, filter_mask, - NLM_F_MULTI) < 0) - break; + if (idx >= cb->args[0]) { + err = br_dev->netdev_ops->ndo_bridge_getlink( + skb, portid, seq, dev, + filter_mask, NLM_F_MULTI); + if (err < 0 && err != -EOPNOTSUPP) + break; + } idx++; } if (ops->ndo_bridge_getlink) { - if (idx >= cb->args[0] && - ops->ndo_bridge_getlink(skb, portid, seq, dev, - filter_mask, - NLM_F_MULTI) < 0) - break; + if (idx >= cb->args[0]) { + err = ops->ndo_bridge_getlink(skb, portid, + seq, dev, + filter_mask, + NLM_F_MULTI); + if (err < 0 && err != -EOPNOTSUPP) + break; + } idx++; } } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index dad4dd37e2aa..fab4599ba8b2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2958,11 +2958,12 @@ EXPORT_SYMBOL_GPL(skb_append_pagefrags); */ unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) { + unsigned char *data = skb->data; + BUG_ON(len > skb->len); - skb->len -= len; - BUG_ON(skb->len < skb->data_len); - skb_postpull_rcsum(skb, skb->data, len); - return skb->data += len; + __skb_pull(skb, len); + skb_postpull_rcsum(skb, data, len); + return skb->data; } EXPORT_SYMBOL_GPL(skb_pull_rcsum); diff --git a/net/core/sock.c b/net/core/sock.c index ca2984afe16e..3307c02244d3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2740,10 +2740,8 @@ static void req_prot_cleanup(struct request_sock_ops *rsk_prot) return; kfree(rsk_prot->slab_name); rsk_prot->slab_name = NULL; - if (rsk_prot->slab) { - kmem_cache_destroy(rsk_prot->slab); - rsk_prot->slab = NULL; - } + kmem_cache_destroy(rsk_prot->slab); + rsk_prot->slab = NULL; } static int req_prot_init(const struct proto *prot) @@ -2828,10 +2826,8 @@ void proto_unregister(struct proto *prot) list_del(&prot->node); mutex_unlock(&proto_list_mutex); - if (prot->slab != NULL) { - kmem_cache_destroy(prot->slab); - prot->slab = NULL; - } + kmem_cache_destroy(prot->slab); + prot->slab = NULL; req_prot_cleanup(prot->rsk_prot); diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index bd9e718c2a20..3de0d0362d7f 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -398,12 +398,8 @@ out_err: void dccp_ackvec_exit(void) { - if (dccp_ackvec_slab != NULL) { - kmem_cache_destroy(dccp_ackvec_slab); - dccp_ackvec_slab = NULL; - } - if (dccp_ackvec_record_slab != NULL) { - kmem_cache_destroy(dccp_ackvec_record_slab); - dccp_ackvec_record_slab = NULL; - } + kmem_cache_destroy(dccp_ackvec_slab); + dccp_ackvec_slab = NULL; + kmem_cache_destroy(dccp_ackvec_record_slab); + dccp_ackvec_record_slab = NULL; } diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c index 83498975165f..90f77d08cc37 100644 --- a/net/dccp/ccid.c +++ b/net/dccp/ccid.c @@ -95,8 +95,7 @@ static struct kmem_cache *ccid_kmem_cache_create(int obj_size, char *slab_name_f static void ccid_kmem_cache_destroy(struct kmem_cache *slab) { - if (slab != NULL) - kmem_cache_destroy(slab); + kmem_cache_destroy(slab); } static int __init ccid_activate(struct ccid_operations *ccid_ops) diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 30addee2dd03..838f524cf11a 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -48,8 +48,6 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) tw->tw_ipv6only = sk->sk_ipv6only; } #endif - /* Linkage updates. */ - __inet_twsk_hashdance(tw, sk, &dccp_hashinfo); /* Get the TIME_WAIT timeout firing. */ if (timeo < rto) @@ -60,6 +58,8 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) timeo = DCCP_TIMEWAIT_LEN; inet_twsk_schedule(tw, timeo); + /* Linkage updates. */ + __inet_twsk_hashdance(tw, sk, &dccp_hashinfo); inet_twsk_put(tw); } else { /* Sorry, if we're out of memory, just CLOSE this diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c index 9d66a0f72f90..295bbd6a56f2 100644 --- a/net/decnet/dn_rules.c +++ b/net/decnet/dn_rules.c @@ -229,7 +229,6 @@ static const struct fib_rules_ops __net_initconst dn_fib_rules_ops_template = { .configure = dn_fib_rule_configure, .compare = dn_fib_rule_compare, .fill = dn_fib_rule_fill, - .default_pref = fib_default_rule_pref, .flush_cache = dn_fib_rule_flush_cache, .nlgroup = RTNLGRP_DECnet_RULE, .policy = dn_fib_rule_policy, diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 76e3800765f8..adb5325f4934 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -22,6 +22,7 @@ #include <linux/of_platform.h> #include <linux/of_net.h> #include <linux/sysfs.h> +#include <linux/phy_fixed.h> #include "dsa_priv.h" char dsa_driver_version[] = "0.1"; @@ -305,7 +306,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) if (ret < 0) goto out; - ds->slave_mii_bus = mdiobus_alloc(); + ds->slave_mii_bus = devm_mdiobus_alloc(parent); if (ds->slave_mii_bus == NULL) { ret = -ENOMEM; goto out; @@ -314,7 +315,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) ret = mdiobus_register(ds->slave_mii_bus); if (ret < 0) - goto out_free; + goto out; /* @@ -367,10 +368,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) return ret; -out_free: - mdiobus_free(ds->slave_mii_bus); out: - kfree(ds); return ret; } @@ -400,7 +398,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, /* * Allocate and initialise switch state. */ - ds = kzalloc(sizeof(*ds) + drv->priv_size, GFP_KERNEL); + ds = devm_kzalloc(parent, sizeof(*ds) + drv->priv_size, GFP_KERNEL); if (ds == NULL) return ERR_PTR(-ENOMEM); @@ -420,10 +418,47 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, static void dsa_switch_destroy(struct dsa_switch *ds) { + struct device_node *port_dn; + struct phy_device *phydev; + struct dsa_chip_data *cd = ds->pd; + int port; + #ifdef CONFIG_NET_DSA_HWMON if (ds->hwmon_dev) hwmon_device_unregister(ds->hwmon_dev); #endif + + /* Disable configuration of the CPU and DSA ports */ + for (port = 0; port < DSA_MAX_PORTS; port++) { + if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) + continue; + + port_dn = cd->port_dn[port]; + if (of_phy_is_fixed_link(port_dn)) { + phydev = of_phy_find_device(port_dn); + if (phydev) { + int addr = phydev->addr; + + phy_device_free(phydev); + of_node_put(port_dn); + fixed_phy_del(addr); + } + } + } + + /* Destroy network devices for physical switch ports. */ + for (port = 0; port < DSA_MAX_PORTS; port++) { + if (!(ds->phys_port_mask & (1 << port))) + continue; + + if (!ds->ports[port]) + continue; + + unregister_netdev(ds->ports[port]); + free_netdev(ds->ports[port]); + } + + mdiobus_unregister(ds->slave_mii_bus); } #ifdef CONFIG_PM_SLEEP @@ -634,6 +669,10 @@ static void dsa_of_free_platform_data(struct dsa_platform_data *pd) port_index++; } kfree(pd->chip[i].rtable); + + /* Drop our reference to the MDIO bus device */ + if (pd->chip[i].host_dev) + put_device(pd->chip[i].host_dev); } kfree(pd->chip); } @@ -661,16 +700,22 @@ static int dsa_of_probe(struct device *dev) return -EPROBE_DEFER; ethernet = of_parse_phandle(np, "dsa,ethernet", 0); - if (!ethernet) - return -EINVAL; + if (!ethernet) { + ret = -EINVAL; + goto out_put_mdio; + } ethernet_dev = of_find_net_device_by_node(ethernet); - if (!ethernet_dev) - return -EPROBE_DEFER; + if (!ethernet_dev) { + ret = -EPROBE_DEFER; + goto out_put_mdio; + } pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) - return -ENOMEM; + if (!pd) { + ret = -ENOMEM; + goto out_put_ethernet; + } dev->platform_data = pd; pd->of_netdev = ethernet_dev; @@ -691,7 +736,9 @@ static int dsa_of_probe(struct device *dev) cd = &pd->chip[chip_index]; cd->of_node = child; - cd->host_dev = &mdio_bus->dev; + + /* When assigning the host device, increment its refcount */ + cd->host_dev = get_device(&mdio_bus->dev); sw_addr = of_get_property(child, "reg", NULL); if (!sw_addr) @@ -711,6 +758,12 @@ static int dsa_of_probe(struct device *dev) ret = -EPROBE_DEFER; goto out_free_chip; } + + /* Drop the mdio_bus device ref, replacing the host + * device with the mdio_bus_switch device, keeping + * the refcount from of_mdio_find_bus() above. + */ + put_device(cd->host_dev); cd->host_dev = &mdio_bus_switch->dev; } @@ -744,6 +797,10 @@ static int dsa_of_probe(struct device *dev) } } + /* The individual chips hold their own refcount on the mdio bus, + * so drop ours */ + put_device(&mdio_bus->dev); + return 0; out_free_chip: @@ -751,6 +808,10 @@ out_free_chip: out_free: kfree(pd); dev->platform_data = NULL; +out_put_ethernet: + put_device(ðernet_dev->dev); +out_put_mdio: + put_device(&mdio_bus->dev); return ret; } @@ -762,6 +823,7 @@ static void dsa_of_remove(struct device *dev) return; dsa_of_free_platform_data(pd); + put_device(&pd->of_netdev->dev); kfree(pd); } #else @@ -775,10 +837,11 @@ static inline void dsa_of_remove(struct device *dev) } #endif -static void dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, - struct device *parent, struct dsa_platform_data *pd) +static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, + struct device *parent, struct dsa_platform_data *pd) { int i; + unsigned configured = 0; dst->pd = pd; dst->master_netdev = dev; @@ -798,9 +861,17 @@ static void dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, dst->ds[i] = ds; if (ds->drv->poll_link != NULL) dst->link_poll_needed = 1; + + ++configured; } /* + * If no switch was found, exit cleanly + */ + if (!configured) + return -EPROBE_DEFER; + + /* * If we use a tagging format that doesn't have an ethertype * field, make sure that all packets from this point on get * sent to the tag format's receive function. @@ -816,6 +887,8 @@ static void dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, dst->link_poll_timer.expires = round_jiffies(jiffies + HZ); add_timer(&dst->link_poll_timer); } + + return 0; } static int dsa_probe(struct platform_device *pdev) @@ -856,7 +929,7 @@ static int dsa_probe(struct platform_device *pdev) goto out; } - dst = kzalloc(sizeof(*dst), GFP_KERNEL); + dst = devm_kzalloc(&pdev->dev, sizeof(*dst), GFP_KERNEL); if (dst == NULL) { dev_put(dev); ret = -ENOMEM; @@ -865,7 +938,9 @@ static int dsa_probe(struct platform_device *pdev) platform_set_drvdata(pdev, dst); - dsa_setup_dst(dst, dev, &pdev->dev, pd); + ret = dsa_setup_dst(dst, dev, &pdev->dev, pd); + if (ret) + goto out; return 0; @@ -887,7 +962,7 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) for (i = 0; i < dst->pd->nr_chips; i++) { struct dsa_switch *ds = dst->ds[i]; - if (ds != NULL) + if (ds) dsa_switch_destroy(ds); } } diff --git a/net/dsa/slave.c b/net/dsa/slave.c index cce97385f743..7d91f4612ac0 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -458,12 +458,17 @@ static int dsa_slave_stp_update(struct net_device *dev, u8 state) static int dsa_slave_port_attr_set(struct net_device *dev, struct switchdev_attr *attr) { - int ret = 0; + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + int ret; switch (attr->id) { case SWITCHDEV_ATTR_PORT_STP_STATE: - if (attr->trans == SWITCHDEV_TRANS_COMMIT) - ret = dsa_slave_stp_update(dev, attr->u.stp_state); + if (attr->trans == SWITCHDEV_TRANS_PREPARE) + ret = ds->drv->port_stp_update ? 0 : -EOPNOTSUPP; + else + ret = ds->drv->port_stp_update(ds, p->port, + attr->u.stp_state); break; default: ret = -EOPNOTSUPP; diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index d25efc93d8f1..b6ca0890d018 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -78,7 +78,7 @@ static int trailer_rcv(struct sk_buff *skb, struct net_device *dev, trailer = skb_tail_pointer(skb) - 4; if (trailer[0] != 0x80 || (trailer[1] & 0xf8) != 0x00 || - (trailer[3] & 0xef) != 0x00 || trailer[3] != 0x00) + (trailer[2] & 0xef) != 0x00 || trailer[3] != 0x00) goto out_drop; source_port = trailer[1] & 7; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 30409b75e925..0c9c3482e419 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -113,6 +113,8 @@ #include <net/arp.h> #include <net/ax25.h> #include <net/netrom.h> +#include <net/dst_metadata.h> +#include <net/ip_tunnels.h> #include <linux/uaccess.h> @@ -296,7 +298,8 @@ static void arp_send_dst(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, - const unsigned char *target_hw, struct sk_buff *oskb) + const unsigned char *target_hw, + struct dst_entry *dst) { struct sk_buff *skb; @@ -309,9 +312,7 @@ static void arp_send_dst(int type, int ptype, __be32 dest_ip, if (!skb) return; - if (oskb) - skb_dst_copy(skb, oskb); - + skb_dst_set(skb, dst_clone(dst)); arp_xmit(skb); } @@ -333,6 +334,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) __be32 target = *(__be32 *)neigh->primary_key; int probes = atomic_read(&neigh->probes); struct in_device *in_dev; + struct dst_entry *dst = NULL; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); @@ -381,9 +383,10 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) } } + if (skb && !(dev->priv_flags & IFF_XMIT_DST_RELEASE)) + dst = skb_dst(skb); arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, - dst_hw, dev->dev_addr, NULL, - dev->priv_flags & IFF_XMIT_DST_RELEASE ? NULL : skb); + dst_hw, dev->dev_addr, NULL, dst); } static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) @@ -649,6 +652,7 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) int addr_type; struct neighbour *n; struct net *net = dev_net(dev); + struct dst_entry *reply_dst = NULL; bool is_garp = false; /* arp_rcv below verifies the ARP header and verifies the device @@ -749,13 +753,18 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) * cache. */ + if (arp->ar_op == htons(ARPOP_REQUEST) && skb_metadata_dst(skb)) + reply_dst = (struct dst_entry *) + iptunnel_metadata_reply(skb_metadata_dst(skb), + GFP_ATOMIC); + /* Special case: IPv4 duplicate address detection packet (RFC2131) */ if (sip == 0) { if (arp->ar_op == htons(ARPOP_REQUEST) && inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL && !arp_ignore(in_dev, sip, tip)) - arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, - dev->dev_addr, sha); + arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, + sha, dev->dev_addr, sha, reply_dst); goto out; } @@ -774,9 +783,10 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) if (!dont_send) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) { - arp_send(ARPOP_REPLY, ETH_P_ARP, sip, - dev, tip, sha, dev->dev_addr, - sha); + arp_send_dst(ARPOP_REPLY, ETH_P_ARP, + sip, dev, tip, sha, + dev->dev_addr, sha, + reply_dst); neigh_release(n); } } @@ -794,13 +804,14 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || skb->pkt_type == PACKET_HOST || NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) { - arp_send(ARPOP_REPLY, ETH_P_ARP, sip, - dev, tip, sha, dev->dev_addr, - sha); + arp_send_dst(ARPOP_REPLY, ETH_P_ARP, + sip, dev, tip, sha, + dev->dev_addr, sha, + reply_dst); } else { pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); - return 0; + goto out_free_dst; } goto out; } @@ -854,6 +865,8 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) out: consume_skb(skb); +out_free_dst: + dst_release(reply_dst); return 0; } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 6fcbd215cdbc..690bcbc59f26 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -340,6 +340,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_tun_key.tun_id = 0; + fl4.flowi4_flags = 0; no_addr = idev->ifa_list == NULL; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 18123d50f576..f2bda9e89c61 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -318,7 +318,6 @@ static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = { .delete = fib4_rule_delete, .compare = fib4_rule_compare, .fill = fib4_rule_fill, - .default_pref = fib_default_rule_pref, .nlmsg_payload = fib4_rule_nlmsg_payload, .flush_cache = fib4_rule_flush_cache, .nlgroup = RTNLGRP_IPV4_RULE, diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 26d6ffb6d23c..6c2af797f2f9 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1426,7 +1426,7 @@ found: nh->nh_flags & RTNH_F_LINKDOWN && !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) continue; - if (!(flp->flowi4_flags & FLOWI_FLAG_VRFSRC)) { + if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) { if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) continue; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 79fe05befcae..e5eb8ac4089d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -427,7 +427,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) fl4.flowi4_mark = mark; fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_proto = IPPROTO_ICMP; - fl4.flowi4_oif = vrf_master_ifindex(skb->dev) ? : skb->dev->ifindex; + fl4.flowi4_oif = vrf_master_ifindex(skb->dev); security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) @@ -461,7 +461,7 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; - fl4->flowi4_oif = vrf_master_ifindex(skb_in->dev) ? : skb_in->dev->ifindex; + fl4->flowi4_oif = vrf_master_ifindex(skb_in->dev); security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); rt = __ip_route_output_key(net, fl4); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 134957159c27..61b45a17fc73 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -577,21 +577,22 @@ EXPORT_SYMBOL(inet_rtx_syn_ack); static bool reqsk_queue_unlink(struct request_sock_queue *queue, struct request_sock *req) { - struct listen_sock *lopt = queue->listen_opt; struct request_sock **prev; + struct listen_sock *lopt; bool found = false; spin_lock(&queue->syn_wait_lock); - - for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL; - prev = &(*prev)->dl_next) { - if (*prev == req) { - *prev = req->dl_next; - found = true; - break; + lopt = queue->listen_opt; + if (lopt) { + for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL; + prev = &(*prev)->dl_next) { + if (*prev == req) { + *prev = req->dl_next; + found = true; + break; + } } } - spin_unlock(&queue->syn_wait_lock); if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer)) reqsk_put(req); @@ -685,20 +686,20 @@ void reqsk_queue_hash_req(struct request_sock_queue *queue, req->num_timeout = 0; req->sk = NULL; + setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req); + mod_timer_pinned(&req->rsk_timer, jiffies + timeout); + req->rsk_hash = hash; + /* before letting lookups find us, make sure all req fields * are committed to memory and refcnt initialized. */ smp_wmb(); atomic_set(&req->rsk_refcnt, 2); - setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req); - req->rsk_hash = hash; spin_lock(&queue->syn_wait_lock); req->dl_next = lopt->syn_table[hash]; lopt->syn_table[hash] = req; spin_unlock(&queue->syn_wait_lock); - - mod_timer_pinned(&req->rsk_timer, jiffies + timeout); } EXPORT_SYMBOL(reqsk_queue_hash_req); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index ae22cc24fbe8..c67f9bd7699c 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -123,13 +123,15 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, /* * Step 2: Hash TW into tcp ehash chain. * Notes : - * - tw_refcnt is set to 3 because : + * - tw_refcnt is set to 4 because : * - We have one reference from bhash chain. * - We have one reference from ehash chain. + * - We have one reference from timer. + * - One reference for ourself (our caller will release it). * We can use atomic_set() because prior spin_lock()/spin_unlock() * committed into memory all tw fields. */ - atomic_set(&tw->tw_refcnt, 1 + 1 + 1); + atomic_set(&tw->tw_refcnt, 4); inet_twsk_add_node_rcu(tw, &ehead->chain); /* Step 3: Remove SK from hash chain */ @@ -217,7 +219,7 @@ void inet_twsk_deschedule_put(struct inet_timewait_sock *tw) } EXPORT_SYMBOL(inet_twsk_deschedule_put); -void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo) +void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) { /* timeout := RTO * 3.5 * @@ -245,12 +247,14 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo) */ tw->tw_kill = timeo <= 4*HZ; - if (!mod_timer_pinned(&tw->tw_timer, jiffies + timeo)) { - atomic_inc(&tw->tw_refcnt); + if (!rearm) { + BUG_ON(mod_timer_pinned(&tw->tw_timer, jiffies + timeo)); atomic_inc(&tw->tw_dr->tw_count); + } else { + mod_timer_pending(&tw->tw_timer, jiffies + timeo); } } -EXPORT_SYMBOL_GPL(inet_twsk_schedule); +EXPORT_SYMBOL_GPL(__inet_twsk_schedule); void inet_twsk_purge(struct inet_hashinfo *hashinfo, struct inet_timewait_death_row *twdr, int family) diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 29ed6c5a5185..84dce6a92f93 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -46,12 +46,13 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> +#include <net/dst_metadata.h> int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, __u8 tos, __u8 ttl, __be16 df, bool xnet) { - int pkt_len = skb->len; + int pkt_len = skb->len - skb_inner_network_offset(skb); struct iphdr *iph; int err; @@ -119,6 +120,33 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) } EXPORT_SYMBOL_GPL(iptunnel_pull_header); +struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, + gfp_t flags) +{ + struct metadata_dst *res; + struct ip_tunnel_info *dst, *src; + + if (!md || md->u.tun_info.mode & IP_TUNNEL_INFO_TX) + return NULL; + + res = metadata_dst_alloc(0, flags); + if (!res) + return NULL; + + dst = &res->u.tun_info; + src = &md->u.tun_info; + dst->key.tun_id = src->key.tun_id; + if (src->mode & IP_TUNNEL_INFO_IPV6) + memcpy(&dst->key.u.ipv6.dst, &src->key.u.ipv6.src, + sizeof(struct in6_addr)); + else + dst->key.u.ipv4.dst = src->key.u.ipv4.src; + dst->mode = src->mode | IP_TUNNEL_INFO_TX; + + return res; +} +EXPORT_SYMBOL_GPL(iptunnel_metadata_reply); + struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, bool csum_help, int gso_type_mask) @@ -198,8 +226,6 @@ static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = { [LWTUNNEL_IP_SRC] = { .type = NLA_U32 }, [LWTUNNEL_IP_TTL] = { .type = NLA_U8 }, [LWTUNNEL_IP_TOS] = { .type = NLA_U8 }, - [LWTUNNEL_IP_SPORT] = { .type = NLA_U16 }, - [LWTUNNEL_IP_DPORT] = { .type = NLA_U16 }, [LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 }, }; @@ -239,12 +265,6 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr, if (tb[LWTUNNEL_IP_TOS]) tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]); - if (tb[LWTUNNEL_IP_SPORT]) - tun_info->key.tp_src = nla_get_be16(tb[LWTUNNEL_IP_SPORT]); - - if (tb[LWTUNNEL_IP_DPORT]) - tun_info->key.tp_dst = nla_get_be16(tb[LWTUNNEL_IP_DPORT]); - if (tb[LWTUNNEL_IP_FLAGS]) tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP_FLAGS]); @@ -266,8 +286,6 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb, nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) || nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) || - nla_put_u16(skb, LWTUNNEL_IP_SPORT, tun_info->key.tp_src) || - nla_put_u16(skb, LWTUNNEL_IP_DPORT, tun_info->key.tp_dst) || nla_put_u16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags)) return -ENOMEM; @@ -281,8 +299,6 @@ static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate) + nla_total_size(4) /* LWTUNNEL_IP_SRC */ + nla_total_size(1) /* LWTUNNEL_IP_TOS */ + nla_total_size(1) /* LWTUNNEL_IP_TTL */ - + nla_total_size(2) /* LWTUNNEL_IP_SPORT */ - + nla_total_size(2) /* LWTUNNEL_IP_DPORT */ + nla_total_size(2); /* LWTUNNEL_IP_FLAGS */ } @@ -305,8 +321,6 @@ static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = { [LWTUNNEL_IP6_SRC] = { .len = sizeof(struct in6_addr) }, [LWTUNNEL_IP6_HOPLIMIT] = { .type = NLA_U8 }, [LWTUNNEL_IP6_TC] = { .type = NLA_U8 }, - [LWTUNNEL_IP6_SPORT] = { .type = NLA_U16 }, - [LWTUNNEL_IP6_DPORT] = { .type = NLA_U16 }, [LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 }, }; @@ -346,12 +360,6 @@ static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr, if (tb[LWTUNNEL_IP6_TC]) tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]); - if (tb[LWTUNNEL_IP6_SPORT]) - tun_info->key.tp_src = nla_get_be16(tb[LWTUNNEL_IP6_SPORT]); - - if (tb[LWTUNNEL_IP6_DPORT]) - tun_info->key.tp_dst = nla_get_be16(tb[LWTUNNEL_IP6_DPORT]); - if (tb[LWTUNNEL_IP6_FLAGS]) tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP6_FLAGS]); @@ -373,8 +381,6 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb, nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) || nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.ttl) || - nla_put_u16(skb, LWTUNNEL_IP6_SPORT, tun_info->key.tp_src) || - nla_put_u16(skb, LWTUNNEL_IP6_DPORT, tun_info->key.tp_dst) || nla_put_u16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags)) return -ENOMEM; @@ -388,8 +394,6 @@ static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate) + nla_total_size(16) /* LWTUNNEL_IP6_SRC */ + nla_total_size(1) /* LWTUNNEL_IP6_HOPLIMIT */ + nla_total_size(1) /* LWTUNNEL_IP6_TC */ - + nla_total_size(2) /* LWTUNNEL_IP6_SPORT */ - + nla_total_size(2) /* LWTUNNEL_IP6_DPORT */ + nla_total_size(2); /* LWTUNNEL_IP6_FLAGS */ } diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 3a2c0162c3ba..866ee89f5254 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -233,7 +233,6 @@ static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = { .match = ipmr_rule_match, .configure = ipmr_rule_configure, .compare = ipmr_rule_compare, - .default_pref = fib_default_rule_pref, .fill = ipmr_rule_fill, .nlgroup = RTNLGRP_IPV4_RULE, .policy = ipmr_rule_policy, diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 690d27d3f2f9..a35584176535 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -75,6 +75,7 @@ endif # NF_TABLES config NF_DUP_IPV4 tristate "Netfilter IPv4 packet duplication to alternate destination" + depends on !NF_CONNTRACK || NF_CONNTRACK help This option enables the nf_dup_ipv4 core, which duplicates an IPv4 packet to be rerouted to another destination. diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index 8618fd150c96..c4ffc9de1654 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -61,9 +61,7 @@ static bool rpfilter_lookup_reverse(struct flowi4 *fl4, if (FIB_RES_DEV(res) == dev) dev_match = true; #endif - if (dev_match || flags & XT_RPFILTER_LOOSE) - return FIB_RES_NH(res).nh_scope <= RT_SCOPE_HOST; - return dev_match; + return dev_match || flags & XT_RPFILTER_LOOSE; } static bool rpfilter_is_local(const struct sk_buff *skb) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 5f4a5565ad8b..c81deb85acb4 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1737,6 +1737,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.flowi4_mark = skb->mark; fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.flowi4_flags = 0; fl4.daddr = daddr; fl4.saddr = saddr; err = fib_lookup(net, &fl4, &res, 0); @@ -2045,6 +2046,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) struct fib_result res; struct rtable *rth; int orig_oif; + int err = -ENETUNREACH; res.tclassid = 0; res.fi = NULL; @@ -2153,7 +2155,8 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) goto make_route; } - if (fib_lookup(net, fl4, &res, 0)) { + err = fib_lookup(net, fl4, &res, 0); + if (err) { res.fi = NULL; res.table = NULL; if (fl4->flowi4_oif) { @@ -2181,7 +2184,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) res.type = RTN_UNICAST; goto make_route; } - rth = ERR_PTR(-ENETUNREACH); + rth = ERR_PTR(err); goto out; } diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 28011fb1f4a2..448c2615fece 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -151,6 +151,27 @@ static void bictcp_init(struct sock *sk) tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } +static void bictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) +{ + if (event == CA_EVENT_TX_START) { + struct bictcp *ca = inet_csk_ca(sk); + u32 now = tcp_time_stamp; + s32 delta; + + delta = now - tcp_sk(sk)->lsndtime; + + /* We were application limited (idle) for a while. + * Shift epoch_start to keep cwnd growth to cubic curve. + */ + if (ca->epoch_start && delta > 0) { + ca->epoch_start += delta; + if (after(ca->epoch_start, now)) + ca->epoch_start = now; + } + return; + } +} + /* calculate the cubic root of x using a table lookup followed by one * Newton-Raphson iteration. * Avg err ~= 0.195% @@ -450,6 +471,7 @@ static struct tcp_congestion_ops cubictcp __read_mostly = { .cong_avoid = bictcp_cong_avoid, .set_state = bictcp_state, .undo_cwnd = bictcp_undo_cwnd, + .cwnd_event = bictcp_cwnd_event, .pkts_acked = bictcp_acked, .owner = THIS_MODULE, .name = "cubic", diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 6d8795b066ac..def765911ff8 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -162,9 +162,9 @@ kill_with_rst: if (tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && tcp_tw_remember_stamp(tw)) - inet_twsk_schedule(tw, tw->tw_timeout); + inet_twsk_reschedule(tw, tw->tw_timeout); else - inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return TCP_TW_ACK; } @@ -201,7 +201,7 @@ kill: return TCP_TW_SUCCESS; } } - inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; @@ -251,7 +251,7 @@ kill: * Do not reschedule in the last case. */ if (paws_reject || th->ack) - inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); @@ -322,9 +322,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) } while (0); #endif - /* Linkage updates. */ - __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); - /* Get the TIME_WAIT timeout firing. */ if (timeo < rto) timeo = rto; @@ -338,6 +335,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) } inet_twsk_schedule(tw, timeo); + /* Linkage updates. */ + __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); inet_twsk_put(tw); } else { /* Sorry, if we're out of memory, just CLOSE this diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1188e4fcf23b..3dbee0d83b15 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -164,6 +164,9 @@ static void tcp_event_data_sent(struct tcp_sock *tp, struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_time_stamp; + if (tcp_packets_in_flight(tp) == 0) + tcp_ca_event(sk, CA_EVENT_TX_START); + tp->lsndtime = now; /* If it is a reply for ato after last received @@ -940,9 +943,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, &md5); tcp_header_size = tcp_options_size + sizeof(struct tcphdr); - if (tcp_packets_in_flight(tp) == 0) - tcp_ca_event(sk, CA_EVENT_TX_START); - /* if no packet is in qdisc/device queue, then allow XPS to select * another queue. We can be called from tcp_tsq_handler() * which holds one reference to sk_wmem_alloc. @@ -2897,6 +2897,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) skb_reserve(skb, MAX_TCP_HEADER); tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), TCPHDR_ACK | TCPHDR_RST); + skb_mstamp_get(&skb->skb_mstamp); /* Send it off. */ if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); @@ -3404,7 +3405,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) */ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); skb_mstamp_get(&skb->skb_mstamp); - NET_INC_STATS_BH(sock_net(sk), mib); + NET_INC_STATS(sock_net(sk), mib); return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index c0a15e7f359f..f7d1d5e19e95 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1024,7 +1024,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (netif_index_is_vrf(net, ipc.oif)) { flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, - (flow_flags | FLOWI_FLAG_VRFSRC), + (flow_flags | FLOWI_FLAG_VRFSRC | + FLOWI_FLAG_SKIP_NH_OIF), faddr, saddr, dport, inet->inet_sport); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index bb919b28619f..c10a9ee68433 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -33,6 +33,8 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, if (saddr) fl4->saddr = saddr->a4; + fl4->flowi4_flags = FLOWI_FLAG_SKIP_NH_OIF; + rt = __ip_route_output_key(net, fl4); if (!IS_ERR(rt)) return &rt->dst; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 99c0f2b843f0..36b85bd05ac8 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1943,37 +1943,6 @@ static void addrconf_leave_anycast(struct inet6_ifaddr *ifp) __ipv6_dev_ac_dec(ifp->idev, &addr); } -static int addrconf_ifid_eui48(u8 *eui, struct net_device *dev) -{ - if (dev->addr_len != ETH_ALEN) - return -1; - memcpy(eui, dev->dev_addr, 3); - memcpy(eui + 5, dev->dev_addr + 3, 3); - - /* - * The zSeries OSA network cards can be shared among various - * OS instances, but the OSA cards have only one MAC address. - * This leads to duplicate address conflicts in conjunction - * with IPv6 if more than one instance uses the same card. - * - * The driver for these cards can deliver a unique 16-bit - * identifier for each instance sharing the same card. It is - * placed instead of 0xFFFE in the interface identifier. The - * "u" bit of the interface identifier is not inverted in this - * case. Hence the resulting interface identifier has local - * scope according to RFC2373. - */ - if (dev->dev_id) { - eui[3] = (dev->dev_id >> 8) & 0xFF; - eui[4] = dev->dev_id & 0xFF; - } else { - eui[3] = 0xFF; - eui[4] = 0xFE; - eui[0] ^= 2; - } - return 0; -} - static int addrconf_ifid_eui64(u8 *eui, struct net_device *dev) { if (dev->addr_len != IEEE802154_ADDR_LEN) @@ -3150,6 +3119,8 @@ static void addrconf_gre_config(struct net_device *dev) } addrconf_addr_gen(idev, true); + if (dev->flags & IFF_POINTOPOINT) + addrconf_add_mroute(dev); } #endif @@ -5158,13 +5129,12 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) rt = addrconf_get_prefix_route(&ifp->peer_addr, 128, ifp->idev->dev, 0, 0); - if (rt && ip6_del_rt(rt)) - dst_free(&rt->dst); + if (rt) + ip6_del_rt(rt); } dst_hold(&ifp->rt->dst); - if (ip6_del_rt(ifp->rt)) - dst_free(&ifp->rt->dst); + ip6_del_rt(ifp->rt); rt_genid_bump_ipv6(net); break; diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 2367a16eae58..9f777ec59a59 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -258,11 +258,6 @@ nla_put_failure: return -ENOBUFS; } -static u32 fib6_rule_default_pref(struct fib_rules_ops *ops) -{ - return 0x3FFF; -} - static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule) { return nla_total_size(16) /* dst */ @@ -279,7 +274,6 @@ static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = { .configure = fib6_rule_configure, .compare = fib6_rule_compare, .fill = fib6_rule_fill, - .default_pref = fib6_rule_default_pref, .nlmsg_payload = fib6_rule_nlmsg_payload, .nlgroup = RTNLGRP_IPV6_RULE, .policy = fib6_rule_policy, diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 418d9823692b..7d2e0023c72d 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -155,6 +155,11 @@ static void node_free(struct fib6_node *fn) kmem_cache_free(fib6_node_kmem, fn); } +static void rt6_rcu_free(struct rt6_info *rt) +{ + call_rcu(&rt->dst.rcu_head, dst_rcu_free); +} + static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) { int cpu; @@ -169,7 +174,7 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu); pcpu_rt = *ppcpu_rt; if (pcpu_rt) { - dst_free(&pcpu_rt->dst); + rt6_rcu_free(pcpu_rt); *ppcpu_rt = NULL; } } @@ -181,7 +186,7 @@ static void rt6_release(struct rt6_info *rt) { if (atomic_dec_and_test(&rt->rt6i_ref)) { rt6_free_pcpu(rt); - dst_free(&rt->dst); + rt6_rcu_free(rt); } } @@ -846,7 +851,7 @@ add: *ins = rt; rt->rt6i_node = fn; atomic_inc(&rt->rt6i_ref); - inet6_rt_notify(RTM_NEWROUTE, rt, info); + inet6_rt_notify(RTM_NEWROUTE, rt, info, 0); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; if (!(fn->fn_flags & RTN_RTINFO)) { @@ -872,7 +877,7 @@ add: rt->rt6i_node = fn; rt->dst.rt6_next = iter->dst.rt6_next; atomic_inc(&rt->rt6i_ref); - inet6_rt_notify(RTM_NEWROUTE, rt, info); + inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; @@ -933,6 +938,10 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, int replace_required = 0; int sernum = fib6_new_sernum(info->nl_net); + if (WARN_ON_ONCE((rt->dst.flags & DST_NOCACHE) && + !atomic_read(&rt->dst.__refcnt))) + return -EINVAL; + if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) allow_create = 0; @@ -1025,6 +1034,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, fib6_start_gc(info->nl_net, rt); if (!(rt->rt6i_flags & RTF_CACHE)) fib6_prune_clones(info->nl_net, pn); + rt->dst.flags &= ~DST_NOCACHE; } out: @@ -1049,7 +1059,8 @@ out: atomic_inc(&pn->leaf->rt6i_ref); } #endif - dst_free(&rt->dst); + if (!(rt->dst.flags & DST_NOCACHE)) + dst_free(&rt->dst); } return err; @@ -1060,7 +1071,8 @@ out: st_failure: if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) fib6_repair_tree(info->nl_net, fn); - dst_free(&rt->dst); + if (!(rt->dst.flags & DST_NOCACHE)) + dst_free(&rt->dst); return err; #endif } @@ -1410,7 +1422,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, fib6_purge_rt(rt, fn, net); - inet6_rt_notify(RTM_DELROUTE, rt, info); + inet6_rt_notify(RTM_DELROUTE, rt, info, 0); rt6_release(rt); } diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 4038c694ec03..3c7b9310b33f 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -404,13 +404,13 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct ipv6_tlv_tnl_enc_lim *tel; __u32 mtu; case ICMPV6_DEST_UNREACH: - net_warn_ratelimited("%s: Path to destination invalid or inactive!\n", - t->parms.name); + net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", + t->parms.name); break; case ICMPV6_TIME_EXCEED: if (code == ICMPV6_EXC_HOPLIMIT) { - net_warn_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", - t->parms.name); + net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", + t->parms.name); } break; case ICMPV6_PARAMPROB: @@ -421,12 +421,12 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (teli && teli == be32_to_cpu(info) - 2) { tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; if (tel->encap_limit == 0) { - net_warn_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", - t->parms.name); + net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", + t->parms.name); } } else { - net_warn_ratelimited("%s: Recipient unable to parse tunneled packet!\n", - t->parms.name); + net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n", + t->parms.name); } break; case ICMPV6_PKT_TOOBIG: @@ -634,20 +634,20 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, } if (!fl6->flowi6_mark) - dst = ip6_tnl_dst_check(tunnel); + dst = ip6_tnl_dst_get(tunnel); if (!dst) { - ndst = ip6_route_output(net, NULL, fl6); + dst = ip6_route_output(net, NULL, fl6); - if (ndst->error) + if (dst->error) goto tx_err_link_failure; - ndst = xfrm_lookup(net, ndst, flowi6_to_flowi(fl6), NULL, 0); - if (IS_ERR(ndst)) { - err = PTR_ERR(ndst); - ndst = NULL; + dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; goto tx_err_link_failure; } - dst = ndst; + ndst = dst; } tdev = dst->dev; @@ -702,12 +702,9 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, skb = new_skb; } - if (fl6->flowi6_mark) { - skb_dst_set(skb, dst); - ndst = NULL; - } else { - skb_dst_set_noref(skb, dst); - } + if (!fl6->flowi6_mark && ndst) + ip6_tnl_dst_set(tunnel, ndst); + skb_dst_set(skb, dst); proto = NEXTHDR_GRE; if (encap_limit >= 0) { @@ -762,14 +759,12 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, skb_set_inner_protocol(skb, protocol); ip6tunnel_xmit(NULL, skb, dev); - if (ndst) - ip6_tnl_dst_store(tunnel, ndst); return 0; tx_err_link_failure: stats->tx_carrier_errors++; dst_link_failure(skb); tx_err_dst_release: - dst_release(ndst); + dst_release(dst); return err; } @@ -1223,6 +1218,9 @@ static const struct net_device_ops ip6gre_netdev_ops = { static void ip6gre_dev_free(struct net_device *dev) { + struct ip6_tnl *t = netdev_priv(dev); + + ip6_tnl_dst_destroy(t); free_percpu(dev->tstats); free_netdev(dev); } @@ -1245,9 +1243,10 @@ static void ip6gre_tunnel_setup(struct net_device *dev) netif_keep_dst(dev); } -static int ip6gre_tunnel_init(struct net_device *dev) +static int ip6gre_tunnel_init_common(struct net_device *dev) { struct ip6_tnl *tunnel; + int ret; tunnel = netdev_priv(dev); @@ -1255,16 +1254,37 @@ static int ip6gre_tunnel_init(struct net_device *dev) tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + ret = ip6_tnl_dst_init(tunnel); + if (ret) { + free_percpu(dev->tstats); + dev->tstats = NULL; + return ret; + } + + return 0; +} + +static int ip6gre_tunnel_init(struct net_device *dev) +{ + struct ip6_tnl *tunnel; + int ret; + + ret = ip6gre_tunnel_init_common(dev); + if (ret) + return ret; + + tunnel = netdev_priv(dev); + memcpy(dev->dev_addr, &tunnel->parms.laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &tunnel->parms.raddr, sizeof(struct in6_addr)); if (ipv6_addr_any(&tunnel->parms.raddr)) dev->header_ops = &ip6gre_header_ops; - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - return 0; } @@ -1460,19 +1480,16 @@ static void ip6gre_netlink_parms(struct nlattr *data[], static int ip6gre_tap_init(struct net_device *dev) { struct ip6_tnl *tunnel; + int ret; - tunnel = netdev_priv(dev); + ret = ip6gre_tunnel_init_common(dev); + if (ret) + return ret; - tunnel->dev = dev; - tunnel->net = dev_net(dev); - strcpy(tunnel->parms.name, dev->name); + tunnel = netdev_priv(dev); ip6gre_tnl_link_config(tunnel, 1); - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - return 0; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 26ea47930740..d03d6da772f3 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -376,6 +376,9 @@ int ip6_forward(struct sk_buff *skb) if (skb->pkt_type != PACKET_HOST) goto drop; + if (unlikely(skb->sk)) + goto drop; + if (skb_warn_if_lro(skb)) goto drop; @@ -586,20 +589,22 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, &ipv6_hdr(skb)->saddr); + hroom = LL_RESERVED_SPACE(rt->dst.dev); if (skb_has_frag_list(skb)) { int first_len = skb_pagelen(skb); struct sk_buff *frag2; if (first_len - hlen > mtu || ((first_len - hlen) & 7) || - skb_cloned(skb)) + skb_cloned(skb) || + skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) goto slow_path; skb_walk_frags(skb, frag) { /* Correct geometry. */ if (frag->len > mtu || ((frag->len & 7) && frag->next) || - skb_headroom(frag) < hlen) + skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) goto slow_path_clean; /* Partially cloned skb? */ @@ -616,8 +621,6 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, err = 0; offset = 0; - frag = skb_shinfo(skb)->frag_list; - skb_frag_list_init(skb); /* BUILD HEADER */ *prevhdr = NEXTHDR_FRAGMENT; @@ -625,8 +628,11 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb, if (!tmp_hdr) { IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); - return -ENOMEM; + err = -ENOMEM; + goto fail; } + frag = skb_shinfo(skb)->frag_list; + skb_frag_list_init(skb); __skb_pull(skb, hlen); fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr)); @@ -723,7 +729,6 @@ slow_path: */ *prevhdr = NEXTHDR_FRAGMENT; - hroom = LL_RESERVED_SPACE(rt->dst.dev); troom = rt->dst.dev->needed_tailroom; /* @@ -872,7 +877,8 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk, #ifdef CONFIG_IPV6_SUBTREES ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || #endif - (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) { + (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) && + (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) { dst_release(dst); dst = NULL; } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index b0ab420612bc..eabffbb89795 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -126,36 +126,92 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev) * Locking : hash tables are protected by RCU and RTNL */ -struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t) +static void ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst, + struct dst_entry *dst) { - struct dst_entry *dst = t->dst_cache; + write_seqlock_bh(&idst->lock); + dst_release(rcu_dereference_protected( + idst->dst, + lockdep_is_held(&idst->lock.lock))); + if (dst) { + dst_hold(dst); + idst->cookie = rt6_get_cookie((struct rt6_info *)dst); + } else { + idst->cookie = 0; + } + rcu_assign_pointer(idst->dst, dst); + write_sequnlock_bh(&idst->lock); +} + +struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t) +{ + struct ip6_tnl_dst *idst; + struct dst_entry *dst; + unsigned int seq; + u32 cookie; - if (dst && dst->obsolete && - !dst->ops->check(dst, t->dst_cookie)) { - t->dst_cache = NULL; + idst = raw_cpu_ptr(t->dst_cache); + + rcu_read_lock(); + do { + seq = read_seqbegin(&idst->lock); + dst = rcu_dereference(idst->dst); + cookie = idst->cookie; + } while (read_seqretry(&idst->lock, seq)); + + if (dst && !atomic_inc_not_zero(&dst->__refcnt)) + dst = NULL; + rcu_read_unlock(); + + if (dst && dst->obsolete && !dst->ops->check(dst, cookie)) { + ip6_tnl_per_cpu_dst_set(idst, NULL); dst_release(dst); - return NULL; + dst = NULL; } - return dst; } -EXPORT_SYMBOL_GPL(ip6_tnl_dst_check); +EXPORT_SYMBOL_GPL(ip6_tnl_dst_get); void ip6_tnl_dst_reset(struct ip6_tnl *t) { - dst_release(t->dst_cache); - t->dst_cache = NULL; + int i; + + for_each_possible_cpu(i) + ip6_tnl_per_cpu_dst_set(raw_cpu_ptr(t->dst_cache), NULL); } EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset); -void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst) +void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst) +{ + ip6_tnl_per_cpu_dst_set(raw_cpu_ptr(t->dst_cache), dst); + +} +EXPORT_SYMBOL_GPL(ip6_tnl_dst_set); + +void ip6_tnl_dst_destroy(struct ip6_tnl *t) { - struct rt6_info *rt = (struct rt6_info *) dst; - t->dst_cookie = rt6_get_cookie(rt); - dst_release(t->dst_cache); - t->dst_cache = dst; + if (!t->dst_cache) + return; + + ip6_tnl_dst_reset(t); + free_percpu(t->dst_cache); } -EXPORT_SYMBOL_GPL(ip6_tnl_dst_store); +EXPORT_SYMBOL_GPL(ip6_tnl_dst_destroy); + +int ip6_tnl_dst_init(struct ip6_tnl *t) +{ + int i; + + t->dst_cache = alloc_percpu(struct ip6_tnl_dst); + if (!t->dst_cache) + return -ENOMEM; + + for_each_possible_cpu(i) + seqlock_init(&per_cpu_ptr(t->dst_cache, i)->lock); + + return 0; +} +EXPORT_SYMBOL_GPL(ip6_tnl_dst_init); /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses @@ -271,6 +327,9 @@ ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) static void ip6_dev_free(struct net_device *dev) { + struct ip6_tnl *t = netdev_priv(dev); + + ip6_tnl_dst_destroy(t); free_percpu(dev->tstats); free_netdev(dev); } @@ -510,14 +569,14 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, struct ipv6_tlv_tnl_enc_lim *tel; __u32 mtu; case ICMPV6_DEST_UNREACH: - net_warn_ratelimited("%s: Path to destination invalid or inactive!\n", - t->parms.name); + net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", + t->parms.name); rel_msg = 1; break; case ICMPV6_TIME_EXCEED: if ((*code) == ICMPV6_EXC_HOPLIMIT) { - net_warn_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", - t->parms.name); + net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", + t->parms.name); rel_msg = 1; } break; @@ -529,13 +588,13 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, if (teli && teli == *info - 2) { tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; if (tel->encap_limit == 0) { - net_warn_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", - t->parms.name); + net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", + t->parms.name); rel_msg = 1; } } else { - net_warn_ratelimited("%s: Recipient unable to parse tunneled packet!\n", - t->parms.name); + net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n", + t->parms.name); } break; case ICMPV6_PKT_TOOBIG: @@ -1010,23 +1069,23 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); neigh_release(neigh); } else if (!fl6->flowi6_mark) - dst = ip6_tnl_dst_check(t); + dst = ip6_tnl_dst_get(t); if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr)) goto tx_err_link_failure; if (!dst) { - ndst = ip6_route_output(net, NULL, fl6); + dst = ip6_route_output(net, NULL, fl6); - if (ndst->error) + if (dst->error) goto tx_err_link_failure; - ndst = xfrm_lookup(net, ndst, flowi6_to_flowi(fl6), NULL, 0); - if (IS_ERR(ndst)) { - err = PTR_ERR(ndst); - ndst = NULL; + dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; goto tx_err_link_failure; } - dst = ndst; + ndst = dst; } tdev = dst->dev; @@ -1072,12 +1131,11 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, consume_skb(skb); skb = new_skb; } - if (fl6->flowi6_mark) { - skb_dst_set(skb, dst); - ndst = NULL; - } else { - skb_dst_set_noref(skb, dst); - } + + if (!fl6->flowi6_mark && ndst) + ip6_tnl_dst_set(t, ndst); + skb_dst_set(skb, dst); + skb->transport_header = skb->network_header; proto = fl6->flowi6_proto; @@ -1101,14 +1159,12 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, ipv6h->saddr = fl6->saddr; ipv6h->daddr = fl6->daddr; ip6tunnel_xmit(NULL, skb, dev); - if (ndst) - ip6_tnl_dst_store(t, ndst); return 0; tx_err_link_failure: stats->tx_carrier_errors++; dst_link_failure(skb); tx_err_dst_release: - dst_release(ndst); + dst_release(dst); return err; } @@ -1573,12 +1629,21 @@ static inline int ip6_tnl_dev_init_gen(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); + int ret; t->dev = dev; t->net = dev_net(dev); dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; + + ret = ip6_tnl_dst_init(t); + if (ret) { + free_percpu(dev->tstats); + dev->tstats = NULL; + return ret; + } + return 0; } diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 74ceb73c1c9a..0e004cc42a22 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -217,7 +217,6 @@ static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = { .match = ip6mr_rule_match, .configure = ip6mr_rule_configure, .compare = ip6mr_rule_compare, - .default_pref = fib_default_rule_pref, .fill = ip6mr_rule_fill, .nlgroup = RTNLGRP_IPV6_RULE, .policy = ip6mr_rule_policy, @@ -550,7 +549,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) if (it->cache == &mrt->mfc6_unres_queue) spin_unlock_bh(&mfc_unres_lock); - else if (it->cache == mrt->mfc6_cache_array) + else if (it->cache == &mrt->mfc6_cache_array[it->ct]) read_unlock(&mrt_lock); } diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 96833e4b3193..f6a024e141e5 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -58,6 +58,7 @@ endif # NF_TABLES config NF_DUP_IPV6 tristate "Netfilter IPv6 packet duplication to alternate destination" + depends on !NF_CONNTRACK || NF_CONNTRACK help This option enables the nf_dup_ipv6 core, which duplicates an IPv6 packet to be rerouted to another destination. diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f45cac6f8356..946880ad48ac 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -142,6 +142,9 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) struct net_device *loopback_dev = net->loopback_dev; int cpu; + if (dev == loopback_dev) + return; + for_each_possible_cpu(cpu) { struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); struct rt6_info *rt; @@ -151,14 +154,12 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) struct inet6_dev *rt_idev = rt->rt6i_idev; struct net_device *rt_dev = rt->dst.dev; - if (rt_idev && (rt_idev->dev == dev || !dev) && - rt_idev->dev != loopback_dev) { + if (rt_idev->dev == dev) { rt->rt6i_idev = in6_dev_get(loopback_dev); in6_dev_put(rt_idev); } - if (rt_dev && (rt_dev == dev || !dev) && - rt_dev != loopback_dev) { + if (rt_dev == dev) { rt->dst.dev = loopback_dev; dev_hold(rt->dst.dev); dev_put(rt_dev); @@ -247,12 +248,6 @@ static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, { } -static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst, - unsigned long old) -{ - return NULL; -} - static struct dst_ops ip6_dst_blackhole_ops = { .family = AF_INET6, .destroy = ip6_dst_destroy, @@ -261,7 +256,7 @@ static struct dst_ops ip6_dst_blackhole_ops = { .default_advmss = ip6_default_advmss, .update_pmtu = ip6_rt_blackhole_update_pmtu, .redirect = ip6_rt_blackhole_redirect, - .cow_metrics = ip6_rt_blackhole_cow_metrics, + .cow_metrics = dst_cow_metrics_generic, .neigh_lookup = ip6_neigh_lookup, }; @@ -318,6 +313,15 @@ static const struct rt6_info ip6_blk_hole_entry_template = { #endif +static void rt6_info_init(struct rt6_info *rt) +{ + struct dst_entry *dst = &rt->dst; + + memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); + INIT_LIST_HEAD(&rt->rt6i_siblings); + INIT_LIST_HEAD(&rt->rt6i_uncached); +} + /* allocate dst with ip6_dst_ops */ static struct rt6_info *__ip6_dst_alloc(struct net *net, struct net_device *dev, @@ -326,13 +330,9 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net, struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0, DST_OBSOLETE_FORCE_CHK, flags); - if (rt) { - struct dst_entry *dst = &rt->dst; + if (rt) + rt6_info_init(rt); - memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); - INIT_LIST_HEAD(&rt->rt6i_siblings); - INIT_LIST_HEAD(&rt->rt6i_uncached); - } return rt; } @@ -1068,6 +1068,9 @@ static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); saved_fn = fn; + if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) + oif = 0; + redo_rt6_select: rt = rt6_select(fn, oif, strict); if (rt->rt6i_nsiblings) @@ -1190,13 +1193,16 @@ struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk, struct flowi6 *fl6) { int flags = 0; + bool any_src; fl6->flowi6_iif = LOOPBACK_IFINDEX; - if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr)) + any_src = ipv6_addr_any(&fl6->saddr); + if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || + (fl6->flowi6_oif && any_src)) flags |= RT6_LOOKUP_F_IFACE; - if (!ipv6_addr_any(&fl6->saddr)) + if (!any_src) flags |= RT6_LOOKUP_F_HAS_SADDR; else if (sk) flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); @@ -1212,24 +1218,20 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0); if (rt) { - new = &rt->dst; - - memset(new + 1, 0, sizeof(*rt) - sizeof(*new)); + rt6_info_init(rt); + new = &rt->dst; new->__use = 1; new->input = dst_discard; new->output = dst_discard_sk; - if (dst_metrics_read_only(&ort->dst)) - new->_metrics = ort->dst._metrics; - else - dst_copy_metrics(new, &ort->dst); + dst_copy_metrics(new, &ort->dst); rt->rt6i_idev = ort->rt6i_idev; if (rt->rt6i_idev) in6_dev_hold(rt->rt6i_idev); rt->rt6i_gateway = ort->rt6i_gateway; - rt->rt6i_flags = ort->rt6i_flags; + rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; rt->rt6i_metric = 0; memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); @@ -1322,8 +1324,7 @@ static void ip6_link_failure(struct sk_buff *skb) if (rt) { if (rt->rt6i_flags & RTF_CACHE) { dst_hold(&rt->dst); - if (ip6_del_rt(rt)) - dst_free(&rt->dst); + ip6_del_rt(rt); } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) { rt->rt6i_node->fn_sernum = -1; } @@ -1748,7 +1749,7 @@ static int ip6_convert_metrics(struct mx6_config *mxc, return -EINVAL; } -int ip6_route_add(struct fib6_config *cfg) +int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret) { int err; struct net *net = cfg->fc_nlinfo.nl_net; @@ -1756,7 +1757,6 @@ int ip6_route_add(struct fib6_config *cfg) struct net_device *dev = NULL; struct inet6_dev *idev = NULL; struct fib6_table *table; - struct mx6_config mxc = { .mx = NULL, }; int addr_type; if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128) @@ -1887,9 +1887,11 @@ int ip6_route_add(struct fib6_config *cfg) rt->dst.input = ip6_pkt_prohibit; break; case RTN_THROW: + case RTN_UNREACHABLE: default: rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN - : -ENETUNREACH; + : (cfg->fc_type == RTN_UNREACHABLE) + ? -EHOSTUNREACH : -ENETUNREACH; rt->dst.output = ip6_pkt_discard_out; rt->dst.input = ip6_pkt_discard; break; @@ -1981,6 +1983,32 @@ install_route: cfg->fc_nlinfo.nl_net = dev_net(dev); + *rt_ret = rt; + + return 0; +out: + if (dev) + dev_put(dev); + if (idev) + in6_dev_put(idev); + if (rt) + dst_free(&rt->dst); + + *rt_ret = NULL; + + return err; +} + +int ip6_route_add(struct fib6_config *cfg) +{ + struct mx6_config mxc = { .mx = NULL, }; + struct rt6_info *rt = NULL; + int err; + + err = ip6_route_info_create(cfg, &rt); + if (err) + goto out; + err = ip6_convert_metrics(&mxc, cfg); if (err) goto out; @@ -1988,14 +2016,12 @@ install_route: err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc); kfree(mxc.mx); + return err; out: - if (dev) - dev_put(dev); - if (idev) - in6_dev_put(idev); if (rt) dst_free(&rt->dst); + return err; } @@ -2005,7 +2031,8 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) struct fib6_table *table; struct net *net = dev_net(rt->dst.dev); - if (rt == net->ipv6.ip6_null_entry) { + if (rt == net->ipv6.ip6_null_entry || + rt->dst.flags & DST_NOCACHE) { err = -ENOENT; goto out; } @@ -2492,6 +2519,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, rt->rt6i_dst.addr = *addr; rt->rt6i_dst.plen = 128; rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL); + rt->dst.flags |= DST_NOCACHE; atomic_set(&rt->dst.__refcnt, 1); @@ -2595,7 +2623,8 @@ void rt6_ifdown(struct net *net, struct net_device *dev) fib6_clean_all(net, fib6_ifdown, &adn); icmp6_clean_all(fib6_ifdown, &adn); - rt6_uncached_list_flush_dev(net, dev); + if (dev) + rt6_uncached_list_flush_dev(net, dev); } struct rt6_mtu_change_arg { @@ -2776,19 +2805,78 @@ errout: return err; } -static int ip6_route_multipath(struct fib6_config *cfg, int add) +struct rt6_nh { + struct rt6_info *rt6_info; + struct fib6_config r_cfg; + struct mx6_config mxc; + struct list_head next; +}; + +static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) +{ + struct rt6_nh *nh; + + list_for_each_entry(nh, rt6_nh_list, next) { + pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n", + &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, + nh->r_cfg.fc_ifindex); + } +} + +static int ip6_route_info_append(struct list_head *rt6_nh_list, + struct rt6_info *rt, struct fib6_config *r_cfg) +{ + struct rt6_nh *nh; + struct rt6_info *rtnh; + int err = -EEXIST; + + list_for_each_entry(nh, rt6_nh_list, next) { + /* check if rt6_info already exists */ + rtnh = nh->rt6_info; + + if (rtnh->dst.dev == rt->dst.dev && + rtnh->rt6i_idev == rt->rt6i_idev && + ipv6_addr_equal(&rtnh->rt6i_gateway, + &rt->rt6i_gateway)) + return err; + } + + nh = kzalloc(sizeof(*nh), GFP_KERNEL); + if (!nh) + return -ENOMEM; + nh->rt6_info = rt; + err = ip6_convert_metrics(&nh->mxc, r_cfg); + if (err) { + kfree(nh); + return err; + } + memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); + list_add_tail(&nh->next, rt6_nh_list); + + return 0; +} + +static int ip6_route_multipath_add(struct fib6_config *cfg) { struct fib6_config r_cfg; struct rtnexthop *rtnh; + struct rt6_info *rt; + struct rt6_nh *err_nh; + struct rt6_nh *nh, *nh_safe; int remaining; int attrlen; - int err = 0, last_err = 0; + int err = 1; + int nhn = 0; + int replace = (cfg->fc_nlinfo.nlh && + (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); + LIST_HEAD(rt6_nh_list); remaining = cfg->fc_mp_len; -beginning: rtnh = (struct rtnexthop *)cfg->fc_mp; - /* Parse a Multipath Entry */ + /* Parse a Multipath Entry and build a list (rt6_nh_list) of + * rt6_info structs per nexthop + */ while (rtnh_ok(rtnh, remaining)) { memcpy(&r_cfg, cfg, sizeof(*cfg)); if (rtnh->rtnh_ifindex) @@ -2808,22 +2896,32 @@ beginning: if (nla) r_cfg.fc_encap_type = nla_get_u16(nla); } - err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg); + + err = ip6_route_info_create(&r_cfg, &rt); + if (err) + goto cleanup; + + err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); if (err) { - last_err = err; - /* If we are trying to remove a route, do not stop the - * loop when ip6_route_del() fails (because next hop is - * already gone), we should try to remove all next hops. - */ - if (add) { - /* If add fails, we should try to delete all - * next hops that have been already added. - */ - add = 0; - remaining = cfg->fc_mp_len - remaining; - goto beginning; - } + dst_free(&rt->dst); + goto cleanup; + } + + rtnh = rtnh_next(rtnh, &remaining); + } + + err_nh = NULL; + list_for_each_entry(nh, &rt6_nh_list, next) { + err = __ip6_ins_rt(nh->rt6_info, &cfg->fc_nlinfo, &nh->mxc); + /* nh->rt6_info is used or freed at this point, reset to NULL*/ + nh->rt6_info = NULL; + if (err) { + if (replace && nhn) + ip6_print_replace_route_err(&rt6_nh_list); + err_nh = nh; + goto add_errout; } + /* Because each route is added like a single route we remove * these flags after the first nexthop: if there is a collision, * we have already failed to add the first nexthop: @@ -2833,6 +2931,62 @@ beginning: */ cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_REPLACE); + nhn++; + } + + goto cleanup; + +add_errout: + /* Delete routes that were already added */ + list_for_each_entry(nh, &rt6_nh_list, next) { + if (err_nh == nh) + break; + ip6_route_del(&nh->r_cfg); + } + +cleanup: + list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { + if (nh->rt6_info) + dst_free(&nh->rt6_info->dst); + kfree(nh->mxc.mx); + list_del(&nh->next); + kfree(nh); + } + + return err; +} + +static int ip6_route_multipath_del(struct fib6_config *cfg) +{ + struct fib6_config r_cfg; + struct rtnexthop *rtnh; + int remaining; + int attrlen; + int err = 1, last_err = 0; + + remaining = cfg->fc_mp_len; + rtnh = (struct rtnexthop *)cfg->fc_mp; + + /* Parse a Multipath Entry */ + while (rtnh_ok(rtnh, remaining)) { + memcpy(&r_cfg, cfg, sizeof(*cfg)); + if (rtnh->rtnh_ifindex) + r_cfg.fc_ifindex = rtnh->rtnh_ifindex; + + attrlen = rtnh_attrlen(rtnh); + if (attrlen > 0) { + struct nlattr *nla, *attrs = rtnh_attrs(rtnh); + + nla = nla_find(attrs, attrlen, RTA_GATEWAY); + if (nla) { + nla_memcpy(&r_cfg.fc_gateway, nla, 16); + r_cfg.fc_flags |= RTF_GATEWAY; + } + } + err = ip6_route_del(&r_cfg); + if (err) + last_err = err; + rtnh = rtnh_next(rtnh, &remaining); } @@ -2849,7 +3003,7 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) return err; if (cfg.fc_mp) - return ip6_route_multipath(&cfg, 0); + return ip6_route_multipath_del(&cfg); else return ip6_route_del(&cfg); } @@ -2864,7 +3018,7 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) return err; if (cfg.fc_mp) - return ip6_route_multipath(&cfg, 1); + return ip6_route_multipath_add(&cfg); else return ip6_route_add(&cfg); } @@ -3155,7 +3309,8 @@ errout: return err; } -void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) +void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, + unsigned int nlm_flags) { struct sk_buff *skb; struct net *net = info->nl_net; @@ -3170,7 +3325,7 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) goto errout; err = rt6_fill_node(net, skb, rt, NULL, NULL, 0, - event, info->portid, seq, 0, 0, 0); + event, info->portid, seq, 0, 0, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index f10b9400b6d7..da55e0c85bb8 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -37,6 +37,7 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = oif; + fl6.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF; memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr)); if (saddr) memcpy(&fl6.saddr, saddr, sizeof(fl6.saddr)); diff --git a/net/irda/irlmp.c b/net/irda/irlmp.c index a26c401ef4a4..43964594aa12 100644 --- a/net/irda/irlmp.c +++ b/net/irda/irlmp.c @@ -1839,7 +1839,7 @@ static void *irlmp_seq_hb_idx(struct irlmp_iter_state *iter, loff_t *off) for (element = hashbin_get_first(iter->hashbin); element != NULL; element = hashbin_get_next(iter->hashbin)) { - if (!off || *off-- == 0) { + if (!off || (*off)-- == 0) { /* NB: hashbin left locked */ return element; } diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index f6b090df3930..afca2eb4dfa7 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1319,7 +1319,7 @@ static void l2tp_tunnel_del_work(struct work_struct *work) tunnel = container_of(work, struct l2tp_tunnel, del_work); sk = l2tp_tunnel_sock_lookup(tunnel); if (!sk) - return; + goto out; sock = sk->sk_socket; @@ -1341,6 +1341,8 @@ static void l2tp_tunnel_del_work(struct work_struct *work) } l2tp_tunnel_sock_put(sk); +out: + l2tp_tunnel_dec_refcount(tunnel); } /* Create a socket for the tunnel, if one isn't set up by @@ -1636,8 +1638,13 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_create); */ int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) { + l2tp_tunnel_inc_refcount(tunnel); l2tp_tunnel_closeall(tunnel); - return (false == queue_work(l2tp_wq, &tunnel->del_work)); + if (false == queue_work(l2tp_wq, &tunnel->del_work)) { + l2tp_tunnel_dec_refcount(tunnel); + return 1; + } + return 0; } EXPORT_SYMBOL_GPL(l2tp_tunnel_delete); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 685ec13ed7c2..7a77a1470f25 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2468,8 +2468,13 @@ static int ieee80211_set_cqm_rssi_config(struct wiphy *wiphy, rssi_hyst == bss_conf->cqm_rssi_hyst) return 0; + if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER && + !(sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI)) + return -EOPNOTSUPP; + bss_conf->cqm_rssi_thold = rssi_thold; bss_conf->cqm_rssi_hyst = rssi_hyst; + sdata->u.mgd.last_cqm_event_signal = 0; /* tell the driver upon association, unless already associated */ if (sdata->u.mgd.associated && @@ -2514,15 +2519,17 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, continue; for (j = 0; j < IEEE80211_HT_MCS_MASK_LEN; j++) { - if (~sdata->rc_rateidx_mcs_mask[i][j]) + if (~sdata->rc_rateidx_mcs_mask[i][j]) { sdata->rc_has_mcs_mask[i] = true; + break; + } + } - if (~sdata->rc_rateidx_vht_mcs_mask[i][j]) + for (j = 0; j < NL80211_VHT_NSS_MAX; j++) { + if (~sdata->rc_rateidx_vht_mcs_mask[i][j]) { sdata->rc_has_vht_mcs_mask[i] = true; - - if (sdata->rc_has_mcs_mask[i] && - sdata->rc_has_vht_mcs_mask[i]) break; + } } } diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index ced6bf3be8d6..1560c8482bcb 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -149,7 +149,7 @@ static ssize_t hwflags_read(struct file *file, char __user *user_buf, for (i = 0; i < NUM_IEEE80211_HW_FLAGS; i++) { if (test_bit(i, local->hw.flags)) - pos += scnprintf(pos, end - pos, "%s", + pos += scnprintf(pos, end - pos, "%s\n", hw_flag_names[i]); } diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 705ef1d040ed..cd7e55e08a23 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -4267,6 +4267,8 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband; struct cfg80211_chan_def chandef; int ret; + u32 i; + bool have_80mhz; sband = local->hw.wiphy->bands[cbss->channel->band]; @@ -4317,6 +4319,20 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, } } + /* Allow VHT if at least one channel on the sband supports 80 MHz */ + have_80mhz = false; + for (i = 0; i < sband->n_channels; i++) { + if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED | + IEEE80211_CHAN_NO_80MHZ)) + continue; + + have_80mhz = true; + break; + } + + if (!have_80mhz) + ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; + ifmgd->flags |= ieee80211_determine_chantype(sdata, sband, cbss->channel, ht_cap, ht_oper, vht_oper, diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 9857693b91ec..9ce8883d5f44 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -716,7 +716,7 @@ static bool rate_control_cap_mask(struct ieee80211_sub_if_data *sdata, /* Filter out rates that the STA does not support */ *mask &= sta->supp_rates[sband->band]; - for (i = 0; i < sizeof(mcs_mask); i++) + for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) mcs_mask[i] &= sta->ht_cap.mcs.rx_mask[i]; sta_vht_cap = sta->vht_cap.vht_mcs.rx_mcs_map; diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 8ba583243509..3ed7ddfbf8e8 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -101,6 +101,7 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, * when it wakes up for the next time. */ set_sta_flag(sta, WLAN_STA_CLEAR_PS_FILT); + ieee80211_clear_fast_xmit(sta); /* * This code races in the following way: diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index aee701a5649e..4e202d0679b2 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -1249,6 +1249,58 @@ static void iee80211_tdls_recalc_chanctx(struct ieee80211_sub_if_data *sdata) mutex_unlock(&local->chanctx_mtx); } +static int iee80211_tdls_have_ht_peers(struct ieee80211_sub_if_data *sdata) +{ + struct sta_info *sta; + bool result = false; + + rcu_read_lock(); + list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) { + if (!sta->sta.tdls || sta->sdata != sdata || !sta->uploaded || + !test_sta_flag(sta, WLAN_STA_AUTHORIZED) || + !test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH) || + !sta->sta.ht_cap.ht_supported) + continue; + result = true; + break; + } + rcu_read_unlock(); + + return result; +} + +static void +iee80211_tdls_recalc_ht_protection(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta) +{ + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + bool tdls_ht; + u16 protection = IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED | + IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT | + IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT; + u16 opmode; + + /* Nothing to do if the BSS connection uses HT */ + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) + return; + + tdls_ht = (sta && sta->sta.ht_cap.ht_supported) || + iee80211_tdls_have_ht_peers(sdata); + + opmode = sdata->vif.bss_conf.ht_operation_mode; + + if (tdls_ht) + opmode |= protection; + else + opmode &= ~protection; + + if (opmode == sdata->vif.bss_conf.ht_operation_mode) + return; + + sdata->vif.bss_conf.ht_operation_mode = opmode; + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_HT); +} + int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, const u8 *peer, enum nl80211_tdls_operation oper) { @@ -1274,6 +1326,10 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, return -ENOTSUPP; } + /* protect possible bss_conf changes and avoid concurrency in + * ieee80211_bss_info_change_notify() + */ + sdata_lock(sdata); mutex_lock(&local->mtx); tdls_dbg(sdata, "TDLS oper %d peer %pM\n", oper, peer); @@ -1287,16 +1343,18 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, iee80211_tdls_recalc_chanctx(sdata); - rcu_read_lock(); + mutex_lock(&local->sta_mtx); sta = sta_info_get(sdata, peer); if (!sta) { - rcu_read_unlock(); + mutex_unlock(&local->sta_mtx); ret = -ENOLINK; break; } + iee80211_tdls_recalc_ht_protection(sdata, sta); + set_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH); - rcu_read_unlock(); + mutex_unlock(&local->sta_mtx); WARN_ON_ONCE(is_zero_ether_addr(sdata->u.mgd.tdls_peer) || !ether_addr_equal(sdata->u.mgd.tdls_peer, peer)); @@ -1318,6 +1376,11 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, ieee80211_flush_queues(local, sdata, false); ret = sta_info_destroy_addr(sdata, peer); + + mutex_lock(&local->sta_mtx); + iee80211_tdls_recalc_ht_protection(sdata, NULL); + mutex_unlock(&local->sta_mtx); + iee80211_tdls_recalc_chanctx(sdata); break; default: @@ -1335,6 +1398,7 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, &sdata->u.mgd.request_smps_work); mutex_unlock(&local->mtx); + sdata_unlock(sdata); return ret; } diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 84e0e8c7fb23..7892eb8ed4c8 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1218,8 +1218,10 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, if (!tx->sta) info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT; - else if (test_and_clear_sta_flag(tx->sta, WLAN_STA_CLEAR_PS_FILT)) + else if (test_and_clear_sta_flag(tx->sta, WLAN_STA_CLEAR_PS_FILT)) { info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT; + ieee80211_check_fast_xmit(tx->sta); + } info->flags |= IEEE80211_TX_CTL_FIRST_FRAGMENT; @@ -2451,7 +2453,8 @@ void ieee80211_check_fast_xmit(struct sta_info *sta) if (test_sta_flag(sta, WLAN_STA_PS_STA) || test_sta_flag(sta, WLAN_STA_PS_DRIVER) || - test_sta_flag(sta, WLAN_STA_PS_DELIVER)) + test_sta_flag(sta, WLAN_STA_PS_DELIVER) || + test_sta_flag(sta, WLAN_STA_CLEAR_PS_FILT)) goto out; if (sdata->noack_map) diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index 834ccdbc74be..ff1c798921a6 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -120,6 +120,7 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.vht_cap; struct ieee80211_sta_vht_cap own_cap; u32 cap_info, i; + bool have_80mhz; memset(vht_cap, 0, sizeof(*vht_cap)); @@ -129,6 +130,20 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, if (!vht_cap_ie || !sband->vht_cap.vht_supported) return; + /* Allow VHT if at least one channel on the sband supports 80 MHz */ + have_80mhz = false; + for (i = 0; i < sband->n_channels; i++) { + if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED | + IEEE80211_CHAN_NO_80MHZ)) + continue; + + have_80mhz = true; + break; + } + + if (!have_80mhz) + return; + /* * A VHT STA must support 40 MHz, but if we verify that here * then we break a few things - some APs (e.g. Netgear R6300v2 diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 8e47f8113495..21a085686dc1 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -152,6 +152,8 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) #endif synchronize_net(); nf_queue_nf_hook_drop(net, &entry->ops); + /* other cpu might still process nfqueue verdict that used reg */ + synchronize_net(); kfree(entry); } EXPORT_SYMBOL(nf_unregister_net_hook); diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index afe905c208af..691b54fcaf2a 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -152,9 +152,13 @@ htable_bits(u32 hashsize) #define SET_HOST_MASK(family) (family == AF_INET ? 32 : 128) #ifdef IP_SET_HASH_WITH_NET0 +/* cidr from 0 to SET_HOST_MASK() value and c = cidr + 1 */ #define NLEN(family) (SET_HOST_MASK(family) + 1) +#define CIDR_POS(c) ((c) - 1) #else +/* cidr from 1 to SET_HOST_MASK() value and c = cidr + 1 */ #define NLEN(family) SET_HOST_MASK(family) +#define CIDR_POS(c) ((c) - 2) #endif #else @@ -305,7 +309,7 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) } else if (h->nets[i].cidr[n] < cidr) { j = i; } else if (h->nets[i].cidr[n] == cidr) { - h->nets[cidr - 1].nets[n]++; + h->nets[CIDR_POS(cidr)].nets[n]++; return; } } @@ -314,7 +318,7 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) h->nets[i].cidr[n] = h->nets[i - 1].cidr[n]; } h->nets[i].cidr[n] = cidr; - h->nets[cidr - 1].nets[n] = 1; + h->nets[CIDR_POS(cidr)].nets[n] = 1; } static void @@ -325,8 +329,8 @@ mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n) for (i = 0; i < nets_length; i++) { if (h->nets[i].cidr[n] != cidr) continue; - h->nets[cidr - 1].nets[n]--; - if (h->nets[cidr - 1].nets[n] > 0) + h->nets[CIDR_POS(cidr)].nets[n]--; + if (h->nets[CIDR_POS(cidr)].nets[n] > 0) return; for (j = i; j < net_end && h->nets[j].cidr[n]; j++) h->nets[j].cidr[n] = h->nets[j + 1].cidr[n]; diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 3c862c0a76d1..a93dfebffa81 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -131,6 +131,13 @@ hash_netnet4_data_next(struct hash_netnet4_elem *next, #define HOST_MASK 32 #include "ip_set_hash_gen.h" +static void +hash_netnet4_init(struct hash_netnet4_elem *e) +{ + e->cidr[0] = HOST_MASK; + e->cidr[1] = HOST_MASK; +} + static int hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, @@ -160,7 +167,7 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], { const struct hash_netnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netnet4_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, }; + struct hash_netnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, last; u32 ip2 = 0, ip2_from = 0, ip2_to = 0, last2; @@ -169,6 +176,7 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + hash_netnet4_init(&e); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; @@ -357,6 +365,13 @@ hash_netnet6_data_next(struct hash_netnet4_elem *next, #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" +static void +hash_netnet6_init(struct hash_netnet6_elem *e) +{ + e->cidr[0] = HOST_MASK; + e->cidr[1] = HOST_MASK; +} + static int hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, @@ -385,13 +400,14 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netnet6_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, }; + struct hash_netnet6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + hash_netnet6_init(&e); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 0c68734f5cc4..9a14c237830f 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -142,6 +142,13 @@ hash_netportnet4_data_next(struct hash_netportnet4_elem *next, #define HOST_MASK 32 #include "ip_set_hash_gen.h" +static void +hash_netportnet4_init(struct hash_netportnet4_elem *e) +{ + e->cidr[0] = HOST_MASK; + e->cidr[1] = HOST_MASK; +} + static int hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, @@ -175,7 +182,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], { const struct hash_netportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netportnet4_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, }; + struct hash_netportnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, ip_last, p = 0, port, port_to; u32 ip2_from = 0, ip2_to = 0, ip2_last, ip2; @@ -185,6 +192,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + hash_netportnet4_init(&e); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || @@ -412,6 +420,13 @@ hash_netportnet6_data_next(struct hash_netportnet4_elem *next, #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" +static void +hash_netportnet6_init(struct hash_netportnet6_elem *e) +{ + e->cidr[0] = HOST_MASK; + e->cidr[1] = HOST_MASK; +} + static int hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, @@ -445,7 +460,7 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], { const struct hash_netportnet *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; - struct hash_netportnet6_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, }; + struct hash_netportnet6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; bool with_ports = false; @@ -454,6 +469,7 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + hash_netportnet6_init(&e); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index a1fe5377a2b3..5a30ce6e8c90 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -297,7 +297,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, ip_set_timeout_expired(ext_timeout(n, set)))) n = NULL; - e = kzalloc(set->dsize, GFP_KERNEL); + e = kzalloc(set->dsize, GFP_ATOMIC); if (!e) return -ENOMEM; e->id = d->id; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index eedf0495f11f..c09d6c7198f6 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -313,12 +313,13 @@ out_free: } EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); -static void nf_ct_tmpl_free(struct nf_conn *tmpl) +void nf_ct_tmpl_free(struct nf_conn *tmpl) { nf_ct_ext_destroy(tmpl); nf_ct_ext_free(tmpl); kfree(tmpl); } +EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); static void destroy_conntrack(struct nf_conntrack *nfct) diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 675d12c69e32..a5d41dfa9f05 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -107,12 +107,17 @@ EXPORT_SYMBOL(nf_log_register); void nf_log_unregister(struct nf_logger *logger) { + const struct nf_logger *log; int i; mutex_lock(&nf_log_mutex); - for (i = 0; i < NFPROTO_NUMPROTO; i++) - RCU_INIT_POINTER(loggers[i][logger->type], NULL); + for (i = 0; i < NFPROTO_NUMPROTO; i++) { + log = nft_log_dereference(loggers[i][logger->type]); + if (log == logger) + RCU_INIT_POINTER(loggers[i][logger->type], NULL); + } mutex_unlock(&nf_log_mutex); + synchronize_rcu(); } EXPORT_SYMBOL(nf_log_unregister); diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 888b9558415e..c8a4a48bced9 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -380,7 +380,7 @@ static int __net_init synproxy_net_init(struct net *net) err3: free_percpu(snet->stats); err2: - nf_conntrack_free(ct); + nf_ct_tmpl_free(ct); err1: return err; } diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 0c0e8ecf02ab..70277b11f742 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -444,6 +444,7 @@ done: static void nfnetlink_rcv(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); + u_int16_t res_id; int msglen; if (nlh->nlmsg_len < NLMSG_HDRLEN || @@ -468,7 +469,12 @@ static void nfnetlink_rcv(struct sk_buff *skb) nfgenmsg = nlmsg_data(nlh); skb_pull(skb, msglen); - nfnetlink_rcv_batch(skb, nlh, nfgenmsg->res_id); + /* Work around old nft using host byte order */ + if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES) + res_id = NFNL_SUBSYS_NFTABLES; + else + res_id = ntohs(nfgenmsg->res_id); + nfnetlink_rcv_batch(skb, nlh, res_id); } else { netlink_rcv_skb(skb, &nfnetlink_rcv_msg); } diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 685cc6a17163..a5cd6d90b78b 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -301,7 +301,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, __be32 **packet_id_ptr) { size_t size; - size_t data_len = 0, cap_len = 0; + size_t data_len = 0, cap_len = 0, rem_len = 0; unsigned int hlen = 0; struct sk_buff *skb; struct nlattr *nla; @@ -360,6 +360,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, hlen = min_t(unsigned int, hlen, data_len); size += sizeof(struct nlattr) + hlen; cap_len = entskb->len; + rem_len = data_len - hlen; break; } @@ -377,7 +378,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, size += nla_total_size(seclen); } - skb = nfnetlink_alloc_skb(net, size, queue->peer_portid, + skb = __netlink_alloc_skb(net->nfnl, size, rem_len, queue->peer_portid, GFP_ATOMIC); if (!skb) { skb_tx_error(entskb); diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 66def315eb56..9c8fab00164b 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -619,6 +619,13 @@ struct nft_xt { static struct nft_expr_type nft_match_type; +static bool nft_match_cmp(const struct xt_match *match, + const char *name, u32 rev, u32 family) +{ + return strcmp(match->name, name) == 0 && match->revision == rev && + (match->family == NFPROTO_UNSPEC || match->family == family); +} + static const struct nft_expr_ops * nft_match_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -626,7 +633,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, struct nft_xt *nft_match; struct xt_match *match; char *mt_name; - __u32 rev, family; + u32 rev, family; if (tb[NFTA_MATCH_NAME] == NULL || tb[NFTA_MATCH_REV] == NULL || @@ -641,8 +648,7 @@ nft_match_select_ops(const struct nft_ctx *ctx, list_for_each_entry(nft_match, &nft_match_list, head) { struct xt_match *match = nft_match->ops.data; - if (strcmp(match->name, mt_name) == 0 && - match->revision == rev && match->family == family) { + if (nft_match_cmp(match, mt_name, rev, family)) { if (!try_module_get(match->me)) return ERR_PTR(-ENOENT); @@ -693,6 +699,13 @@ static LIST_HEAD(nft_target_list); static struct nft_expr_type nft_target_type; +static bool nft_target_cmp(const struct xt_target *tg, + const char *name, u32 rev, u32 family) +{ + return strcmp(tg->name, name) == 0 && tg->revision == rev && + (tg->family == NFPROTO_UNSPEC || tg->family == family); +} + static const struct nft_expr_ops * nft_target_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -700,7 +713,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, struct nft_xt *nft_target; struct xt_target *target; char *tg_name; - __u32 rev, family; + u32 rev, family; if (tb[NFTA_TARGET_NAME] == NULL || tb[NFTA_TARGET_REV] == NULL || @@ -715,8 +728,7 @@ nft_target_select_ops(const struct nft_ctx *ctx, list_for_each_entry(nft_target, &nft_target_list, head) { struct xt_target *target = nft_target->ops.data; - if (strcmp(target->name, tg_name) == 0 && - target->revision == rev && target->family == family) { + if (nft_target_cmp(target, tg_name, rev, family)) { if (!try_module_get(target->me)) return ERR_PTR(-ENOENT); diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 8e524898ccea..faf32d888198 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -255,7 +255,7 @@ out: return 0; err3: - nf_conntrack_free(ct); + nf_ct_tmpl_free(ct); err2: nf_ct_l3proto_module_put(par->family); err1: diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 50889be1517d..fafe33bdb619 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -125,6 +125,24 @@ static inline u32 netlink_group_mask(u32 group) return group ? 1 << (group - 1) : 0; } +static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb, + gfp_t gfp_mask) +{ + unsigned int len = skb_end_offset(skb); + struct sk_buff *new; + + new = alloc_skb(len, gfp_mask); + if (new == NULL) + return NULL; + + NETLINK_CB(new).portid = NETLINK_CB(skb).portid; + NETLINK_CB(new).dst_group = NETLINK_CB(skb).dst_group; + NETLINK_CB(new).creds = NETLINK_CB(skb).creds; + + memcpy(skb_put(new, len), skb->data, len); + return new; +} + int netlink_add_tap(struct netlink_tap *nt) { if (unlikely(nt->dev->type != ARPHRD_NETLINK)) @@ -206,7 +224,11 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb, int ret = -ENOMEM; dev_hold(dev); - nskb = skb_clone(skb, GFP_ATOMIC); + + if (netlink_skb_is_mmaped(skb) || is_vmalloc_addr(skb->head)) + nskb = netlink_to_full_skb(skb, GFP_ATOMIC); + else + nskb = skb_clone(skb, GFP_ATOMIC); if (nskb) { nskb->dev = dev; nskb->protocol = htons((u16) sk->sk_protocol); @@ -279,11 +301,6 @@ static void netlink_rcv_wake(struct sock *sk) } #ifdef CONFIG_NETLINK_MMAP -static bool netlink_skb_is_mmaped(const struct sk_buff *skb) -{ - return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED; -} - static bool netlink_rx_is_mmaped(struct sock *sk) { return nlk_sk(sk)->rx_ring.pg_vec != NULL; @@ -674,12 +691,19 @@ static unsigned int netlink_poll(struct file *file, struct socket *sock, mask = datagram_poll(file, sock, wait); - spin_lock_bh(&sk->sk_receive_queue.lock); - if (nlk->rx_ring.pg_vec) { - if (netlink_has_valid_frame(&nlk->rx_ring)) - mask |= POLLIN | POLLRDNORM; + /* We could already have received frames in the normal receive + * queue, that will show up as NL_MMAP_STATUS_COPY in the ring, + * so if mask contains pollin/etc already, there's no point + * walking the ring. + */ + if ((mask & (POLLIN | POLLRDNORM)) != (POLLIN | POLLRDNORM)) { + spin_lock_bh(&sk->sk_receive_queue.lock); + if (nlk->rx_ring.pg_vec) { + if (netlink_has_valid_frame(&nlk->rx_ring)) + mask |= POLLIN | POLLRDNORM; + } + spin_unlock_bh(&sk->sk_receive_queue.lock); } - spin_unlock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_write_queue.lock); if (nlk->tx_ring.pg_vec) { @@ -839,7 +863,6 @@ static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb) } #else /* CONFIG_NETLINK_MMAP */ -#define netlink_skb_is_mmaped(skb) false #define netlink_rx_is_mmaped(sk) false #define netlink_tx_is_mmaped(sk) false #define netlink_mmap sock_no_mmap @@ -1087,8 +1110,8 @@ static int netlink_insert(struct sock *sk, u32 portid) lock_sock(sk); - err = -EBUSY; - if (nlk_sk(sk)->portid) + err = nlk_sk(sk)->portid == portid ? 0 : -EBUSY; + if (nlk_sk(sk)->bound) goto err; err = -ENOMEM; @@ -1108,10 +1131,14 @@ static int netlink_insert(struct sock *sk, u32 portid) err = -EOVERFLOW; if (err == -EEXIST) err = -EADDRINUSE; - nlk_sk(sk)->portid = 0; sock_put(sk); + goto err; } + /* We need to ensure that the socket is hashed and visible. */ + smp_wmb(); + nlk_sk(sk)->bound = portid; + err: release_sock(sk); return err; @@ -1496,6 +1523,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; int err; long unsigned int groups = nladdr->nl_groups; + bool bound; if (addr_len < sizeof(struct sockaddr_nl)) return -EINVAL; @@ -1512,9 +1540,14 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, return err; } - if (nlk->portid) + bound = nlk->bound; + if (bound) { + /* Ensure nlk->portid is up-to-date. */ + smp_rmb(); + if (nladdr->nl_pid != nlk->portid) return -EINVAL; + } if (nlk->netlink_bind && groups) { int group; @@ -1530,7 +1563,10 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, } } - if (!nlk->portid) { + /* No need for barriers here as we return to user-space without + * using any of the bound attributes. + */ + if (!bound) { err = nladdr->nl_pid ? netlink_insert(sk, nladdr->nl_pid) : netlink_autobind(sock); @@ -1578,7 +1614,10 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) return -EPERM; - if (!nlk->portid) + /* No need for barriers here as we return to user-space without + * using any of the bound attributes. + */ + if (!nlk->bound) err = netlink_autobind(sock); if (err == 0) { @@ -1837,15 +1876,16 @@ retry: } EXPORT_SYMBOL(netlink_unicast); -struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, - u32 dst_portid, gfp_t gfp_mask) +struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size, + unsigned int ldiff, u32 dst_portid, + gfp_t gfp_mask) { #ifdef CONFIG_NETLINK_MMAP + unsigned int maxlen, linear_size; struct sock *sk = NULL; struct sk_buff *skb; struct netlink_ring *ring; struct nl_mmap_hdr *hdr; - unsigned int maxlen; sk = netlink_getsockbyportid(ssk, dst_portid); if (IS_ERR(sk)) @@ -1856,7 +1896,11 @@ struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, if (ring->pg_vec == NULL) goto out_put; - if (ring->frame_size - NL_MMAP_HDRLEN < size) + /* We need to account the full linear size needed as a ring + * slot cannot have non-linear parts. + */ + linear_size = size + ldiff; + if (ring->frame_size - NL_MMAP_HDRLEN < linear_size) goto out_put; skb = alloc_skb_head(gfp_mask); @@ -1870,13 +1914,14 @@ struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, /* check again under lock */ maxlen = ring->frame_size - NL_MMAP_HDRLEN; - if (maxlen < size) + if (maxlen < linear_size) goto out_free; netlink_forward_ring(ring); hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); if (hdr == NULL) goto err2; + netlink_ring_setup_skb(skb, sk, ring, hdr); netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED); atomic_inc(&ring->pending); @@ -1902,7 +1947,7 @@ out: #endif return alloc_skb(size, gfp_mask); } -EXPORT_SYMBOL_GPL(netlink_alloc_skb); +EXPORT_SYMBOL_GPL(__netlink_alloc_skb); int netlink_has_listeners(struct sock *sk, unsigned int group) { @@ -2326,7 +2371,7 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, int pos, idx, shift; err = 0; - netlink_table_grab(); + netlink_lock_table(); for (pos = 0; pos * 8 < nlk->ngroups; pos += sizeof(u32)) { if (len - pos < sizeof(u32)) break; @@ -2341,7 +2386,7 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, } if (put_user(ALIGN(nlk->ngroups / 8, sizeof(u32)), optlen)) err = -EFAULT; - netlink_table_ungrab(); + netlink_unlock_table(); break; } case NETLINK_CAP_ACK: @@ -2413,10 +2458,13 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) dst_group = nlk->dst_group; } - if (!nlk->portid) { + if (!nlk->bound) { err = netlink_autobind(sock); if (err) goto out; + } else { + /* Ensure nlk is hashed and visible. */ + smp_rmb(); } /* It's a really convoluted way for userland to ask for mmaped @@ -2737,6 +2785,7 @@ static int netlink_dump(struct sock *sk) struct sk_buff *skb = NULL; struct nlmsghdr *nlh; int len, err = -ENOBUFS; + int alloc_min_size; int alloc_size; mutex_lock(nlk->cb_mutex); @@ -2745,9 +2794,6 @@ static int netlink_dump(struct sock *sk) goto errout_skb; } - cb = &nlk->cb; - alloc_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE); - if (!netlink_rx_is_mmaped(sk) && atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) goto errout_skb; @@ -2757,23 +2803,35 @@ static int netlink_dump(struct sock *sk) * to reduce number of system calls on dump operations, if user * ever provided a big enough buffer. */ - if (alloc_size < nlk->max_recvmsg_len) { - skb = netlink_alloc_skb(sk, - nlk->max_recvmsg_len, - nlk->portid, + cb = &nlk->cb; + alloc_min_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE); + + if (alloc_min_size < nlk->max_recvmsg_len) { + alloc_size = nlk->max_recvmsg_len; + skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); - /* available room should be exact amount to avoid MSG_TRUNC */ - if (skb) - skb_reserve(skb, skb_tailroom(skb) - - nlk->max_recvmsg_len); } - if (!skb) + if (!skb) { + alloc_size = alloc_min_size; skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, GFP_KERNEL); + } if (!skb) goto errout_skb; + + /* Trim skb to allocated size. User is expected to provide buffer as + * large as max(min_dump_alloc, 16KiB (mac_recvmsg_len capped at + * netlink_recvmsg())). dump will pack as many smaller messages as + * could fit within the allocated skb. skb is typically allocated + * with larger space than required (could be as much as near 2x the + * requested size with align to next power of 2 approach). Allowing + * dump to use the excess space makes it difficult for a user to have a + * reasonable static buffer based on the expected largest dump of a + * single netdev. The outcome is MSG_TRUNC error. + */ + skb_reserve(skb, skb_tailroom(skb) - alloc_size); netlink_skb_set_owner_r(skb, sk); len = cb->dump(skb, cb); diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 89008405d6b4..14437d9b1965 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -35,6 +35,7 @@ struct netlink_sock { unsigned long state; size_t max_recvmsg_len; wait_queue_head_t wait; + bool bound; bool cb_running; struct netlink_callback cb; struct mutex *cb_mutex; @@ -59,6 +60,15 @@ static inline struct netlink_sock *nlk_sk(struct sock *sk) return container_of(sk, struct netlink_sock, sk); } +static inline bool netlink_skb_is_mmaped(const struct sk_buff *skb) +{ +#ifdef CONFIG_NETLINK_MMAP + return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED; +#else + return false; +#endif /* CONFIG_NETLINK_MMAP */ +} + struct netlink_table { struct rhashtable hash; struct hlist_head mc_list; diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index af7cdef42066..d143aa9f6654 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -5,6 +5,8 @@ config OPENVSWITCH tristate "Open vSwitch" depends on INET + depends on !NF_CONNTRACK || \ + (NF_CONNTRACK && (!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6)) select LIBCRC32C select MPLS select NET_MPLS_GSO @@ -31,17 +33,6 @@ config OPENVSWITCH If unsure, say N. -config OPENVSWITCH_CONNTRACK - bool "Open vSwitch conntrack action support" - depends on OPENVSWITCH - depends on NF_CONNTRACK - default OPENVSWITCH - ---help--- - If you say Y here, then Open vSwitch module will be able to pass - packets through conntrack. - - Say N to exclude this support and reduce the binary size. - config OPENVSWITCH_GRE tristate "Open vSwitch GRE tunneling support" depends on OPENVSWITCH diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index 5b5913b06f54..60f809085b92 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -15,7 +15,9 @@ openvswitch-y := \ vport-internal_dev.o \ vport-netdev.o -openvswitch-$(CONFIG_OPENVSWITCH_CONNTRACK) += conntrack.o +ifneq ($(CONFIG_NF_CONNTRACK),) +openvswitch-y += conntrack.o +endif obj-$(CONFIG_OPENVSWITCH_VXLAN)+= vport-vxlan.o obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 315f5330b6e5..c6a39bf2c3b9 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -684,7 +684,7 @@ static void ovs_fragment(struct vport *vport, struct sk_buff *skb, u16 mru, { if (skb_network_offset(skb) > MAX_L2_LEN) { OVS_NLERR(1, "L2 header too long to fragment"); - return; + goto err; } if (ethertype == htons(ETH_P_IP)) { @@ -708,8 +708,7 @@ static void ovs_fragment(struct vport *vport, struct sk_buff *skb, u16 mru, struct rt6_info ovs_rt; if (!v6ops) { - kfree_skb(skb); - return; + goto err; } prepare_frag(vport, skb); @@ -728,8 +727,12 @@ static void ovs_fragment(struct vport *vport, struct sk_buff *skb, u16 mru, WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.", ovs_vport_name(vport), ntohs(ethertype), mru, vport->dev->mtu); - kfree_skb(skb); + goto err; } + + return; +err: + kfree_skb(skb); } static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, @@ -968,7 +971,7 @@ static int execute_masked_set_action(struct sk_buff *skb, case OVS_KEY_ATTR_CT_STATE: case OVS_KEY_ATTR_CT_ZONE: case OVS_KEY_ATTR_CT_MARK: - case OVS_KEY_ATTR_CT_LABEL: + case OVS_KEY_ATTR_CT_LABELS: err = -EINVAL; break; } @@ -1099,6 +1102,12 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, break; case OVS_ACTION_ATTR_CT: + if (!is_flow_key_valid(key)) { + err = ovs_flow_key_update(skb, key); + if (err) + return err; + } + err = ovs_ct_execute(ovs_dp_get_net(dp), skb, key, nla_data(a)); diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index e8e524ad8a01..a5ec34f8502f 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -37,9 +37,9 @@ struct md_mark { }; /* Metadata label for masked write to conntrack label. */ -struct md_label { - struct ovs_key_ct_label value; - struct ovs_key_ct_label mask; +struct md_labels { + struct ovs_key_ct_labels value; + struct ovs_key_ct_labels mask; }; /* Conntrack action context for execution. */ @@ -47,10 +47,10 @@ struct ovs_conntrack_info { struct nf_conntrack_helper *helper; struct nf_conntrack_zone zone; struct nf_conn *ct; - u32 flags; + u8 commit : 1; u16 family; struct md_mark mark; - struct md_label label; + struct md_labels labels; }; static u16 key_to_nfproto(const struct sw_flow_key *key) @@ -109,21 +109,21 @@ static u32 ovs_ct_get_mark(const struct nf_conn *ct) #endif } -static void ovs_ct_get_label(const struct nf_conn *ct, - struct ovs_key_ct_label *label) +static void ovs_ct_get_labels(const struct nf_conn *ct, + struct ovs_key_ct_labels *labels) { struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL; if (cl) { size_t len = cl->words * sizeof(long); - if (len > OVS_CT_LABEL_LEN) - len = OVS_CT_LABEL_LEN; - else if (len < OVS_CT_LABEL_LEN) - memset(label, 0, OVS_CT_LABEL_LEN); - memcpy(label, cl->bits, len); + if (len > OVS_CT_LABELS_LEN) + len = OVS_CT_LABELS_LEN; + else if (len < OVS_CT_LABELS_LEN) + memset(labels, 0, OVS_CT_LABELS_LEN); + memcpy(labels, cl->bits, len); } else { - memset(label, 0, OVS_CT_LABEL_LEN); + memset(labels, 0, OVS_CT_LABELS_LEN); } } @@ -134,7 +134,7 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state, key->ct.state = state; key->ct.zone = zone->id; key->ct.mark = ovs_ct_get_mark(ct); - ovs_ct_get_label(ct, &key->ct.label); + ovs_ct_get_labels(ct, &key->ct.labels); } /* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has @@ -151,6 +151,8 @@ static void ovs_ct_update_key(const struct sk_buff *skb, ct = nf_ct_get(skb, &ctinfo); if (ct) { state = ovs_ct_get_state(ctinfo); + if (!nf_ct_is_confirmed(ct)) + state |= OVS_CS_F_NEW; if (ct->master) state |= OVS_CS_F_RELATED; zone = nf_ct_zone(ct); @@ -167,7 +169,7 @@ void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb) { - if (nla_put_u8(skb, OVS_KEY_ATTR_CT_STATE, key->ct.state)) + if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, key->ct.state)) return -EMSGSIZE; if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && @@ -179,8 +181,8 @@ int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb) return -EMSGSIZE; if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && - nla_put(skb, OVS_KEY_ATTR_CT_LABEL, sizeof(key->ct.label), - &key->ct.label)) + nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(key->ct.labels), + &key->ct.labels)) return -EMSGSIZE; return 0; @@ -213,18 +215,15 @@ static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key, #endif } -static int ovs_ct_set_label(struct sk_buff *skb, struct sw_flow_key *key, - const struct ovs_key_ct_label *label, - const struct ovs_key_ct_label *mask) +static int ovs_ct_set_labels(struct sk_buff *skb, struct sw_flow_key *key, + const struct ovs_key_ct_labels *labels, + const struct ovs_key_ct_labels *mask) { enum ip_conntrack_info ctinfo; struct nf_conn_labels *cl; struct nf_conn *ct; int err; - if (!IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)) - return -ENOTSUPP; - /* The connection could be invalid, in which case set_label is no-op.*/ ct = nf_ct_get(skb, &ctinfo); if (!ct) @@ -235,15 +234,15 @@ static int ovs_ct_set_label(struct sk_buff *skb, struct sw_flow_key *key, nf_ct_labels_ext_add(ct); cl = nf_ct_labels_find(ct); } - if (!cl || cl->words * sizeof(long) < OVS_CT_LABEL_LEN) + if (!cl || cl->words * sizeof(long) < OVS_CT_LABELS_LEN) return -ENOSPC; - err = nf_connlabels_replace(ct, (u32 *)label, (u32 *)mask, - OVS_CT_LABEL_LEN / sizeof(u32)); + err = nf_connlabels_replace(ct, (u32 *)labels, (u32 *)mask, + OVS_CT_LABELS_LEN / sizeof(u32)); if (err) return err; - ovs_ct_get_label(ct, &key->ct.label); + ovs_ct_get_labels(ct, &key->ct.labels); return 0; } @@ -275,13 +274,15 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto) case NFPROTO_IPV6: { u8 nexthdr = ipv6_hdr(skb)->nexthdr; __be16 frag_off; + int ofs; - protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), - &nexthdr, &frag_off); - if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { + ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + &frag_off); + if (ofs < 0 || (frag_off & htons(~0x7)) != 0) { pr_debug("proto header not found\n"); return NF_ACCEPT; } + protoff = ofs; break; } default: @@ -375,7 +376,7 @@ static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb, return true; } -static int __ovs_ct_lookup(struct net *net, const struct sw_flow_key *key, +static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, const struct ovs_conntrack_info *info, struct sk_buff *skb) { @@ -406,6 +407,8 @@ static int __ovs_ct_lookup(struct net *net, const struct sw_flow_key *key, } } + ovs_ct_update_key(skb, key, true); + return 0; } @@ -428,8 +431,6 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key, err = __ovs_ct_lookup(net, key, info, skb); if (err) return err; - - ovs_ct_update_key(skb, key, true); } return 0; @@ -458,17 +459,15 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, if (nf_conntrack_confirm(skb) != NF_ACCEPT) return -EINVAL; - ovs_ct_update_key(skb, key, true); - return 0; } -static bool label_nonzero(const struct ovs_key_ct_label *label) +static bool labels_nonzero(const struct ovs_key_ct_labels *labels) { size_t i; - for (i = 0; i < sizeof(*label); i++) - if (label->ct_label[i]) + for (i = 0; i < sizeof(*labels); i++) + if (labels->ct_labels[i]) return true; return false; @@ -491,7 +490,7 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, return err; } - if (info->flags & OVS_CT_F_COMMIT) + if (info->commit) err = ovs_ct_commit(net, key, info, skb); else err = ovs_ct_lookup(net, key, info, skb); @@ -504,9 +503,9 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, if (err) goto err; } - if (label_nonzero(&info->label.mask)) - err = ovs_ct_set_label(skb, key, &info->label.value, - &info->label.mask); + if (labels_nonzero(&info->labels.mask)) + err = ovs_ct_set_labels(skb, key, &info->labels.value, + &info->labels.mask); err: skb_push(skb, nh_ofs); return err; @@ -537,14 +536,13 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, } static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { - [OVS_CT_ATTR_FLAGS] = { .minlen = sizeof(u32), - .maxlen = sizeof(u32) }, + [OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 }, [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16), .maxlen = sizeof(u16) }, [OVS_CT_ATTR_MARK] = { .minlen = sizeof(struct md_mark), .maxlen = sizeof(struct md_mark) }, - [OVS_CT_ATTR_LABEL] = { .minlen = sizeof(struct md_label), - .maxlen = sizeof(struct md_label) }, + [OVS_CT_ATTR_LABELS] = { .minlen = sizeof(struct md_labels), + .maxlen = sizeof(struct md_labels) }, [OVS_CT_ATTR_HELPER] = { .minlen = 1, .maxlen = NF_CT_HELPER_NAME_LEN } }; @@ -574,8 +572,8 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, } switch (type) { - case OVS_CT_ATTR_FLAGS: - info->flags = nla_get_u32(a); + case OVS_CT_ATTR_COMMIT: + info->commit = true; break; #ifdef CONFIG_NF_CONNTRACK_ZONES case OVS_CT_ATTR_ZONE: @@ -586,15 +584,23 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, case OVS_CT_ATTR_MARK: { struct md_mark *mark = nla_data(a); + if (!mark->mask) { + OVS_NLERR(log, "ct_mark mask cannot be 0"); + return -EINVAL; + } info->mark = *mark; break; } #endif #ifdef CONFIG_NF_CONNTRACK_LABELS - case OVS_CT_ATTR_LABEL: { - struct md_label *label = nla_data(a); + case OVS_CT_ATTR_LABELS: { + struct md_labels *labels = nla_data(a); - info->label = *label; + if (!labels_nonzero(&labels->mask)) { + OVS_NLERR(log, "ct_labels mask cannot be 0"); + return -EINVAL; + } + info->labels = *labels; break; } #endif @@ -631,7 +637,7 @@ bool ovs_ct_verify(struct net *net, enum ovs_key_attr attr) attr == OVS_KEY_ATTR_CT_MARK) return true; if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && - attr == OVS_KEY_ATTR_CT_LABEL) { + attr == OVS_KEY_ATTR_CT_LABELS) { struct ovs_net *ovs_net = net_generic(net, ovs_net_id); return ovs_net->xt_label; @@ -699,18 +705,19 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, if (!start) return -EMSGSIZE; - if (nla_put_u32(skb, OVS_CT_ATTR_FLAGS, ct_info->flags)) + if (ct_info->commit && nla_put_flag(skb, OVS_CT_ATTR_COMMIT)) return -EMSGSIZE; if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id)) return -EMSGSIZE; - if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && + if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && ct_info->mark.mask && nla_put(skb, OVS_CT_ATTR_MARK, sizeof(ct_info->mark), &ct_info->mark)) return -EMSGSIZE; if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && - nla_put(skb, OVS_CT_ATTR_LABEL, sizeof(ct_info->label), - &ct_info->label)) + labels_nonzero(&ct_info->labels.mask) && + nla_put(skb, OVS_CT_ATTR_LABELS, sizeof(ct_info->labels), + &ct_info->labels)) return -EMSGSIZE; if (ct_info->helper) { if (nla_put_string(skb, OVS_CT_ATTR_HELPER, @@ -735,7 +742,7 @@ void ovs_ct_free_action(const struct nlattr *a) void ovs_ct_init(struct net *net) { - unsigned int n_bits = sizeof(struct ovs_key_ct_label) * BITS_PER_BYTE; + unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE; struct ovs_net *ovs_net = net_generic(net, ovs_net_id); if (nf_connlabels_get(net, n_bits)) { diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h index 3cb30667a7dc..82e0dfc66028 100644 --- a/net/openvswitch/conntrack.h +++ b/net/openvswitch/conntrack.h @@ -19,7 +19,7 @@ struct ovs_conntrack_info; enum ovs_key_attr; -#if defined(CONFIG_OPENVSWITCH_CONNTRACK) +#if IS_ENABLED(CONFIG_NF_CONNTRACK) void ovs_ct_init(struct net *); void ovs_ct_exit(struct net *); bool ovs_ct_verify(struct net *, enum ovs_key_attr attr); @@ -34,6 +34,10 @@ int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *, void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key); int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb); void ovs_ct_free_action(const struct nlattr *a); + +#define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \ + OVS_CS_F_RELATED | OVS_CS_F_REPLY_DIR | \ + OVS_CS_F_INVALID | OVS_CS_F_TRACKED) #else #include <linux/errno.h> @@ -72,7 +76,7 @@ static inline void ovs_ct_fill_key(const struct sk_buff *skb, key->ct.state = 0; key->ct.zone = 0; key->ct.mark = 0; - memset(&key->ct.label, 0, sizeof(key->ct.label)); + memset(&key->ct.labels, 0, sizeof(key->ct.labels)); } static inline int ovs_ct_put_key(const struct sw_flow_key *key, @@ -82,5 +86,7 @@ static inline int ovs_ct_put_key(const struct sw_flow_key *key, } static inline void ovs_ct_free_action(const struct nlattr *a) { } -#endif + +#define CT_SUPPORTED_MASK 0 +#endif /* CONFIG_NF_CONNTRACK */ #endif /* ovs_conntrack.h */ diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 6fbd2decb19e..b816ff871528 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -952,7 +952,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) if (error) goto err_kfree_flow; - ovs_flow_mask_key(&new_flow->key, &key, &mask); + ovs_flow_mask_key(&new_flow->key, &key, true, &mask); /* Extract flow identifier. */ error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID], @@ -1080,7 +1080,7 @@ static struct sw_flow_actions *get_flow_actions(struct net *net, struct sw_flow_key masked_key; int error; - ovs_flow_mask_key(&masked_key, key, mask); + ovs_flow_mask_key(&masked_key, key, true, mask); error = ovs_nla_copy_actions(net, a, &masked_key, &acts, log); if (error) { OVS_NLERR(log, diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index fe527d2dd4b7..8cfa15a08668 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -116,7 +116,7 @@ struct sw_flow_key { u16 zone; u32 mark; u8 state; - struct ovs_key_ct_label label; + struct ovs_key_ct_labels labels; } ct; } __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index c92d6a262bc5..bd710bc37469 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -57,6 +57,7 @@ struct ovs_len_tbl { }; #define OVS_ATTR_NESTED -1 +#define OVS_ATTR_VARIABLE -2 static void update_range(struct sw_flow_match *match, size_t offset, size_t size, bool is_mask) @@ -290,10 +291,10 @@ size_t ovs_key_attr_size(void) + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ + nla_total_size(4) /* OVS_KEY_ATTR_DP_HASH */ + nla_total_size(4) /* OVS_KEY_ATTR_RECIRC_ID */ - + nla_total_size(1) /* OVS_KEY_ATTR_CT_STATE */ + + nla_total_size(4) /* OVS_KEY_ATTR_CT_STATE */ + nla_total_size(2) /* OVS_KEY_ATTR_CT_ZONE */ + nla_total_size(4) /* OVS_KEY_ATTR_CT_MARK */ - + nla_total_size(16) /* OVS_KEY_ATTR_CT_LABEL */ + + nla_total_size(16) /* OVS_KEY_ATTR_CT_LABELS */ + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + nla_total_size(4) /* OVS_KEY_ATTR_VLAN */ @@ -304,6 +305,10 @@ size_t ovs_key_attr_size(void) + nla_total_size(28); /* OVS_KEY_ATTR_ND */ } +static const struct ovs_len_tbl ovs_vxlan_ext_key_lens[OVS_VXLAN_EXT_MAX + 1] = { + [OVS_VXLAN_EXT_GBP] = { .len = sizeof(u32) }, +}; + static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = { [OVS_TUNNEL_KEY_ATTR_ID] = { .len = sizeof(u64) }, [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = { .len = sizeof(u32) }, @@ -315,8 +320,9 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] [OVS_TUNNEL_KEY_ATTR_TP_SRC] = { .len = sizeof(u16) }, [OVS_TUNNEL_KEY_ATTR_TP_DST] = { .len = sizeof(u16) }, [OVS_TUNNEL_KEY_ATTR_OAM] = { .len = 0 }, - [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = { .len = OVS_ATTR_NESTED }, - [OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS] = { .len = OVS_ATTR_NESTED }, + [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = { .len = OVS_ATTR_VARIABLE }, + [OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS] = { .len = OVS_ATTR_NESTED, + .next = ovs_vxlan_ext_key_lens }, }; /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ @@ -343,12 +349,19 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_TUNNEL] = { .len = OVS_ATTR_NESTED, .next = ovs_tunnel_key_lens, }, [OVS_KEY_ATTR_MPLS] = { .len = sizeof(struct ovs_key_mpls) }, - [OVS_KEY_ATTR_CT_STATE] = { .len = sizeof(u8) }, + [OVS_KEY_ATTR_CT_STATE] = { .len = sizeof(u32) }, [OVS_KEY_ATTR_CT_ZONE] = { .len = sizeof(u16) }, [OVS_KEY_ATTR_CT_MARK] = { .len = sizeof(u32) }, - [OVS_KEY_ATTR_CT_LABEL] = { .len = sizeof(struct ovs_key_ct_label) }, + [OVS_KEY_ATTR_CT_LABELS] = { .len = sizeof(struct ovs_key_ct_labels) }, }; +static bool check_attr_len(unsigned int attr_len, unsigned int expected_len) +{ + return expected_len == attr_len || + expected_len == OVS_ATTR_NESTED || + expected_len == OVS_ATTR_VARIABLE; +} + static bool is_all_zero(const u8 *fp, size_t size) { int i; @@ -388,7 +401,7 @@ static int __parse_flow_nlattrs(const struct nlattr *attr, } expected_len = ovs_key_lens[type].len; - if (nla_len(nla) != expected_len && expected_len != OVS_ATTR_NESTED) { + if (!check_attr_len(nla_len(nla), expected_len)) { OVS_NLERR(log, "Key %d has unexpected len %d expected %d", type, nla_len(nla), expected_len); return -EINVAL; @@ -473,29 +486,50 @@ static int genev_tun_opt_from_nlattr(const struct nlattr *a, return 0; } -static const struct nla_policy vxlan_opt_policy[OVS_VXLAN_EXT_MAX + 1] = { - [OVS_VXLAN_EXT_GBP] = { .type = NLA_U32 }, -}; - -static int vxlan_tun_opt_from_nlattr(const struct nlattr *a, +static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr, struct sw_flow_match *match, bool is_mask, bool log) { - struct nlattr *tb[OVS_VXLAN_EXT_MAX+1]; + struct nlattr *a; + int rem; unsigned long opt_key_offset; struct vxlan_metadata opts; - int err; BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts)); - err = nla_parse_nested(tb, OVS_VXLAN_EXT_MAX, a, vxlan_opt_policy); - if (err < 0) - return err; - memset(&opts, 0, sizeof(opts)); + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); - if (tb[OVS_VXLAN_EXT_GBP]) - opts.gbp = nla_get_u32(tb[OVS_VXLAN_EXT_GBP]); + if (type > OVS_VXLAN_EXT_MAX) { + OVS_NLERR(log, "VXLAN extension %d out of range max %d", + type, OVS_VXLAN_EXT_MAX); + return -EINVAL; + } + + if (!check_attr_len(nla_len(a), + ovs_vxlan_ext_key_lens[type].len)) { + OVS_NLERR(log, "VXLAN extension %d has unexpected len %d expected %d", + type, nla_len(a), + ovs_vxlan_ext_key_lens[type].len); + return -EINVAL; + } + + switch (type) { + case OVS_VXLAN_EXT_GBP: + opts.gbp = nla_get_u32(a); + break; + default: + OVS_NLERR(log, "Unknown VXLAN extension attribute %d", + type); + return -EINVAL; + } + } + if (rem) { + OVS_NLERR(log, "VXLAN extension message has %d unknown bytes.", + rem); + return -EINVAL; + } if (!is_mask) SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), false); @@ -528,8 +562,8 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, return -EINVAL; } - if (ovs_tunnel_key_lens[type].len != nla_len(a) && - ovs_tunnel_key_lens[type].len != OVS_ATTR_NESTED) { + if (!check_attr_len(nla_len(a), + ovs_tunnel_key_lens[type].len)) { OVS_NLERR(log, "Tunnel attr %d has unexpected len %d expected %d", type, nla_len(a), ovs_tunnel_key_lens[type].len); return -EINVAL; @@ -780,7 +814,13 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, if (*attrs & (1 << OVS_KEY_ATTR_CT_STATE) && ovs_ct_verify(net, OVS_KEY_ATTR_CT_STATE)) { - u8 ct_state = nla_get_u8(a[OVS_KEY_ATTR_CT_STATE]); + u32 ct_state = nla_get_u32(a[OVS_KEY_ATTR_CT_STATE]); + + if (ct_state & ~CT_SUPPORTED_MASK) { + OVS_NLERR(log, "ct_state flags %08x unsupported", + ct_state); + return -EINVAL; + } SW_FLOW_KEY_PUT(match, ct.state, ct_state, is_mask); *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_STATE); @@ -799,14 +839,14 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, SW_FLOW_KEY_PUT(match, ct.mark, mark, is_mask); *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_MARK); } - if (*attrs & (1 << OVS_KEY_ATTR_CT_LABEL) && - ovs_ct_verify(net, OVS_KEY_ATTR_CT_LABEL)) { - const struct ovs_key_ct_label *cl; + if (*attrs & (1 << OVS_KEY_ATTR_CT_LABELS) && + ovs_ct_verify(net, OVS_KEY_ATTR_CT_LABELS)) { + const struct ovs_key_ct_labels *cl; - cl = nla_data(a[OVS_KEY_ATTR_CT_LABEL]); - SW_FLOW_KEY_MEMCPY(match, ct.label, cl->ct_label, + cl = nla_data(a[OVS_KEY_ATTR_CT_LABELS]); + SW_FLOW_KEY_MEMCPY(match, ct.labels, cl->ct_labels, sizeof(*cl), is_mask); - *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_LABEL); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_LABELS); } return 0; } @@ -1052,10 +1092,16 @@ static void nlattr_set(struct nlattr *attr, u8 val, /* The nlattr stream should already have been validated */ nla_for_each_nested(nla, attr, rem) { - if (tbl && tbl[nla_type(nla)].len == OVS_ATTR_NESTED) - nlattr_set(nla, val, tbl[nla_type(nla)].next); - else + if (tbl[nla_type(nla)].len == OVS_ATTR_NESTED) { + if (tbl[nla_type(nla)].next) + tbl = tbl[nla_type(nla)].next; + nlattr_set(nla, val, tbl); + } else { memset(nla_data(nla), val, nla_len(nla)); + } + + if (nla_type(nla) == OVS_KEY_ATTR_CT_STATE) + *(u32 *)nla_data(nla) &= CT_SUPPORTED_MASK; } } @@ -1922,8 +1968,7 @@ static int validate_set(const struct nlattr *a, key_len /= 2; if (key_type > OVS_KEY_ATTR_MAX || - (ovs_key_lens[key_type].len != key_len && - ovs_key_lens[key_type].len != OVS_ATTR_NESTED)) + !check_attr_len(key_len, ovs_key_lens[key_type].len)) return -EINVAL; if (masked && !validate_masked(nla_data(ovs_key), key_len)) @@ -1937,7 +1982,7 @@ static int validate_set(const struct nlattr *a, case OVS_KEY_ATTR_PRIORITY: case OVS_KEY_ATTR_SKB_MARK: case OVS_KEY_ATTR_CT_MARK: - case OVS_KEY_ATTR_CT_LABEL: + case OVS_KEY_ATTR_CT_LABELS: case OVS_KEY_ATTR_ETHERNET: break; diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index d22d8e948d0f..c7f74aab34b9 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -57,20 +57,21 @@ static u16 range_n_bytes(const struct sw_flow_key_range *range) } void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, - const struct sw_flow_mask *mask) + bool full, const struct sw_flow_mask *mask) { - const long *m = (const long *)((const u8 *)&mask->key + - mask->range.start); - const long *s = (const long *)((const u8 *)src + - mask->range.start); - long *d = (long *)((u8 *)dst + mask->range.start); + int start = full ? 0 : mask->range.start; + int len = full ? sizeof *dst : range_n_bytes(&mask->range); + const long *m = (const long *)((const u8 *)&mask->key + start); + const long *s = (const long *)((const u8 *)src + start); + long *d = (long *)((u8 *)dst + start); int i; - /* The memory outside of the 'mask->range' are not set since - * further operations on 'dst' only uses contents within - * 'mask->range'. + /* If 'full' is true then all of 'dst' is fully initialized. Otherwise, + * if 'full' is false the memory outside of the 'mask->range' is left + * uninitialized. This can be used as an optimization when further + * operations on 'dst' only use contents within 'mask->range'. */ - for (i = 0; i < range_n_bytes(&mask->range); i += sizeof(long)) + for (i = 0; i < len; i += sizeof(long)) *d++ = *s++ & *m++; } @@ -92,7 +93,8 @@ struct sw_flow *ovs_flow_alloc(void) /* Initialize the default stat node. */ stats = kmem_cache_alloc_node(flow_stats_cache, - GFP_KERNEL | __GFP_ZERO, 0); + GFP_KERNEL | __GFP_ZERO, + node_online(0) ? 0 : NUMA_NO_NODE); if (!stats) goto err; @@ -475,7 +477,7 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti, u32 hash; struct sw_flow_key masked_key; - ovs_flow_mask_key(&masked_key, unmasked, mask); + ovs_flow_mask_key(&masked_key, unmasked, false, mask); hash = flow_hash(&masked_key, &mask->range); head = find_bucket(ti, hash); hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) { diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index 616eda10d955..2dd9900f533d 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -86,5 +86,5 @@ struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *, bool ovs_flow_cmp(const struct sw_flow *, const struct sw_flow_match *); void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, - const struct sw_flow_mask *mask); + bool full, const struct sw_flow_mask *mask); #endif /* flow_table.h */ diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 388b8a6bf112..b3934126daa8 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -106,12 +106,45 @@ static void internal_dev_destructor(struct net_device *dev) free_netdev(dev); } +static struct rtnl_link_stats64 * +internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) +{ + int i; + + memset(stats, 0, sizeof(*stats)); + stats->rx_errors = dev->stats.rx_errors; + stats->tx_errors = dev->stats.tx_errors; + stats->tx_dropped = dev->stats.tx_dropped; + stats->rx_dropped = dev->stats.rx_dropped; + + for_each_possible_cpu(i) { + const struct pcpu_sw_netstats *percpu_stats; + struct pcpu_sw_netstats local_stats; + unsigned int start; + + percpu_stats = per_cpu_ptr(dev->tstats, i); + + do { + start = u64_stats_fetch_begin_irq(&percpu_stats->syncp); + local_stats = *percpu_stats; + } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start)); + + stats->rx_bytes += local_stats.rx_bytes; + stats->rx_packets += local_stats.rx_packets; + stats->tx_bytes += local_stats.tx_bytes; + stats->tx_packets += local_stats.tx_packets; + } + + return stats; +} + static const struct net_device_ops internal_dev_netdev_ops = { .ndo_open = internal_dev_open, .ndo_stop = internal_dev_stop, .ndo_start_xmit = internal_dev_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_change_mtu = internal_dev_change_mtu, + .ndo_get_stats64 = internal_get_stats, }; static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { @@ -161,6 +194,11 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) err = -ENOMEM; goto error_free_vport; } + vport->dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!vport->dev->tstats) { + err = -ENOMEM; + goto error_free_netdev; + } dev_net_set(vport->dev, ovs_dp_get_net(vport->dp)); internal_dev = internal_dev_priv(vport->dev); @@ -173,7 +211,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) rtnl_lock(); err = register_netdevice(vport->dev); if (err) - goto error_free_netdev; + goto error_unlock; dev_set_promiscuity(vport->dev, 1); rtnl_unlock(); @@ -181,8 +219,10 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) return vport; -error_free_netdev: +error_unlock: rtnl_unlock(); + free_percpu(vport->dev->tstats); +error_free_netdev: free_netdev(vport->dev); error_free_vport: ovs_vport_free(vport); @@ -198,7 +238,7 @@ static void internal_dev_destroy(struct vport *vport) /* unregister_netdevice() waits for an RCU grace period. */ unregister_netdevice(vport->dev); - + free_percpu(vport->dev->tstats); rtnl_unlock(); } diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index dc81dc619aa2..12a36ac21eda 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -280,35 +280,19 @@ void ovs_vport_del(struct vport *vport) */ void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) { - struct net_device *dev = vport->dev; - int i; - - memset(stats, 0, sizeof(*stats)); - stats->rx_errors = dev->stats.rx_errors; - stats->tx_errors = dev->stats.tx_errors; - stats->tx_dropped = dev->stats.tx_dropped; - stats->rx_dropped = dev->stats.rx_dropped; - - stats->rx_dropped += atomic_long_read(&dev->rx_dropped); - stats->tx_dropped += atomic_long_read(&dev->tx_dropped); - - for_each_possible_cpu(i) { - const struct pcpu_sw_netstats *percpu_stats; - struct pcpu_sw_netstats local_stats; - unsigned int start; - - percpu_stats = per_cpu_ptr(dev->tstats, i); - - do { - start = u64_stats_fetch_begin_irq(&percpu_stats->syncp); - local_stats = *percpu_stats; - } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start)); - - stats->rx_bytes += local_stats.rx_bytes; - stats->rx_packets += local_stats.rx_packets; - stats->tx_bytes += local_stats.tx_bytes; - stats->tx_packets += local_stats.tx_packets; - } + const struct rtnl_link_stats64 *dev_stats; + struct rtnl_link_stats64 temp; + + dev_stats = dev_get_stats(vport->dev, &temp); + stats->rx_errors = dev_stats->rx_errors; + stats->tx_errors = dev_stats->tx_errors; + stats->tx_dropped = dev_stats->tx_dropped; + stats->rx_dropped = dev_stats->rx_dropped; + + stats->rx_bytes = dev_stats->rx_bytes; + stats->rx_packets = dev_stats->rx_packets; + stats->tx_bytes = dev_stats->tx_bytes; + stats->tx_packets = dev_stats->tx_packets; } /** @@ -460,6 +444,15 @@ int ovs_vport_receive(struct vport *vport, struct sk_buff *skb, OVS_CB(skb)->input_vport = vport; OVS_CB(skb)->mru = 0; + if (unlikely(dev_net(skb->dev) != ovs_dp_get_net(vport->dp))) { + u32 mark; + + mark = skb->mark; + skb_scrub_packet(skb, true); + skb->mark = mark; + tun_info = NULL; + } + /* Extract flow from 'skb' into 'key'. */ error = ovs_flow_key_extract(tun_info, skb, &key); if (unlikely(error)) { diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 7b8e39a22387..aa4b15c35884 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -230,6 +230,8 @@ struct packet_skb_cb { } sa; }; +#define vio_le() virtio_legacy_is_little_endian() + #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc)) @@ -2680,15 +2682,15 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) goto out_unlock; if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && - (__virtio16_to_cpu(false, vnet_hdr.csum_start) + - __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2 > - __virtio16_to_cpu(false, vnet_hdr.hdr_len))) - vnet_hdr.hdr_len = __cpu_to_virtio16(false, - __virtio16_to_cpu(false, vnet_hdr.csum_start) + - __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2); + (__virtio16_to_cpu(vio_le(), vnet_hdr.csum_start) + + __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset) + 2 > + __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len))) + vnet_hdr.hdr_len = __cpu_to_virtio16(vio_le(), + __virtio16_to_cpu(vio_le(), vnet_hdr.csum_start) + + __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset) + 2); err = -EINVAL; - if (__virtio16_to_cpu(false, vnet_hdr.hdr_len) > len) + if (__virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len) > len) goto out_unlock; if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) { @@ -2731,7 +2733,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, - __virtio16_to_cpu(false, vnet_hdr.hdr_len), + __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len), msg->msg_flags & MSG_DONTWAIT, &err); if (skb == NULL) goto out_unlock; @@ -2778,8 +2780,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (po->has_vnet_hdr) { if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { - u16 s = __virtio16_to_cpu(false, vnet_hdr.csum_start); - u16 o = __virtio16_to_cpu(false, vnet_hdr.csum_offset); + u16 s = __virtio16_to_cpu(vio_le(), vnet_hdr.csum_start); + u16 o = __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset); if (!skb_partial_csum_set(skb, s, o)) { err = -EINVAL; goto out_free; @@ -2787,7 +2789,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) } skb_shinfo(skb)->gso_size = - __virtio16_to_cpu(false, vnet_hdr.gso_size); + __virtio16_to_cpu(vio_le(), vnet_hdr.gso_size); skb_shinfo(skb)->gso_type = gso_type; /* Header must be checked, and gso_segs computed. */ @@ -3161,9 +3163,9 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, /* This is a hint as to how much should be linear. */ vnet_hdr.hdr_len = - __cpu_to_virtio16(false, skb_headlen(skb)); + __cpu_to_virtio16(vio_le(), skb_headlen(skb)); vnet_hdr.gso_size = - __cpu_to_virtio16(false, sinfo->gso_size); + __cpu_to_virtio16(vio_le(), sinfo->gso_size); if (sinfo->gso_type & SKB_GSO_TCPV4) vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4; else if (sinfo->gso_type & SKB_GSO_TCPV6) @@ -3181,9 +3183,9 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (skb->ip_summed == CHECKSUM_PARTIAL) { vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; - vnet_hdr.csum_start = __cpu_to_virtio16(false, + vnet_hdr.csum_start = __cpu_to_virtio16(vio_le(), skb_checksum_start_offset(skb)); - vnet_hdr.csum_offset = __cpu_to_virtio16(false, + vnet_hdr.csum_offset = __cpu_to_virtio16(vio_le(), skb->csum_offset); } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID; diff --git a/net/rds/connection.c b/net/rds/connection.c index a50e652eb269..49adeef8090c 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -70,7 +70,8 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) } while (0) /* rcu read lock must be held or the connection spinlock */ -static struct rds_connection *rds_conn_lookup(struct hlist_head *head, +static struct rds_connection *rds_conn_lookup(struct net *net, + struct hlist_head *head, __be32 laddr, __be32 faddr, struct rds_transport *trans) { @@ -78,7 +79,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head, hlist_for_each_entry_rcu(conn, head, c_hash_node) { if (conn->c_faddr == faddr && conn->c_laddr == laddr && - conn->c_trans == trans) { + conn->c_trans == trans && net == rds_conn_net(conn)) { ret = conn; break; } @@ -132,7 +133,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP) goto new_conn; rcu_read_lock(); - conn = rds_conn_lookup(head, laddr, faddr, trans); + conn = rds_conn_lookup(net, head, laddr, faddr, trans); if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && laddr == faddr && !is_outgoing) { /* This is a looped back IB connection, and we're @@ -189,6 +190,12 @@ new_conn: } } + if (trans == NULL) { + kmem_cache_free(rds_conn_slab, conn); + conn = ERR_PTR(-ENODEV); + goto out; + } + conn->c_trans = trans; ret = trans->conn_alloc(conn, gfp); @@ -239,7 +246,7 @@ new_conn: if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP) found = NULL; else - found = rds_conn_lookup(head, laddr, faddr, trans); + found = rds_conn_lookup(net, head, laddr, faddr, trans); if (found) { trans->conn_free(conn->c_transport_data); kmem_cache_free(rds_conn_slab, conn); diff --git a/net/rds/ib.c b/net/rds/ib.c index d020fade312c..2d3f2ab475df 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -99,8 +99,6 @@ static void rds_ib_dev_free(struct work_struct *work) if (rds_ibdev->mr_pool) rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); - if (rds_ibdev->mr) - ib_dereg_mr(rds_ibdev->mr); if (rds_ibdev->pd) ib_dealloc_pd(rds_ibdev->pd); @@ -164,12 +162,6 @@ static void rds_ib_add_one(struct ib_device *device) goto put_dev; } - rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(rds_ibdev->mr)) { - rds_ibdev->mr = NULL; - goto put_dev; - } - rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); if (IS_ERR(rds_ibdev->mr_pool)) { rds_ibdev->mr_pool = NULL; @@ -230,11 +222,10 @@ struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device) * * This can be called at any time and can be racing with any other RDS path. */ -static void rds_ib_remove_one(struct ib_device *device) +static void rds_ib_remove_one(struct ib_device *device, void *client_data) { - struct rds_ib_device *rds_ibdev; + struct rds_ib_device *rds_ibdev = client_data; - rds_ibdev = ib_get_client_data(device, &rds_ib_client); if (!rds_ibdev) return; diff --git a/net/rds/ib.h b/net/rds/ib.h index 9fc95e38659a..aae60fda77f6 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -100,7 +100,6 @@ struct rds_ib_connection { /* alphabet soup, IBTA style */ struct rdma_cm_id *i_cm_id; struct ib_pd *i_pd; - struct ib_mr *i_mr; struct ib_cq *i_send_cq; struct ib_cq *i_recv_cq; @@ -173,7 +172,6 @@ struct rds_ib_device { struct list_head conn_list; struct ib_device *dev; struct ib_pd *pd; - struct ib_mr *mr; struct rds_ib_mr_pool *mr_pool; unsigned int fmr_max_remaps; unsigned int max_fmrs; diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index d150bb4aa3cb..9043f5c04787 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -269,7 +269,6 @@ static int rds_ib_setup_qp(struct rds_connection *conn) /* Protection domain and memory range */ ic->i_pd = rds_ibdev->pd; - ic->i_mr = rds_ibdev->mr; cq_attr.cqe = ic->i_send_ring.w_nr + 1; ic->i_send_cq = ib_create_cq(dev, rds_ib_send_cq_comp_handler, @@ -375,7 +374,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) rds_ib_recv_init_ack(ic); - rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr, + rdsdebug("conn %p pd %p cq %p %p\n", conn, ic->i_pd, ic->i_send_cq, ic->i_recv_cq); out: @@ -682,7 +681,6 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) ic->i_cm_id = NULL; ic->i_pd = NULL; - ic->i_mr = NULL; ic->i_send_cq = NULL; ic->i_recv_cq = NULL; ic->i_send_hdrs = NULL; diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 6bbe62060060..f43831e4186a 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -62,12 +62,12 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic) sge = &recv->r_sge[0]; sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); sge->length = sizeof(struct rds_header); - sge->lkey = ic->i_mr->lkey; + sge->lkey = ic->i_pd->local_dma_lkey; sge = &recv->r_sge[1]; sge->addr = 0; sge->length = RDS_FRAG_SIZE; - sge->lkey = ic->i_mr->lkey; + sge->lkey = ic->i_pd->local_dma_lkey; } } @@ -564,7 +564,7 @@ void rds_ib_recv_init_ack(struct rds_ib_connection *ic) sge->addr = ic->i_ack_dma; sge->length = sizeof(struct rds_header); - sge->lkey = ic->i_mr->lkey; + sge->lkey = ic->i_pd->local_dma_lkey; wr->sg_list = sge; wr->num_sge = 1; diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index c576ebeb4115..4e88047086b6 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -202,9 +202,9 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic) sge = &send->s_sge[0]; sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); sge->length = sizeof(struct rds_header); - sge->lkey = ic->i_mr->lkey; + sge->lkey = ic->i_pd->local_dma_lkey; - send->s_sge[1].lkey = ic->i_mr->lkey; + send->s_sge[1].lkey = ic->i_pd->local_dma_lkey; } } @@ -818,7 +818,7 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) /* Convert our struct scatterlist to struct ib_sge */ send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg); send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg); - send->s_sge[0].lkey = ic->i_mr->lkey; + send->s_sge[0].lkey = ic->i_pd->local_dma_lkey; rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr, send->s_sge[0].addr, send->s_sge[0].length); @@ -932,7 +932,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat); send->s_sge[j].length = len; - send->s_sge[j].lkey = ic->i_mr->lkey; + send->s_sge[j].lkey = ic->i_pd->local_dma_lkey; sent += len; rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr); diff --git a/net/rds/iw.c b/net/rds/iw.c index 5d5a9d258658..3df0295c6659 100644 --- a/net/rds/iw.c +++ b/net/rds/iw.c @@ -125,12 +125,11 @@ free_attr: kfree(dev_attr); } -static void rds_iw_remove_one(struct ib_device *device) +static void rds_iw_remove_one(struct ib_device *device, void *client_data) { - struct rds_iw_device *rds_iwdev; + struct rds_iw_device *rds_iwdev = client_data; struct rds_iw_cm_id *i_cm_id, *next; - rds_iwdev = ib_get_client_data(device, &rds_iw_client); if (!rds_iwdev) return; @@ -149,10 +148,7 @@ static void rds_iw_remove_one(struct ib_device *device) if (rds_iwdev->mr) ib_dereg_mr(rds_iwdev->mr); - while (ib_dealloc_pd(rds_iwdev->pd)) { - rdsdebug("Failed to dealloc pd %p\n", rds_iwdev->pd); - msleep(1); - } + ib_dealloc_pd(rds_iwdev->pd); list_del(&rds_iwdev->list); kfree(rds_iwdev); diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c index dba8d0864f18..6a8fbd6e69e7 100644 --- a/net/rds/iw_rdma.c +++ b/net/rds/iw_rdma.c @@ -667,11 +667,12 @@ static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool, struct ib_mr *mr; int err; - mr = ib_alloc_fast_reg_mr(rds_iwdev->pd, pool->max_message_size); + mr = ib_alloc_mr(rds_iwdev->pd, IB_MR_TYPE_MEM_REG, + pool->max_message_size); if (IS_ERR(mr)) { err = PTR_ERR(mr); - printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed (err=%d)\n", err); + printk(KERN_WARNING "RDS/IW: ib_alloc_mr failed (err=%d)\n", err); return err; } diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c index 334fe98c5084..86152ec3b887 100644 --- a/net/rds/iw_send.c +++ b/net/rds/iw_send.c @@ -153,9 +153,10 @@ void rds_iw_send_init_ring(struct rds_iw_connection *ic) sge->length = sizeof(struct rds_header); sge->lkey = 0; - send->s_mr = ib_alloc_fast_reg_mr(ic->i_pd, fastreg_message_size); + send->s_mr = ib_alloc_mr(ic->i_pd, IB_MR_TYPE_MEM_REG, + fastreg_message_size); if (IS_ERR(send->s_mr)) { - printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed\n"); + printk(KERN_WARNING "RDS/IW: ib_alloc_mr failed\n"); break; } diff --git a/net/rfkill/core.c b/net/rfkill/core.c index f12149a29cb1..b41e9ea2ffff 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -341,7 +341,15 @@ static void __rfkill_switch_all(const enum rfkill_type type, bool blocked) { struct rfkill *rfkill; - rfkill_global_states[type].cur = blocked; + if (type == RFKILL_TYPE_ALL) { + int i; + + for (i = 0; i < NUM_RFKILL_TYPES; i++) + rfkill_global_states[i].cur = blocked; + } else { + rfkill_global_states[type].cur = blocked; + } + list_for_each_entry(rfkill, &rfkill_list, node) { if (rfkill->type != type && type != RFKILL_TYPE_ALL) continue; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 2d1be4a760fd..32fcdecdb9e2 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -31,13 +31,17 @@ #define MIRRED_TAB_MASK 7 static LIST_HEAD(mirred_list); +static DEFINE_SPINLOCK(mirred_list_lock); static void tcf_mirred_release(struct tc_action *a, int bind) { struct tcf_mirred *m = to_mirred(a); struct net_device *dev = rcu_dereference_protected(m->tcfm_dev, 1); + /* We could be called either in a RCU callback or with RTNL lock held. */ + spin_lock_bh(&mirred_list_lock); list_del(&m->tcfm_list); + spin_unlock_bh(&mirred_list_lock); if (dev) dev_put(dev); } @@ -103,10 +107,10 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, } else { if (bind) return 0; - if (!ovr) { - tcf_hash_release(a, bind); + + tcf_hash_release(a, bind); + if (!ovr) return -EEXIST; - } } m = to_mirred(a); @@ -123,7 +127,9 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, } if (ret == ACT_P_CREATED) { + spin_lock_bh(&mirred_list_lock); list_add(&m->tcfm_list, &mirred_list); + spin_unlock_bh(&mirred_list_lock); tcf_hash_insert(a); } @@ -173,6 +179,7 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, skb2->skb_iif = skb->dev->ifindex; skb2->dev = dev; + skb_sender_cpu_clear(skb2); err = dev_queue_xmit(skb2); if (err) { @@ -221,7 +228,8 @@ static int mirred_device_event(struct notifier_block *unused, struct tcf_mirred *m; ASSERT_RTNL(); - if (event == NETDEV_UNREGISTER) + if (event == NETDEV_UNREGISTER) { + spin_lock_bh(&mirred_list_lock); list_for_each_entry(m, &mirred_list, tcfm_list) { if (rcu_access_pointer(m->tcfm_dev) == dev) { dev_put(dev); @@ -231,6 +239,8 @@ static int mirred_device_event(struct notifier_block *unused, RCU_INIT_POINTER(m->tcfm_dev, NULL); } } + spin_unlock_bh(&mirred_list_lock); + } return NOTIFY_DONE; } diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 715e01e5910a..f23a3b68bba6 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -33,7 +33,6 @@ struct fw_head { u32 mask; - bool mask_set; struct fw_filter __rcu *ht[HTSIZE]; struct rcu_head rcu; }; @@ -84,7 +83,7 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, } } } else { - /* old method */ + /* Old method: classify the packet using its skb mark. */ if (id && (TC_H_MAJ(id) == 0 || !(TC_H_MAJ(id ^ tp->q->handle)))) { res->classid = id; @@ -114,14 +113,9 @@ static unsigned long fw_get(struct tcf_proto *tp, u32 handle) static int fw_init(struct tcf_proto *tp) { - struct fw_head *head; - - head = kzalloc(sizeof(struct fw_head), GFP_KERNEL); - if (head == NULL) - return -ENOBUFS; - - head->mask_set = false; - rcu_assign_pointer(tp->root, head); + /* We don't allocate fw_head here, because in the old method + * we don't need it at all. + */ return 0; } @@ -252,7 +246,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, int err; if (!opt) - return handle ? -EINVAL : 0; + return handle ? -EINVAL : 0; /* Succeed if it is old method. */ err = nla_parse_nested(tb, TCA_FW_MAX, opt, fw_policy); if (err < 0) @@ -302,11 +296,17 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, if (!handle) return -EINVAL; - if (!head->mask_set) { - head->mask = 0xFFFFFFFF; + if (!head) { + u32 mask = 0xFFFFFFFF; if (tb[TCA_FW_MASK]) - head->mask = nla_get_u32(tb[TCA_FW_MASK]); - head->mask_set = true; + mask = nla_get_u32(tb[TCA_FW_MASK]); + + head = kzalloc(sizeof(*head), GFP_KERNEL); + if (!head) + return -ENOBUFS; + head->mask = mask; + + rcu_assign_pointer(tp->root, head); } f = kzalloc(sizeof(struct fw_filter), GFP_KERNEL); diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 9d15cb6b8cb1..86b04e31e60b 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -368,6 +368,15 @@ static unsigned int hhf_drop(struct Qdisc *sch) return bucket - q->buckets; } +static unsigned int hhf_qdisc_drop(struct Qdisc *sch) +{ + unsigned int prev_backlog; + + prev_backlog = sch->qstats.backlog; + hhf_drop(sch); + return prev_backlog - sch->qstats.backlog; +} + static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct hhf_sched_data *q = qdisc_priv(sch); @@ -696,7 +705,7 @@ static struct Qdisc_ops hhf_qdisc_ops __read_mostly = { .enqueue = hhf_enqueue, .dequeue = hhf_dequeue, .peek = qdisc_peek_dequeued, - .drop = hhf_drop, + .drop = hhf_qdisc_drop, .init = hhf_init, .reset = hhf_reset, .destroy = hhf_destroy, diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 197c3f59ecbf..b00f1f9611d6 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1208,20 +1208,22 @@ void sctp_assoc_update(struct sctp_association *asoc, * within this document. * * Our basic strategy is to round-robin transports in priorities - * according to sctp_state_prio_map[] e.g., if no such + * according to sctp_trans_score() e.g., if no such * transport with state SCTP_ACTIVE exists, round-robin through * SCTP_UNKNOWN, etc. You get the picture. */ -static const u8 sctp_trans_state_to_prio_map[] = { - [SCTP_ACTIVE] = 3, /* best case */ - [SCTP_UNKNOWN] = 2, - [SCTP_PF] = 1, - [SCTP_INACTIVE] = 0, /* worst case */ -}; - static u8 sctp_trans_score(const struct sctp_transport *trans) { - return sctp_trans_state_to_prio_map[trans->state]; + switch (trans->state) { + case SCTP_ACTIVE: + return 3; /* best case */ + case SCTP_UNKNOWN: + return 2; + case SCTP_PF: + return 1; + default: /* case SCTP_INACTIVE */ + return 0; /* worst case */ + } } static struct sctp_transport *sctp_trans_elect_tie(struct sctp_transport *trans1, diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index b7143337e4fa..3d9ea9a48289 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1186,7 +1186,7 @@ static void sctp_v4_del_protocol(void) unregister_inetaddr_notifier(&sctp_inetaddr_notifier); } -static int __net_init sctp_net_init(struct net *net) +static int __net_init sctp_defaults_init(struct net *net) { int status; @@ -1279,12 +1279,6 @@ static int __net_init sctp_net_init(struct net *net) sctp_dbg_objcnt_init(net); - /* Initialize the control inode/socket for handling OOTB packets. */ - if ((status = sctp_ctl_sock_init(net))) { - pr_err("Failed to initialize the SCTP control sock\n"); - goto err_ctl_sock_init; - } - /* Initialize the local address list. */ INIT_LIST_HEAD(&net->sctp.local_addr_list); spin_lock_init(&net->sctp.local_addr_lock); @@ -1300,9 +1294,6 @@ static int __net_init sctp_net_init(struct net *net) return 0; -err_ctl_sock_init: - sctp_dbg_objcnt_exit(net); - sctp_proc_exit(net); err_init_proc: cleanup_sctp_mibs(net); err_init_mibs: @@ -1311,15 +1302,12 @@ err_sysctl_register: return status; } -static void __net_exit sctp_net_exit(struct net *net) +static void __net_exit sctp_defaults_exit(struct net *net) { /* Free the local address list */ sctp_free_addr_wq(net); sctp_free_local_addr_list(net); - /* Free the control endpoint. */ - inet_ctl_sock_destroy(net->sctp.ctl_sock); - sctp_dbg_objcnt_exit(net); sctp_proc_exit(net); @@ -1327,9 +1315,32 @@ static void __net_exit sctp_net_exit(struct net *net) sctp_sysctl_net_unregister(net); } -static struct pernet_operations sctp_net_ops = { - .init = sctp_net_init, - .exit = sctp_net_exit, +static struct pernet_operations sctp_defaults_ops = { + .init = sctp_defaults_init, + .exit = sctp_defaults_exit, +}; + +static int __net_init sctp_ctrlsock_init(struct net *net) +{ + int status; + + /* Initialize the control inode/socket for handling OOTB packets. */ + status = sctp_ctl_sock_init(net); + if (status) + pr_err("Failed to initialize the SCTP control sock\n"); + + return status; +} + +static void __net_init sctp_ctrlsock_exit(struct net *net) +{ + /* Free the control endpoint. */ + inet_ctl_sock_destroy(net->sctp.ctl_sock); +} + +static struct pernet_operations sctp_ctrlsock_ops = { + .init = sctp_ctrlsock_init, + .exit = sctp_ctrlsock_exit, }; /* Initialize the universe into something sensible. */ @@ -1462,8 +1473,11 @@ static __init int sctp_init(void) sctp_v4_pf_init(); sctp_v6_pf_init(); - status = sctp_v4_protosw_init(); + status = register_pernet_subsys(&sctp_defaults_ops); + if (status) + goto err_register_defaults; + status = sctp_v4_protosw_init(); if (status) goto err_protosw_init; @@ -1471,9 +1485,9 @@ static __init int sctp_init(void) if (status) goto err_v6_protosw_init; - status = register_pernet_subsys(&sctp_net_ops); + status = register_pernet_subsys(&sctp_ctrlsock_ops); if (status) - goto err_register_pernet_subsys; + goto err_register_ctrlsock; status = sctp_v4_add_protocol(); if (status) @@ -1489,12 +1503,14 @@ out: err_v6_add_protocol: sctp_v4_del_protocol(); err_add_protocol: - unregister_pernet_subsys(&sctp_net_ops); -err_register_pernet_subsys: + unregister_pernet_subsys(&sctp_ctrlsock_ops); +err_register_ctrlsock: sctp_v6_protosw_exit(); err_v6_protosw_init: sctp_v4_protosw_exit(); err_protosw_init: + unregister_pernet_subsys(&sctp_defaults_ops); +err_register_defaults: sctp_v4_pf_exit(); sctp_v6_pf_exit(); sctp_sysctl_unregister(); @@ -1527,12 +1543,14 @@ static __exit void sctp_exit(void) sctp_v6_del_protocol(); sctp_v4_del_protocol(); - unregister_pernet_subsys(&sctp_net_ops); + unregister_pernet_subsys(&sctp_ctrlsock_ops); /* Free protosw registrations */ sctp_v6_protosw_exit(); sctp_v4_protosw_exit(); + unregister_pernet_subsys(&sctp_defaults_ops); + /* Unregister with socket layer. */ sctp_v6_pf_exit(); sctp_v4_pf_exit(); diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 35df1266bf07..6098d4c42fa9 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -244,12 +244,13 @@ void sctp_generate_t3_rtx_event(unsigned long peer) int error; struct sctp_transport *transport = (struct sctp_transport *) peer; struct sctp_association *asoc = transport->asoc; - struct net *net = sock_net(asoc->base.sk); + struct sock *sk = asoc->base.sk; + struct net *net = sock_net(sk); /* Check whether a task is in the sock. */ - bh_lock_sock(asoc->base.sk); - if (sock_owned_by_user(asoc->base.sk)) { + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy\n", __func__); /* Try again later. */ @@ -272,10 +273,10 @@ void sctp_generate_t3_rtx_event(unsigned long peer) transport, GFP_ATOMIC); if (error) - asoc->base.sk->sk_err = -error; + sk->sk_err = -error; out_unlock: - bh_unlock_sock(asoc->base.sk); + bh_unlock_sock(sk); sctp_transport_put(transport); } @@ -285,11 +286,12 @@ out_unlock: static void sctp_generate_timeout_event(struct sctp_association *asoc, sctp_event_timeout_t timeout_type) { - struct net *net = sock_net(asoc->base.sk); + struct sock *sk = asoc->base.sk; + struct net *net = sock_net(sk); int error = 0; - bh_lock_sock(asoc->base.sk); - if (sock_owned_by_user(asoc->base.sk)) { + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy: timer %d\n", __func__, timeout_type); @@ -312,10 +314,10 @@ static void sctp_generate_timeout_event(struct sctp_association *asoc, (void *)timeout_type, GFP_ATOMIC); if (error) - asoc->base.sk->sk_err = -error; + sk->sk_err = -error; out_unlock: - bh_unlock_sock(asoc->base.sk); + bh_unlock_sock(sk); sctp_association_put(asoc); } @@ -365,10 +367,11 @@ void sctp_generate_heartbeat_event(unsigned long data) int error = 0; struct sctp_transport *transport = (struct sctp_transport *) data; struct sctp_association *asoc = transport->asoc; - struct net *net = sock_net(asoc->base.sk); + struct sock *sk = asoc->base.sk; + struct net *net = sock_net(sk); - bh_lock_sock(asoc->base.sk); - if (sock_owned_by_user(asoc->base.sk)) { + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy\n", __func__); /* Try again later. */ @@ -388,11 +391,11 @@ void sctp_generate_heartbeat_event(unsigned long data) asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC); - if (error) - asoc->base.sk->sk_err = -error; + if (error) + sk->sk_err = -error; out_unlock: - bh_unlock_sock(asoc->base.sk); + bh_unlock_sock(sk); sctp_transport_put(transport); } @@ -403,10 +406,11 @@ void sctp_generate_proto_unreach_event(unsigned long data) { struct sctp_transport *transport = (struct sctp_transport *) data; struct sctp_association *asoc = transport->asoc; - struct net *net = sock_net(asoc->base.sk); + struct sock *sk = asoc->base.sk; + struct net *net = sock_net(sk); - bh_lock_sock(asoc->base.sk); - if (sock_owned_by_user(asoc->base.sk)) { + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy\n", __func__); /* Try again later. */ @@ -427,7 +431,7 @@ void sctp_generate_proto_unreach_event(unsigned long data) asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC); out_unlock: - bh_unlock_sock(asoc->base.sk); + bh_unlock_sock(sk); sctp_association_put(asoc); } diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 4feda2d0a833..548240dd15fc 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -23,7 +23,7 @@ struct unx_cred { }; #define uc_uid uc_base.cr_uid -#define UNX_WRITESLACK (21 + (UNX_MAXNODENAME >> 2)) +#define UNX_WRITESLACK (21 + XDR_QUADLEN(UNX_MAXNODENAME)) #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 2928afffbb81..4a2340a54401 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -44,7 +44,7 @@ static void cache_revisit_request(struct cache_head *item); static void cache_init(struct cache_head *h) { time_t now = seconds_since_boot(); - h->next = NULL; + INIT_HLIST_NODE(&h->cache_list); h->flags = 0; kref_init(&h->ref); h->expiry_time = now + CACHE_NEW_EXPIRY; @@ -54,15 +54,14 @@ static void cache_init(struct cache_head *h) struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, struct cache_head *key, int hash) { - struct cache_head **head, **hp; - struct cache_head *new = NULL, *freeme = NULL; + struct cache_head *new = NULL, *freeme = NULL, *tmp = NULL; + struct hlist_head *head; head = &detail->hash_table[hash]; read_lock(&detail->hash_lock); - for (hp=head; *hp != NULL ; hp = &(*hp)->next) { - struct cache_head *tmp = *hp; + hlist_for_each_entry(tmp, head, cache_list) { if (detail->match(tmp, key)) { if (cache_is_expired(detail, tmp)) /* This entry is expired, we will discard it. */ @@ -88,12 +87,10 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, write_lock(&detail->hash_lock); /* check if entry appeared while we slept */ - for (hp=head; *hp != NULL ; hp = &(*hp)->next) { - struct cache_head *tmp = *hp; + hlist_for_each_entry(tmp, head, cache_list) { if (detail->match(tmp, key)) { if (cache_is_expired(detail, tmp)) { - *hp = tmp->next; - tmp->next = NULL; + hlist_del_init(&tmp->cache_list); detail->entries --; freeme = tmp; break; @@ -104,8 +101,8 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail, return tmp; } } - new->next = *head; - *head = new; + + hlist_add_head(&new->cache_list, head); detail->entries++; cache_get(new); write_unlock(&detail->hash_lock); @@ -143,7 +140,6 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, * If 'old' is not VALID, we update it directly, * otherwise we need to replace it */ - struct cache_head **head; struct cache_head *tmp; if (!test_bit(CACHE_VALID, &old->flags)) { @@ -168,15 +164,13 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, } cache_init(tmp); detail->init(tmp, old); - head = &detail->hash_table[hash]; write_lock(&detail->hash_lock); if (test_bit(CACHE_NEGATIVE, &new->flags)) set_bit(CACHE_NEGATIVE, &tmp->flags); else detail->update(tmp, new); - tmp->next = *head; - *head = tmp; + hlist_add_head(&tmp->cache_list, &detail->hash_table[hash]); detail->entries++; cache_get(tmp); cache_fresh_locked(tmp, new->expiry_time); @@ -416,28 +410,29 @@ static int cache_clean(void) /* find a non-empty bucket in the table */ while (current_detail && current_index < current_detail->hash_size && - current_detail->hash_table[current_index] == NULL) + hlist_empty(¤t_detail->hash_table[current_index])) current_index++; /* find a cleanable entry in the bucket and clean it, or set to next bucket */ if (current_detail && current_index < current_detail->hash_size) { - struct cache_head *ch, **cp; + struct cache_head *ch = NULL; struct cache_detail *d; + struct hlist_head *head; + struct hlist_node *tmp; write_lock(¤t_detail->hash_lock); /* Ok, now to clean this strand */ - cp = & current_detail->hash_table[current_index]; - for (ch = *cp ; ch ; cp = & ch->next, ch = *cp) { + head = ¤t_detail->hash_table[current_index]; + hlist_for_each_entry_safe(ch, tmp, head, cache_list) { if (current_detail->nextcheck > ch->expiry_time) current_detail->nextcheck = ch->expiry_time+1; if (!cache_is_expired(current_detail, ch)) continue; - *cp = ch->next; - ch->next = NULL; + hlist_del_init(&ch->cache_list); current_detail->entries--; rv = 1; break; @@ -1270,18 +1265,13 @@ EXPORT_SYMBOL_GPL(qword_get); * get a header, then pass each real item in the cache */ -struct handle { - struct cache_detail *cd; -}; - -static void *c_start(struct seq_file *m, loff_t *pos) +void *cache_seq_start(struct seq_file *m, loff_t *pos) __acquires(cd->hash_lock) { loff_t n = *pos; unsigned int hash, entry; struct cache_head *ch; - struct cache_detail *cd = ((struct handle*)m->private)->cd; - + struct cache_detail *cd = m->private; read_lock(&cd->hash_lock); if (!n--) @@ -1289,7 +1279,7 @@ static void *c_start(struct seq_file *m, loff_t *pos) hash = n >> 32; entry = n & ((1LL<<32) - 1); - for (ch=cd->hash_table[hash]; ch; ch=ch->next) + hlist_for_each_entry(ch, &cd->hash_table[hash], cache_list) if (!entry--) return ch; n &= ~((1LL<<32) - 1); @@ -1297,51 +1287,57 @@ static void *c_start(struct seq_file *m, loff_t *pos) hash++; n += 1LL<<32; } while(hash < cd->hash_size && - cd->hash_table[hash]==NULL); + hlist_empty(&cd->hash_table[hash])); if (hash >= cd->hash_size) return NULL; *pos = n+1; - return cd->hash_table[hash]; + return hlist_entry_safe(cd->hash_table[hash].first, + struct cache_head, cache_list); } +EXPORT_SYMBOL_GPL(cache_seq_start); -static void *c_next(struct seq_file *m, void *p, loff_t *pos) +void *cache_seq_next(struct seq_file *m, void *p, loff_t *pos) { struct cache_head *ch = p; int hash = (*pos >> 32); - struct cache_detail *cd = ((struct handle*)m->private)->cd; + struct cache_detail *cd = m->private; if (p == SEQ_START_TOKEN) hash = 0; - else if (ch->next == NULL) { + else if (ch->cache_list.next == NULL) { hash++; *pos += 1LL<<32; } else { ++*pos; - return ch->next; + return hlist_entry_safe(ch->cache_list.next, + struct cache_head, cache_list); } *pos &= ~((1LL<<32) - 1); while (hash < cd->hash_size && - cd->hash_table[hash] == NULL) { + hlist_empty(&cd->hash_table[hash])) { hash++; *pos += 1LL<<32; } if (hash >= cd->hash_size) return NULL; ++*pos; - return cd->hash_table[hash]; + return hlist_entry_safe(cd->hash_table[hash].first, + struct cache_head, cache_list); } +EXPORT_SYMBOL_GPL(cache_seq_next); -static void c_stop(struct seq_file *m, void *p) +void cache_seq_stop(struct seq_file *m, void *p) __releases(cd->hash_lock) { - struct cache_detail *cd = ((struct handle*)m->private)->cd; + struct cache_detail *cd = m->private; read_unlock(&cd->hash_lock); } +EXPORT_SYMBOL_GPL(cache_seq_stop); static int c_show(struct seq_file *m, void *p) { struct cache_head *cp = p; - struct cache_detail *cd = ((struct handle*)m->private)->cd; + struct cache_detail *cd = m->private; if (p == SEQ_START_TOKEN) return cd->cache_show(m, cd, NULL); @@ -1364,33 +1360,36 @@ static int c_show(struct seq_file *m, void *p) } static const struct seq_operations cache_content_op = { - .start = c_start, - .next = c_next, - .stop = c_stop, + .start = cache_seq_start, + .next = cache_seq_next, + .stop = cache_seq_stop, .show = c_show, }; static int content_open(struct inode *inode, struct file *file, struct cache_detail *cd) { - struct handle *han; + struct seq_file *seq; + int err; if (!cd || !try_module_get(cd->owner)) return -EACCES; - han = __seq_open_private(file, &cache_content_op, sizeof(*han)); - if (han == NULL) { + + err = seq_open(file, &cache_content_op); + if (err) { module_put(cd->owner); - return -ENOMEM; + return err; } - han->cd = cd; + seq = file->private_data; + seq->private = cd; return 0; } static int content_release(struct inode *inode, struct file *file, struct cache_detail *cd) { - int ret = seq_release_private(inode, file); + int ret = seq_release(inode, file); module_put(cd->owner); return ret; } @@ -1665,17 +1664,21 @@ EXPORT_SYMBOL_GPL(cache_unregister_net); struct cache_detail *cache_create_net(struct cache_detail *tmpl, struct net *net) { struct cache_detail *cd; + int i; cd = kmemdup(tmpl, sizeof(struct cache_detail), GFP_KERNEL); if (cd == NULL) return ERR_PTR(-ENOMEM); - cd->hash_table = kzalloc(cd->hash_size * sizeof(struct cache_head *), + cd->hash_table = kzalloc(cd->hash_size * sizeof(struct hlist_head), GFP_KERNEL); if (cd->hash_table == NULL) { kfree(cd); return ERR_PTR(-ENOMEM); } + + for (i = 0; i < cd->hash_size; i++) + INIT_HLIST_HEAD(&cd->hash_table[i]); cd->net = net; return cd; } diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 337ca851a350..f14f24ee9983 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -1092,14 +1092,10 @@ void rpc_destroy_mempool(void) { rpciod_stop(); - if (rpc_buffer_mempool) - mempool_destroy(rpc_buffer_mempool); - if (rpc_task_mempool) - mempool_destroy(rpc_task_mempool); - if (rpc_task_slabp) - kmem_cache_destroy(rpc_task_slabp); - if (rpc_buffer_slabp) - kmem_cache_destroy(rpc_buffer_slabp); + mempool_destroy(rpc_buffer_mempool); + mempool_destroy(rpc_task_mempool); + kmem_cache_destroy(rpc_task_slabp); + kmem_cache_destroy(rpc_buffer_slabp); rpc_destroy_wait_queue(&delay_queue); } diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 5a16d8d8c831..a8f579df14d8 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -34,36 +34,19 @@ static void svc_unregister(const struct svc_serv *serv, struct net *net); -#define svc_serv_is_pooled(serv) ((serv)->sv_function) +#define svc_serv_is_pooled(serv) ((serv)->sv_ops->svo_function) -/* - * Mode for mapping cpus to pools. - */ -enum { - SVC_POOL_AUTO = -1, /* choose one of the others */ - SVC_POOL_GLOBAL, /* no mapping, just a single global pool - * (legacy & UP mode) */ - SVC_POOL_PERCPU, /* one pool per cpu */ - SVC_POOL_PERNODE /* one pool per numa node */ -}; #define SVC_POOL_DEFAULT SVC_POOL_GLOBAL /* * Structure for mapping cpus to pools and vice versa. * Setup once during sunrpc initialisation. */ -static struct svc_pool_map { - int count; /* How many svc_servs use us */ - int mode; /* Note: int not enum to avoid - * warnings about "enumeration value - * not handled in switch" */ - unsigned int npools; - unsigned int *pool_to; /* maps pool id to cpu or node */ - unsigned int *to_pool; /* maps cpu or node to pool id */ -} svc_pool_map = { - .count = 0, +struct svc_pool_map svc_pool_map = { .mode = SVC_POOL_DEFAULT }; +EXPORT_SYMBOL_GPL(svc_pool_map); + static DEFINE_MUTEX(svc_pool_map_mutex);/* protects svc_pool_map.count only */ static int @@ -236,7 +219,7 @@ svc_pool_map_init_pernode(struct svc_pool_map *m) * vice versa). Initialise the map if we're the first user. * Returns the number of pools. */ -static unsigned int +unsigned int svc_pool_map_get(void) { struct svc_pool_map *m = &svc_pool_map; @@ -271,7 +254,7 @@ svc_pool_map_get(void) mutex_unlock(&svc_pool_map_mutex); return m->npools; } - +EXPORT_SYMBOL_GPL(svc_pool_map_get); /* * Drop a reference to the global map of cpus to pools. @@ -280,7 +263,7 @@ svc_pool_map_get(void) * mode using the pool_mode module option without * rebooting or re-loading sunrpc.ko. */ -static void +void svc_pool_map_put(void) { struct svc_pool_map *m = &svc_pool_map; @@ -297,7 +280,7 @@ svc_pool_map_put(void) mutex_unlock(&svc_pool_map_mutex); } - +EXPORT_SYMBOL_GPL(svc_pool_map_put); static int svc_pool_map_get_node(unsigned int pidx) { @@ -423,7 +406,7 @@ EXPORT_SYMBOL_GPL(svc_bind); */ static struct svc_serv * __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, - void (*shutdown)(struct svc_serv *serv, struct net *net)) + struct svc_serv_ops *ops) { struct svc_serv *serv; unsigned int vers; @@ -440,7 +423,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, bufsize = RPCSVC_MAXPAYLOAD; serv->sv_max_payload = bufsize? bufsize : 4096; serv->sv_max_mesg = roundup(serv->sv_max_payload + PAGE_SIZE, PAGE_SIZE); - serv->sv_shutdown = shutdown; + serv->sv_ops = ops; xdrsize = 0; while (prog) { prog->pg_lovers = prog->pg_nvers-1; @@ -486,26 +469,22 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, struct svc_serv * svc_create(struct svc_program *prog, unsigned int bufsize, - void (*shutdown)(struct svc_serv *serv, struct net *net)) + struct svc_serv_ops *ops) { - return __svc_create(prog, bufsize, /*npools*/1, shutdown); + return __svc_create(prog, bufsize, /*npools*/1, ops); } EXPORT_SYMBOL_GPL(svc_create); struct svc_serv * svc_create_pooled(struct svc_program *prog, unsigned int bufsize, - void (*shutdown)(struct svc_serv *serv, struct net *net), - svc_thread_fn func, struct module *mod) + struct svc_serv_ops *ops) { struct svc_serv *serv; unsigned int npools = svc_pool_map_get(); - serv = __svc_create(prog, bufsize, npools, shutdown); + serv = __svc_create(prog, bufsize, npools, ops); if (!serv) goto out_err; - - serv->sv_function = func; - serv->sv_module = mod; return serv; out_err: svc_pool_map_put(); @@ -517,8 +496,8 @@ void svc_shutdown_net(struct svc_serv *serv, struct net *net) { svc_close_net(serv, net); - if (serv->sv_shutdown) - serv->sv_shutdown(serv, net); + if (serv->sv_ops->svo_shutdown) + serv->sv_ops->svo_shutdown(serv, net); } EXPORT_SYMBOL_GPL(svc_shutdown_net); @@ -604,40 +583,52 @@ svc_release_buffer(struct svc_rqst *rqstp) } struct svc_rqst * -svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) +svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node) { struct svc_rqst *rqstp; rqstp = kzalloc_node(sizeof(*rqstp), GFP_KERNEL, node); if (!rqstp) - goto out_enomem; + return rqstp; - serv->sv_nrthreads++; __set_bit(RQ_BUSY, &rqstp->rq_flags); spin_lock_init(&rqstp->rq_lock); rqstp->rq_server = serv; rqstp->rq_pool = pool; - spin_lock_bh(&pool->sp_lock); - pool->sp_nrthreads++; - list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads); - spin_unlock_bh(&pool->sp_lock); rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node); if (!rqstp->rq_argp) - goto out_thread; + goto out_enomem; rqstp->rq_resp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node); if (!rqstp->rq_resp) - goto out_thread; + goto out_enomem; if (!svc_init_buffer(rqstp, serv->sv_max_mesg, node)) - goto out_thread; + goto out_enomem; return rqstp; -out_thread: - svc_exit_thread(rqstp); out_enomem: - return ERR_PTR(-ENOMEM); + svc_rqst_free(rqstp); + return NULL; +} +EXPORT_SYMBOL_GPL(svc_rqst_alloc); + +struct svc_rqst * +svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) +{ + struct svc_rqst *rqstp; + + rqstp = svc_rqst_alloc(serv, pool, node); + if (!rqstp) + return ERR_PTR(-ENOMEM); + + serv->sv_nrthreads++; + spin_lock_bh(&pool->sp_lock); + pool->sp_nrthreads++; + list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads); + spin_unlock_bh(&pool->sp_lock); + return rqstp; } EXPORT_SYMBOL_GPL(svc_prepare_thread); @@ -739,12 +730,12 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) break; } - __module_get(serv->sv_module); - task = kthread_create_on_node(serv->sv_function, rqstp, + __module_get(serv->sv_ops->svo_module); + task = kthread_create_on_node(serv->sv_ops->svo_function, rqstp, node, "%s", serv->sv_name); if (IS_ERR(task)) { error = PTR_ERR(task); - module_put(serv->sv_module); + module_put(serv->sv_ops->svo_module); svc_exit_thread(rqstp); break; } @@ -772,15 +763,21 @@ EXPORT_SYMBOL_GPL(svc_set_num_threads); * mutex" for the service. */ void -svc_exit_thread(struct svc_rqst *rqstp) +svc_rqst_free(struct svc_rqst *rqstp) { - struct svc_serv *serv = rqstp->rq_server; - struct svc_pool *pool = rqstp->rq_pool; - svc_release_buffer(rqstp); kfree(rqstp->rq_resp); kfree(rqstp->rq_argp); kfree(rqstp->rq_auth_data); + kfree_rcu(rqstp, rq_rcu_head); +} +EXPORT_SYMBOL_GPL(svc_rqst_free); + +void +svc_exit_thread(struct svc_rqst *rqstp) +{ + struct svc_serv *serv = rqstp->rq_server; + struct svc_pool *pool = rqstp->rq_pool; spin_lock_bh(&pool->sp_lock); pool->sp_nrthreads--; @@ -788,7 +785,7 @@ svc_exit_thread(struct svc_rqst *rqstp) list_del_rcu(&rqstp->rq_all); spin_unlock_bh(&pool->sp_lock); - kfree_rcu(rqstp, rq_rcu_head); + svc_rqst_free(rqstp); /* Release the server */ if (serv) diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 163ac45c3639..a6cbb2104667 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -24,7 +24,6 @@ static int svc_deferred_recv(struct svc_rqst *rqstp); static struct cache_deferred_req *svc_defer(struct cache_req *req); static void svc_age_temp_xprts(unsigned long closure); static void svc_delete_xprt(struct svc_xprt *xprt); -static void svc_xprt_do_enqueue(struct svc_xprt *xprt); /* apparently the "standard" is that clients close * idle connections after 5 minutes, servers after @@ -225,12 +224,12 @@ static void svc_xprt_received(struct svc_xprt *xprt) } /* As soon as we clear busy, the xprt could be closed and - * 'put', so we need a reference to call svc_xprt_do_enqueue with: + * 'put', so we need a reference to call svc_enqueue_xprt with: */ svc_xprt_get(xprt); smp_mb__before_atomic(); clear_bit(XPT_BUSY, &xprt->xpt_flags); - svc_xprt_do_enqueue(xprt); + xprt->xpt_server->sv_ops->svo_enqueue_xprt(xprt); svc_xprt_put(xprt); } @@ -320,7 +319,7 @@ static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt) return false; } -static void svc_xprt_do_enqueue(struct svc_xprt *xprt) +void svc_xprt_do_enqueue(struct svc_xprt *xprt) { struct svc_pool *pool; struct svc_rqst *rqstp = NULL; @@ -402,6 +401,7 @@ redo_search: out: trace_svc_xprt_do_enqueue(xprt, rqstp); } +EXPORT_SYMBOL_GPL(svc_xprt_do_enqueue); /* * Queue up a transport with data pending. If there are idle nfsd @@ -412,7 +412,7 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) { if (test_bit(XPT_BUSY, &xprt->xpt_flags)) return; - svc_xprt_do_enqueue(xprt); + xprt->xpt_server->sv_ops->svo_enqueue_xprt(xprt); } EXPORT_SYMBOL_GPL(svc_xprt_enqueue); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index ab5dd621ae0c..2e98f4a243e5 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -614,6 +614,7 @@ static void xprt_autoclose(struct work_struct *work) clear_bit(XPRT_CLOSE_WAIT, &xprt->state); xprt->ops->close(xprt); xprt_release_write(xprt, NULL); + wake_up_bit(&xprt->state, XPRT_LOCKED); } /** @@ -723,6 +724,7 @@ void xprt_unlock_connect(struct rpc_xprt *xprt, void *cookie) xprt->ops->release_xprt(xprt, NULL); out: spin_unlock_bh(&xprt->transport_lock); + wake_up_bit(&xprt->state, XPRT_LOCKED); } /** @@ -1394,6 +1396,10 @@ out: static void xprt_destroy(struct rpc_xprt *xprt) { dprintk("RPC: destroying transport %p\n", xprt); + + /* Exclude transport connect/disconnect handlers */ + wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_UNINTERRUPTIBLE); + del_timer_sync(&xprt->timer); rpc_xprt_debugfs_unregister(xprt); diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 04ea914201b2..5318951b3b53 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -117,7 +117,7 @@ __frwr_recovery_worker(struct work_struct *work) if (ib_dereg_mr(r->r.frmr.fr_mr)) goto out_fail; - r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(pd, depth); + r->r.frmr.fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth); if (IS_ERR(r->r.frmr.fr_mr)) goto out_fail; @@ -148,7 +148,7 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device, struct rpcrdma_frmr *f = &r->r.frmr; int rc; - f->fr_mr = ib_alloc_fast_reg_mr(pd, depth); + f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth); if (IS_ERR(f->fr_mr)) goto out_mr_err; f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth); @@ -158,7 +158,7 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device, out_mr_err: rc = PTR_ERR(f->fr_mr); - dprintk("RPC: %s: ib_alloc_fast_reg_mr status %i\n", + dprintk("RPC: %s: ib_alloc_mr status %i\n", __func__, rc); return rc; diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c index 41985d07fdb7..617b76f22154 100644 --- a/net/sunrpc/xprtrdma/physical_ops.c +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -23,6 +23,21 @@ static int physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_create_data_internal *cdata) { + struct ib_mr *mr; + + /* Obtain an rkey to use for RPC data payloads. + */ + mr = ib_get_dma_mr(ia->ri_pd, + IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ); + if (IS_ERR(mr)) { + pr_err("%s: ib_get_dma_mr for failed with %lX\n", + __func__, PTR_ERR(mr)); + return -ENOMEM; + } + + ia->ri_dma_mr = mr; return 0; } @@ -51,7 +66,7 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, struct rpcrdma_ia *ia = &r_xprt->rx_ia; rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing)); - seg->mr_rkey = ia->ri_bind_mem->rkey; + seg->mr_rkey = ia->ri_dma_mr->rkey; seg->mr_base = seg->mr_dma; seg->mr_nsegs = 1; return 1; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 84ea37daef36..bc8bd6577467 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -71,6 +71,67 @@ static const char transfertypes[][12] = { }; #endif +/* The client can send a request inline as long as the RPCRDMA header + * plus the RPC call fit under the transport's inline limit. If the + * combined call message size exceeds that limit, the client must use + * the read chunk list for this operation. + */ +static bool rpcrdma_args_inline(struct rpc_rqst *rqst) +{ + unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len; + + return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst); +} + +/* The client can't know how large the actual reply will be. Thus it + * plans for the largest possible reply for that particular ULP + * operation. If the maximum combined reply message size exceeds that + * limit, the client must provide a write list or a reply chunk for + * this request. + */ +static bool rpcrdma_results_inline(struct rpc_rqst *rqst) +{ + unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen; + + return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst); +} + +static int +rpcrdma_tail_pullup(struct xdr_buf *buf) +{ + size_t tlen = buf->tail[0].iov_len; + size_t skip = tlen & 3; + + /* Do not include the tail if it is only an XDR pad */ + if (tlen < 4) + return 0; + + /* xdr_write_pages() adds a pad at the beginning of the tail + * if the content in "buf->pages" is unaligned. Force the + * tail's actual content to land at the next XDR position + * after the head instead. + */ + if (skip) { + unsigned char *src, *dst; + unsigned int count; + + src = buf->tail[0].iov_base; + dst = buf->head[0].iov_base; + dst += buf->head[0].iov_len; + + src += skip; + tlen -= skip; + + dprintk("RPC: %s: skip=%zu, memmove(%p, %p, %zu)\n", + __func__, skip, dst, src, tlen); + + for (count = tlen; count; count--) + *dst++ = *src++; + } + + return tlen; +} + /* * Chunk assembly from upper layer xdr_buf. * @@ -122,6 +183,10 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, if (len && n == nsegs) return -EIO; + /* When encoding the read list, the tail is always sent inline */ + if (type == rpcrdma_readch) + return n; + if (xdrbuf->tail[0].iov_len) { /* the rpcrdma protocol allows us to omit any trailing * xdr pad bytes, saving the server an RDMA operation. */ @@ -297,8 +362,7 @@ out: * pre-registered memory buffer for this request. For small amounts * of data, this is efficient. The cutoff value is tunable. */ -static int -rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) +static void rpcrdma_inline_pullup(struct rpc_rqst *rqst) { int i, npages, curlen; int copy_len; @@ -310,16 +374,9 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) destp = rqst->rq_svec[0].iov_base; curlen = rqst->rq_svec[0].iov_len; destp += curlen; - /* - * Do optional padding where it makes sense. Alignment of write - * payload can help the server, if our setting is accurate. - */ - pad -= (curlen + 36/*sizeof(struct rpcrdma_msg_padded)*/); - if (pad < 0 || rqst->rq_slen - curlen < RPCRDMA_INLINE_PAD_THRESH) - pad = 0; /* don't pad this request */ - dprintk("RPC: %s: pad %d destp 0x%p len %d hdrlen %d\n", - __func__, pad, destp, rqst->rq_slen, curlen); + dprintk("RPC: %s: destp 0x%p len %d hdrlen %d\n", + __func__, destp, rqst->rq_slen, curlen); copy_len = rqst->rq_snd_buf.page_len; @@ -355,7 +412,6 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) page_base = 0; } /* header now contains entire send message */ - return pad; } /* @@ -380,7 +436,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); char *base; - size_t rpclen, padlen; + size_t rpclen; ssize_t hdrlen; enum rpcrdma_chunktype rtype, wtype; struct rpcrdma_msg *headerp; @@ -402,28 +458,15 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) /* * Chunks needed for results? * + * o Read ops return data as write chunk(s), header as inline. * o If the expected result is under the inline threshold, all ops - * return as inline (but see later). + * return as inline. * o Large non-read ops return as a single reply chunk. - * o Large read ops return data as write chunk(s), header as inline. - * - * Note: the NFS code sending down multiple result segments implies - * the op is one of read, readdir[plus], readlink or NFSv4 getacl. - */ - - /* - * This code can handle read chunks, write chunks OR reply - * chunks -- only one type. If the request is too big to fit - * inline, then we will choose read chunks. If the request is - * a READ, then use write chunks to separate the file data - * into pages; otherwise use reply chunks. */ - if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst)) - wtype = rpcrdma_noch; - else if (rqst->rq_rcv_buf.page_len == 0) - wtype = rpcrdma_replych; - else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) + if (rqst->rq_rcv_buf.flags & XDRBUF_READ) wtype = rpcrdma_writech; + else if (rpcrdma_results_inline(rqst)) + wtype = rpcrdma_noch; else wtype = rpcrdma_replych; @@ -432,21 +475,25 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * * o If the total request is under the inline threshold, all ops * are sent as inline. - * o Large non-write ops are sent with the entire message as a - * single read chunk (protocol 0-position special case). * o Large write ops transmit data as read chunk(s), header as * inline. + * o Large non-write ops are sent with the entire message as a + * single read chunk (protocol 0-position special case). * - * Note: the NFS code sending down multiple argument segments - * implies the op is a write. - * TBD check NFSv4 setacl + * This assumes that the upper layer does not present a request + * that both has a data payload, and whose non-data arguments + * by themselves are larger than the inline threshold. */ - if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) + if (rpcrdma_args_inline(rqst)) { rtype = rpcrdma_noch; - else if (rqst->rq_snd_buf.page_len == 0) - rtype = rpcrdma_areadch; - else + } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) { rtype = rpcrdma_readch; + } else { + r_xprt->rx_stats.nomsg_call_count++; + headerp->rm_type = htonl(RDMA_NOMSG); + rtype = rpcrdma_areadch; + rpclen = 0; + } /* The following simplification is not true forever */ if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) @@ -458,7 +505,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) } hdrlen = RPCRDMA_HDRLEN_MIN; - padlen = 0; /* * Pull up any extra send data into the preregistered buffer. @@ -467,45 +513,15 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) */ if (rtype == rpcrdma_noch) { - padlen = rpcrdma_inline_pullup(rqst, - RPCRDMA_INLINE_PAD_VALUE(rqst)); - - if (padlen) { - headerp->rm_type = rdma_msgp; - headerp->rm_body.rm_padded.rm_align = - cpu_to_be32(RPCRDMA_INLINE_PAD_VALUE(rqst)); - headerp->rm_body.rm_padded.rm_thresh = - cpu_to_be32(RPCRDMA_INLINE_PAD_THRESH); - headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero; - headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; - headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; - hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ - if (wtype != rpcrdma_noch) { - dprintk("RPC: %s: invalid chunk list\n", - __func__); - return -EIO; - } - } else { - headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero; - headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero; - headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero; - /* new length after pullup */ - rpclen = rqst->rq_svec[0].iov_len; - /* - * Currently we try to not actually use read inline. - * Reply chunks have the desirable property that - * they land, packed, directly in the target buffers - * without headers, so they require no fixup. The - * additional RDMA Write op sends the same amount - * of data, streams on-the-wire and adds no overhead - * on receive. Therefore, we request a reply chunk - * for non-writes wherever feasible and efficient. - */ - if (wtype == rpcrdma_noch) - wtype = rpcrdma_replych; - } - } + rpcrdma_inline_pullup(rqst); + headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero; + headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero; + headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero; + /* new length after pullup */ + rpclen = rqst->rq_svec[0].iov_len; + } else if (rtype == rpcrdma_readch) + rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf); if (rtype != rpcrdma_noch) { hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf, headerp, rtype); @@ -518,9 +534,9 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) if (hdrlen < 0) return hdrlen; - dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" + dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd" " headerp 0x%p base 0x%p lkey 0x%x\n", - __func__, transfertypes[wtype], hdrlen, rpclen, padlen, + __func__, transfertypes[wtype], hdrlen, rpclen, headerp, base, rdmab_lkey(req->rl_rdmabuf)); /* @@ -534,26 +550,15 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) req->rl_send_iov[0].length = hdrlen; req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf); + req->rl_niovs = 1; + if (rtype == rpcrdma_areadch) + return 0; + req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf); req->rl_send_iov[1].length = rpclen; req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf); req->rl_niovs = 2; - - if (padlen) { - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - - req->rl_send_iov[2].addr = rdmab_addr(ep->rep_padbuf); - req->rl_send_iov[2].length = padlen; - req->rl_send_iov[2].lkey = rdmab_lkey(ep->rep_padbuf); - - req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen; - req->rl_send_iov[3].length = rqst->rq_slen - rpclen; - req->rl_send_iov[3].lkey = rdmab_lkey(req->rl_sendbuf); - - req->rl_niovs = 4; - } - return 0; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 2e1348bde325..f0c3ff67ca98 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -115,15 +115,6 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, rqstp->rq_arg.tail[0].iov_len = 0; } -static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count) -{ - if (!rdma_cap_read_multi_sge(xprt->sc_cm_id->device, - xprt->sc_cm_id->port_num)) - return 1; - else - return min_t(int, sge_count, xprt->sc_max_sge); -} - /* Issue an RDMA_READ using the local lkey to map the data sink */ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, @@ -144,9 +135,9 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, ctxt->direction = DMA_FROM_DEVICE; ctxt->read_hdr = head; - pages_needed = - min_t(int, pages_needed, rdma_read_max_sge(xprt, pages_needed)); - read = min_t(int, pages_needed << PAGE_SHIFT, rs_length); + pages_needed = min_t(int, pages_needed, xprt->sc_max_sge_rd); + read = min_t(int, (pages_needed << PAGE_SHIFT) - *page_offset, + rs_length); for (pno = 0; pno < pages_needed; pno++) { int len = min_t(int, rs_length, PAGE_SIZE - pg_off); @@ -245,7 +236,8 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, ctxt->direction = DMA_FROM_DEVICE; ctxt->frmr = frmr; pages_needed = min_t(int, pages_needed, xprt->sc_frmr_pg_list_len); - read = min_t(int, pages_needed << PAGE_SHIFT, rs_length); + read = min_t(int, (pages_needed << PAGE_SHIFT) - *page_offset, + rs_length); frmr->kva = page_address(rqstp->rq_arg.pages[pg_no]); frmr->direction = DMA_FROM_DEVICE; @@ -541,7 +533,7 @@ static int rdma_read_complete(struct svc_rqst *rqstp, rqstp->rq_arg.page_base = head->arg.page_base; /* rq_respages starts after the last arg page */ - rqstp->rq_respages = &rqstp->rq_arg.pages[page_no]; + rqstp->rq_respages = &rqstp->rq_pages[page_no]; rqstp->rq_next_page = rqstp->rq_respages + 1; /* Rebuild rq_arg head and tail. */ diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index d25cd430f9ff..1dfae8317065 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -136,6 +136,79 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt, return dma_addr; } +/* Returns the address of the first read chunk or <nul> if no read chunk + * is present + */ +struct rpcrdma_read_chunk * +svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp) +{ + struct rpcrdma_read_chunk *ch = + (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; + + if (ch->rc_discrim == xdr_zero) + return NULL; + return ch; +} + +/* Returns the address of the first read write array element or <nul> + * if no write array list is present + */ +static struct rpcrdma_write_array * +svc_rdma_get_write_array(struct rpcrdma_msg *rmsgp) +{ + if (rmsgp->rm_body.rm_chunks[0] != xdr_zero || + rmsgp->rm_body.rm_chunks[1] == xdr_zero) + return NULL; + return (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[1]; +} + +/* Returns the address of the first reply array element or <nul> if no + * reply array is present + */ +static struct rpcrdma_write_array * +svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp) +{ + struct rpcrdma_read_chunk *rch; + struct rpcrdma_write_array *wr_ary; + struct rpcrdma_write_array *rp_ary; + + /* XXX: Need to fix when reply chunk may occur with read list + * and/or write list. + */ + if (rmsgp->rm_body.rm_chunks[0] != xdr_zero || + rmsgp->rm_body.rm_chunks[1] != xdr_zero) + return NULL; + + rch = svc_rdma_get_read_chunk(rmsgp); + if (rch) { + while (rch->rc_discrim != xdr_zero) + rch++; + + /* The reply chunk follows an empty write array located + * at 'rc_position' here. The reply array is at rc_target. + */ + rp_ary = (struct rpcrdma_write_array *)&rch->rc_target; + goto found_it; + } + + wr_ary = svc_rdma_get_write_array(rmsgp); + if (wr_ary) { + int chunk = be32_to_cpu(wr_ary->wc_nchunks); + + rp_ary = (struct rpcrdma_write_array *) + &wr_ary->wc_array[chunk].wc_target.rs_length; + goto found_it; + } + + /* No read list, no write list */ + rp_ary = (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[2]; + + found_it: + if (rp_ary->wc_discrim == xdr_zero) + return NULL; + return rp_ary; +} + /* Assumptions: * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE */ @@ -384,6 +457,7 @@ static int send_reply(struct svcxprt_rdma *rdma, int byte_count) { struct ib_send_wr send_wr; + u32 xdr_off; int sge_no; int sge_bytes; int page_no; @@ -418,8 +492,8 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->direction = DMA_TO_DEVICE; /* Map the payload indicated by 'byte_count' */ + xdr_off = 0; for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) { - int xdr_off = 0; sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count); byte_count -= sge_bytes; ctxt->sge[sge_no].addr = @@ -457,6 +531,13 @@ static int send_reply(struct svcxprt_rdma *rdma, } rqstp->rq_next_page = rqstp->rq_respages + 1; + /* The loop above bumps sc_dma_used for each sge. The + * xdr_buf.tail gets a separate sge, but resides in the + * same page as xdr_buf.head. Don't count it twice. + */ + if (sge_no > ctxt->count) + atomic_dec(&rdma->sc_dma_used); + if (sge_no > rdma->sc_max_sge) { pr_err("svcrdma: Too many sges (%d)\n", sge_no); goto err; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 6b36279e4288..fcc3eb80c265 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -91,7 +91,7 @@ struct svc_xprt_class svc_rdma_class = { .xcl_name = "rdma", .xcl_owner = THIS_MODULE, .xcl_ops = &svc_rdma_ops, - .xcl_max_payload = RPCRDMA_MAXPAYLOAD, + .xcl_max_payload = RPCSVC_MAXPAYLOAD_RDMA, .xcl_ident = XPRT_TRANSPORT_RDMA, }; @@ -659,6 +659,7 @@ static int rdma_cma_handler(struct rdma_cm_id *cma_id, if (xprt) { set_bit(XPT_CLOSE, &xprt->xpt_flags); svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); } break; default: @@ -733,17 +734,19 @@ static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt) struct ib_mr *mr; struct ib_fast_reg_page_list *pl; struct svc_rdma_fastreg_mr *frmr; + u32 num_sg; frmr = kmalloc(sizeof(*frmr), GFP_KERNEL); if (!frmr) goto err; - mr = ib_alloc_fast_reg_mr(xprt->sc_pd, RPCSVC_MAXPAGES); + num_sg = min_t(u32, RPCSVC_MAXPAGES, xprt->sc_frmr_pg_list_len); + mr = ib_alloc_mr(xprt->sc_pd, IB_MR_TYPE_MEM_REG, num_sg); if (IS_ERR(mr)) goto err_free_frmr; pl = ib_alloc_fast_reg_page_list(xprt->sc_cm_id->device, - RPCSVC_MAXPAGES); + num_sg); if (IS_ERR(pl)) goto err_free_mr; @@ -872,6 +875,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) * capabilities of this particular device */ newxprt->sc_max_sge = min((size_t)devattr.max_sge, (size_t)RPCSVC_MAXPAGES); + newxprt->sc_max_sge_rd = min_t(size_t, devattr.max_sge_rd, + RPCSVC_MAXPAGES); newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr, (size_t)svcrdma_max_requests); newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests; @@ -1046,6 +1051,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) " remote_ip : %pI4\n" " remote_port : %d\n" " max_sge : %d\n" + " max_sge_rd : %d\n" " sq_depth : %d\n" " max_requests : %d\n" " ord : %d\n", @@ -1059,6 +1065,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id-> route.addr.dst_addr)->sin_port), newxprt->sc_max_sge, + newxprt->sc_max_sge_rd, newxprt->sc_sq_depth, newxprt->sc_max_requests, newxprt->sc_ord); @@ -1201,40 +1208,6 @@ static int svc_rdma_secure_port(struct svc_rqst *rqstp) return 1; } -/* - * Attempt to register the kvec representing the RPC memory with the - * device. - * - * Returns: - * NULL : The device does not support fastreg or there were no more - * fastreg mr. - * frmr : The kvec register request was successfully posted. - * <0 : An error was encountered attempting to register the kvec. - */ -int svc_rdma_fastreg(struct svcxprt_rdma *xprt, - struct svc_rdma_fastreg_mr *frmr) -{ - struct ib_send_wr fastreg_wr; - u8 key; - - /* Bump the key */ - key = (u8)(frmr->mr->lkey & 0x000000FF); - ib_update_fast_reg_key(frmr->mr, ++key); - - /* Prepare FASTREG WR */ - memset(&fastreg_wr, 0, sizeof fastreg_wr); - fastreg_wr.opcode = IB_WR_FAST_REG_MR; - fastreg_wr.send_flags = IB_SEND_SIGNALED; - fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva; - fastreg_wr.wr.fast_reg.page_list = frmr->page_list; - fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len; - fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; - fastreg_wr.wr.fast_reg.length = frmr->map_len; - fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags; - fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey; - return svc_rdma_send(xprt, &fastreg_wr); -} - int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) { struct ib_send_wr *bad_wr, *n_wr; diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 680f888a9ddd..41e452bc580c 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -175,10 +175,8 @@ xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap) } static void -xprt_rdma_format_addresses(struct rpc_xprt *xprt) +xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap) { - struct sockaddr *sap = (struct sockaddr *) - &rpcx_to_rdmad(xprt).addr; char buf[128]; switch (sap->sa_family) { @@ -272,8 +270,8 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) xprt_clear_connected(xprt); - rpcrdma_buffer_destroy(&r_xprt->rx_buf); rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia); + rpcrdma_buffer_destroy(&r_xprt->rx_buf); rpcrdma_ia_close(&r_xprt->rx_ia); xprt_rdma_free_addresses(xprt); @@ -302,7 +300,7 @@ xprt_setup_rdma(struct xprt_create *args) struct rpc_xprt *xprt; struct rpcrdma_xprt *new_xprt; struct rpcrdma_ep *new_ep; - struct sockaddr_in *sin; + struct sockaddr *sap; int rc; if (args->addrlen > sizeof(xprt->addr)) { @@ -333,26 +331,20 @@ xprt_setup_rdma(struct xprt_create *args) * Set up RDMA-specific connect data. */ - /* Put server RDMA address in local cdata */ - memcpy(&cdata.addr, args->dstaddr, args->addrlen); + sap = (struct sockaddr *)&cdata.addr; + memcpy(sap, args->dstaddr, args->addrlen); /* Ensure xprt->addr holds valid server TCP (not RDMA) * address, for any side protocols which peek at it */ xprt->prot = IPPROTO_TCP; xprt->addrlen = args->addrlen; - memcpy(&xprt->addr, &cdata.addr, xprt->addrlen); + memcpy(&xprt->addr, sap, xprt->addrlen); - sin = (struct sockaddr_in *)&cdata.addr; - if (ntohs(sin->sin_port) != 0) + if (rpc_get_port(sap)) xprt_set_bound(xprt); - dprintk("RPC: %s: %pI4:%u\n", - __func__, &sin->sin_addr.s_addr, ntohs(sin->sin_port)); - - /* Set max requests */ cdata.max_requests = xprt->max_reqs; - /* Set some length limits */ cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */ cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */ @@ -375,8 +367,7 @@ xprt_setup_rdma(struct xprt_create *args) new_xprt = rpcx_to_rdmax(xprt); - rc = rpcrdma_ia_open(new_xprt, (struct sockaddr *) &cdata.addr, - xprt_rdma_memreg_strategy); + rc = rpcrdma_ia_open(new_xprt, sap, xprt_rdma_memreg_strategy); if (rc) goto out1; @@ -409,7 +400,7 @@ xprt_setup_rdma(struct xprt_create *args) INIT_DELAYED_WORK(&new_xprt->rx_connect_worker, xprt_rdma_connect_worker); - xprt_rdma_format_addresses(xprt); + xprt_rdma_format_addresses(xprt, sap); xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt); if (xprt->max_payload == 0) goto out4; @@ -420,6 +411,9 @@ xprt_setup_rdma(struct xprt_create *args) if (!try_module_get(THIS_MODULE)) goto out4; + dprintk("RPC: %s: %s:%s\n", __func__, + xprt->address_strings[RPC_DISPLAY_ADDR], + xprt->address_strings[RPC_DISPLAY_PORT]); return xprt; out4: @@ -653,31 +647,30 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) if (xprt_connected(xprt)) idle_time = (long)(jiffies - xprt->last_used) / HZ; - seq_printf(seq, - "\txprt:\trdma %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu " - "%lu %lu %lu %Lu %Lu %Lu %Lu %lu %lu %lu\n", - - 0, /* need a local port? */ - xprt->stat.bind_count, - xprt->stat.connect_count, - xprt->stat.connect_time, - idle_time, - xprt->stat.sends, - xprt->stat.recvs, - xprt->stat.bad_xids, - xprt->stat.req_u, - xprt->stat.bklog_u, - - r_xprt->rx_stats.read_chunk_count, - r_xprt->rx_stats.write_chunk_count, - r_xprt->rx_stats.reply_chunk_count, - r_xprt->rx_stats.total_rdma_request, - r_xprt->rx_stats.total_rdma_reply, - r_xprt->rx_stats.pullup_copy_count, - r_xprt->rx_stats.fixup_copy_count, - r_xprt->rx_stats.hardway_register_count, - r_xprt->rx_stats.failed_marshal_count, - r_xprt->rx_stats.bad_reply_count); + seq_puts(seq, "\txprt:\trdma "); + seq_printf(seq, "%u %lu %lu %lu %ld %lu %lu %lu %llu %llu ", + 0, /* need a local port? */ + xprt->stat.bind_count, + xprt->stat.connect_count, + xprt->stat.connect_time, + idle_time, + xprt->stat.sends, + xprt->stat.recvs, + xprt->stat.bad_xids, + xprt->stat.req_u, + xprt->stat.bklog_u); + seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu\n", + r_xprt->rx_stats.read_chunk_count, + r_xprt->rx_stats.write_chunk_count, + r_xprt->rx_stats.reply_chunk_count, + r_xprt->rx_stats.total_rdma_request, + r_xprt->rx_stats.total_rdma_reply, + r_xprt->rx_stats.pullup_copy_count, + r_xprt->rx_stats.fixup_copy_count, + r_xprt->rx_stats.hardway_register_count, + r_xprt->rx_stats.failed_marshal_count, + r_xprt->rx_stats.bad_reply_count, + r_xprt->rx_stats.nomsg_call_count); } static int diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 891c4ede2c20..5502d4dade74 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -52,6 +52,7 @@ #include <linux/prefetch.h> #include <linux/sunrpc/addr.h> #include <asm/bitops.h> +#include <linux/module.h> /* try_module_get()/module_put() */ #include "xprt_rdma.h" @@ -414,6 +415,14 @@ connected: return 0; } +static void rpcrdma_destroy_id(struct rdma_cm_id *id) +{ + if (id) { + module_put(id->device->owner); + rdma_destroy_id(id); + } +} + static struct rdma_cm_id * rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia, struct sockaddr *addr) @@ -440,6 +449,17 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, } wait_for_completion_interruptible_timeout(&ia->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); + + /* FIXME: + * Until xprtrdma supports DEVICE_REMOVAL, the provider must + * be pinned while there are active NFS/RDMA mounts to prevent + * hangs and crashes at umount time. + */ + if (!ia->ri_async_rc && !try_module_get(id->device->owner)) { + dprintk("RPC: %s: Failed to get device module\n", + __func__); + ia->ri_async_rc = -ENODEV; + } rc = ia->ri_async_rc; if (rc) goto out; @@ -449,16 +469,17 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, if (rc) { dprintk("RPC: %s: rdma_resolve_route() failed %i\n", __func__, rc); - goto out; + goto put; } wait_for_completion_interruptible_timeout(&ia->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); rc = ia->ri_async_rc; if (rc) - goto out; + goto put; return id; - +put: + module_put(id->device->owner); out: rdma_destroy_id(id); return ERR_PTR(rc); @@ -493,9 +514,11 @@ rpcrdma_clean_cq(struct ib_cq *cq) int rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) { - int rc, mem_priv; struct rpcrdma_ia *ia = &xprt->rx_ia; struct ib_device_attr *devattr = &ia->ri_devattr; + int rc; + + ia->ri_dma_mr = NULL; ia->ri_id = rpcrdma_create_id(xprt, ia, addr); if (IS_ERR(ia->ri_id)) { @@ -519,17 +542,9 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) goto out3; } - if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) { - ia->ri_have_dma_lkey = 1; - ia->ri_dma_lkey = ia->ri_device->local_dma_lkey; - } - if (memreg == RPCRDMA_FRMR) { - /* Requires both frmr reg and local dma lkey */ - if (((devattr->device_cap_flags & - (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != - (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) || - (devattr->max_fast_reg_page_list_len == 0)) { + if (!(devattr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) || + (devattr->max_fast_reg_page_list_len == 0)) { dprintk("RPC: %s: FRMR registration " "not supported by HCA\n", __func__); memreg = RPCRDMA_MTHCAFMR; @@ -539,42 +554,20 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) if (!ia->ri_device->alloc_fmr) { dprintk("RPC: %s: MTHCAFMR registration " "not supported by HCA\n", __func__); - memreg = RPCRDMA_ALLPHYSICAL; + rc = -EINVAL; + goto out3; } } - /* - * Optionally obtain an underlying physical identity mapping in - * order to do a memory window-based bind. This base registration - * is protected from remote access - that is enabled only by binding - * for the specific bytes targeted during each RPC operation, and - * revoked after the corresponding completion similar to a storage - * adapter. - */ switch (memreg) { case RPCRDMA_FRMR: ia->ri_ops = &rpcrdma_frwr_memreg_ops; break; case RPCRDMA_ALLPHYSICAL: ia->ri_ops = &rpcrdma_physical_memreg_ops; - mem_priv = IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_READ; - goto register_setup; + break; case RPCRDMA_MTHCAFMR: ia->ri_ops = &rpcrdma_fmr_memreg_ops; - if (ia->ri_have_dma_lkey) - break; - mem_priv = IB_ACCESS_LOCAL_WRITE; - register_setup: - ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); - if (IS_ERR(ia->ri_bind_mem)) { - printk(KERN_ALERT "%s: ib_get_dma_mr for " - "phys register failed with %lX\n", - __func__, PTR_ERR(ia->ri_bind_mem)); - rc = -ENOMEM; - goto out3; - } break; default: printk(KERN_ERR "RPC: Unsupported memory " @@ -592,7 +585,7 @@ out3: ib_dealloc_pd(ia->ri_pd); ia->ri_pd = NULL; out2: - rdma_destroy_id(ia->ri_id); + rpcrdma_destroy_id(ia->ri_id); ia->ri_id = NULL; out1: return rc; @@ -606,25 +599,17 @@ out1: void rpcrdma_ia_close(struct rpcrdma_ia *ia) { - int rc; - dprintk("RPC: %s: entering\n", __func__); - if (ia->ri_bind_mem != NULL) { - rc = ib_dereg_mr(ia->ri_bind_mem); - dprintk("RPC: %s: ib_dereg_mr returned %i\n", - __func__, rc); - } - if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { if (ia->ri_id->qp) rdma_destroy_qp(ia->ri_id); - rdma_destroy_id(ia->ri_id); + rpcrdma_destroy_id(ia->ri_id); ia->ri_id = NULL; } /* If the pd is still busy, xprtrdma missed freeing a resource */ if (ia->ri_pd && !IS_ERR(ia->ri_pd)) - WARN_ON(ib_dealloc_pd(ia->ri_pd)); + ib_dealloc_pd(ia->ri_pd); } /* @@ -639,6 +624,12 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, struct ib_cq_init_attr cq_attr = {}; int rc, err; + if (devattr->max_sge < RPCRDMA_MAX_IOVS) { + dprintk("RPC: %s: insufficient sge's available\n", + __func__); + return -ENOMEM; + } + /* check provider's send/recv wr limits */ if (cdata->max_requests > devattr->max_qp_wr) cdata->max_requests = devattr->max_qp_wr; @@ -651,21 +642,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, if (rc) return rc; ep->rep_attr.cap.max_recv_wr = cdata->max_requests; - ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2); + ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS; ep->rep_attr.cap.max_recv_sge = 1; ep->rep_attr.cap.max_inline_data = 0; ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; ep->rep_attr.qp_type = IB_QPT_RC; ep->rep_attr.port_num = ~0; - if (cdata->padding) { - ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding, - GFP_KERNEL); - if (IS_ERR(ep->rep_padbuf)) - return PTR_ERR(ep->rep_padbuf); - } else - ep->rep_padbuf = NULL; - dprintk("RPC: %s: requested max: dtos: send %d recv %d; " "iovs: send %d recv %d\n", __func__, @@ -748,7 +731,8 @@ out2: dprintk("RPC: %s: ib_destroy_cq returned %i\n", __func__, err); out1: - rpcrdma_free_regbuf(ia, ep->rep_padbuf); + if (ia->ri_dma_mr) + ib_dereg_mr(ia->ri_dma_mr); return rc; } @@ -769,25 +753,32 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) cancel_delayed_work_sync(&ep->rep_connect_worker); - if (ia->ri_id->qp) { + if (ia->ri_id->qp) rpcrdma_ep_disconnect(ep, ia); + + rpcrdma_clean_cq(ep->rep_attr.recv_cq); + rpcrdma_clean_cq(ep->rep_attr.send_cq); + + if (ia->ri_id->qp) { rdma_destroy_qp(ia->ri_id); ia->ri_id->qp = NULL; } - rpcrdma_free_regbuf(ia, ep->rep_padbuf); - - rpcrdma_clean_cq(ep->rep_attr.recv_cq); rc = ib_destroy_cq(ep->rep_attr.recv_cq); if (rc) dprintk("RPC: %s: ib_destroy_cq returned %i\n", __func__, rc); - rpcrdma_clean_cq(ep->rep_attr.send_cq); rc = ib_destroy_cq(ep->rep_attr.send_cq); if (rc) dprintk("RPC: %s: ib_destroy_cq returned %i\n", __func__, rc); + + if (ia->ri_dma_mr) { + rc = ib_dereg_mr(ia->ri_dma_mr); + dprintk("RPC: %s: ib_dereg_mr returned %i\n", + __func__, rc); + } } /* @@ -825,7 +816,7 @@ retry: if (ia->ri_device != id->device) { printk("RPC: %s: can't reconnect on " "different device!\n", __func__); - rdma_destroy_id(id); + rpcrdma_destroy_id(id); rc = -ENETUNREACH; goto out; } @@ -834,7 +825,7 @@ retry: if (rc) { dprintk("RPC: %s: rdma_create_qp failed %i\n", __func__, rc); - rdma_destroy_id(id); + rpcrdma_destroy_id(id); rc = -ENETUNREACH; goto out; } @@ -845,7 +836,7 @@ retry: write_unlock(&ia->ri_qplock); rdma_destroy_qp(old); - rdma_destroy_id(old); + rpcrdma_destroy_id(old); } else { dprintk("RPC: %s: connecting...\n", __func__); rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); @@ -1229,75 +1220,6 @@ rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg) (unsigned long long)seg->mr_dma, seg->mr_dmalen); } -static int -rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, - struct ib_mr **mrp, struct ib_sge *iov) -{ - struct ib_phys_buf ipb; - struct ib_mr *mr; - int rc; - - /* - * All memory passed here was kmalloc'ed, therefore phys-contiguous. - */ - iov->addr = ib_dma_map_single(ia->ri_device, - va, len, DMA_BIDIRECTIONAL); - if (ib_dma_mapping_error(ia->ri_device, iov->addr)) - return -ENOMEM; - - iov->length = len; - - if (ia->ri_have_dma_lkey) { - *mrp = NULL; - iov->lkey = ia->ri_dma_lkey; - return 0; - } else if (ia->ri_bind_mem != NULL) { - *mrp = NULL; - iov->lkey = ia->ri_bind_mem->lkey; - return 0; - } - - ipb.addr = iov->addr; - ipb.size = iov->length; - mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1, - IB_ACCESS_LOCAL_WRITE, &iov->addr); - - dprintk("RPC: %s: phys convert: 0x%llx " - "registered 0x%llx length %d\n", - __func__, (unsigned long long)ipb.addr, - (unsigned long long)iov->addr, len); - - if (IS_ERR(mr)) { - *mrp = NULL; - rc = PTR_ERR(mr); - dprintk("RPC: %s: failed with %i\n", __func__, rc); - } else { - *mrp = mr; - iov->lkey = mr->lkey; - rc = 0; - } - - return rc; -} - -static int -rpcrdma_deregister_internal(struct rpcrdma_ia *ia, - struct ib_mr *mr, struct ib_sge *iov) -{ - int rc; - - ib_dma_unmap_single(ia->ri_device, - iov->addr, iov->length, DMA_BIDIRECTIONAL); - - if (NULL == mr) - return 0; - - rc = ib_dereg_mr(mr); - if (rc) - dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc); - return rc; -} - /** * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers * @ia: controlling rpcrdma_ia @@ -1317,26 +1239,29 @@ struct rpcrdma_regbuf * rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags) { struct rpcrdma_regbuf *rb; - int rc; + struct ib_sge *iov; - rc = -ENOMEM; rb = kmalloc(sizeof(*rb) + size, flags); if (rb == NULL) goto out; - rb->rg_size = size; - rb->rg_owner = NULL; - rc = rpcrdma_register_internal(ia, rb->rg_base, size, - &rb->rg_mr, &rb->rg_iov); - if (rc) + iov = &rb->rg_iov; + iov->addr = ib_dma_map_single(ia->ri_device, + (void *)rb->rg_base, size, + DMA_BIDIRECTIONAL); + if (ib_dma_mapping_error(ia->ri_device, iov->addr)) goto out_free; + iov->length = size; + iov->lkey = ia->ri_pd->local_dma_lkey; + rb->rg_size = size; + rb->rg_owner = NULL; return rb; out_free: kfree(rb); out: - return ERR_PTR(rc); + return ERR_PTR(-ENOMEM); } /** @@ -1347,10 +1272,15 @@ out: void rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) { - if (rb) { - rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov); - kfree(rb); - } + struct ib_sge *iov; + + if (!rb) + return; + + iov = &rb->rg_iov; + ib_dma_unmap_single(ia->ri_device, + iov->addr, iov->length, DMA_BIDIRECTIONAL); + kfree(rb); } /* @@ -1363,9 +1293,11 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_req *req) { + struct ib_device *device = ia->ri_device; struct ib_send_wr send_wr, *send_wr_fail; struct rpcrdma_rep *rep = req->rl_reply; - int rc; + struct ib_sge *iov = req->rl_send_iov; + int i, rc; if (rep) { rc = rpcrdma_ep_post_recv(ia, ep, rep); @@ -1376,22 +1308,15 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, send_wr.next = NULL; send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION; - send_wr.sg_list = req->rl_send_iov; + send_wr.sg_list = iov; send_wr.num_sge = req->rl_niovs; send_wr.opcode = IB_WR_SEND; - if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */ - ib_dma_sync_single_for_device(ia->ri_device, - req->rl_send_iov[3].addr, - req->rl_send_iov[3].length, - DMA_TO_DEVICE); - ib_dma_sync_single_for_device(ia->ri_device, - req->rl_send_iov[1].addr, - req->rl_send_iov[1].length, - DMA_TO_DEVICE); - ib_dma_sync_single_for_device(ia->ri_device, - req->rl_send_iov[0].addr, - req->rl_send_iov[0].length, - DMA_TO_DEVICE); + + for (i = 0; i < send_wr.num_sge; i++) + ib_dma_sync_single_for_device(device, iov[i].addr, + iov[i].length, DMA_TO_DEVICE); + dprintk("RPC: %s: posting %d s/g entries\n", + __func__, send_wr.num_sge); if (DECR_CQCOUNT(ep) > 0) send_wr.send_flags = 0; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index f49dd8b38122..c09414e6f91b 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -51,7 +51,6 @@ #include <linux/sunrpc/clnt.h> /* rpc_xprt */ #include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */ #include <linux/sunrpc/xprtrdma.h> /* xprt parameters */ -#include <linux/sunrpc/svc.h> /* RPCSVC_MAXPAYLOAD */ #define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */ #define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */ @@ -65,9 +64,7 @@ struct rpcrdma_ia { struct ib_device *ri_device; struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; - struct ib_mr *ri_bind_mem; - u32 ri_dma_lkey; - int ri_have_dma_lkey; + struct ib_mr *ri_dma_mr; struct completion ri_done; int ri_async_rc; unsigned int ri_max_frmr_depth; @@ -89,7 +86,6 @@ struct rpcrdma_ep { int rep_connected; struct ib_qp_init_attr rep_attr; wait_queue_head_t rep_connect_wait; - struct rpcrdma_regbuf *rep_padbuf; struct rdma_conn_param rep_remote_cma; struct sockaddr_storage rep_remote_addr; struct delayed_work rep_connect_worker; @@ -119,7 +115,6 @@ struct rpcrdma_ep { struct rpcrdma_regbuf { size_t rg_size; struct rpcrdma_req *rg_owner; - struct ib_mr *rg_mr; struct ib_sge rg_iov; __be32 rg_base[0] __attribute__ ((aligned(256))); }; @@ -165,8 +160,7 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) * struct rpcrdma_buffer. N is the max number of outstanding requests. */ -/* temporary static scatter/gather max */ -#define RPCRDMA_MAX_DATA_SEGS (64) /* max scatter/gather */ +#define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE) #define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */ struct rpcrdma_buffer; @@ -258,16 +252,18 @@ struct rpcrdma_mr_seg { /* chunk descriptors */ char *mr_offset; /* kva if no page, else offset */ }; +#define RPCRDMA_MAX_IOVS (2) + struct rpcrdma_req { - unsigned int rl_niovs; /* 0, 2 or 4 */ - unsigned int rl_nchunks; /* non-zero if chunks */ - unsigned int rl_connect_cookie; /* retry detection */ - struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ + unsigned int rl_niovs; + unsigned int rl_nchunks; + unsigned int rl_connect_cookie; + struct rpcrdma_buffer *rl_buffer; struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ - struct ib_sge rl_send_iov[4]; /* for active requests */ - struct rpcrdma_regbuf *rl_rdmabuf; - struct rpcrdma_regbuf *rl_sendbuf; - struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; + struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; + struct rpcrdma_regbuf *rl_rdmabuf; + struct rpcrdma_regbuf *rl_sendbuf; + struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; }; static inline struct rpcrdma_req * @@ -342,6 +338,7 @@ struct rpcrdma_stats { unsigned long hardway_register_count; unsigned long failed_marshal_count; unsigned long bad_reply_count; + unsigned long nomsg_call_count; }; /* diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 0030376327b7..1a85e0ed0b48 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -777,7 +777,6 @@ static void xs_sock_mark_closed(struct rpc_xprt *xprt) xs_sock_reset_connection_flags(xprt); /* Mark transport as closed and wake up all pending tasks */ xprt_disconnect_done(xprt); - xprt_force_disconnect(xprt); } /** @@ -822,6 +821,8 @@ static void xs_reset_transport(struct sock_xprt *transport) if (atomic_read(&transport->xprt.swapper)) sk_clear_memalloc(sk); + kernel_sock_shutdown(sock, SHUT_RDWR); + write_lock_bh(&sk->sk_callback_lock); transport->inet = NULL; transport->sock = NULL; @@ -829,6 +830,7 @@ static void xs_reset_transport(struct sock_xprt *transport) sk->sk_user_data = NULL; xs_restore_old_callbacks(transport, sk); + xprt_clear_connected(xprt); write_unlock_bh(&sk->sk_callback_lock); xs_sock_reset_connection_flags(xprt); @@ -878,8 +880,11 @@ static void xs_xprt_free(struct rpc_xprt *xprt) */ static void xs_destroy(struct rpc_xprt *xprt) { + struct sock_xprt *transport = container_of(xprt, + struct sock_xprt, xprt); dprintk("RPC: xs_destroy xprt %p\n", xprt); + cancel_delayed_work_sync(&transport->connect_worker); xs_close(xprt); xs_xprt_free(xprt); module_put(THIS_MODULE); @@ -1432,6 +1437,7 @@ out: static void xs_tcp_state_change(struct sock *sk) { struct rpc_xprt *xprt; + struct sock_xprt *transport; read_lock_bh(&sk->sk_callback_lock); if (!(xprt = xprt_from_sock(sk))) @@ -1443,13 +1449,12 @@ static void xs_tcp_state_change(struct sock *sk) sock_flag(sk, SOCK_ZAPPED), sk->sk_shutdown); + transport = container_of(xprt, struct sock_xprt, xprt); trace_rpc_socket_state_change(xprt, sk->sk_socket); switch (sk->sk_state) { case TCP_ESTABLISHED: spin_lock(&xprt->transport_lock); if (!xprt_test_and_set_connected(xprt)) { - struct sock_xprt *transport = container_of(xprt, - struct sock_xprt, xprt); /* Reset TCP record info */ transport->tcp_offset = 0; @@ -1458,6 +1463,8 @@ static void xs_tcp_state_change(struct sock *sk) transport->tcp_flags = TCP_RCV_COPY_FRAGHDR | TCP_RCV_COPY_XID; xprt->connect_cookie++; + clear_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); + xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, -EAGAIN); } @@ -1493,6 +1500,9 @@ static void xs_tcp_state_change(struct sock *sk) smp_mb__after_atomic(); break; case TCP_CLOSE: + if (test_and_clear_bit(XPRT_SOCK_CONNECTING, + &transport->sock_state)) + xprt_clear_connecting(xprt); xs_sock_mark_closed(xprt); } out: @@ -1866,7 +1876,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, sk->sk_data_ready = xs_local_data_ready; sk->sk_write_space = xs_udp_write_space; sk->sk_error_report = xs_error_report; - sk->sk_allocation = GFP_ATOMIC; + sk->sk_allocation = GFP_NOIO; xprt_clear_connected(xprt); @@ -2051,7 +2061,7 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_user_data = xprt; sk->sk_data_ready = xs_udp_data_ready; sk->sk_write_space = xs_udp_write_space; - sk->sk_allocation = GFP_ATOMIC; + sk->sk_allocation = GFP_NOIO; xprt_set_connected(xprt); @@ -2153,7 +2163,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; sk->sk_error_report = xs_error_report; - sk->sk_allocation = GFP_ATOMIC; + sk->sk_allocation = GFP_NOIO; /* socket options */ sock_reset_flag(sk, SOCK_LINGER); @@ -2176,6 +2186,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) /* Tell the socket layer to start connecting... */ xprt->stat.connect_count++; xprt->stat.connect_start = jiffies; + set_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); ret = kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK); switch (ret) { case 0: @@ -2237,7 +2248,6 @@ static void xs_tcp_setup_socket(struct work_struct *work) case -EINPROGRESS: case -EALREADY: xprt_unlock_connect(xprt, transport); - xprt_clear_connecting(xprt); return; case -EINVAL: /* Happens, for instance, if the user specified a link @@ -2279,13 +2289,14 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task) WARN_ON_ONCE(!xprt_lock_connect(xprt, task, transport)); - /* Start by resetting any existing state */ - xs_reset_transport(transport); - - if (transport->sock != NULL && !RPC_IS_SOFTCONN(task)) { + if (transport->sock != NULL) { dprintk("RPC: xs_connect delayed xprt %p for %lu " "seconds\n", xprt, xprt->reestablish_timeout / HZ); + + /* Start by resetting any existing state */ + xs_reset_transport(transport); + queue_delayed_work(rpciod_workqueue, &transport->connect_worker, xprt->reestablish_timeout); diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 16c1c43980a1..77f5d17e2612 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -16,6 +16,7 @@ #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/if_bridge.h> +#include <linux/if_vlan.h> #include <net/ip_fib.h> #include <net/switchdev.h> @@ -634,6 +635,8 @@ static int switchdev_port_br_afspec(struct net_device *dev, if (nla_len(attr) != sizeof(struct bridge_vlan_info)) return -EINVAL; vinfo = nla_data(attr); + if (!vinfo->vid || vinfo->vid >= VLAN_VID_MASK) + return -EINVAL; vlan->flags = vinfo->flags; if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) { if (vlan->vid_begin) @@ -853,12 +856,8 @@ int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, .cb = cb, .idx = idx, }; - int err; - - err = switchdev_port_obj_dump(dev, &dump.obj); - if (err) - return err; + switchdev_port_obj_dump(dev, &dump.obj); return dump.idx; } EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump); diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 8b010c976b2f..eadba62afa85 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -42,7 +42,8 @@ #include "core.h" #define MAX_PKT_DEFAULT_MCAST 1500 /* bcast link max packet size (fixed) */ -#define BCLINK_WIN_DEFAULT 20 /* bcast link window size (default) */ +#define BCLINK_WIN_DEFAULT 50 /* bcast link window size (default) */ +#define BCLINK_WIN_MIN 32 /* bcast minimum link window size */ const char tipc_bclink_name[] = "broadcast-link"; @@ -170,6 +171,30 @@ static void bclink_retransmit_pkt(struct tipc_net *tn, u32 after, u32 to) } /** + * bclink_prepare_wakeup - prepare users for wakeup after congestion + * @bcl: broadcast link + * @resultq: queue for users which can be woken up + * Move a number of waiting users, as permitted by available space in + * the send queue, from link wait queue to specified queue for wakeup + */ +static void bclink_prepare_wakeup(struct tipc_link *bcl, struct sk_buff_head *resultq) +{ + int pnd[TIPC_SYSTEM_IMPORTANCE + 1] = {0,}; + int imp, lim; + struct sk_buff *skb, *tmp; + + skb_queue_walk_safe(&bcl->wakeupq, skb, tmp) { + imp = TIPC_SKB_CB(skb)->chain_imp; + lim = bcl->window + bcl->backlog[imp].limit; + pnd[imp] += TIPC_SKB_CB(skb)->chain_sz; + if ((pnd[imp] + bcl->backlog[imp].len) >= lim) + continue; + skb_unlink(skb, &bcl->wakeupq); + skb_queue_tail(resultq, skb); + } +} + +/** * tipc_bclink_wakeup_users - wake up pending users * * Called with no locks taken @@ -177,8 +202,12 @@ static void bclink_retransmit_pkt(struct tipc_net *tn, u32 after, u32 to) void tipc_bclink_wakeup_users(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_link *bcl = tn->bcl; + struct sk_buff_head resultq; - tipc_sk_rcv(net, &tn->bclink->link.wakeupq); + skb_queue_head_init(&resultq); + bclink_prepare_wakeup(bcl, &resultq); + tipc_sk_rcv(net, &resultq); } /** @@ -880,9 +909,10 @@ int tipc_bclink_set_queue_limits(struct net *net, u32 limit) if (!bcl) return -ENOPROTOOPT; - if ((limit < TIPC_MIN_LINK_WIN) || (limit > TIPC_MAX_LINK_WIN)) + if (limit < BCLINK_WIN_MIN) + limit = BCLINK_WIN_MIN; + if (limit > TIPC_MAX_LINK_WIN) return -EINVAL; - tipc_bclink_lock(net); tipc_link_set_queue_limits(bcl, limit); tipc_bclink_unlock(net); diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 562c926a51cc..5f73450159df 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -121,7 +121,7 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) { struct sk_buff *head = *headbuf; struct sk_buff *frag = *buf; - struct sk_buff *tail; + struct sk_buff *tail = NULL; struct tipc_msg *msg; u32 fragid; int delta; @@ -141,9 +141,15 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) if (unlikely(skb_unclone(frag, GFP_ATOMIC))) goto err; head = *headbuf = frag; - skb_frag_list_init(head); - TIPC_SKB_CB(head)->tail = NULL; *buf = NULL; + TIPC_SKB_CB(head)->tail = NULL; + if (skb_is_nonlinear(head)) { + skb_walk_frags(head, tail) { + TIPC_SKB_CB(head)->tail = tail; + } + } else { + skb_frag_list_init(head); + } return 0; } @@ -539,6 +545,7 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) *err = -TIPC_ERR_NO_NAME; if (skb_linearize(skb)) return false; + msg = buf_msg(skb); if (msg_reroute_cnt(msg)) return false; dnode = addr_domain(net, msg_lookup_scope(msg)); diff --git a/net/tipc/msg.h b/net/tipc/msg.h index a82c5848d4bc..5351a3f97e8e 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -357,7 +357,7 @@ static inline u32 msg_importance(struct tipc_msg *m) if (likely((usr <= TIPC_CRITICAL_IMPORTANCE) && !msg_errcode(m))) return usr; if ((usr == MSG_FRAGMENTER) || (usr == MSG_BUNDLER)) - return msg_bits(m, 5, 13, 0x7); + return msg_bits(m, 9, 0, 0x7); return TIPC_SYSTEM_IMPORTANCE; } @@ -366,7 +366,7 @@ static inline void msg_set_importance(struct tipc_msg *m, u32 i) int usr = msg_user(m); if (likely((usr == MSG_FRAGMENTER) || (usr == MSG_BUNDLER))) - msg_set_bits(m, 5, 13, 0x7, i); + msg_set_bits(m, 9, 0, 0x7, i); else if (i < TIPC_SYSTEM_IMPORTANCE) msg_set_user(m, i); else diff --git a/net/tipc/node.c b/net/tipc/node.c index 703875fd6cde..2c32a83037a3 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1116,7 +1116,7 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, } /* Ignore duplicate packets */ - if (less(oseqno, rcv_nxt)) + if ((usr != LINK_PROTOCOL) && less(oseqno, rcv_nxt)) return true; /* Initiate or update failover mode if applicable */ @@ -1146,8 +1146,8 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, if (!pl || !tipc_link_is_up(pl)) return true; - /* Initiate or update synch mode if applicable */ - if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG)) { + /* Initiate synch mode if applicable */ + if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG) && (oseqno == 1)) { syncpt = iseqno + exp_pkts - 1; if (!tipc_link_is_up(l)) { tipc_link_fsm_evt(l, LINK_ESTABLISH_EVT); diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index c170d3138953..6e648d90297a 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -52,6 +52,8 @@ /* IANA assigned UDP port */ #define UDP_PORT_DEFAULT 6118 +#define UDP_MIN_HEADROOM 28 + static const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = { [TIPC_NLA_UDP_UNSPEC] = {.type = NLA_UNSPEC}, [TIPC_NLA_UDP_LOCAL] = {.type = NLA_BINARY, @@ -156,6 +158,9 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, struct sk_buff *clone; struct rtable *rt; + if (skb_headroom(skb) < UDP_MIN_HEADROOM) + pskb_expand_head(skb, UDP_MIN_HEADROOM, 0, GFP_ATOMIC); + clone = skb_clone(skb, GFP_ATOMIC); skb_set_inner_protocol(clone, htons(ETH_P_TIPC)); ub = rcu_dereference_rtnl(b->media_ptr); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 03ee4d359f6a..94f658235fb4 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2064,6 +2064,11 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state) goto out; } + if (flags & MSG_PEEK) + skip = sk_peek_offset(sk, flags); + else + skip = 0; + do { int chunk; struct sk_buff *skb, *last; @@ -2112,7 +2117,6 @@ unlock: break; } - skip = sk_peek_offset(sk, flags); while (skip >= unix_skb_len(skb)) { skip -= unix_skb_len(skb); last = skb; @@ -2181,6 +2185,17 @@ unlock: sk_peek_offset_fwd(sk, chunk); + if (UNIXCB(skb).fp) + break; + + skip = 0; + last = skb; + last_len = skb->len; + unix_state_lock(sk); + skb = skb_peek_next(skb, &sk->sk_receive_queue); + if (skb) + goto again; + unix_state_unlock(sk); break; } } while (size); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index df5fc6b340f1..00e8a349aabc 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1948,13 +1948,13 @@ int __vsock_core_init(const struct vsock_transport *t, struct module *owner) err = misc_register(&vsock_device); if (err) { pr_err("Failed to register misc device\n"); - return -ENOENT; + goto err_reset_transport; } err = proto_register(&vsock_proto, 1); /* we want our slab */ if (err) { pr_err("Cannot register vsock protocol\n"); - goto err_misc_deregister; + goto err_deregister_misc; } err = sock_register(&vsock_family_ops); @@ -1969,8 +1969,9 @@ int __vsock_core_init(const struct vsock_transport *t, struct module *owner) err_unregister_proto: proto_unregister(&vsock_proto); -err_misc_deregister: +err_deregister_misc: misc_deregister(&vsock_device); +err_reset_transport: transport = NULL; err_busy: mutex_unlock(&vsock_register_mutex); diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 1f63daff3965..5243ce2b2c18 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -40,13 +40,11 @@ static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg); static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg); -static void vmci_transport_peer_attach_cb(u32 sub_id, - const struct vmci_event_data *ed, - void *client_data); static void vmci_transport_peer_detach_cb(u32 sub_id, const struct vmci_event_data *ed, void *client_data); static void vmci_transport_recv_pkt_work(struct work_struct *work); +static void vmci_transport_cleanup(struct work_struct *work); static int vmci_transport_recv_listen(struct sock *sk, struct vmci_transport_packet *pkt); static int vmci_transport_recv_connecting_server( @@ -75,6 +73,10 @@ struct vmci_transport_recv_pkt_info { struct vmci_transport_packet pkt; }; +static LIST_HEAD(vmci_transport_cleanup_list); +static DEFINE_SPINLOCK(vmci_transport_cleanup_lock); +static DECLARE_WORK(vmci_transport_cleanup_work, vmci_transport_cleanup); + static struct vmci_handle vmci_transport_stream_handle = { VMCI_INVALID_ID, VMCI_INVALID_ID }; static u32 vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID; @@ -791,44 +793,6 @@ out: return err; } -static void vmci_transport_peer_attach_cb(u32 sub_id, - const struct vmci_event_data *e_data, - void *client_data) -{ - struct sock *sk = client_data; - const struct vmci_event_payload_qp *e_payload; - struct vsock_sock *vsk; - - e_payload = vmci_event_data_const_payload(e_data); - - vsk = vsock_sk(sk); - - /* We don't ask for delayed CBs when we subscribe to this event (we - * pass 0 as flags to vmci_event_subscribe()). VMCI makes no - * guarantees in that case about what context we might be running in, - * so it could be BH or process, blockable or non-blockable. So we - * need to account for all possible contexts here. - */ - local_bh_disable(); - bh_lock_sock(sk); - - /* XXX This is lame, we should provide a way to lookup sockets by - * qp_handle. - */ - if (vmci_handle_is_equal(vmci_trans(vsk)->qp_handle, - e_payload->handle)) { - /* XXX This doesn't do anything, but in the future we may want - * to set a flag here to verify the attach really did occur and - * we weren't just sent a datagram claiming it was. - */ - goto out; - } - -out: - bh_unlock_sock(sk); - local_bh_enable(); -} - static void vmci_transport_handle_detach(struct sock *sk) { struct vsock_sock *vsk; @@ -871,28 +835,38 @@ static void vmci_transport_peer_detach_cb(u32 sub_id, const struct vmci_event_data *e_data, void *client_data) { - struct sock *sk = client_data; + struct vmci_transport *trans = client_data; const struct vmci_event_payload_qp *e_payload; - struct vsock_sock *vsk; e_payload = vmci_event_data_const_payload(e_data); - vsk = vsock_sk(sk); - if (vmci_handle_is_invalid(e_payload->handle)) - return; - - /* Same rules for locking as for peer_attach_cb(). */ - local_bh_disable(); - bh_lock_sock(sk); /* XXX This is lame, we should provide a way to lookup sockets by * qp_handle. */ - if (vmci_handle_is_equal(vmci_trans(vsk)->qp_handle, - e_payload->handle)) - vmci_transport_handle_detach(sk); + if (vmci_handle_is_invalid(e_payload->handle) || + vmci_handle_is_equal(trans->qp_handle, e_payload->handle)) + return; - bh_unlock_sock(sk); - local_bh_enable(); + /* We don't ask for delayed CBs when we subscribe to this event (we + * pass 0 as flags to vmci_event_subscribe()). VMCI makes no + * guarantees in that case about what context we might be running in, + * so it could be BH or process, blockable or non-blockable. So we + * need to account for all possible contexts here. + */ + spin_lock_bh(&trans->lock); + if (!trans->sk) + goto out; + + /* Apart from here, trans->lock is only grabbed as part of sk destruct, + * where trans->sk isn't locked. + */ + bh_lock_sock(trans->sk); + + vmci_transport_handle_detach(trans->sk); + + bh_unlock_sock(trans->sk); + out: + spin_unlock_bh(&trans->lock); } static void vmci_transport_qp_resumed_cb(u32 sub_id, @@ -1181,7 +1155,7 @@ vmci_transport_recv_connecting_server(struct sock *listener, */ err = vmci_event_subscribe(VMCI_EVENT_QP_PEER_DETACH, vmci_transport_peer_detach_cb, - pending, &detach_sub_id); + vmci_trans(vpending), &detach_sub_id); if (err < VMCI_SUCCESS) { vmci_transport_send_reset(pending, pkt); err = vmci_transport_error_to_vsock_error(err); @@ -1321,7 +1295,6 @@ vmci_transport_recv_connecting_client(struct sock *sk, || vmci_trans(vsk)->qpair || vmci_trans(vsk)->produce_size != 0 || vmci_trans(vsk)->consume_size != 0 - || vmci_trans(vsk)->attach_sub_id != VMCI_INVALID_ID || vmci_trans(vsk)->detach_sub_id != VMCI_INVALID_ID) { skerr = EPROTO; err = -EINVAL; @@ -1389,7 +1362,6 @@ static int vmci_transport_recv_connecting_client_negotiate( struct vsock_sock *vsk; struct vmci_handle handle; struct vmci_qp *qpair; - u32 attach_sub_id; u32 detach_sub_id; bool is_local; u32 flags; @@ -1399,7 +1371,6 @@ static int vmci_transport_recv_connecting_client_negotiate( vsk = vsock_sk(sk); handle = VMCI_INVALID_HANDLE; - attach_sub_id = VMCI_INVALID_ID; detach_sub_id = VMCI_INVALID_ID; /* If we have gotten here then we should be past the point where old @@ -1444,23 +1415,15 @@ static int vmci_transport_recv_connecting_client_negotiate( goto destroy; } - /* Subscribe to attach and detach events first. + /* Subscribe to detach events first. * * XXX We attach once for each queue pair created for now so it is easy * to find the socket (it's provided), but later we should only * subscribe once and add a way to lookup sockets by queue pair handle. */ - err = vmci_event_subscribe(VMCI_EVENT_QP_PEER_ATTACH, - vmci_transport_peer_attach_cb, - sk, &attach_sub_id); - if (err < VMCI_SUCCESS) { - err = vmci_transport_error_to_vsock_error(err); - goto destroy; - } - err = vmci_event_subscribe(VMCI_EVENT_QP_PEER_DETACH, vmci_transport_peer_detach_cb, - sk, &detach_sub_id); + vmci_trans(vsk), &detach_sub_id); if (err < VMCI_SUCCESS) { err = vmci_transport_error_to_vsock_error(err); goto destroy; @@ -1496,7 +1459,6 @@ static int vmci_transport_recv_connecting_client_negotiate( vmci_trans(vsk)->produce_size = vmci_trans(vsk)->consume_size = pkt->u.size; - vmci_trans(vsk)->attach_sub_id = attach_sub_id; vmci_trans(vsk)->detach_sub_id = detach_sub_id; vmci_trans(vsk)->notify_ops->process_negotiate(sk); @@ -1504,9 +1466,6 @@ static int vmci_transport_recv_connecting_client_negotiate( return 0; destroy: - if (attach_sub_id != VMCI_INVALID_ID) - vmci_event_unsubscribe(attach_sub_id); - if (detach_sub_id != VMCI_INVALID_ID) vmci_event_unsubscribe(detach_sub_id); @@ -1607,9 +1566,11 @@ static int vmci_transport_socket_init(struct vsock_sock *vsk, vmci_trans(vsk)->qp_handle = VMCI_INVALID_HANDLE; vmci_trans(vsk)->qpair = NULL; vmci_trans(vsk)->produce_size = vmci_trans(vsk)->consume_size = 0; - vmci_trans(vsk)->attach_sub_id = vmci_trans(vsk)->detach_sub_id = - VMCI_INVALID_ID; + vmci_trans(vsk)->detach_sub_id = VMCI_INVALID_ID; vmci_trans(vsk)->notify_ops = NULL; + INIT_LIST_HEAD(&vmci_trans(vsk)->elem); + vmci_trans(vsk)->sk = &vsk->sk; + vmci_trans(vsk)->lock = __SPIN_LOCK_UNLOCKED(vmci_trans(vsk)->lock); if (psk) { vmci_trans(vsk)->queue_pair_size = vmci_trans(psk)->queue_pair_size; @@ -1629,29 +1590,57 @@ static int vmci_transport_socket_init(struct vsock_sock *vsk, return 0; } -static void vmci_transport_destruct(struct vsock_sock *vsk) +static void vmci_transport_free_resources(struct list_head *transport_list) { - if (vmci_trans(vsk)->attach_sub_id != VMCI_INVALID_ID) { - vmci_event_unsubscribe(vmci_trans(vsk)->attach_sub_id); - vmci_trans(vsk)->attach_sub_id = VMCI_INVALID_ID; - } + while (!list_empty(transport_list)) { + struct vmci_transport *transport = + list_first_entry(transport_list, struct vmci_transport, + elem); + list_del(&transport->elem); - if (vmci_trans(vsk)->detach_sub_id != VMCI_INVALID_ID) { - vmci_event_unsubscribe(vmci_trans(vsk)->detach_sub_id); - vmci_trans(vsk)->detach_sub_id = VMCI_INVALID_ID; - } + if (transport->detach_sub_id != VMCI_INVALID_ID) { + vmci_event_unsubscribe(transport->detach_sub_id); + transport->detach_sub_id = VMCI_INVALID_ID; + } - if (!vmci_handle_is_invalid(vmci_trans(vsk)->qp_handle)) { - vmci_qpair_detach(&vmci_trans(vsk)->qpair); - vmci_trans(vsk)->qp_handle = VMCI_INVALID_HANDLE; - vmci_trans(vsk)->produce_size = 0; - vmci_trans(vsk)->consume_size = 0; + if (!vmci_handle_is_invalid(transport->qp_handle)) { + vmci_qpair_detach(&transport->qpair); + transport->qp_handle = VMCI_INVALID_HANDLE; + transport->produce_size = 0; + transport->consume_size = 0; + } + + kfree(transport); } +} + +static void vmci_transport_cleanup(struct work_struct *work) +{ + LIST_HEAD(pending); + + spin_lock_bh(&vmci_transport_cleanup_lock); + list_replace_init(&vmci_transport_cleanup_list, &pending); + spin_unlock_bh(&vmci_transport_cleanup_lock); + vmci_transport_free_resources(&pending); +} + +static void vmci_transport_destruct(struct vsock_sock *vsk) +{ + /* Ensure that the detach callback doesn't use the sk/vsk + * we are about to destruct. + */ + spin_lock_bh(&vmci_trans(vsk)->lock); + vmci_trans(vsk)->sk = NULL; + spin_unlock_bh(&vmci_trans(vsk)->lock); if (vmci_trans(vsk)->notify_ops) vmci_trans(vsk)->notify_ops->socket_destruct(vsk); - kfree(vsk->trans); + spin_lock_bh(&vmci_transport_cleanup_lock); + list_add(&vmci_trans(vsk)->elem, &vmci_transport_cleanup_list); + spin_unlock_bh(&vmci_transport_cleanup_lock); + schedule_work(&vmci_transport_cleanup_work); + vsk->trans = NULL; } @@ -2146,6 +2135,9 @@ module_init(vmci_transport_init); static void __exit vmci_transport_exit(void) { + cancel_work_sync(&vmci_transport_cleanup_work); + vmci_transport_free_resources(&vmci_transport_cleanup_list); + if (!vmci_handle_is_invalid(vmci_transport_stream_handle)) { if (vmci_datagram_destroy_handle( vmci_transport_stream_handle) != VMCI_SUCCESS) @@ -2164,6 +2156,7 @@ module_exit(vmci_transport_exit); MODULE_AUTHOR("VMware, Inc."); MODULE_DESCRIPTION("VMCI transport for Virtual Sockets"); +MODULE_VERSION("1.0.2.0-k"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS("vmware_vsock"); MODULE_ALIAS_NETPROTO(PF_VSOCK); diff --git a/net/vmw_vsock/vmci_transport.h b/net/vmw_vsock/vmci_transport.h index ce6c9623d5f0..2ad46f39649f 100644 --- a/net/vmw_vsock/vmci_transport.h +++ b/net/vmw_vsock/vmci_transport.h @@ -119,10 +119,12 @@ struct vmci_transport { u64 queue_pair_size; u64 queue_pair_min_size; u64 queue_pair_max_size; - u32 attach_sub_id; u32 detach_sub_id; union vmci_transport_notify notify; struct vmci_transport_notify_ops *notify_ops; + struct list_head elem; + struct sock *sk; + spinlock_t lock; /* protects sk. */ }; int vmci_transport_register(void); diff --git a/net/wireless/reg.c b/net/wireless/reg.c index b144485946f2..2510b231451e 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -2625,7 +2625,7 @@ static void restore_regulatory_settings(bool reset_user) * settings, user regulatory settings takes precedence. */ if (is_an_alpha2(alpha2)) - regulatory_hint_user(user_alpha2, NL80211_USER_REG_HINT_USER); + regulatory_hint_user(alpha2, NL80211_USER_REG_HINT_USER); spin_lock(®_requests_lock); list_splice_tail_init(&tmp_reg_req_list, ®_requests_list); |