From c4cbaf7973a794839af080f13748335976cf3f3f Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Sat, 9 Jun 2018 09:14:42 +0300 Subject: cfg80211: Add support for HE Add support for the HE in cfg80211 and also add userspace API to nl80211 to send rate information out, conforming with P802.11ax_D2.0. Signed-off-by: Liad Kaufman Signed-off-by: Johannes Berg Signed-off-by: Ilan Peer Signed-off-by: Ido Yariv Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 106 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 104 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 5fbfe61f41c6..9ba1f289c439 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -285,6 +285,41 @@ struct ieee80211_sta_vht_cap { struct ieee80211_vht_mcs_info vht_mcs; }; +#define IEEE80211_HE_PPE_THRES_MAX_LEN 25 + +/** + * struct ieee80211_sta_he_cap - STA's HE capabilities + * + * This structure describes most essential parameters needed + * to describe 802.11ax HE capabilities for a STA. + * + * @has_he: true iff HE data is valid. + * @he_cap_elem: Fixed portion of the HE capabilities element. + * @he_mcs_nss_supp: The supported NSS/MCS combinations. + * @ppe_thres: Holds the PPE Thresholds data. + */ +struct ieee80211_sta_he_cap { + bool has_he; + struct ieee80211_he_cap_elem he_cap_elem; + struct ieee80211_he_mcs_nss_supp he_mcs_nss_supp; + u8 ppe_thres[IEEE80211_HE_PPE_THRES_MAX_LEN]; +}; + +/** + * struct ieee80211_sband_iftype_data + * + * This structure encapsulates sband data that is relevant for the + * interface types defined in @types_mask. Each type in the + * @types_mask must be unique across all instances of iftype_data. + * + * @types_mask: interface types mask + * @he_cap: holds the HE capabilities + */ +struct ieee80211_sband_iftype_data { + u16 types_mask; + struct ieee80211_sta_he_cap he_cap; +}; + /** * struct ieee80211_supported_band - frequency band definition * @@ -301,6 +336,11 @@ struct ieee80211_sta_vht_cap { * @n_bitrates: Number of bitrates in @bitrates * @ht_cap: HT capabilities in this band * @vht_cap: VHT capabilities in this band + * @n_iftype_data: number of iftype data entries + * @iftype_data: interface type data entries. Note that the bits in + * @types_mask inside this structure cannot overlap (i.e. only + * one occurrence of each type is allowed across all instances of + * iftype_data). */ struct ieee80211_supported_band { struct ieee80211_channel *channels; @@ -310,8 +350,55 @@ struct ieee80211_supported_band { int n_bitrates; struct ieee80211_sta_ht_cap ht_cap; struct ieee80211_sta_vht_cap vht_cap; + u16 n_iftype_data; + const struct ieee80211_sband_iftype_data *iftype_data; }; +/** + * ieee80211_get_sband_iftype_data - return sband data for a given iftype + * @sband: the sband to search for the STA on + * @iftype: enum nl80211_iftype + * + * Return: pointer to struct ieee80211_sband_iftype_data, or NULL is none found + */ +static inline const struct ieee80211_sband_iftype_data * +ieee80211_get_sband_iftype_data(const struct ieee80211_supported_band *sband, + u8 iftype) +{ + int i; + + if (WARN_ON(iftype >= NL80211_IFTYPE_MAX)) + return NULL; + + for (i = 0; i < sband->n_iftype_data; i++) { + const struct ieee80211_sband_iftype_data *data = + &sband->iftype_data[i]; + + if (data->types_mask & BIT(iftype)) + return data; + } + + return NULL; +} + +/** + * ieee80211_get_he_sta_cap - return HE capabilities for an sband's STA + * @sband: the sband to search for the STA on + * + * Return: pointer to the struct ieee80211_sta_he_cap, or NULL is none found + */ +static inline const struct ieee80211_sta_he_cap * +ieee80211_get_he_sta_cap(const struct ieee80211_supported_band *sband) +{ + const struct ieee80211_sband_iftype_data *data = + ieee80211_get_sband_iftype_data(sband, NL80211_IFTYPE_STATION); + + if (data && data->he_cap.has_he) + return &data->he_cap; + + return NULL; +} + /** * wiphy_read_of_freq_limits - read frequency limits from device tree * @@ -899,6 +986,8 @@ enum station_parameters_apply_mask { * @opmode_notif: operating mode field from Operating Mode Notification * @opmode_notif_used: information if operating mode field is used * @support_p2p_ps: information if station supports P2P PS mechanism + * @he_capa: HE capabilities of station + * @he_capa_len: the length of the HE capabilities */ struct station_parameters { const u8 *supported_rates; @@ -926,6 +1015,8 @@ struct station_parameters { u8 opmode_notif; bool opmode_notif_used; int support_p2p_ps; + const struct ieee80211_he_cap_elem *he_capa; + u8 he_capa_len; }; /** @@ -1000,12 +1091,14 @@ int cfg80211_check_station_change(struct wiphy *wiphy, * @RATE_INFO_FLAGS_VHT_MCS: mcs field filled with VHT MCS * @RATE_INFO_FLAGS_SHORT_GI: 400ns guard interval * @RATE_INFO_FLAGS_60G: 60GHz MCS + * @RATE_INFO_FLAGS_HE_MCS: HE MCS information */ enum rate_info_flags { RATE_INFO_FLAGS_MCS = BIT(0), RATE_INFO_FLAGS_VHT_MCS = BIT(1), RATE_INFO_FLAGS_SHORT_GI = BIT(2), RATE_INFO_FLAGS_60G = BIT(3), + RATE_INFO_FLAGS_HE_MCS = BIT(4), }; /** @@ -1019,6 +1112,7 @@ enum rate_info_flags { * @RATE_INFO_BW_40: 40 MHz bandwidth * @RATE_INFO_BW_80: 80 MHz bandwidth * @RATE_INFO_BW_160: 160 MHz bandwidth + * @RATE_INFO_BW_HE_RU: bandwidth determined by HE RU allocation */ enum rate_info_bw { RATE_INFO_BW_20 = 0, @@ -1027,6 +1121,7 @@ enum rate_info_bw { RATE_INFO_BW_40, RATE_INFO_BW_80, RATE_INFO_BW_160, + RATE_INFO_BW_HE_RU, }; /** @@ -1035,10 +1130,14 @@ enum rate_info_bw { * Information about a receiving or transmitting bitrate * * @flags: bitflag of flags from &enum rate_info_flags - * @mcs: mcs index if struct describes a 802.11n bitrate + * @mcs: mcs index if struct describes an HT/VHT/HE rate * @legacy: bitrate in 100kbit/s for 802.11abg - * @nss: number of streams (VHT only) + * @nss: number of streams (VHT & HE only) * @bw: bandwidth (from &enum rate_info_bw) + * @he_gi: HE guard interval (from &enum nl80211_he_gi) + * @he_dcm: HE DCM value + * @he_ru_alloc: HE RU allocation (from &enum nl80211_he_ru_alloc, + * only valid if bw is %RATE_INFO_BW_HE_RU) */ struct rate_info { u8 flags; @@ -1046,6 +1145,9 @@ struct rate_info { u16 legacy; u8 nss; u8 bw; + u8 he_gi; + u8 he_dcm; + u8 he_ru_alloc; }; /** -- cgit v1.2.3 From 95a28eeaf1491bcb8bf521bad4784683333705ee Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Sat, 9 Jun 2018 09:14:43 +0300 Subject: radiotap: add structs for HE Add radiotap structures for HE. Signed-off-by: Liad Kaufman Signed-off-by: Johannes Berg Signed-off-by: Ilan Peer Signed-off-by: Ido Yariv Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/ieee80211_radiotap.h | 123 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) (limited to 'include/net') diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h index 960236fb1681..feef706e1158 100644 --- a/include/net/ieee80211_radiotap.h +++ b/include/net/ieee80211_radiotap.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2017 Intel Deutschland GmbH + * Copyright (c) 2018 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -72,6 +73,8 @@ enum ieee80211_radiotap_presence { IEEE80211_RADIOTAP_AMPDU_STATUS = 20, IEEE80211_RADIOTAP_VHT = 21, IEEE80211_RADIOTAP_TIMESTAMP = 22, + IEEE80211_RADIOTAP_HE = 23, + IEEE80211_RADIOTAP_HE_MU = 24, /* valid in every it_present bitmap, even vendor namespaces */ IEEE80211_RADIOTAP_RADIOTAP_NAMESPACE = 29, @@ -202,6 +205,126 @@ enum ieee80211_radiotap_timestamp_flags { IEEE80211_RADIOTAP_TIMESTAMP_FLAG_ACCURACY = 0x02, }; +struct ieee80211_radiotap_he { + __le16 data1, data2, data3, data4, data5, data6; +}; + +enum ieee80211_radiotap_he_bits { + IEEE80211_RADIOTAP_HE_DATA1_FORMAT_MASK = 3, + IEEE80211_RADIOTAP_HE_DATA1_FORMAT_SU = 0, + IEEE80211_RADIOTAP_HE_DATA1_FORMAT_EXT_SU = 1, + IEEE80211_RADIOTAP_HE_DATA1_FORMAT_MU = 2, + IEEE80211_RADIOTAP_HE_DATA1_FORMAT_TRIG = 3, + + IEEE80211_RADIOTAP_HE_DATA1_BSS_COLOR_KNOWN = 0x0004, + IEEE80211_RADIOTAP_HE_DATA1_BEAM_CHANGE_KNOWN = 0x0008, + IEEE80211_RADIOTAP_HE_DATA1_UL_DL_KNOWN = 0x0010, + IEEE80211_RADIOTAP_HE_DATA1_DATA_MCS_KNOWN = 0x0020, + IEEE80211_RADIOTAP_HE_DATA1_DATA_DCM_KNOWN = 0x0040, + IEEE80211_RADIOTAP_HE_DATA1_CODING_KNOWN = 0x0080, + IEEE80211_RADIOTAP_HE_DATA1_LDPC_XSYMSEG_KNOWN = 0x0100, + IEEE80211_RADIOTAP_HE_DATA1_STBC_KNOWN = 0x0200, + IEEE80211_RADIOTAP_HE_DATA1_SPTL_REUSE_KNOWN = 0x0400, + IEEE80211_RADIOTAP_HE_DATA1_SPTL_REUSE2_KNOWN = 0x0800, + IEEE80211_RADIOTAP_HE_DATA1_SPTL_REUSE3_KNOWN = 0x1000, + IEEE80211_RADIOTAP_HE_DATA1_SPTL_REUSE4_KNOWN = 0x2000, + IEEE80211_RADIOTAP_HE_DATA1_BW_RU_ALLOC_KNOWN = 0x4000, + IEEE80211_RADIOTAP_HE_DATA1_DOPPLER_KNOWN = 0x8000, + + IEEE80211_RADIOTAP_HE_DATA2_PRISEC_80_KNOWN = 0x0001, + IEEE80211_RADIOTAP_HE_DATA2_GI_KNOWN = 0x0002, + IEEE80211_RADIOTAP_HE_DATA2_NUM_LTF_SYMS_KNOWN = 0x0004, + IEEE80211_RADIOTAP_HE_DATA2_PRE_FEC_PAD_KNOWN = 0x0008, + IEEE80211_RADIOTAP_HE_DATA2_TXBF_KNOWN = 0x0010, + IEEE80211_RADIOTAP_HE_DATA2_PE_DISAMBIG_KNOWN = 0x0020, + IEEE80211_RADIOTAP_HE_DATA2_TXOP_KNOWN = 0x0040, + IEEE80211_RADIOTAP_HE_DATA2_MIDAMBLE_KNOWN = 0x0080, + IEEE80211_RADIOTAP_HE_DATA2_RU_OFFSET = 0x3f00, + IEEE80211_RADIOTAP_HE_DATA2_RU_OFFSET_KNOWN = 0x4000, + IEEE80211_RADIOTAP_HE_DATA2_PRISEC_80_SEC = 0x8000, + + IEEE80211_RADIOTAP_HE_DATA3_BSS_COLOR = 0x003f, + IEEE80211_RADIOTAP_HE_DATA3_BEAM_CHANGE = 0x0040, + IEEE80211_RADIOTAP_HE_DATA3_UL_DL = 0x0080, + IEEE80211_RADIOTAP_HE_DATA3_DATA_MCS = 0x0f00, + IEEE80211_RADIOTAP_HE_DATA3_DATA_DCM = 0x1000, + IEEE80211_RADIOTAP_HE_DATA3_CODING = 0x2000, + IEEE80211_RADIOTAP_HE_DATA3_LDPC_XSYMSEG = 0x4000, + IEEE80211_RADIOTAP_HE_DATA3_STBC = 0x8000, + + IEEE80211_RADIOTAP_HE_DATA4_SU_MU_SPTL_REUSE = 0x000f, + IEEE80211_RADIOTAP_HE_DATA4_MU_STA_ID = 0x7ff0, + IEEE80211_RADIOTAP_HE_DATA4_TB_SPTL_REUSE1 = 0x000f, + IEEE80211_RADIOTAP_HE_DATA4_TB_SPTL_REUSE2 = 0x00f0, + IEEE80211_RADIOTAP_HE_DATA4_TB_SPTL_REUSE3 = 0x0f00, + IEEE80211_RADIOTAP_HE_DATA4_TB_SPTL_REUSE4 = 0xf000, + + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC = 0x000f, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_20MHZ = 0, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_40MHZ = 1, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_80MHZ = 2, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_160MHZ = 3, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_26T = 4, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_52T = 5, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_106T = 6, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_242T = 7, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_484T = 8, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_996T = 9, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_2x996T = 10, + + IEEE80211_RADIOTAP_HE_DATA5_GI = 0x0030, + IEEE80211_RADIOTAP_HE_DATA5_GI_0_8 = 0, + IEEE80211_RADIOTAP_HE_DATA5_GI_1_6 = 1, + IEEE80211_RADIOTAP_HE_DATA5_GI_3_2 = 2, + + IEEE80211_RADIOTAP_HE_DATA5_LTF_SIZE = 0x00c0, + IEEE80211_RADIOTAP_HE_DATA5_LTF_SIZE_UNKNOWN = 0, + IEEE80211_RADIOTAP_HE_DATA5_LTF_SIZE_1X = 1, + IEEE80211_RADIOTAP_HE_DATA5_LTF_SIZE_2X = 2, + IEEE80211_RADIOTAP_HE_DATA5_LTF_SIZE_4X = 3, + IEEE80211_RADIOTAP_HE_DATA5_NUM_LTF_SYMS = 0x0700, + IEEE80211_RADIOTAP_HE_DATA5_PRE_FEC_PAD = 0x3000, + IEEE80211_RADIOTAP_HE_DATA5_TXBF = 0x4000, + IEEE80211_RADIOTAP_HE_DATA5_PE_DISAMBIG = 0x8000, + + IEEE80211_RADIOTAP_HE_DATA6_NSTS = 0x000f, + IEEE80211_RADIOTAP_HE_DATA6_DOPPLER = 0x0010, + IEEE80211_RADIOTAP_HE_DATA6_TXOP = 0x7f00, + IEEE80211_RADIOTAP_HE_DATA6_MIDAMBLE_PDCTY = 0x8000, +}; + +struct ieee80211_radiotap_he_mu { + __le16 flags1, flags2; + u8 ru_ch1[4]; + u8 ru_ch2[4]; +}; + +enum ieee80211_radiotap_he_mu_bits { + IEEE80211_RADIOTAP_HE_MU_FLAGS1_SIG_B_MCS = 0x000f, + IEEE80211_RADIOTAP_HE_MU_FLAGS1_SIG_B_MCS_KNOWN = 0x0010, + IEEE80211_RADIOTAP_HE_MU_FLAGS1_SIG_B_DCM = 0x0020, + IEEE80211_RADIOTAP_HE_MU_FLAGS1_SIG_B_DCM_KNOWN = 0x0040, + IEEE80211_RADIOTAP_HE_MU_FLAGS1_CH2_CTR_26T_RU_KNOWN = 0x0080, + IEEE80211_RADIOTAP_HE_MU_FLAGS1_CH1_RU_KNOWN = 0x0100, + IEEE80211_RADIOTAP_HE_MU_FLAGS1_CH2_RU_KNOWN = 0x0200, + IEEE80211_RADIOTAP_HE_MU_FLAGS1_CH1_CTR_26T_RU_KNOWN = 0x1000, + IEEE80211_RADIOTAP_HE_MU_FLAGS1_CH1_CTR_26T_RU = 0x2000, + IEEE80211_RADIOTAP_HE_MU_FLAGS1_SIG_B_COMP_KNOWN = 0x4000, + IEEE80211_RADIOTAP_HE_MU_FLAGS1_SIG_B_SYMS_USERS_KNOWN = 0x8000, + + IEEE80211_RADIOTAP_HE_MU_FLAGS2_BW_FROM_SIG_A_BW = 0x0003, + IEEE80211_RADIOTAP_HE_MU_FLAGS2_BW_FROM_SIG_A_BW_20MHZ = 0x0000, + IEEE80211_RADIOTAP_HE_MU_FLAGS2_BW_FROM_SIG_A_BW_40MHZ = 0x0001, + IEEE80211_RADIOTAP_HE_MU_FLAGS2_BW_FROM_SIG_A_BW_80MHZ = 0x0002, + IEEE80211_RADIOTAP_HE_MU_FLAGS2_BW_FROM_SIG_A_BW_160MHZ = 0x0003, + IEEE80211_RADIOTAP_HE_MU_FLAGS2_BW_FROM_SIG_A_BW_KNOWN = 0x0004, + IEEE80211_RADIOTAP_HE_MU_FLAGS2_SIG_B_COMP = 0x0008, + IEEE80211_RADIOTAP_HE_MU_FLAGS2_SIG_B_SYMS_USERS = 0x00f0, + IEEE80211_RADIOTAP_HE_MU_FLAGS2_PUNC_FROM_SIG_A_BW = 0x0300, + IEEE80211_RADIOTAP_HE_MU_FLAGS2_PUNC_FROM_SIG_A_BW_KNOWN= 0x0400, + IEEE80211_RADIOTAP_HE_MU_FLAGS2_CH2_CTR_26T_RU = 0x0800, +}; + /** * ieee80211_get_radiotap_len - get radiotap header length */ -- cgit v1.2.3 From 41cbb0f5a29592874355e4159489eb08337cd50e Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Sat, 9 Jun 2018 09:14:44 +0300 Subject: mac80211: add support for HE Add support for HE in mac80211 conforming with P802.11ax_D1.4. Johannes: Fix another bug with the buf_size comparison in agg-rx.c. Signed-off-by: Liad Kaufman Signed-off-by: Johannes Berg Signed-off-by: Ilan Peer Signed-off-by: Ido Yariv Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/mac80211.h | 64 ++++++++-- net/mac80211/Makefile | 1 + net/mac80211/agg-rx.c | 10 +- net/mac80211/agg-tx.c | 19 ++- net/mac80211/cfg.c | 5 + net/mac80211/he.c | 55 +++++++++ net/mac80211/ieee80211_i.h | 16 +++ net/mac80211/main.c | 19 ++- net/mac80211/mlme.c | 288 ++++++++++++++++++++++++++++++++++++++++++--- net/mac80211/rx.c | 127 +++++++++++++++++++- net/mac80211/sta_info.c | 15 ++- net/mac80211/sta_info.h | 20 +++- net/mac80211/trace.h | 2 +- net/mac80211/util.c | 120 ++++++++++++++++++- 14 files changed, 716 insertions(+), 45 deletions(-) create mode 100644 net/mac80211/he.c (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 851a5e19ae32..5790f55c241d 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -23,6 +23,7 @@ #include #include #include +#include #include /** @@ -162,6 +163,8 @@ enum ieee80211_ac_numbers { * @txop: maximum burst time in units of 32 usecs, 0 meaning disabled * @acm: is mandatory admission control required for the access category * @uapsd: is U-APSD mode enabled for the queue + * @mu_edca: is the MU EDCA configured + * @mu_edca_param_rec: MU EDCA Parameter Record for HE */ struct ieee80211_tx_queue_params { u16 txop; @@ -170,6 +173,8 @@ struct ieee80211_tx_queue_params { u8 aifs; bool acm; bool uapsd; + bool mu_edca; + struct ieee80211_he_mu_edca_param_ac_rec mu_edca_param_rec; }; struct ieee80211_low_level_stats { @@ -463,6 +468,15 @@ struct ieee80211_mu_group_data { * This structure keeps information about a BSS (and an association * to that BSS) that can change during the lifetime of the BSS. * + * @bss_color: 6-bit value to mark inter-BSS frame, if BSS supports HE + * @htc_trig_based_pkt_ext: default PE in 4us units, if BSS supports HE + * @multi_sta_back_32bit: supports BA bitmap of 32-bits in Multi-STA BACK + * @uora_exists: is the UORA element advertised by AP + * @ack_enabled: indicates support to receive a multi-TID that solicits either + * ACK, BACK or both + * @uora_ocw_range: UORA element's OCW Range field + * @frame_time_rts_th: HE duration RTS threshold, in units of 32us + * @he_support: does this BSS support HE * @assoc: association status * @ibss_joined: indicates whether this station is part of an IBSS * or not @@ -550,6 +564,14 @@ struct ieee80211_mu_group_data { */ struct ieee80211_bss_conf { const u8 *bssid; + u8 bss_color; + u8 htc_trig_based_pkt_ext; + bool multi_sta_back_32bit; + bool uora_exists; + bool ack_enabled; + u8 uora_ocw_range; + u16 frame_time_rts_th; + bool he_support; /* association related data */ bool assoc, ibss_joined; bool ibss_creator; @@ -1106,6 +1128,18 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * @RX_FLAG_AMPDU_EOF_BIT: Value of the EOF bit in the A-MPDU delimiter for this * frame * @RX_FLAG_AMPDU_EOF_BIT_KNOWN: The EOF value is known + * @RX_FLAG_RADIOTAP_HE: HE radiotap data is present + * (&struct ieee80211_radiotap_he, mac80211 will fill in + * - DATA3_DATA_MCS + * - DATA3_DATA_DCM + * - DATA3_CODING + * - DATA5_GI + * - DATA5_DATA_BW_RU_ALLOC + * - DATA6_NSTS + * - DATA3_STBC + * from the RX info data, so leave those zeroed when building this data) + * @RX_FLAG_RADIOTAP_HE_MU: HE MU radiotap data is present + * (&struct ieee80211_radiotap_he_mu) */ enum mac80211_rx_flags { RX_FLAG_MMIC_ERROR = BIT(0), @@ -1134,6 +1168,8 @@ enum mac80211_rx_flags { RX_FLAG_ICV_STRIPPED = BIT(23), RX_FLAG_AMPDU_EOF_BIT = BIT(24), RX_FLAG_AMPDU_EOF_BIT_KNOWN = BIT(25), + RX_FLAG_RADIOTAP_HE = BIT(26), + RX_FLAG_RADIOTAP_HE_MU = BIT(27), }; /** @@ -1164,6 +1200,7 @@ enum mac80211_rx_encoding { RX_ENC_LEGACY = 0, RX_ENC_HT, RX_ENC_VHT, + RX_ENC_HE, }; /** @@ -1198,6 +1235,9 @@ enum mac80211_rx_encoding { * @encoding: &enum mac80211_rx_encoding * @bw: &enum rate_info_bw * @enc_flags: uses bits from &enum mac80211_rx_encoding_flags + * @he_ru: HE RU, from &enum nl80211_he_ru_alloc + * @he_gi: HE GI, from &enum nl80211_he_gi + * @he_dcm: HE DCM value * @rx_flags: internal RX flags for mac80211 * @ampdu_reference: A-MPDU reference number, must be a different value for * each A-MPDU but the same for each subframe within one A-MPDU @@ -1211,7 +1251,8 @@ struct ieee80211_rx_status { u32 flag; u16 freq; u8 enc_flags; - u8 encoding:2, bw:3; + u8 encoding:2, bw:3, he_ru:3; + u8 he_gi:2, he_dcm:1; u8 rate_idx; u8 nss; u8 rx_flags; @@ -1770,6 +1811,7 @@ struct ieee80211_sta_rates { * @supp_rates: Bitmap of supported rates (per band) * @ht_cap: HT capabilities of this STA; restricted to our own capabilities * @vht_cap: VHT capabilities of this STA; restricted to our own capabilities + * @he_cap: HE capabilities of this STA * @max_rx_aggregation_subframes: maximal amount of frames in a single AMPDU * that this station is allowed to transmit to us. * Can be modified by driver. @@ -1805,7 +1847,8 @@ struct ieee80211_sta { u16 aid; struct ieee80211_sta_ht_cap ht_cap; struct ieee80211_sta_vht_cap vht_cap; - u8 max_rx_aggregation_subframes; + struct ieee80211_sta_he_cap he_cap; + u16 max_rx_aggregation_subframes; bool wme; u8 uapsd_queues; u8 max_sp; @@ -2196,10 +2239,11 @@ enum ieee80211_hw_flags { * it shouldn't be set. * * @max_tx_aggregation_subframes: maximum number of subframes in an - * aggregate an HT driver will transmit. Though ADDBA will advertise - * a constant value of 64 as some older APs can crash if the window - * size is smaller (an example is LinkSys WRT120N with FW v1.0.07 - * build 002 Jun 18 2012). + * aggregate an HT/HE device will transmit. In HT AddBA we'll + * advertise a constant value of 64 as some older APs crash if + * the window size is smaller (an example is LinkSys WRT120N + * with FW v1.0.07 build 002 Jun 18 2012). + * For AddBA to HE capable peers this value will be used. * * @max_tx_fragments: maximum number of tx buffers per (A)-MSDU, sum * of 1 + skb_shinfo(skb)->nr_frags for each skb in the frag_list. @@ -2216,6 +2260,8 @@ enum ieee80211_hw_flags { * the default is _GI | _BANDWIDTH. * Use the %IEEE80211_RADIOTAP_VHT_KNOWN_\* values. * + * @radiotap_he: HE radiotap validity flags + * * @radiotap_timestamp: Information for the radiotap timestamp field; if the * 'units_pos' member is set to a non-negative value it must be set to * a combination of a IEEE80211_RADIOTAP_TIMESTAMP_UNIT_* and a @@ -2263,8 +2309,8 @@ struct ieee80211_hw { u8 max_rates; u8 max_report_rates; u8 max_rate_tries; - u8 max_rx_aggregation_subframes; - u8 max_tx_aggregation_subframes; + u16 max_rx_aggregation_subframes; + u16 max_tx_aggregation_subframes; u8 max_tx_fragments; u8 offchannel_tx_hw_queue; u8 radiotap_mcs_details; @@ -2904,7 +2950,7 @@ struct ieee80211_ampdu_params { struct ieee80211_sta *sta; u16 tid; u16 ssn; - u8 buf_size; + u16 buf_size; bool amsdu; u16 timeout; }; diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile index e3589ade62e0..bb707789ef2b 100644 --- a/net/mac80211/Makefile +++ b/net/mac80211/Makefile @@ -12,6 +12,7 @@ mac80211-y := \ scan.o offchannel.o \ ht.o agg-tx.o agg-rx.o \ vht.o \ + he.o \ ibss.o \ iface.o \ rate.o \ diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 3ffd853b483f..6a4f154c99f6 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -245,6 +245,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, }; int i, ret = -EOPNOTSUPP; u16 status = WLAN_STATUS_REQUEST_DECLINED; + u16 max_buf_size; if (tid >= IEEE80211_FIRST_TSPEC_TSID) { ht_dbg(sta->sdata, @@ -268,13 +269,18 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, goto end; } + if (sta->sta.he_cap.has_he) + max_buf_size = IEEE80211_MAX_AMPDU_BUF; + else + max_buf_size = IEEE80211_MAX_AMPDU_BUF_HT; + /* sanity check for incoming parameters: * check if configuration can support the BA policy * and if buffer size does not exceeds max value */ /* XXX: check own ht delayed BA capability?? */ if (((ba_policy != 1) && (!(sta->sta.ht_cap.cap & IEEE80211_HT_CAP_DELAY_BA))) || - (buf_size > IEEE80211_MAX_AMPDU_BUF_HT)) { + (buf_size > max_buf_size)) { status = WLAN_STATUS_INVALID_QOS_PARAM; ht_dbg_ratelimited(sta->sdata, "AddBA Req with bad params from %pM on tid %u. policy %d, buffer size %d\n", @@ -283,7 +289,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, } /* determine default buffer size */ if (buf_size == 0) - buf_size = IEEE80211_MAX_AMPDU_BUF_HT; + buf_size = max_buf_size; /* make sure the size doesn't exceed the maximum supported by the hw */ if (buf_size > sta->sta.max_rx_aggregation_subframes) diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 86c6bc0432ba..69e831bc317b 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -463,6 +463,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) .timeout = 0, }; int ret; + u16 buf_size; tid_tx = rcu_dereference_protected_tid_tx(sta, tid); @@ -511,11 +512,22 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) sta->ampdu_mlme.addba_req_num[tid]++; spin_unlock_bh(&sta->lock); + if (sta->sta.he_cap.has_he) { + buf_size = local->hw.max_tx_aggregation_subframes; + } else { + /* + * We really should use what the driver told us it will + * transmit as the maximum, but certain APs (e.g. the + * LinkSys WRT120N with FW v1.0.07 build 002 Jun 18 2012) + * will crash when we use a lower number. + */ + buf_size = IEEE80211_MAX_AMPDU_BUF_HT; + } + /* send AddBA request */ ieee80211_send_addba_request(sdata, sta->sta.addr, tid, tid_tx->dialog_token, params.ssn, - IEEE80211_MAX_AMPDU_BUF_HT, - tid_tx->timeout); + buf_size, tid_tx->timeout); } /* @@ -905,8 +917,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, { struct tid_ampdu_tx *tid_tx; struct ieee80211_txq *txq; - u16 capab, tid; - u8 buf_size; + u16 capab, tid, buf_size; bool amsdu; capab = le16_to_cpu(mgmt->u.action.u.addba_resp.capab); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index c4e2f7d2bcb8..02f3672e7b5e 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1412,6 +1412,11 @@ static int sta_apply_parameters(struct ieee80211_local *local, ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, params->vht_capa, sta); + if (params->he_capa) + ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, + (void *)params->he_capa, + params->he_capa_len, sta); + if (params->opmode_notif_used) { /* returned value is only needed for rc update, but the * rc isn't initialized here yet, so ignore it diff --git a/net/mac80211/he.c b/net/mac80211/he.c new file mode 100644 index 000000000000..769078ed5a12 --- /dev/null +++ b/net/mac80211/he.c @@ -0,0 +1,55 @@ +/* + * HE handling + * + * Copyright(c) 2017 Intel Deutschland GmbH + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "ieee80211_i.h" + +void +ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, + const u8 *he_cap_ie, u8 he_cap_len, + struct sta_info *sta) +{ + struct ieee80211_sta_he_cap *he_cap = &sta->sta.he_cap; + struct ieee80211_he_cap_elem *he_cap_ie_elem = (void *)he_cap_ie; + u8 he_ppe_size; + u8 mcs_nss_size; + u8 he_total_size; + + memset(he_cap, 0, sizeof(*he_cap)); + + if (!he_cap_ie || !ieee80211_get_he_sta_cap(sband)) + return; + + /* Make sure size is OK */ + mcs_nss_size = ieee80211_he_mcs_nss_size(he_cap_ie_elem); + he_ppe_size = + ieee80211_he_ppe_size(he_cap_ie[sizeof(he_cap->he_cap_elem) + + mcs_nss_size], + he_cap_ie_elem->phy_cap_info); + he_total_size = sizeof(he_cap->he_cap_elem) + mcs_nss_size + + he_ppe_size; + if (he_cap_len < he_total_size) + return; + + memcpy(&he_cap->he_cap_elem, he_cap_ie, sizeof(he_cap->he_cap_elem)); + + /* HE Tx/Rx HE MCS NSS Support Field */ + memcpy(&he_cap->he_mcs_nss_supp, + &he_cap_ie[sizeof(he_cap->he_cap_elem)], mcs_nss_size); + + /* Check if there are (optional) PPE Thresholds */ + if (he_cap->he_cap_elem.phy_cap_info[6] & + IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) + memcpy(he_cap->ppe_thres, + &he_cap_ie[sizeof(he_cap->he_cap_elem) + mcs_nss_size], + he_ppe_size); + + he_cap->has_he = true; +} diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index a6c12c104c38..172aeae21ae9 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -365,6 +365,7 @@ enum ieee80211_sta_flags { IEEE80211_STA_DISABLE_160MHZ = BIT(13), IEEE80211_STA_DISABLE_WMM = BIT(14), IEEE80211_STA_ENABLE_RRM = BIT(15), + IEEE80211_STA_DISABLE_HE = BIT(16), }; struct ieee80211_mgd_auth_data { @@ -1454,6 +1455,10 @@ struct ieee802_11_elems { const struct ieee80211_vht_cap *vht_cap_elem; const struct ieee80211_vht_operation *vht_operation; const struct ieee80211_meshconf_ie *mesh_config; + const u8 *he_cap; + const struct ieee80211_he_operation *he_operation; + const struct ieee80211_mu_edca_param_set *mu_edca_param_set; + const u8 *uora_element; const u8 *mesh_id; const u8 *peering; const __le16 *awake_window; @@ -1483,6 +1488,7 @@ struct ieee802_11_elems { u8 ext_supp_rates_len; u8 wmm_info_len; u8 wmm_param_len; + u8 he_cap_len; u8 mesh_id_len; u8 peering_len; u8 preq_len; @@ -1825,6 +1831,13 @@ void ieee80211_get_vht_mask_from_cap(__le16 vht_cap, enum nl80211_chan_width ieee80211_sta_rx_bw_to_chan_width(struct sta_info *sta); +/* HE */ +void +ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, + const u8 *he_cap_ie, u8 he_cap_len, + struct sta_info *sta); + /* Spectrum management */ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, @@ -2076,6 +2089,9 @@ u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, u32 cap); u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, const struct cfg80211_chan_def *chandef); +u8 *ieee80211_ie_build_he_cap(u8 *pos, + const struct ieee80211_sta_he_cap *he_cap, + u8 *end); int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef, const struct ieee80211_supported_band *sband, const u8 *srates, int srates_len, u32 *rates); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 070f77862014..b33faba8cbbe 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -3,6 +3,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright (C) 2017 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -825,7 +826,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) int result, i; enum nl80211_band band; int channels, max_bitrates; - bool supp_ht, supp_vht; + bool supp_ht, supp_vht, supp_he; netdev_features_t feature_whitelist; struct cfg80211_chan_def dflt_chandef = {}; @@ -905,6 +906,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) max_bitrates = 0; supp_ht = false; supp_vht = false; + supp_he = false; for (band = 0; band < NUM_NL80211_BANDS; band++) { struct ieee80211_supported_band *sband; @@ -931,6 +933,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) supp_ht = supp_ht || sband->ht_cap.ht_supported; supp_vht = supp_vht || sband->vht_cap.vht_supported; + if (!supp_he) + supp_he = !!ieee80211_get_he_sta_cap(sband); + if (!sband->ht_cap.ht_supported) continue; @@ -1020,6 +1025,18 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) local->scan_ies_len += 2 + sizeof(struct ieee80211_vht_cap); + /* HE cap element is variable in size - set len to allow max size */ + /* + * TODO: 1 is added at the end of the calculation to accommodate for + * the temporary placing of the HE capabilities IE under EXT. + * Remove it once it is placed in the final place. + */ + if (supp_he) + local->scan_ies_len += + 2 + sizeof(struct ieee80211_he_cap_elem) + + sizeof(struct ieee80211_he_mcs_nss_supp) + + IEEE80211_HE_PPE_THRES_MAX_LEN + 1; + if (!local->ops->hw_scan) { /* For hw_scan, driver needs to set these up. */ local->hw.wiphy->max_scan_ssids = 4; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index a44e5b4aaeda..0322d78007ad 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -149,6 +149,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel, const struct ieee80211_ht_operation *ht_oper, const struct ieee80211_vht_operation *vht_oper, + const struct ieee80211_he_operation *he_oper, struct cfg80211_chan_def *chandef, bool tracking) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; @@ -207,7 +208,27 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, } vht_chandef = *chandef; - if (!ieee80211_chandef_vht_oper(vht_oper, &vht_chandef)) { + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && he_oper && + (le32_to_cpu(he_oper->he_oper_params) & + IEEE80211_HE_OPERATION_VHT_OPER_INFO)) { + struct ieee80211_vht_operation he_oper_vht_cap; + + /* + * Set only first 3 bytes (other 2 aren't used in + * ieee80211_chandef_vht_oper() anyway) + */ + memcpy(&he_oper_vht_cap, he_oper->optional, 3); + he_oper_vht_cap.basic_mcs_set = cpu_to_le16(0); + + if (!ieee80211_chandef_vht_oper(&he_oper_vht_cap, + &vht_chandef)) { + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE)) + sdata_info(sdata, + "HE AP VHT information is invalid, disable HE\n"); + ret = IEEE80211_STA_DISABLE_HE; + goto out; + } + } else if (!ieee80211_chandef_vht_oper(vht_oper, &vht_chandef)) { if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) sdata_info(sdata, "AP VHT information is invalid, disable VHT\n"); @@ -300,12 +321,14 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, const struct ieee80211_ht_cap *ht_cap, const struct ieee80211_ht_operation *ht_oper, const struct ieee80211_vht_operation *vht_oper, + const struct ieee80211_he_operation *he_oper, const u8 *bssid, u32 *changed) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - struct ieee80211_supported_band *sband; - struct ieee80211_channel *chan; + struct ieee80211_channel *chan = sdata->vif.bss_conf.chandef.chan; + struct ieee80211_supported_band *sband = + local->hw.wiphy->bands[chan->band]; struct cfg80211_chan_def chandef; u16 ht_opmode; u32 flags; @@ -320,6 +343,11 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, if (ifmgd->flags & IEEE80211_STA_DISABLE_VHT) vht_oper = NULL; + /* don't check HE if we associated as non-HE station */ + if (ifmgd->flags & IEEE80211_STA_DISABLE_HE || + !ieee80211_get_he_sta_cap(sband)) + he_oper = NULL; + if (WARN_ON_ONCE(!sta)) return -EINVAL; @@ -333,12 +361,9 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.ht_operation_mode = ht_opmode; } - chan = sdata->vif.bss_conf.chandef.chan; - sband = local->hw.wiphy->bands[chan->band]; - - /* calculate new channel (type) based on HT/VHT operation IEs */ + /* calculate new channel (type) based on HT/VHT/HE operation IEs */ flags = ieee80211_determine_chantype(sdata, sband, chan, - ht_oper, vht_oper, + ht_oper, vht_oper, he_oper, &chandef, true); /* @@ -582,6 +607,34 @@ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, ieee80211_ie_build_vht_cap(pos, &vht_cap, cap); } +/* This function determines HE capability flags for the association + * and builds the IE. + */ +static void ieee80211_add_he_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb, + struct ieee80211_supported_band *sband) +{ + u8 *pos; + const struct ieee80211_sta_he_cap *he_cap = NULL; + u8 he_cap_size; + + he_cap = ieee80211_get_he_sta_cap(sband); + if (!he_cap) + return; + + /* + * TODO: the 1 added is because this temporarily is under the EXTENSION + * IE. Get rid of it when it moves. + */ + he_cap_size = + 2 + 1 + sizeof(he_cap->he_cap_elem) + + ieee80211_he_mcs_nss_size(&he_cap->he_cap_elem) + + ieee80211_he_ppe_size(he_cap->ppe_thres[0], + he_cap->he_cap_elem.phy_cap_info); + pos = skb_put(skb, he_cap_size); + ieee80211_ie_build_he_cap(pos, he_cap, pos + he_cap_size); +} + static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; @@ -643,6 +696,9 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) 2 + 2 * sband->n_channels + /* supported channels */ 2 + sizeof(struct ieee80211_ht_cap) + /* HT */ 2 + sizeof(struct ieee80211_vht_cap) + /* VHT */ + 2 + 1 + sizeof(struct ieee80211_he_cap_elem) + /* HE */ + sizeof(struct ieee80211_he_mcs_nss_supp) + + IEEE80211_HE_PPE_THRES_MAX_LEN + assoc_data->ie_len + /* extra IEs */ (assoc_data->fils_kek_len ? 16 /* AES-SIV */ : 0) + 9, /* WMM */ @@ -827,11 +883,41 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) offset = noffset; } + /* if present, add any custom IEs that go before HE */ + if (assoc_data->ie_len) { + static const u8 before_he[] = { + /* + * no need to list the ones split off before VHT + * or generated here + */ + WLAN_EID_OPMODE_NOTIF, + WLAN_EID_EXTENSION, WLAN_EID_EXT_FUTURE_CHAN_GUIDANCE, + /* 11ai elements */ + WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_SESSION, + WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_PUBLIC_KEY, + WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_KEY_CONFIRM, + WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_HLP_CONTAINER, + WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_IP_ADDR_ASSIGN, + /* TODO: add 11ah/11aj/11ak elements */ + }; + + /* RIC already taken above, so no need to handle here anymore */ + noffset = ieee80211_ie_split(assoc_data->ie, assoc_data->ie_len, + before_he, ARRAY_SIZE(before_he), + offset); + pos = skb_put(skb, noffset - offset); + memcpy(pos, assoc_data->ie + offset, noffset - offset); + offset = noffset; + } + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) ieee80211_add_vht_ie(sdata, skb, sband, &assoc_data->ap_vht_cap); - /* if present, add any custom non-vendor IEs that go after HT */ + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE)) + ieee80211_add_he_ie(sdata, skb, sband); + + /* if present, add any custom non-vendor IEs that go after HE */ if (assoc_data->ie_len) { noffset = ieee80211_ie_split_vendor(assoc_data->ie, assoc_data->ie_len, @@ -898,6 +984,11 @@ void ieee80211_send_nullfunc(struct ieee80211_local *local, struct ieee80211_hdr_3addr *nullfunc; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + /* Don't send NDPs when STA is connected HE */ + if (sdata->vif.type == NL80211_IFTYPE_STATION && + !(ifmgd->flags & IEEE80211_STA_DISABLE_HE)) + return; + skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif, !ieee80211_hw_check(&local->hw, DOESNT_SUPPORT_QOS_NDP)); if (!skb) @@ -929,6 +1020,10 @@ static void ieee80211_send_4addr_nullfunc(struct ieee80211_local *local, if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) return; + /* Don't send NDPs when connected HE */ + if (!(sdata->u.mgd.flags & IEEE80211_STA_DISABLE_HE)) + return; + skb = dev_alloc_skb(local->hw.extra_tx_headroom + 30); if (!skb) return; @@ -1700,9 +1795,11 @@ static void ieee80211_sta_handle_tspec_ac_params_wk(struct work_struct *work) } /* MLME */ -static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, - const u8 *wmm_param, size_t wmm_param_len) +static bool +ieee80211_sta_wmm_params(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + const u8 *wmm_param, size_t wmm_param_len, + const struct ieee80211_mu_edca_param_set *mu_edca) { struct ieee80211_tx_queue_params params[IEEE80211_NUM_ACS]; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; @@ -1749,6 +1846,9 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, sdata->wmm_acm |= BIT(1) | BIT(2); /* BK/- */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BK) uapsd = true; + params[ac].mu_edca = !!mu_edca; + if (mu_edca) + params[ac].mu_edca_param_rec = mu_edca->ac_bk; break; case 2: /* AC_VI */ ac = IEEE80211_AC_VI; @@ -1756,6 +1856,9 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, sdata->wmm_acm |= BIT(4) | BIT(5); /* CL/VI */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VI) uapsd = true; + params[ac].mu_edca = !!mu_edca; + if (mu_edca) + params[ac].mu_edca_param_rec = mu_edca->ac_vi; break; case 3: /* AC_VO */ ac = IEEE80211_AC_VO; @@ -1763,6 +1866,9 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, sdata->wmm_acm |= BIT(6) | BIT(7); /* VO/NC */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VO) uapsd = true; + params[ac].mu_edca = !!mu_edca; + if (mu_edca) + params[ac].mu_edca_param_rec = mu_edca->ac_vo; break; case 0: /* AC_BE */ default: @@ -1771,6 +1877,9 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, sdata->wmm_acm |= BIT(0) | BIT(3); /* BE/EE */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BE) uapsd = true; + params[ac].mu_edca = !!mu_edca; + if (mu_edca) + params[ac].mu_edca_param_rec = mu_edca->ac_be; break; } @@ -3021,6 +3130,25 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, goto out; } + /* + * If AP doesn't support HT, or it doesn't have HE mandatory IEs, mark + * HE as disabled. If on the 5GHz band, make sure it supports VHT. + */ + if (ifmgd->flags & IEEE80211_STA_DISABLE_HT || + (sband->band == NL80211_BAND_5GHZ && + ifmgd->flags & IEEE80211_STA_DISABLE_VHT) || + (!elems.he_cap && !elems.he_operation)) + ifmgd->flags |= IEEE80211_STA_DISABLE_HE; + + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && + (!elems.he_cap || !elems.he_operation)) { + mutex_unlock(&sdata->local->sta_mtx); + sdata_info(sdata, + "HE AP is missing HE capability/operation\n"); + ret = false; + goto out; + } + /* Set up internal HT/VHT capabilities */ if (elems.ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, @@ -3030,6 +3158,48 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, elems.vht_cap_elem, sta); + if (elems.he_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && + elems.he_cap) { + ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, + elems.he_cap, + elems.he_cap_len, + sta); + + bss_conf->he_support = sta->sta.he_cap.has_he; + } else { + bss_conf->he_support = false; + } + + if (bss_conf->he_support) { + u32 he_oper_params = + le32_to_cpu(elems.he_operation->he_oper_params); + + bss_conf->bss_color = he_oper_params & + IEEE80211_HE_OPERATION_BSS_COLOR_MASK; + bss_conf->htc_trig_based_pkt_ext = + (he_oper_params & + IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK) << + IEEE80211_HE_OPERATION_DFLT_PE_DURATION_OFFSET; + bss_conf->frame_time_rts_th = + (he_oper_params & + IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK) << + IEEE80211_HE_OPERATION_RTS_THRESHOLD_OFFSET; + + bss_conf->multi_sta_back_32bit = + sta->sta.he_cap.he_cap_elem.mac_cap_info[2] & + IEEE80211_HE_MAC_CAP2_32BIT_BA_BITMAP; + + bss_conf->ack_enabled = + sta->sta.he_cap.he_cap_elem.mac_cap_info[2] & + IEEE80211_HE_MAC_CAP2_ACK_EN; + + bss_conf->uora_exists = !!elems.uora_element; + if (elems.uora_element) + bss_conf->uora_ocw_range = elems.uora_element[0]; + + /* TODO: OPEN: what happens if BSS color disable is set? */ + } + /* * Some APs, e.g. Netgear WNDR3700, report invalid HT operation data * in their association response, so ignore that data for our own @@ -3089,7 +3259,8 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, if (ifmgd->flags & IEEE80211_STA_DISABLE_WMM) { ieee80211_set_wmm_default(sdata, false, false); } else if (!ieee80211_sta_wmm_params(local, sdata, elems.wmm_param, - elems.wmm_param_len)) { + elems.wmm_param_len, + elems.mu_edca_param_set)) { /* still enable QoS since we might have HT/VHT */ ieee80211_set_wmm_default(sdata, false, true); /* set the disable-WMM flag in this case to disable @@ -3603,7 +3774,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, if (!(ifmgd->flags & IEEE80211_STA_DISABLE_WMM) && ieee80211_sta_wmm_params(local, sdata, elems.wmm_param, - elems.wmm_param_len)) + elems.wmm_param_len, + elems.mu_edca_param_set)) changed |= BSS_CHANGED_QOS; /* @@ -3642,7 +3814,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, if (ieee80211_config_bw(sdata, sta, elems.ht_cap_elem, elems.ht_operation, - elems.vht_operation, bssid, &changed)) { + elems.vht_operation, elems.he_operation, + bssid, &changed)) { mutex_unlock(&local->sta_mtx); sdata_info(sdata, "failed to follow AP %pM bandwidth change, disconnect\n", @@ -4279,6 +4452,66 @@ static u8 ieee80211_ht_vht_rx_chains(struct ieee80211_sub_if_data *sdata, return chains; } +static bool +ieee80211_verify_sta_he_mcs_support(struct ieee80211_supported_band *sband, + const struct ieee80211_he_operation *he_op) +{ + const struct ieee80211_sta_he_cap *sta_he_cap = + ieee80211_get_he_sta_cap(sband); + u16 ap_min_req_set = le16_to_cpu(he_op->he_mcs_nss_set); + int i; + + if (!sta_he_cap || !he_op) + return false; + + /* Need to go over for 80MHz, 160MHz and for 80+80 */ + for (i = 0; i < 3; i++) { + const struct ieee80211_he_mcs_nss_supp *sta_mcs_nss_supp = + &sta_he_cap->he_mcs_nss_supp; + u16 sta_mcs_map_rx = + le16_to_cpu(((__le16 *)sta_mcs_nss_supp)[2 * i]); + u16 sta_mcs_map_tx = + le16_to_cpu(((__le16 *)sta_mcs_nss_supp)[2 * i + 1]); + u8 nss; + bool verified = true; + + /* + * For each band there is a maximum of 8 spatial streams + * possible. Each of the sta_mcs_map_* is a 16-bit struct built + * of 2 bits per NSS (1-8), with the values defined in enum + * ieee80211_he_mcs_support. Need to make sure STA TX and RX + * capabilities aren't less than the AP's minimum requirements + * for this HE BSS per SS. + * It is enough to find one such band that meets the reqs. + */ + for (nss = 8; nss > 0; nss--) { + u8 sta_rx_val = (sta_mcs_map_rx >> (2 * (nss - 1))) & 3; + u8 sta_tx_val = (sta_mcs_map_tx >> (2 * (nss - 1))) & 3; + u8 ap_val = (ap_min_req_set >> (2 * (nss - 1))) & 3; + + if (ap_val == IEEE80211_HE_MCS_NOT_SUPPORTED) + continue; + + /* + * Make sure the HE AP doesn't require MCSs that aren't + * supported by the client + */ + if (sta_rx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || + sta_tx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || + (ap_val > sta_rx_val) || (ap_val > sta_tx_val)) { + verified = false; + break; + } + } + + if (verified) + return true; + } + + /* If here, STA doesn't meet AP's HE min requirements */ + return false; +} + static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss) { @@ -4287,6 +4520,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, const struct ieee80211_ht_cap *ht_cap = NULL; const struct ieee80211_ht_operation *ht_oper = NULL; const struct ieee80211_vht_operation *vht_oper = NULL; + const struct ieee80211_he_operation *he_oper = NULL; struct ieee80211_supported_band *sband; struct cfg80211_chan_def chandef; int ret; @@ -4342,6 +4576,25 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, } } + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && + ieee80211_get_he_sta_cap(sband)) { + const struct cfg80211_bss_ies *ies; + const u8 *he_oper_ie; + + ies = rcu_dereference(cbss->ies); + he_oper_ie = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_OPERATION, + ies->data, ies->len); + if (he_oper_ie && + he_oper_ie[1] == ieee80211_he_oper_size(&he_oper_ie[3])) + he_oper = (void *)(he_oper_ie + 3); + else + he_oper = NULL; + + if (!he_oper || + !ieee80211_verify_sta_he_mcs_support(sband, he_oper)) + ifmgd->flags |= IEEE80211_STA_DISABLE_HE; + } + /* Allow VHT if at least one channel on the sband supports 80 MHz */ have_80mhz = false; for (i = 0; i < sband->n_channels; i++) { @@ -4358,7 +4611,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, ifmgd->flags |= ieee80211_determine_chantype(sdata, sband, cbss->channel, - ht_oper, vht_oper, + ht_oper, vht_oper, he_oper, &chandef, false); sdata->needed_rx_chains = min(ieee80211_ht_vht_rx_chains(sdata, cbss), @@ -4764,8 +5017,9 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP104) { ifmgd->flags |= IEEE80211_STA_DISABLE_HT; ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; + ifmgd->flags |= IEEE80211_STA_DISABLE_HE; netdev_info(sdata->dev, - "disabling HT/VHT due to WEP/TKIP use\n"); + "disabling HE/HT/VHT due to WEP/TKIP use\n"); } } diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 756ba176db1e..a16ba568e2a3 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -175,6 +175,20 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local, len += 12; } + if (status->encoding == RX_ENC_HE && + status->flag & RX_FLAG_RADIOTAP_HE) { + len = ALIGN(len, 2); + len += 12; + BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_he) != 12); + } + + if (status->encoding == RX_ENC_HE && + status->flag & RX_FLAG_RADIOTAP_HE_MU) { + len = ALIGN(len, 2); + len += 12; + BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_he_mu) != 12); + } + if (status->chains) { /* antenna and antenna signal fields */ len += 2 * hweight8(status->chains); @@ -263,6 +277,19 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, int mpdulen, chain; unsigned long chains = status->chains; struct ieee80211_vendor_radiotap rtap = {}; + struct ieee80211_radiotap_he he = {}; + struct ieee80211_radiotap_he_mu he_mu = {}; + + if (status->flag & RX_FLAG_RADIOTAP_HE) { + he = *(struct ieee80211_radiotap_he *)skb->data; + skb_pull(skb, sizeof(he)); + WARN_ON_ONCE(status->encoding != RX_ENC_HE); + } + + if (status->flag & RX_FLAG_RADIOTAP_HE_MU) { + he_mu = *(struct ieee80211_radiotap_he_mu *)skb->data; + skb_pull(skb, sizeof(he_mu)); + } if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA) { rtap = *(struct ieee80211_vendor_radiotap *)skb->data; @@ -520,6 +547,89 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, *pos++ = flags; } + if (status->encoding == RX_ENC_HE && + status->flag & RX_FLAG_RADIOTAP_HE) { +#define HE_PREP(f, val) cpu_to_le16(FIELD_PREP(IEEE80211_RADIOTAP_HE_##f, val)) + + if (status->enc_flags & RX_ENC_FLAG_STBC_MASK) { + he.data6 |= HE_PREP(DATA6_NSTS, + FIELD_GET(RX_ENC_FLAG_STBC_MASK, + status->enc_flags)); + he.data3 |= HE_PREP(DATA3_STBC, 1); + } else { + he.data6 |= HE_PREP(DATA6_NSTS, status->nss); + } + +#define CHECK_GI(s) \ + BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_GI_##s != \ + (int)NL80211_RATE_INFO_HE_GI_##s) + + CHECK_GI(0_8); + CHECK_GI(1_6); + CHECK_GI(3_2); + + he.data3 |= HE_PREP(DATA3_DATA_MCS, status->rate_idx); + he.data3 |= HE_PREP(DATA3_DATA_DCM, status->he_dcm); + he.data3 |= HE_PREP(DATA3_CODING, + !!(status->enc_flags & RX_ENC_FLAG_LDPC)); + + he.data5 |= HE_PREP(DATA5_GI, status->he_gi); + + switch (status->bw) { + case RATE_INFO_BW_20: + he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_20MHZ); + break; + case RATE_INFO_BW_40: + he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_40MHZ); + break; + case RATE_INFO_BW_80: + he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_80MHZ); + break; + case RATE_INFO_BW_160: + he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, + IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_160MHZ); + break; + case RATE_INFO_BW_HE_RU: +#define CHECK_RU_ALLOC(s) \ + BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_##s##T != \ + NL80211_RATE_INFO_HE_RU_ALLOC_##s + 4) + + CHECK_RU_ALLOC(26); + CHECK_RU_ALLOC(52); + CHECK_RU_ALLOC(106); + CHECK_RU_ALLOC(242); + CHECK_RU_ALLOC(484); + CHECK_RU_ALLOC(996); + CHECK_RU_ALLOC(2x996); + + he.data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, + status->he_ru + 4); + break; + default: + WARN_ONCE(1, "Invalid SU BW %d\n", status->bw); + } + + /* ensure 2 byte alignment */ + while ((pos - (u8 *)rthdr) & 1) + pos++; + rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_HE); + memcpy(pos, &he, sizeof(he)); + pos += sizeof(he); + } + + if (status->encoding == RX_ENC_HE && + status->flag & RX_FLAG_RADIOTAP_HE_MU) { + /* ensure 2 byte alignment */ + while ((pos - (u8 *)rthdr) & 1) + pos++; + rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_HE_MU); + memcpy(pos, &he_mu, sizeof(he_mu)); + pos += sizeof(he_mu); + } + for_each_set_bit(chain, &chains, IEEE80211_MAX_CHAINS) { *pos++ = status->chain_signal[chain]; *pos++ = chain; @@ -613,6 +723,12 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, rcu_dereference(local->monitor_sdata); bool only_monitor = false; + if (status->flag & RX_FLAG_RADIOTAP_HE) + rtap_space += sizeof(struct ieee80211_radiotap_he); + + if (status->flag & RX_FLAG_RADIOTAP_HE_MU) + rtap_space += sizeof(struct ieee80211_radiotap_he_mu); + if (unlikely(status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)) { struct ieee80211_vendor_radiotap *rtap = (void *)origskb->data; @@ -3386,8 +3502,7 @@ static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx, status = IEEE80211_SKB_RXCB((rx->skb)); sband = rx->local->hw.wiphy->bands[status->band]; - if (!(status->encoding == RX_ENC_HT) && - !(status->encoding == RX_ENC_VHT)) + if (status->encoding == RX_ENC_LEGACY) rate = &sband->bitrates[status->rate_idx]; ieee80211_rx_cooked_monitor(rx, rate); @@ -4386,6 +4501,14 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, status->rate_idx, status->nss)) goto drop; break; + case RX_ENC_HE: + if (WARN_ONCE(status->rate_idx > 11 || + !status->nss || + status->nss > 8, + "Rate marked as an HE rate but data is invalid: MCS: %d, NSS: %d\n", + status->rate_idx, status->nss)) + goto drop; + break; default: WARN_ON_ONCE(1); /* fall through */ diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index aa96fddfbfc2..aa8fe771a8db 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1323,6 +1323,11 @@ static void ieee80211_send_null_response(struct sta_info *sta, int tid, struct ieee80211_tx_info *info; struct ieee80211_chanctx_conf *chanctx_conf; + /* Don't send NDPs when STA is connected HE */ + if (sdata->vif.type == NL80211_IFTYPE_STATION && + !(sdata->u.mgd.flags & IEEE80211_STA_DISABLE_HE)) + return; + if (qos) { fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_QOS_NULLFUNC | @@ -1968,7 +1973,7 @@ sta_get_last_rx_stats(struct sta_info *sta) return stats; } -static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate, +static void sta_stats_decode_rate(struct ieee80211_local *local, u32 rate, struct rate_info *rinfo) { rinfo->bw = STA_STATS_GET(BW, rate); @@ -2005,6 +2010,14 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate, rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift); break; } + case STA_STATS_RATE_TYPE_HE: + rinfo->flags = RATE_INFO_FLAGS_HE_MCS; + rinfo->mcs = STA_STATS_GET(HE_MCS, rate); + rinfo->nss = STA_STATS_GET(HE_NSS, rate); + rinfo->he_gi = STA_STATS_GET(HE_GI, rate); + rinfo->he_ru_alloc = STA_STATS_GET(HE_RU, rate); + rinfo->he_dcm = STA_STATS_GET(HE_DCM, rate); + break; } } diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 81b35f623792..9a04327d71d1 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -170,7 +170,7 @@ struct tid_ampdu_tx { u8 dialog_token; u8 stop_initiator; bool tx_stop; - u8 buf_size; + u16 buf_size; u16 failed_bar_ssn; bool bar_pending; @@ -405,7 +405,7 @@ struct ieee80211_sta_rx_stats { int last_signal; u8 chains; s8 chain_signal_last[IEEE80211_MAX_CHAINS]; - u16 last_rate; + u32 last_rate; struct u64_stats_sync syncp; u64 bytes; u64 msdu[IEEE80211_NUM_TIDS + 1]; @@ -764,6 +764,7 @@ enum sta_stats_type { STA_STATS_RATE_TYPE_LEGACY, STA_STATS_RATE_TYPE_HT, STA_STATS_RATE_TYPE_VHT, + STA_STATS_RATE_TYPE_HE, }; #define STA_STATS_FIELD_HT_MCS GENMASK( 7, 0) @@ -771,9 +772,14 @@ enum sta_stats_type { #define STA_STATS_FIELD_LEGACY_BAND GENMASK( 7, 4) #define STA_STATS_FIELD_VHT_MCS GENMASK( 3, 0) #define STA_STATS_FIELD_VHT_NSS GENMASK( 7, 4) +#define STA_STATS_FIELD_HE_MCS GENMASK( 3, 0) +#define STA_STATS_FIELD_HE_NSS GENMASK( 7, 4) #define STA_STATS_FIELD_BW GENMASK(11, 8) #define STA_STATS_FIELD_SGI GENMASK(12, 12) #define STA_STATS_FIELD_TYPE GENMASK(15, 13) +#define STA_STATS_FIELD_HE_RU GENMASK(18, 16) +#define STA_STATS_FIELD_HE_GI GENMASK(20, 19) +#define STA_STATS_FIELD_HE_DCM GENMASK(21, 21) #define STA_STATS_FIELD(_n, _v) FIELD_PREP(STA_STATS_FIELD_ ## _n, _v) #define STA_STATS_GET(_n, _v) FIELD_GET(STA_STATS_FIELD_ ## _n, _v) @@ -782,7 +788,7 @@ enum sta_stats_type { static inline u32 sta_stats_encode_rate(struct ieee80211_rx_status *s) { - u16 r; + u32 r; r = STA_STATS_FIELD(BW, s->bw); @@ -804,6 +810,14 @@ static inline u32 sta_stats_encode_rate(struct ieee80211_rx_status *s) r |= STA_STATS_FIELD(LEGACY_BAND, s->band); r |= STA_STATS_FIELD(LEGACY_IDX, s->rate_idx); break; + case RX_ENC_HE: + r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_HE); + r |= STA_STATS_FIELD(HE_NSS, s->nss); + r |= STA_STATS_FIELD(HE_MCS, s->rate_idx); + r |= STA_STATS_FIELD(HE_GI, s->he_gi); + r |= STA_STATS_FIELD(HE_RU, s->he_ru); + r |= STA_STATS_FIELD(HE_DCM, s->he_dcm); + break; default: WARN_ON(1); return STA_STATS_RATE_INVALID; diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 80a7edf8d314..0ab69a1964f8 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -92,7 +92,7 @@ STA_ENTRY \ __field(u16, tid) \ __field(u16, ssn) \ - __field(u8, buf_size) \ + __field(u16, buf_size) \ __field(bool, amsdu) \ __field(u16, timeout) \ __field(u16, action) diff --git a/net/mac80211/util.c b/net/mac80211/util.c index b744b10465c3..c77c84325348 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1095,6 +1095,21 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, if (elen >= sizeof(*elems->max_idle_period_ie)) elems->max_idle_period_ie = (void *)pos; break; + case WLAN_EID_EXTENSION: + if (pos[0] == WLAN_EID_EXT_HE_MU_EDCA && + elen >= (sizeof(*elems->mu_edca_param_set) + 1)) { + elems->mu_edca_param_set = (void *)&pos[1]; + } else if (pos[0] == WLAN_EID_EXT_HE_CAPABILITY) { + elems->he_cap = (void *)&pos[1]; + elems->he_cap_len = elen - 1; + } else if (pos[0] == WLAN_EID_EXT_HE_OPERATION && + elen >= sizeof(*elems->he_operation) && + elen >= ieee80211_he_oper_size(&pos[1])) { + elems->he_operation = (void *)&pos[1]; + } else if (pos[0] == WLAN_EID_EXT_UORA && elen >= 1) { + elems->uora_element = (void *)&pos[1]; + } + break; default: break; } @@ -1356,6 +1371,7 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, size_t *offset, u32 flags) { struct ieee80211_supported_band *sband; + const struct ieee80211_sta_he_cap *he_cap; u8 *pos = buffer, *end = buffer + buffer_len; size_t noffset; int supp_rates_len, i; @@ -1463,11 +1479,6 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, sband->ht_cap.cap); } - /* - * If adding more here, adjust code in main.c - * that calculates local->scan_ies_len. - */ - /* insert custom IEs that go before VHT */ if (ie && ie_len) { static const u8 before_vht[] = { @@ -1510,6 +1521,39 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, sband->vht_cap.cap); } + /* insert custom IEs that go before HE */ + if (ie && ie_len) { + static const u8 before_he[] = { + /* + * no need to list the ones split off before VHT + * or generated here + */ + WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_REQ_PARAMS, + WLAN_EID_AP_CSN, + /* TODO: add 11ah/11aj/11ak elements */ + }; + noffset = ieee80211_ie_split(ie, ie_len, + before_he, ARRAY_SIZE(before_he), + *offset); + if (end - pos < noffset - *offset) + goto out_err; + memcpy(pos, ie + *offset, noffset - *offset); + pos += noffset - *offset; + *offset = noffset; + } + + he_cap = ieee80211_get_he_sta_cap(sband); + if (he_cap) { + pos = ieee80211_ie_build_he_cap(pos, he_cap, end); + if (!pos) + goto out_err; + } + + /* + * If adding more here, adjust code in main.c + * that calculates local->scan_ies_len. + */ + return pos - buffer; out_err: WARN_ONCE(1, "not enough space for preq IEs\n"); @@ -2396,6 +2440,72 @@ u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, return pos; } +u8 *ieee80211_ie_build_he_cap(u8 *pos, + const struct ieee80211_sta_he_cap *he_cap, + u8 *end) +{ + u8 n; + u8 ie_len; + u8 *orig_pos = pos; + + /* Make sure we have place for the IE */ + /* + * TODO: the 1 added is because this temporarily is under the EXTENSION + * IE. Get rid of it when it moves. + */ + if (!he_cap) + return orig_pos; + + n = ieee80211_he_mcs_nss_size(&he_cap->he_cap_elem); + ie_len = 2 + 1 + + sizeof(he_cap->he_cap_elem) + n + + ieee80211_he_ppe_size(he_cap->ppe_thres[0], + he_cap->he_cap_elem.phy_cap_info); + + if ((end - pos) < ie_len) + return orig_pos; + + *pos++ = WLAN_EID_EXTENSION; + pos++; /* We'll set the size later below */ + *pos++ = WLAN_EID_EXT_HE_CAPABILITY; + + /* Fixed data */ + memcpy(pos, &he_cap->he_cap_elem, sizeof(he_cap->he_cap_elem)); + pos += sizeof(he_cap->he_cap_elem); + + memcpy(pos, &he_cap->he_mcs_nss_supp, n); + pos += n; + + /* Check if PPE Threshold should be present */ + if ((he_cap->he_cap_elem.phy_cap_info[6] & + IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) == 0) + goto end; + + /* + * Calculate how many PPET16/PPET8 pairs are to come. Algorithm: + * (NSS_M1 + 1) x (num of 1 bits in RU_INDEX_BITMASK) + */ + n = hweight8(he_cap->ppe_thres[0] & + IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK); + n *= (1 + ((he_cap->ppe_thres[0] & IEEE80211_PPE_THRES_NSS_MASK) >> + IEEE80211_PPE_THRES_NSS_POS)); + + /* + * Each pair is 6 bits, and we need to add the 7 "header" bits to the + * total size. + */ + n = (n * IEEE80211_PPE_THRES_INFO_PPET_SIZE * 2) + 7; + n = DIV_ROUND_UP(n, 8); + + /* Copy PPE Thresholds */ + memcpy(pos, &he_cap->ppe_thres, n); + pos += n; + +end: + orig_pos[1] = (pos - orig_pos) - 2; + return pos; +} + u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, const struct cfg80211_chan_def *chandef, u16 prot_mode, bool rifs_mode) -- cgit v1.2.3 From 0eb71a9da5796851fa87ddc1a534066c0fe54055 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 18 Jun 2018 12:52:50 +1000 Subject: rhashtable: split rhashtable.h Due to the use of rhashtables in net namespaces, rhashtable.h is included in lots of the kernel, so a small changes can required a large recompilation. This makes development painful. This patch splits out rhashtable-types.h which just includes the major type declarations, and does not include (non-trivial) inline code. rhashtable.h is no longer included by anything in the include/ directory. Common include files only include rhashtable-types.h so a large recompilation is only triggered when that changes. Acked-by: Herbert Xu Signed-off-by: NeilBrown Signed-off-by: David S. Miller --- MAINTAINERS | 2 + drivers/net/ethernet/chelsio/cxgb4/cxgb4.h | 1 + include/linux/ipc.h | 2 +- include/linux/ipc_namespace.h | 2 +- include/linux/mroute_base.h | 2 +- include/linux/rhashtable-types.h | 139 +++++++++++++++++++++++++++++ include/linux/rhashtable.h | 127 +------------------------- include/net/inet_frag.h | 2 +- include/net/netfilter/nf_flow_table.h | 2 +- include/net/sctp/structs.h | 2 +- include/net/seg6.h | 2 +- include/net/seg6_hmac.h | 2 +- ipc/msg.c | 1 + ipc/sem.c | 1 + ipc/shm.c | 1 + ipc/util.c | 1 + lib/rhashtable.c | 1 + net/ipv4/inet_fragment.c | 1 + net/ipv4/ipmr.c | 1 + net/ipv4/ipmr_base.c | 1 + net/ipv6/ip6mr.c | 1 + net/ipv6/seg6.c | 1 + net/ipv6/seg6_hmac.c | 1 + net/netfilter/nf_tables_api.c | 1 + net/sctp/input.c | 1 + net/sctp/socket.c | 1 + 26 files changed, 166 insertions(+), 133 deletions(-) create mode 100644 include/linux/rhashtable-types.h (limited to 'include/net') diff --git a/MAINTAINERS b/MAINTAINERS index edf3cf5ea691..99e5cef8172e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12162,7 +12162,9 @@ M: Herbert Xu L: netdev@vger.kernel.org S: Maintained F: lib/rhashtable.c +F: lib/test_rhashtable.c F: include/linux/rhashtable.h +F: include/linux/rhashtable-types.h RICOH R5C592 MEMORYSTICK DRIVER M: Maxim Levitsky diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h index 0dbe2d9e22d6..1adb968b8354 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/ipc.h b/include/linux/ipc.h index 6cc2df7f7ac9..e1c9eea6015b 100644 --- a/include/linux/ipc.h +++ b/include/linux/ipc.h @@ -4,7 +4,7 @@ #include #include -#include +#include #include #include diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index b5630c8eb2f3..6cea726612b7 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -9,7 +9,7 @@ #include #include #include -#include +#include struct user_namespace; diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index d633f737b3c6..fd436cdd4725 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -2,7 +2,7 @@ #define __LINUX_MROUTE_BASE_H #include -#include +#include #include #include #include diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h new file mode 100644 index 000000000000..9740063ff13b --- /dev/null +++ b/include/linux/rhashtable-types.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Resizable, Scalable, Concurrent Hash Table + * + * Simple structures that might be needed in include + * files. + */ + +#ifndef _LINUX_RHASHTABLE_TYPES_H +#define _LINUX_RHASHTABLE_TYPES_H + +#include +#include +#include +#include + +struct rhash_head { + struct rhash_head __rcu *next; +}; + +struct rhlist_head { + struct rhash_head rhead; + struct rhlist_head __rcu *next; +}; + +struct bucket_table; + +/** + * struct rhashtable_compare_arg - Key for the function rhashtable_compare + * @ht: Hash table + * @key: Key to compare against + */ +struct rhashtable_compare_arg { + struct rhashtable *ht; + const void *key; +}; + +typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed); +typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed); +typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg, + const void *obj); + +/** + * struct rhashtable_params - Hash table construction parameters + * @nelem_hint: Hint on number of elements, should be 75% of desired size + * @key_len: Length of key + * @key_offset: Offset of key in struct to be hashed + * @head_offset: Offset of rhash_head in struct to be hashed + * @max_size: Maximum size while expanding + * @min_size: Minimum size while shrinking + * @locks_mul: Number of bucket locks to allocate per cpu (default: 32) + * @automatic_shrinking: Enable automatic shrinking of tables + * @nulls_base: Base value to generate nulls marker + * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash) + * @obj_hashfn: Function to hash object + * @obj_cmpfn: Function to compare key with object + */ +struct rhashtable_params { + u16 nelem_hint; + u16 key_len; + u16 key_offset; + u16 head_offset; + unsigned int max_size; + u16 min_size; + bool automatic_shrinking; + u8 locks_mul; + u32 nulls_base; + rht_hashfn_t hashfn; + rht_obj_hashfn_t obj_hashfn; + rht_obj_cmpfn_t obj_cmpfn; +}; + +/** + * struct rhashtable - Hash table handle + * @tbl: Bucket table + * @key_len: Key length for hashfn + * @max_elems: Maximum number of elements in table + * @p: Configuration parameters + * @rhlist: True if this is an rhltable + * @run_work: Deferred worker to expand/shrink asynchronously + * @mutex: Mutex to protect current/future table swapping + * @lock: Spin lock to protect walker list + * @nelems: Number of elements in table + */ +struct rhashtable { + struct bucket_table __rcu *tbl; + unsigned int key_len; + unsigned int max_elems; + struct rhashtable_params p; + bool rhlist; + struct work_struct run_work; + struct mutex mutex; + spinlock_t lock; + atomic_t nelems; +}; + +/** + * struct rhltable - Hash table with duplicate objects in a list + * @ht: Underlying rhtable + */ +struct rhltable { + struct rhashtable ht; +}; + +/** + * struct rhashtable_walker - Hash table walker + * @list: List entry on list of walkers + * @tbl: The table that we were walking over + */ +struct rhashtable_walker { + struct list_head list; + struct bucket_table *tbl; +}; + +/** + * struct rhashtable_iter - Hash table iterator + * @ht: Table to iterate through + * @p: Current pointer + * @list: Current hash list pointer + * @walker: Associated rhashtable walker + * @slot: Current slot + * @skip: Number of entries to skip in slot + */ +struct rhashtable_iter { + struct rhashtable *ht; + struct rhash_head *p; + struct rhlist_head *list; + struct rhashtable_walker walker; + unsigned int slot; + unsigned int skip; + bool end_of_table; +}; + +int rhashtable_init(struct rhashtable *ht, + const struct rhashtable_params *params); +int rhltable_init(struct rhltable *hlt, + const struct rhashtable_params *params); + +#endif /* _LINUX_RHASHTABLE_TYPES_H */ diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 4e1f535c2034..48754ab07cdf 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Resizable, Scalable, Concurrent Hash Table * @@ -17,16 +18,14 @@ #ifndef _LINUX_RHASHTABLE_H #define _LINUX_RHASHTABLE_H -#include -#include #include #include #include #include #include -#include #include +#include /* * The end of the chain is marked with a special nulls marks which has * the following format: @@ -64,15 +63,6 @@ */ #define RHT_ELASTICITY 16u -struct rhash_head { - struct rhash_head __rcu *next; -}; - -struct rhlist_head { - struct rhash_head rhead; - struct rhlist_head __rcu *next; -}; - /** * struct bucket_table - Table of hash buckets * @size: Number of hash buckets @@ -102,114 +92,6 @@ struct bucket_table { struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp; }; -/** - * struct rhashtable_compare_arg - Key for the function rhashtable_compare - * @ht: Hash table - * @key: Key to compare against - */ -struct rhashtable_compare_arg { - struct rhashtable *ht; - const void *key; -}; - -typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed); -typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed); -typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg, - const void *obj); - -struct rhashtable; - -/** - * struct rhashtable_params - Hash table construction parameters - * @nelem_hint: Hint on number of elements, should be 75% of desired size - * @key_len: Length of key - * @key_offset: Offset of key in struct to be hashed - * @head_offset: Offset of rhash_head in struct to be hashed - * @max_size: Maximum size while expanding - * @min_size: Minimum size while shrinking - * @locks_mul: Number of bucket locks to allocate per cpu (default: 32) - * @automatic_shrinking: Enable automatic shrinking of tables - * @nulls_base: Base value to generate nulls marker - * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash) - * @obj_hashfn: Function to hash object - * @obj_cmpfn: Function to compare key with object - */ -struct rhashtable_params { - u16 nelem_hint; - u16 key_len; - u16 key_offset; - u16 head_offset; - unsigned int max_size; - u16 min_size; - bool automatic_shrinking; - u8 locks_mul; - u32 nulls_base; - rht_hashfn_t hashfn; - rht_obj_hashfn_t obj_hashfn; - rht_obj_cmpfn_t obj_cmpfn; -}; - -/** - * struct rhashtable - Hash table handle - * @tbl: Bucket table - * @key_len: Key length for hashfn - * @max_elems: Maximum number of elements in table - * @p: Configuration parameters - * @rhlist: True if this is an rhltable - * @run_work: Deferred worker to expand/shrink asynchronously - * @mutex: Mutex to protect current/future table swapping - * @lock: Spin lock to protect walker list - * @nelems: Number of elements in table - */ -struct rhashtable { - struct bucket_table __rcu *tbl; - unsigned int key_len; - unsigned int max_elems; - struct rhashtable_params p; - bool rhlist; - struct work_struct run_work; - struct mutex mutex; - spinlock_t lock; - atomic_t nelems; -}; - -/** - * struct rhltable - Hash table with duplicate objects in a list - * @ht: Underlying rhtable - */ -struct rhltable { - struct rhashtable ht; -}; - -/** - * struct rhashtable_walker - Hash table walker - * @list: List entry on list of walkers - * @tbl: The table that we were walking over - */ -struct rhashtable_walker { - struct list_head list; - struct bucket_table *tbl; -}; - -/** - * struct rhashtable_iter - Hash table iterator - * @ht: Table to iterate through - * @p: Current pointer - * @list: Current hash list pointer - * @walker: Associated rhashtable walker - * @slot: Current slot - * @skip: Number of entries to skip in slot - */ -struct rhashtable_iter { - struct rhashtable *ht; - struct rhash_head *p; - struct rhlist_head *list; - struct rhashtable_walker walker; - unsigned int slot; - unsigned int skip; - bool end_of_table; -}; - static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash) { return NULLS_MARKER(ht->p.nulls_base + hash); @@ -376,11 +258,6 @@ static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, } #endif /* CONFIG_PROVE_LOCKING */ -int rhashtable_init(struct rhashtable *ht, - const struct rhashtable_params *params); -int rhltable_init(struct rhltable *hlt, - const struct rhashtable_params *params); - void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, struct rhash_head *obj); diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index ed07e3786d98..f4272a29dc44 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -2,7 +2,7 @@ #ifndef __NET_FRAG_H__ #define __NET_FRAG_H__ -#include +#include struct netns_frags { /* sysctls */ diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index ba9fa4592f2b..0e355f4a3d76 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index dbe1b911a24d..e0f962d27386 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -48,7 +48,7 @@ #define __sctp_structs_h__ #include -#include +#include #include /* linux/in.h needs this!! */ #include /* We get struct sockaddr_in. */ #include /* We get struct in6_addr */ diff --git a/include/net/seg6.h b/include/net/seg6.h index e029e301faa5..2567941a2f32 100644 --- a/include/net/seg6.h +++ b/include/net/seg6.h @@ -18,7 +18,7 @@ #include #include #include -#include +#include static inline void update_csum_diff4(struct sk_buff *skb, __be32 from, __be32 to) diff --git a/include/net/seg6_hmac.h b/include/net/seg6_hmac.h index 69c3a106056b..7fda469e2758 100644 --- a/include/net/seg6_hmac.h +++ b/include/net/seg6_hmac.h @@ -22,7 +22,7 @@ #include #include #include -#include +#include #define SEG6_HMAC_MAX_DIGESTSIZE 160 #define SEG6_HMAC_RING_SIZE 256 diff --git a/ipc/msg.c b/ipc/msg.c index 3b6545302598..203281198079 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include diff --git a/ipc/sem.c b/ipc/sem.c index 5af1943ad782..29c0347ef11d 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -86,6 +86,7 @@ #include #include #include +#include #include #include "util.h" diff --git a/ipc/shm.c b/ipc/shm.c index 051a3e1fb8df..d4daf78df6da 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -43,6 +43,7 @@ #include #include #include +#include #include diff --git a/ipc/util.c b/ipc/util.c index 4e81182fa0ac..fdffff41f65b 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -63,6 +63,7 @@ #include #include #include +#include #include diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 9427b5766134..c9fafea7dc6e 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -28,6 +28,7 @@ #include #include #include +#include #define HASH_DEFAULT_SIZE 64UL #define HASH_MIN_SIZE 4U diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index c9e35b81d093..316518f87294 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 9f79b9803a16..82f914122f1b 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index cafb0506c8c9..1ad9aa62a97b 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -2,6 +2,7 @@ * Common logic shared by IPv4 [ipmr] and IPv6 [ip6mr] implementation */ +#include #include /* Sets everything common except 'dev', since that is done under locking */ diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 0d0f0053bb11..d0b7e0249c13 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 0fdf2a55e746..8d0ba757a46c 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index 33fb35cbfac1..b1791129a875 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 896d4a36081d..3f211e1025c1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include diff --git a/net/sctp/input.c b/net/sctp/input.c index ba8a6e6c36fa..9bbc5f92c941 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -56,6 +56,7 @@ #include #include #include +#include /* Forward declarations for internal helpers. */ static int sctp_rcv_ootb(struct sk_buff *); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index d20f7addee19..0e91e83eea5a 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From cadefe5f584abaac40dce72009e4de738cbff467 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 20 Jun 2018 16:07:35 -0400 Subject: tcp_bbr: fix bbr pacing rate for internal pacing This commit makes BBR use only the MSS (without any headers) to calculate pacing rates when internal TCP-layer pacing is used. This is necessary to achieve the correct pacing behavior in this case, since tcp_internal_pacing() uses only the payload length to calculate pacing delays. Signed-off-by: Kevin Yang Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 11 +++++++++++ net/ipv4/tcp_bbr.c | 6 +++++- net/ipv4/tcp_output.c | 14 -------------- 3 files changed, 16 insertions(+), 15 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 0448e7c5d2b4..822ee49ed0f9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1184,6 +1184,17 @@ static inline bool tcp_is_cwnd_limited(const struct sock *sk) return tp->is_cwnd_limited; } +/* BBR congestion control needs pacing. + * Same remark for SO_MAX_PACING_RATE. + * sch_fq packet scheduler is efficiently handling pacing, + * but is not always installed/used. + * Return true if TCP stack should pace packets itself. + */ +static inline bool tcp_needs_internal_pacing(const struct sock *sk) +{ + return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED; +} + /* Something is really bad, we could not queue an additional packet, * because qdisc is full or receiver sent a 0 window. * We do not want to add fuel to the fire, or abort too early, diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 58e2f479ffb4..3b5f45b9e81e 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -205,7 +205,11 @@ static u32 bbr_bw(const struct sock *sk) */ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) { - rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache); + unsigned int mss = tcp_sk(sk)->mss_cache; + + if (!tcp_needs_internal_pacing(sk)) + mss = tcp_mss_to_mtu(sk, mss); + rate *= mss; rate *= gain; rate >>= BBR_SCALE; rate *= USEC_PER_SEC; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8e08b409c71e..f8f6129160dd 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -973,17 +973,6 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer) return HRTIMER_NORESTART; } -/* BBR congestion control needs pacing. - * Same remark for SO_MAX_PACING_RATE. - * sch_fq packet scheduler is efficiently handling pacing, - * but is not always installed/used. - * Return true if TCP stack should pace packets itself. - */ -static bool tcp_needs_internal_pacing(const struct sock *sk) -{ - return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED; -} - static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) { u64 len_ns; @@ -995,9 +984,6 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) if (!rate || rate == ~0U) return; - /* Should account for header sizes as sch_fq does, - * but lets make things simple. - */ len_ns = (u64)skb->len * NSEC_PER_SEC; do_div(len_ns, rate); hrtimer_start(&tcp_sk(sk)->pacing_timer, -- cgit v1.2.3 From 5424ea27390f1f8903e5de0eaa0c5b561e8e877a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Jun 2018 16:27:47 -0700 Subject: netns: get more entropy from net_hash_mix() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit struct net are effectively allocated from order-1 pages on x86, with one object per slab, meaning that the 13 low order bits of their addresses are zero. Once shifted by L1_CACHE_SHIFT, this leaves 7 zero-bits, meaning that net_hash_mix() does not help spreading objects on various hash tables. For example, TCP listen table has 32 buckets, meaning that all netns use the same bucket for port 80 or port 443. Signed-off-by: Eric Dumazet Reported-by: Maciej Å»enczykowski Signed-off-by: David S. Miller --- include/net/netns/hash.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/hash.h b/include/net/netns/hash.h index 24c78183a4c2..16a842456189 100644 --- a/include/net/netns/hash.h +++ b/include/net/netns/hash.h @@ -9,12 +9,7 @@ struct net; static inline u32 net_hash_mix(const struct net *net) { #ifdef CONFIG_NET_NS - /* - * shift this right to eliminate bits, that are - * always zeroed - */ - - return (u32)(((unsigned long)net) >> L1_CACHE_SHIFT); + return (u32)(((unsigned long)net) >> ilog2(sizeof(*net))); #else return 0; #endif -- cgit v1.2.3 From 9b42c1f179a614e11893ae4619f0304a38f481ae Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 12 Jun 2018 12:44:26 +0200 Subject: xfrm: Extend the output_mark to support input direction and masking. We already support setting an output mark at the xfrm_state, unfortunately this does not support the input direction and masking the marks that will be applied to the skb. This change adds support applying a masked value in both directions. The existing XFRMA_OUTPUT_MARK number is reused for this purpose and as it is now bi-directional, it is renamed to XFRMA_SET_MARK. An additional XFRMA_SET_MARK_MASK attribute is added for setting the mask. If the attribute mask not provided, it is set to 0xffffffff, keeping the XFRMA_OUTPUT_MARK existing 'full mask' semantics. Co-developed-by: Tobias Brunner Co-developed-by: Eyal Birger Co-developed-by: Lorenzo Colitti Signed-off-by: Steffen Klassert Signed-off-by: Tobias Brunner Signed-off-by: Eyal Birger Signed-off-by: Lorenzo Colitti --- include/net/xfrm.h | 9 ++++++++- include/uapi/linux/xfrm.h | 4 +++- net/xfrm/xfrm_device.c | 3 ++- net/xfrm/xfrm_input.c | 2 ++ net/xfrm/xfrm_output.c | 3 +-- net/xfrm/xfrm_policy.c | 5 +++-- net/xfrm/xfrm_user.c | 48 +++++++++++++++++++++++++++++++++++++---------- 7 files changed, 57 insertions(+), 17 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 557122846e0e..3dc83ba26f62 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -166,7 +166,7 @@ struct xfrm_state { int header_len; int trailer_len; u32 extra_flags; - u32 output_mark; + struct xfrm_mark smark; } props; struct xfrm_lifetime_cfg lft; @@ -2012,6 +2012,13 @@ static inline int xfrm_mark_put(struct sk_buff *skb, const struct xfrm_mark *m) return ret; } +static inline __u32 xfrm_smark_get(__u32 mark, struct xfrm_state *x) +{ + struct xfrm_mark *m = &x->props.smark; + + return (m->v & m->m) | (mark & ~m->m); +} + static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x, unsigned int family) { diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index e3af2859188b..5a6ed7ce5a29 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -305,9 +305,11 @@ enum xfrm_attr_type_t { XFRMA_ADDRESS_FILTER, /* struct xfrm_address_filter */ XFRMA_PAD, XFRMA_OFFLOAD_DEV, /* struct xfrm_state_offload */ - XFRMA_OUTPUT_MARK, /* __u32 */ + XFRMA_SET_MARK, /* __u32 */ + XFRMA_SET_MARK_MASK, /* __u32 */ __XFRMA_MAX +#define XFRMA_OUTPUT_MARK XFRMA_SET_MARK /* Compatibility */ #define XFRMA_MAX (__XFRMA_MAX - 1) }; diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 175941e15a6e..16c1230d20fa 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -162,7 +162,8 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, } dst = __xfrm_dst_lookup(net, 0, 0, saddr, daddr, - x->props.family, x->props.output_mark); + x->props.family, + xfrm_smark_get(0, x)); if (IS_ERR(dst)) return 0; diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 352abca2605f..074810436242 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -339,6 +339,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) goto drop; } + skb->mark = xfrm_smark_get(skb->mark, x); + skb->sp->xvec[skb->sp->len++] = x; lock: diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 89b178a78dc7..45ba07ab3e4f 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -66,8 +66,7 @@ static int xfrm_output_one(struct sk_buff *skb, int err) goto error_nolock; } - if (x->props.output_mark) - skb->mark = x->props.output_mark; + skb->mark = xfrm_smark_get(skb->mark, x); err = x->outer_mode->output(x, skb); if (err) { diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 5f48251c1319..7637637717ec 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1607,10 +1607,11 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, dst_copy_metrics(dst1, dst); if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) { + __u32 mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]); + family = xfrm[i]->props.family; dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif, - &saddr, &daddr, family, - xfrm[i]->props.output_mark); + &saddr, &daddr, family, mark); err = PTR_ERR(dst); if (IS_ERR(dst)) goto put_states; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 080035f056d9..9602cc9e05ab 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -527,6 +527,19 @@ static void xfrm_update_ae_params(struct xfrm_state *x, struct nlattr **attrs, x->replay_maxdiff = nla_get_u32(rt); } +static void xfrm_smark_init(struct nlattr **attrs, struct xfrm_mark *m) +{ + if (attrs[XFRMA_SET_MARK]) { + m->v = nla_get_u32(attrs[XFRMA_SET_MARK]); + if (attrs[XFRMA_SET_MARK_MASK]) + m->m = nla_get_u32(attrs[XFRMA_SET_MARK_MASK]); + else + m->m = 0xffffffff; + } else { + m->v = m->m = 0; + } +} + static struct xfrm_state *xfrm_state_construct(struct net *net, struct xfrm_usersa_info *p, struct nlattr **attrs, @@ -579,8 +592,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, xfrm_mark_get(attrs, &x->mark); - if (attrs[XFRMA_OUTPUT_MARK]) - x->props.output_mark = nla_get_u32(attrs[XFRMA_OUTPUT_MARK]); + xfrm_smark_init(attrs, &x->props.smark); err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV]); if (err) @@ -824,6 +836,18 @@ static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb) return 0; } +static int xfrm_smark_put(struct sk_buff *skb, struct xfrm_mark *m) +{ + int ret = 0; + + if (m->v | m->m) { + ret = nla_put_u32(skb, XFRMA_SET_MARK, m->v); + if (!ret) + ret = nla_put_u32(skb, XFRMA_SET_MARK_MASK, m->m); + } + return ret; +} + /* Don't change this without updating xfrm_sa_len! */ static int copy_to_user_state_extra(struct xfrm_state *x, struct xfrm_usersa_info *p, @@ -887,6 +911,11 @@ static int copy_to_user_state_extra(struct xfrm_state *x, ret = xfrm_mark_put(skb, &x->mark); if (ret) goto out; + + ret = xfrm_smark_put(skb, &x->props.smark); + if (ret) + goto out; + if (x->replay_esn) ret = nla_put(skb, XFRMA_REPLAY_ESN_VAL, xfrm_replay_state_esn_len(x->replay_esn), @@ -900,11 +929,7 @@ static int copy_to_user_state_extra(struct xfrm_state *x, ret = copy_user_offload(&x->xso, skb); if (ret) goto out; - if (x->props.output_mark) { - ret = nla_put_u32(skb, XFRMA_OUTPUT_MARK, x->props.output_mark); - if (ret) - goto out; - } + if (x->security) ret = copy_sec_ctx(x->security, skb); out: @@ -2493,7 +2518,8 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_PROTO] = { .type = NLA_U8 }, [XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) }, [XFRMA_OFFLOAD_DEV] = { .len = sizeof(struct xfrm_user_offload) }, - [XFRMA_OUTPUT_MARK] = { .type = NLA_U32 }, + [XFRMA_SET_MARK] = { .type = NLA_U32 }, + [XFRMA_SET_MARK_MASK] = { .type = NLA_U32 }, }; static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = { @@ -2719,8 +2745,10 @@ static inline unsigned int xfrm_sa_len(struct xfrm_state *x) l += nla_total_size(sizeof(x->props.extra_flags)); if (x->xso.dev) l += nla_total_size(sizeof(x->xso)); - if (x->props.output_mark) - l += nla_total_size(sizeof(x->props.output_mark)); + if (x->props.smark.v | x->props.smark.m) { + l += nla_total_size(sizeof(x->props.smark.v)); + l += nla_total_size(sizeof(x->props.smark.m)); + } /* Must count x->lastused as it may become non-zero behind our back. */ l += nla_total_size_64bit(sizeof(u64)); -- cgit v1.2.3 From d159ce7957eec306eacda672e5909e26675ca8ef Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 12 Jun 2018 14:06:57 +0200 Subject: flow: Extend flow informations with xfrm interface id. Add a new flowi_xfrm structure with informations needed to do a xfrm lookup. At the moment it keeps the informations about the new xfrm interface id needed to lookup xfrm interfaces that are introduced with a followup patch. We need this new lookup key as other possible keys, like the ifindex is already part of the xfrm selector and used as a key to enforce the output device after the transformation in the policy/state lookup. Signed-off-by: Steffen Klassert Acked-by: Shannon Nelson Acked-by: Benedict Wong Tested-by: Benedict Wong Tested-by: Antony Antony Reviewed-by: Eyal Birger --- include/net/flow.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/flow.h b/include/net/flow.h index 8ce21793094e..187c9bef672f 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -26,6 +26,10 @@ struct flowi_tunnel { __be64 tun_id; }; +struct flowi_xfrm { + __u32 if_id; +}; + struct flowi_common { int flowic_oif; int flowic_iif; @@ -39,6 +43,7 @@ struct flowi_common { #define FLOWI_FLAG_SKIP_NH_OIF 0x04 __u32 flowic_secid; struct flowi_tunnel flowic_tun_key; + struct flowi_xfrm xfrm; kuid_t flowic_uid; }; @@ -78,6 +83,7 @@ struct flowi4 { #define flowi4_secid __fl_common.flowic_secid #define flowi4_tun_key __fl_common.flowic_tun_key #define flowi4_uid __fl_common.flowic_uid +#define flowi4_xfrm __fl_common.xfrm /* (saddr,daddr) must be grouped, same order as in IP header */ __be32 saddr; @@ -109,6 +115,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, fl4->flowi4_flags = flags; fl4->flowi4_secid = 0; fl4->flowi4_tun_key.tun_id = 0; + fl4->flowi4_xfrm.if_id = 0; fl4->flowi4_uid = uid; fl4->daddr = daddr; fl4->saddr = saddr; @@ -138,6 +145,7 @@ struct flowi6 { #define flowi6_secid __fl_common.flowic_secid #define flowi6_tun_key __fl_common.flowic_tun_key #define flowi6_uid __fl_common.flowic_uid +#define flowi6_xfrm __fl_common.xfrm struct in6_addr daddr; struct in6_addr saddr; /* Note: flowi6_tos is encoded in flowlabel, too. */ @@ -185,6 +193,7 @@ struct flowi { #define flowi_secid u.__fl_common.flowic_secid #define flowi_tun_key u.__fl_common.flowic_tun_key #define flowi_uid u.__fl_common.flowic_uid +#define flowi_xfrm u.__fl_common.xfrm } __attribute__((__aligned__(BITS_PER_LONG/8))); static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4) -- cgit v1.2.3 From 7e6526404adedf079279aa7aa11722deaca8fe2e Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 12 Jun 2018 14:07:07 +0200 Subject: xfrm: Add a new lookup key to match xfrm interfaces. This patch adds the xfrm interface id as a lookup key for xfrm states and policies. With this we can assign states and policies to virtual xfrm interfaces. Signed-off-by: Steffen Klassert Acked-by: Shannon Nelson Acked-by: Benedict Wong Tested-by: Benedict Wong Tested-by: Antony Antony Reviewed-by: Eyal Birger --- include/net/xfrm.h | 21 +++++++++++++----- include/uapi/linux/xfrm.h | 1 + net/core/pktgen.c | 2 +- net/key/af_key.c | 6 +++--- net/xfrm/xfrm_policy.c | 18 +++++++++++----- net/xfrm/xfrm_state.c | 19 ++++++++++++----- net/xfrm/xfrm_user.c | 54 +++++++++++++++++++++++++++++++++++++++++------ 7 files changed, 96 insertions(+), 25 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 3dc83ba26f62..e8bada4d2a45 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -147,6 +147,7 @@ struct xfrm_state { struct xfrm_id id; struct xfrm_selector sel; struct xfrm_mark mark; + u32 if_id; u32 tfcpad; u32 genid; @@ -574,6 +575,7 @@ struct xfrm_policy { atomic_t genid; u32 priority; u32 index; + u32 if_id; struct xfrm_mark mark; struct xfrm_selector selector; struct xfrm_lifetime_cfg lft; @@ -1533,7 +1535,7 @@ struct xfrm_state *xfrm_state_find(const xfrm_address_t *daddr, struct xfrm_tmpl *tmpl, struct xfrm_policy *pol, int *err, unsigned short family); -struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, +struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id, xfrm_address_t *daddr, xfrm_address_t *saddr, unsigned short family, @@ -1690,20 +1692,20 @@ int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk, void *); void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net); int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl); -struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, +struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id, u8 type, int dir, struct xfrm_selector *sel, struct xfrm_sec_ctx *ctx, int delete, int *err); -struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8, int dir, - u32 id, int delete, int *err); +struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u32 if_id, u8, + int dir, u32 id, int delete, int *err); int xfrm_policy_flush(struct net *net, u8 type, bool task_valid); void xfrm_policy_hash_rebuild(struct net *net); u32 xfrm_get_acqseq(void); int verify_spi_info(u8 proto, u32 min, u32 max); int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi); struct xfrm_state *xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, - u8 mode, u32 reqid, u8 proto, + u8 mode, u32 reqid, u32 if_id, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create, unsigned short family); @@ -2019,6 +2021,15 @@ static inline __u32 xfrm_smark_get(__u32 mark, struct xfrm_state *x) return (m->v & m->m) | (mark & ~m->m); } +static inline int xfrm_if_id_put(struct sk_buff *skb, __u32 if_id) +{ + int ret = 0; + + if (if_id) + ret = nla_put_u32(skb, XFRMA_IF_ID, if_id); + return ret; +} + static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x, unsigned int family) { diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 5a6ed7ce5a29..5f3b9fec7b5f 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -307,6 +307,7 @@ enum xfrm_attr_type_t { XFRMA_OFFLOAD_DEV, /* struct xfrm_state_offload */ XFRMA_SET_MARK, /* __u32 */ XFRMA_SET_MARK_MASK, /* __u32 */ + XFRMA_IF_ID, /* __u32 */ __XFRMA_MAX #define XFRMA_OUTPUT_MARK XFRMA_SET_MARK /* Compatibility */ diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 49368e21d228..6d37dbf0aa64 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2255,7 +2255,7 @@ static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) x = xfrm_state_lookup_byspi(pn->net, htonl(pkt_dev->spi), AF_INET); } else { /* slow path: we dont already have xfrm_state */ - x = xfrm_stateonly_find(pn->net, DUMMY_MARK, + x = xfrm_stateonly_find(pn->net, DUMMY_MARK, 0, (xfrm_address_t *)&pkt_dev->cur_daddr, (xfrm_address_t *)&pkt_dev->cur_saddr, AF_INET, diff --git a/net/key/af_key.c b/net/key/af_key.c index 8bdc1cbe490a..398ebcd614a0 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1383,7 +1383,7 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_ } if (!x) - x = xfrm_find_acq(net, &dummy_mark, mode, reqid, proto, xdaddr, xsaddr, 1, family); + x = xfrm_find_acq(net, &dummy_mark, mode, reqid, 0, proto, xdaddr, xsaddr, 1, family); if (x == NULL) return -ENOENT; @@ -2414,7 +2414,7 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sa return err; } - xp = xfrm_policy_bysel_ctx(net, DUMMY_MARK, XFRM_POLICY_TYPE_MAIN, + xp = xfrm_policy_bysel_ctx(net, DUMMY_MARK, 0, XFRM_POLICY_TYPE_MAIN, pol->sadb_x_policy_dir - 1, &sel, pol_ctx, 1, &err); security_xfrm_policy_free(pol_ctx); @@ -2663,7 +2663,7 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, const struct sadb_ return -EINVAL; delete = (hdr->sadb_msg_type == SADB_X_SPDDELETE2); - xp = xfrm_policy_byid(net, DUMMY_MARK, XFRM_POLICY_TYPE_MAIN, + xp = xfrm_policy_byid(net, DUMMY_MARK, 0, XFRM_POLICY_TYPE_MAIN, dir, pol->sadb_x_policy_id, delete, &err); if (xp == NULL) return -ENOENT; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 7637637717ec..fc0c69312b2c 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -747,6 +747,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) newpos = NULL; hlist_for_each_entry(pol, chain, bydst) { if (pol->type == policy->type && + pol->if_id == policy->if_id && !selector_cmp(&pol->selector, &policy->selector) && xfrm_policy_mark_match(policy, pol) && xfrm_sec_ctx_match(pol->security, policy->security) && @@ -798,8 +799,9 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) } EXPORT_SYMBOL(xfrm_policy_insert); -struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type, - int dir, struct xfrm_selector *sel, +struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id, + u8 type, int dir, + struct xfrm_selector *sel, struct xfrm_sec_ctx *ctx, int delete, int *err) { @@ -812,6 +814,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type, ret = NULL; hlist_for_each_entry(pol, chain, bydst) { if (pol->type == type && + pol->if_id == if_id && (mark & pol->mark.m) == pol->mark.v && !selector_cmp(sel, &pol->selector) && xfrm_sec_ctx_match(ctx, pol->security)) { @@ -837,8 +840,9 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type, } EXPORT_SYMBOL(xfrm_policy_bysel_ctx); -struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type, - int dir, u32 id, int delete, int *err) +struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u32 if_id, + u8 type, int dir, u32 id, int delete, + int *err) { struct xfrm_policy *pol, *ret; struct hlist_head *chain; @@ -853,6 +857,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type, ret = NULL; hlist_for_each_entry(pol, chain, byidx) { if (pol->type == type && pol->index == id && + pol->if_id == if_id && (mark & pol->mark.m) == pol->mark.v) { xfrm_pol_hold(pol); if (delete) { @@ -1063,6 +1068,7 @@ static int xfrm_policy_match(const struct xfrm_policy *pol, bool match; if (pol->family != family || + pol->if_id != fl->flowi_xfrm.if_id || (fl->flowi_mark & pol->mark.m) != pol->mark.v || pol->type != type) return ret; @@ -1177,7 +1183,8 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, match = xfrm_selector_match(&pol->selector, fl, family); if (match) { - if ((sk->sk_mark & pol->mark.m) != pol->mark.v) { + if ((sk->sk_mark & pol->mark.m) != pol->mark.v || + pol->if_id != fl->flowi_xfrm.if_id) { pol = NULL; goto out; } @@ -1305,6 +1312,7 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir) newp->lft = old->lft; newp->curlft = old->curlft; newp->mark = old->mark; + newp->if_id = old->if_id; newp->action = old->action; newp->flags = old->flags; newp->xfrm_nr = old->xfrm_nr; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 8308281f3253..3803b6813fc5 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -941,6 +941,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, int error = 0; struct xfrm_state *best = NULL; u32 mark = pol->mark.v & pol->mark.m; + u32 if_id = fl->flowi_xfrm.if_id; unsigned short encap_family = tmpl->encap_family; unsigned int sequence; struct km_event c; @@ -955,6 +956,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && (mark & x->mark.m) == x->mark.v && + x->if_id == if_id && !(x->props.flags & XFRM_STATE_WILDRECV) && xfrm_state_addr_check(x, daddr, saddr, encap_family) && tmpl->mode == x->props.mode && @@ -971,6 +973,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && (mark & x->mark.m) == x->mark.v && + x->if_id == if_id && !(x->props.flags & XFRM_STATE_WILDRECV) && xfrm_addr_equal(&x->id.daddr, daddr, encap_family) && tmpl->mode == x->props.mode && @@ -1010,6 +1013,7 @@ found: * to current session. */ xfrm_init_tempstate(x, fl, tmpl, daddr, saddr, family); memcpy(&x->mark, &pol->mark, sizeof(x->mark)); + x->if_id = if_id; error = security_xfrm_state_alloc_acquire(x, pol->security, fl->flowi_secid); if (error) { @@ -1067,7 +1071,7 @@ out: } struct xfrm_state * -xfrm_stateonly_find(struct net *net, u32 mark, +xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id, xfrm_address_t *daddr, xfrm_address_t *saddr, unsigned short family, u8 mode, u8 proto, u32 reqid) { @@ -1080,6 +1084,7 @@ xfrm_stateonly_find(struct net *net, u32 mark, if (x->props.family == family && x->props.reqid == reqid && (mark & x->mark.m) == x->mark.v && + x->if_id == if_id && !(x->props.flags & XFRM_STATE_WILDRECV) && xfrm_state_addr_check(x, daddr, saddr, family) && mode == x->props.mode && @@ -1160,11 +1165,13 @@ static void __xfrm_state_bump_genids(struct xfrm_state *xnew) struct xfrm_state *x; unsigned int h; u32 mark = xnew->mark.v & xnew->mark.m; + u32 if_id = xnew->if_id; h = xfrm_dst_hash(net, &xnew->id.daddr, &xnew->props.saddr, reqid, family); hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { if (x->props.family == family && x->props.reqid == reqid && + x->if_id == if_id && (mark & x->mark.m) == x->mark.v && xfrm_addr_equal(&x->id.daddr, &xnew->id.daddr, family) && xfrm_addr_equal(&x->props.saddr, &xnew->props.saddr, family)) @@ -1187,7 +1194,7 @@ EXPORT_SYMBOL(xfrm_state_insert); static struct xfrm_state *__find_acq_core(struct net *net, const struct xfrm_mark *m, unsigned short family, u8 mode, - u32 reqid, u8 proto, + u32 reqid, u32 if_id, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create) @@ -1242,6 +1249,7 @@ static struct xfrm_state *__find_acq_core(struct net *net, x->props.family = family; x->props.mode = mode; x->props.reqid = reqid; + x->if_id = if_id; x->mark.v = m->v; x->mark.m = m->m; x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires; @@ -1296,7 +1304,7 @@ int xfrm_state_add(struct xfrm_state *x) if (use_spi && !x1) x1 = __find_acq_core(net, &x->mark, family, x->props.mode, - x->props.reqid, x->id.proto, + x->props.reqid, x->if_id, x->id.proto, &x->id.daddr, &x->props.saddr, 0); __xfrm_state_bump_genids(x); @@ -1395,6 +1403,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, x->props.flags = orig->props.flags; x->props.extra_flags = orig->props.extra_flags; + x->if_id = orig->if_id; x->tfcpad = orig->tfcpad; x->replay_maxdiff = orig->replay_maxdiff; x->replay_maxage = orig->replay_maxage; @@ -1619,13 +1628,13 @@ EXPORT_SYMBOL(xfrm_state_lookup_byaddr); struct xfrm_state * xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid, - u8 proto, const xfrm_address_t *daddr, + u32 if_id, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create, unsigned short family) { struct xfrm_state *x; spin_lock_bh(&net->xfrm.xfrm_state_lock); - x = __find_acq_core(net, mark, family, mode, reqid, proto, daddr, saddr, create); + x = __find_acq_core(net, mark, family, mode, reqid, if_id, proto, daddr, saddr, create); spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 9602cc9e05ab..79245e1c3487 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -594,6 +594,9 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, xfrm_smark_init(attrs, &x->props.smark); + if (attrs[XFRMA_IF_ID]) + x->if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV]); if (err) goto error; @@ -929,7 +932,11 @@ static int copy_to_user_state_extra(struct xfrm_state *x, ret = copy_user_offload(&x->xso, skb); if (ret) goto out; - + if (x->if_id) { + ret = nla_put_u32(skb, XFRMA_IF_ID, x->if_id); + if (ret) + goto out; + } if (x->security) ret = copy_sec_ctx(x->security, skb); out: @@ -1278,6 +1285,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, int err; u32 mark; struct xfrm_mark m; + u32 if_id = 0; p = nlmsg_data(nlh); err = verify_spi_info(p->info.id.proto, p->min, p->max); @@ -1290,6 +1298,10 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, x = NULL; mark = xfrm_mark_get(attrs, &m); + + if (attrs[XFRMA_IF_ID]) + if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + if (p->info.seq) { x = xfrm_find_acq_byseq(net, mark, p->info.seq); if (x && !xfrm_addr_equal(&x->id.daddr, daddr, family)) { @@ -1300,7 +1312,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, if (!x) x = xfrm_find_acq(net, &m, p->info.mode, p->info.reqid, - p->info.id.proto, daddr, + if_id, p->info.id.proto, daddr, &p->info.saddr, 1, family); err = -ENOENT; @@ -1588,6 +1600,9 @@ static struct xfrm_policy *xfrm_policy_construct(struct net *net, struct xfrm_us xfrm_mark_get(attrs, &xp->mark); + if (attrs[XFRMA_IF_ID]) + xp->if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + return xp; error: *errp = err; @@ -1733,6 +1748,8 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr err = copy_to_user_policy_type(xp->type, skb); if (!err) err = xfrm_mark_put(skb, &xp->mark); + if (!err) + err = xfrm_if_id_put(skb, xp->if_id); if (err) { nlmsg_cancel(skb, nlh); return err; @@ -1814,6 +1831,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, int delete; struct xfrm_mark m; u32 mark = xfrm_mark_get(attrs, &m); + u32 if_id = 0; p = nlmsg_data(nlh); delete = nlh->nlmsg_type == XFRM_MSG_DELPOLICY; @@ -1826,8 +1844,11 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, if (err) return err; + if (attrs[XFRMA_IF_ID]) + if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + if (p->index) - xp = xfrm_policy_byid(net, mark, type, p->dir, p->index, delete, &err); + xp = xfrm_policy_byid(net, mark, if_id, type, p->dir, p->index, delete, &err); else { struct nlattr *rt = attrs[XFRMA_SEC_CTX]; struct xfrm_sec_ctx *ctx; @@ -1844,7 +1865,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, if (err) return err; } - xp = xfrm_policy_bysel_ctx(net, mark, type, p->dir, &p->sel, + xp = xfrm_policy_bysel_ctx(net, mark, if_id, type, p->dir, &p->sel, ctx, delete, &err); security_xfrm_policy_free(ctx); } @@ -1967,6 +1988,10 @@ static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct if (err) goto out_cancel; + err = xfrm_if_id_put(skb, x->if_id); + if (err) + goto out_cancel; + nlmsg_end(skb, nlh); return 0; @@ -2109,6 +2134,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh, int err = -ENOENT; struct xfrm_mark m; u32 mark = xfrm_mark_get(attrs, &m); + u32 if_id = 0; err = copy_from_user_policy_type(&type, attrs); if (err) @@ -2118,8 +2144,11 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh, if (err) return err; + if (attrs[XFRMA_IF_ID]) + if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + if (p->index) - xp = xfrm_policy_byid(net, mark, type, p->dir, p->index, 0, &err); + xp = xfrm_policy_byid(net, mark, if_id, type, p->dir, p->index, 0, &err); else { struct nlattr *rt = attrs[XFRMA_SEC_CTX]; struct xfrm_sec_ctx *ctx; @@ -2136,7 +2165,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh, if (err) return err; } - xp = xfrm_policy_bysel_ctx(net, mark, type, p->dir, + xp = xfrm_policy_bysel_ctx(net, mark, if_id, type, p->dir, &p->sel, ctx, 0, &err); security_xfrm_policy_free(ctx); } @@ -2520,6 +2549,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_OFFLOAD_DEV] = { .len = sizeof(struct xfrm_user_offload) }, [XFRMA_SET_MARK] = { .type = NLA_U32 }, [XFRMA_SET_MARK_MASK] = { .type = NLA_U32 }, + [XFRMA_IF_ID] = { .type = NLA_U32 }, }; static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = { @@ -2651,6 +2681,10 @@ static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct if (err) return err; + err = xfrm_if_id_put(skb, x->if_id); + if (err) + return err; + nlmsg_end(skb, nlh); return 0; } @@ -2749,6 +2783,8 @@ static inline unsigned int xfrm_sa_len(struct xfrm_state *x) l += nla_total_size(sizeof(x->props.smark.v)); l += nla_total_size(sizeof(x->props.smark.m)); } + if (x->if_id) + l += nla_total_size(sizeof(x->if_id)); /* Must count x->lastused as it may become non-zero behind our back. */ l += nla_total_size_64bit(sizeof(u64)); @@ -2878,6 +2914,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x, err = copy_to_user_policy_type(xp->type, skb); if (!err) err = xfrm_mark_put(skb, &xp->mark); + if (!err) + err = xfrm_if_id_put(skb, xp->if_id); if (err) { nlmsg_cancel(skb, nlh); return err; @@ -2994,6 +3032,8 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp, err = copy_to_user_policy_type(xp->type, skb); if (!err) err = xfrm_mark_put(skb, &xp->mark); + if (!err) + err = xfrm_if_id_put(skb, xp->if_id); if (err) { nlmsg_cancel(skb, nlh); return err; @@ -3075,6 +3115,8 @@ static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, const struct km_e err = copy_to_user_policy_type(xp->type, skb); if (!err) err = xfrm_mark_put(skb, &xp->mark); + if (!err) + err = xfrm_if_id_put(skb, xp->if_id); if (err) goto out_free_skb; -- cgit v1.2.3 From f203b76d78092faf248db3f851840fbecf80b40e Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 12 Jun 2018 14:07:12 +0200 Subject: xfrm: Add virtual xfrm interfaces This patch adds support for virtual xfrm interfaces. Packets that are routed through such an interface are guaranteed to be IPsec transformed or dropped. It is a generic virtual interface that ensures IPsec transformation, no need to know what happens behind the interface. This means that we can tunnel IPv4 and IPv6 through the same interface and support all xfrm modes (tunnel, transport and beet) on it. Co-developed-by: Lorenzo Colitti Co-developed-by: Benedict Wong Signed-off-by: Lorenzo Colitti Signed-off-by: Benedict Wong Signed-off-by: Steffen Klassert Acked-by: Shannon Nelson Tested-by: Benedict Wong Tested-by: Antony Antony Reviewed-by: Eyal Birger --- include/net/xfrm.h | 24 ++ include/uapi/linux/if_link.h | 10 + net/xfrm/Kconfig | 8 + net/xfrm/Makefile | 1 + net/xfrm/xfrm_input.c | 3 + net/xfrm/xfrm_interface.c | 972 +++++++++++++++++++++++++++++++++++++++++++ net/xfrm/xfrm_policy.c | 43 ++ 7 files changed, 1061 insertions(+) create mode 100644 net/xfrm/xfrm_interface.c (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index e8bada4d2a45..3fa578a6a819 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -293,6 +294,13 @@ struct xfrm_replay { int (*overflow)(struct xfrm_state *x, struct sk_buff *skb); }; +struct xfrm_if_cb { + struct xfrm_if *(*decode_session)(struct sk_buff *skb); +}; + +void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb); +void xfrm_if_unregister_cb(void); + struct net_device; struct xfrm_type; struct xfrm_dst; @@ -1039,6 +1047,22 @@ static inline void xfrm_dst_destroy(struct xfrm_dst *xdst) void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev); +struct xfrm_if_parms { + char name[IFNAMSIZ]; /* name of XFRM device */ + int link; /* ifindex of underlying L2 interface */ + u32 if_id; /* interface identifyer */ +}; + +struct xfrm_if { + struct xfrm_if __rcu *next; /* next interface in list */ + struct net_device *dev; /* virtual device associated with interface */ + struct net_device *phydev; /* physical device */ + struct net *net; /* netns for packet i/o */ + struct xfrm_if_parms p; /* interface parms */ + + struct gro_cells gro_cells; +}; + struct xfrm_offload { /* Output sequence number for replay protection on offloading. */ struct { diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index cf01b6824244..bff0af507b32 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -459,6 +459,16 @@ enum { #define IFLA_MACSEC_MAX (__IFLA_MACSEC_MAX - 1) +/* XFRM section */ +enum { + IFLA_XFRM_UNSPEC, + IFLA_XFRM_LINK, + IFLA_XFRM_IF_ID, + __IFLA_XFRM_MAX +}; + +#define IFLA_XFRM_MAX (__IFLA_XFRM_MAX - 1) + enum macsec_validation_type { MACSEC_VALIDATE_DISABLED = 0, MACSEC_VALIDATE_CHECK = 1, diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index 286ed25c1a69..53381888a7b3 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -25,6 +25,14 @@ config XFRM_USER If unsure, say Y. +config XFRM_INTERFACE + tristate "Transformation virtual interface" + depends on XFRM && IPV6 + ---help--- + This provides a virtual interface to route IPsec traffic. + + If unsure, say N. + config XFRM_SUB_POLICY bool "Transformation sub policy support" depends on XFRM diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile index 0bd2465a8c5a..fbc4552d17b8 100644 --- a/net/xfrm/Makefile +++ b/net/xfrm/Makefile @@ -10,3 +10,4 @@ obj-$(CONFIG_XFRM_STATISTICS) += xfrm_proc.o obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o obj-$(CONFIG_XFRM_USER) += xfrm_user.o obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o +obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 074810436242..b89c9c7f8c5c 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -320,6 +320,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) seq = 0; if (!spi && (err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) { + secpath_reset(skb); XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); goto drop; } @@ -328,12 +329,14 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) XFRM_SPI_SKB_CB(skb)->daddroff); do { if (skb->sp->len == XFRM_MAX_DEPTH) { + secpath_reset(skb); XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); goto drop; } x = xfrm_state_lookup(net, mark, daddr, spi, nexthdr, family); if (x == NULL) { + secpath_reset(skb); XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES); xfrm_audit_state_notfound(skb, family, spi, seq); goto drop; diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c new file mode 100644 index 000000000000..31cb1c7e3881 --- /dev/null +++ b/net/xfrm/xfrm_interface.c @@ -0,0 +1,972 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * XFRM virtual interface + * + * Copyright (C) 2018 secunet Security Networks AG + * + * Author: + * Steffen Klassert + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int xfrmi_dev_init(struct net_device *dev); +static void xfrmi_dev_setup(struct net_device *dev); +static struct rtnl_link_ops xfrmi_link_ops __read_mostly; +static unsigned int xfrmi_net_id __read_mostly; + +struct xfrmi_net { + /* lists for storing interfaces in use */ + struct xfrm_if __rcu *xfrmi[1]; +}; + +#define for_each_xfrmi_rcu(start, xi) \ + for (xi = rcu_dereference(start); xi; xi = rcu_dereference(xi->next)) + +static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x) +{ + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + struct xfrm_if *xi; + + for_each_xfrmi_rcu(xfrmn->xfrmi[0], xi) { + if (x->if_id == xi->p.if_id && + (xi->dev->flags & IFF_UP)) + return xi; + } + + return NULL; +} + +static struct xfrm_if *xfrmi_decode_session(struct sk_buff *skb) +{ + struct xfrmi_net *xfrmn; + int ifindex; + struct xfrm_if *xi; + + if (!skb->dev) + return NULL; + + xfrmn = net_generic(dev_net(skb->dev), xfrmi_net_id); + ifindex = skb->dev->ifindex; + + for_each_xfrmi_rcu(xfrmn->xfrmi[0], xi) { + if (ifindex == xi->dev->ifindex && + (xi->dev->flags & IFF_UP)) + return xi; + } + + return NULL; +} + +static void xfrmi_link(struct xfrmi_net *xfrmn, struct xfrm_if *xi) +{ + struct xfrm_if __rcu **xip = &xfrmn->xfrmi[0]; + + rcu_assign_pointer(xi->next , rtnl_dereference(*xip)); + rcu_assign_pointer(*xip, xi); +} + +static void xfrmi_unlink(struct xfrmi_net *xfrmn, struct xfrm_if *xi) +{ + struct xfrm_if __rcu **xip; + struct xfrm_if *iter; + + for (xip = &xfrmn->xfrmi[0]; + (iter = rtnl_dereference(*xip)) != NULL; + xip = &iter->next) { + if (xi == iter) { + rcu_assign_pointer(*xip, xi->next); + break; + } + } +} + +static void xfrmi_dev_free(struct net_device *dev) +{ + free_percpu(dev->tstats); +} + +static int xfrmi_create2(struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct net *net = dev_net(dev); + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + int err; + + dev->rtnl_link_ops = &xfrmi_link_ops; + err = register_netdevice(dev); + if (err < 0) + goto out; + + strcpy(xi->p.name, dev->name); + + dev_hold(dev); + xfrmi_link(xfrmn, xi); + + return 0; + +out: + return err; +} + +static struct xfrm_if *xfrmi_create(struct net *net, struct xfrm_if_parms *p) +{ + struct net_device *dev; + struct xfrm_if *xi; + char name[IFNAMSIZ]; + int err; + + if (p->name[0]) + strlcpy(name, p->name, IFNAMSIZ); + else + goto failed; + + dev = alloc_netdev(sizeof(*xi), name, NET_NAME_UNKNOWN, xfrmi_dev_setup); + if (!dev) + goto failed; + + dev_net_set(dev, net); + + xi = netdev_priv(dev); + xi->p = *p; + xi->net = net; + xi->dev = dev; + xi->phydev = dev_get_by_index(net, p->link); + if (!xi->phydev) + goto failed_free; + + err = xfrmi_create2(dev); + if (err < 0) + goto failed_dev_put; + + return xi; + +failed_dev_put: + dev_put(xi->phydev); +failed_free: + free_netdev(dev); +failed: + return NULL; +} + +static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p, + int create) +{ + struct xfrm_if __rcu **xip; + struct xfrm_if *xi; + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + + for (xip = &xfrmn->xfrmi[0]; + (xi = rtnl_dereference(*xip)) != NULL; + xip = &xi->next) { + if (xi->p.if_id == p->if_id) { + if (create) + return NULL; + + return xi; + } + } + if (!create) + return NULL; + return xfrmi_create(net, p); +} + +static void xfrmi_dev_uninit(struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id); + + xfrmi_unlink(xfrmn, xi); + dev_put(xi->phydev); + dev_put(dev); +} + +static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) +{ + skb->tstamp = 0; + skb->pkt_type = PACKET_HOST; + skb->skb_iif = 0; + skb->ignore_df = 0; + skb_dst_drop(skb); + nf_reset(skb); + nf_reset_trace(skb); + + if (!xnet) + return; + + ipvs_reset(skb); + secpath_reset(skb); + skb_orphan(skb); + skb->mark = 0; +} + +static int xfrmi_rcv_cb(struct sk_buff *skb, int err) +{ + struct pcpu_sw_netstats *tstats; + struct xfrm_mode *inner_mode; + struct net_device *dev; + struct xfrm_state *x; + struct xfrm_if *xi; + bool xnet; + + if (err && !skb->sp) + return 0; + + x = xfrm_input_state(skb); + + xi = xfrmi_lookup(xs_net(x), x); + if (!xi) + return 1; + + dev = xi->dev; + skb->dev = dev; + + if (err) { + dev->stats.rx_errors++; + dev->stats.rx_dropped++; + + return 0; + } + + xnet = !net_eq(xi->net, dev_net(skb->dev)); + + if (xnet) { + inner_mode = x->inner_mode; + + if (x->sel.family == AF_UNSPEC) { + inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); + if (inner_mode == NULL) { + XFRM_INC_STATS(dev_net(skb->dev), + LINUX_MIB_XFRMINSTATEMODEERROR); + return -EINVAL; + } + } + + if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, + inner_mode->afinfo->family)) + return -EPERM; + } + + xfrmi_scrub_packet(skb, xnet); + + tstats = this_cpu_ptr(dev->tstats); + + u64_stats_update_begin(&tstats->syncp); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + u64_stats_update_end(&tstats->syncp); + + return 0; +} + +static int +xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct net_device_stats *stats = &xi->dev->stats; + struct dst_entry *dst = skb_dst(skb); + unsigned int length = skb->len; + struct net_device *tdev; + struct xfrm_state *x; + int err = -1; + int mtu; + + if (!dst) + goto tx_err_link_failure; + + fl->flowi_xfrm.if_id = xi->p.if_id; + + dst_hold(dst); + dst = xfrm_lookup(xi->net, dst, fl, NULL, 0); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; + goto tx_err_link_failure; + } + + x = dst->xfrm; + if (!x) + goto tx_err_link_failure; + + if (x->if_id != xi->p.if_id) + goto tx_err_link_failure; + + tdev = dst->dev; + + if (tdev == dev) { + stats->collisions++; + net_warn_ratelimited("%s: Local routing loop detected!\n", + xi->p.name); + goto tx_err_dst_release; + } + + mtu = dst_mtu(dst); + if (!skb->ignore_df && skb->len > mtu) { + skb_dst_update_pmtu(skb, mtu); + + if (skb->protocol == htons(ETH_P_IPV6)) { + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + } else { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + } + + dst_release(dst); + return -EMSGSIZE; + } + + xfrmi_scrub_packet(skb, !net_eq(xi->net, dev_net(dev))); + skb_dst_set(skb, dst); + skb->dev = tdev; + + err = dst_output(xi->net, skb->sk, skb); + if (net_xmit_eval(err) == 0) { + struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); + + u64_stats_update_begin(&tstats->syncp); + tstats->tx_bytes += length; + tstats->tx_packets++; + u64_stats_update_end(&tstats->syncp); + } else { + stats->tx_errors++; + stats->tx_aborted_errors++; + } + + return 0; +tx_err_link_failure: + stats->tx_carrier_errors++; + dst_link_failure(skb); +tx_err_dst_release: + dst_release(dst); + return err; +} + +static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct net_device_stats *stats = &xi->dev->stats; + struct flowi fl; + int ret; + + memset(&fl, 0, sizeof(fl)); + + switch (skb->protocol) { + case htons(ETH_P_IPV6): + xfrm_decode_session(skb, &fl, AF_INET6); + memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + break; + case htons(ETH_P_IP): + xfrm_decode_session(skb, &fl, AF_INET); + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + break; + default: + goto tx_err; + } + + fl.flowi_oif = xi->phydev->ifindex; + + ret = xfrmi_xmit2(skb, dev, &fl); + if (ret < 0) + goto tx_err; + + return NETDEV_TX_OK; + +tx_err: + stats->tx_errors++; + stats->tx_dropped++; + kfree_skb(skb); + return NETDEV_TX_OK; +} + +static int xfrmi4_err(struct sk_buff *skb, u32 info) +{ + const struct iphdr *iph = (const struct iphdr *)skb->data; + struct net *net = dev_net(skb->dev); + int protocol = iph->protocol; + struct ip_comp_hdr *ipch; + struct ip_esp_hdr *esph; + struct ip_auth_hdr *ah ; + struct xfrm_state *x; + struct xfrm_if *xi; + __be32 spi; + + switch (protocol) { + case IPPROTO_ESP: + esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); + spi = esph->spi; + break; + case IPPROTO_AH: + ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); + spi = ah->spi; + break; + case IPPROTO_COMP: + ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); + spi = htonl(ntohs(ipch->cpi)); + break; + default: + return 0; + } + + switch (icmp_hdr(skb)->type) { + case ICMP_DEST_UNREACH: + if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + return 0; + case ICMP_REDIRECT: + break; + default: + return 0; + } + + x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, + spi, protocol, AF_INET); + if (!x) + return 0; + + xi = xfrmi_lookup(net, x); + if (!xi) { + xfrm_state_put(x); + return -1; + } + + if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) + ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0); + else + ipv4_redirect(skb, net, 0, 0, protocol, 0); + xfrm_state_put(x); + + return 0; +} + +static int xfrmi6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; + struct net *net = dev_net(skb->dev); + int protocol = iph->nexthdr; + struct ip_comp_hdr *ipch; + struct ip_esp_hdr *esph; + struct ip_auth_hdr *ah; + struct xfrm_state *x; + struct xfrm_if *xi; + __be32 spi; + + switch (protocol) { + case IPPROTO_ESP: + esph = (struct ip_esp_hdr *)(skb->data + offset); + spi = esph->spi; + break; + case IPPROTO_AH: + ah = (struct ip_auth_hdr *)(skb->data + offset); + spi = ah->spi; + break; + case IPPROTO_COMP: + ipch = (struct ip_comp_hdr *)(skb->data + offset); + spi = htonl(ntohs(ipch->cpi)); + break; + default: + return 0; + } + + if (type != ICMPV6_PKT_TOOBIG && + type != NDISC_REDIRECT) + return 0; + + x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, + spi, protocol, AF_INET6); + if (!x) + return 0; + + xi = xfrmi_lookup(net, x); + if (!xi) { + xfrm_state_put(x); + return -1; + } + + if (type == NDISC_REDIRECT) + ip6_redirect(skb, net, skb->dev->ifindex, 0, + sock_net_uid(net, NULL)); + else + ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); + xfrm_state_put(x); + + return 0; +} + +static int xfrmi_change(struct xfrm_if *xi, const struct xfrm_if_parms *p) +{ + if (xi->p.link != p->link) + return -EINVAL; + + xi->p.if_id = p->if_id; + + return 0; +} + +static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p) +{ + struct net *net = dev_net(xi->dev); + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + int err; + + xfrmi_unlink(xfrmn, xi); + synchronize_net(); + err = xfrmi_change(xi, p); + xfrmi_link(xfrmn, xi); + netdev_state_change(xi->dev); + return err; +} + +static void xfrmi_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *s) +{ + int cpu; + + if (!dev->tstats) + return; + + for_each_possible_cpu(cpu) { + struct pcpu_sw_netstats *stats; + struct pcpu_sw_netstats tmp; + int start; + + stats = per_cpu_ptr(dev->tstats, cpu); + do { + start = u64_stats_fetch_begin_irq(&stats->syncp); + tmp.rx_packets = stats->rx_packets; + tmp.rx_bytes = stats->rx_bytes; + tmp.tx_packets = stats->tx_packets; + tmp.tx_bytes = stats->tx_bytes; + } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); + + s->rx_packets += tmp.rx_packets; + s->rx_bytes += tmp.rx_bytes; + s->tx_packets += tmp.tx_packets; + s->tx_bytes += tmp.tx_bytes; + } + + s->rx_dropped = dev->stats.rx_dropped; + s->tx_dropped = dev->stats.tx_dropped; +} + +static int xfrmi_get_iflink(const struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + + return xi->phydev->ifindex; +} + + +static const struct net_device_ops xfrmi_netdev_ops = { + .ndo_init = xfrmi_dev_init, + .ndo_uninit = xfrmi_dev_uninit, + .ndo_start_xmit = xfrmi_xmit, + .ndo_get_stats64 = xfrmi_get_stats64, + .ndo_get_iflink = xfrmi_get_iflink, +}; + +static void xfrmi_dev_setup(struct net_device *dev) +{ + dev->netdev_ops = &xfrmi_netdev_ops; + dev->type = ARPHRD_NONE; + dev->hard_header_len = ETH_HLEN; + dev->min_header_len = ETH_HLEN; + dev->mtu = ETH_DATA_LEN; + dev->min_mtu = ETH_MIN_MTU; + dev->max_mtu = ETH_DATA_LEN; + dev->addr_len = ETH_ALEN; + dev->flags = IFF_NOARP; + dev->needs_free_netdev = true; + dev->priv_destructor = xfrmi_dev_free; + netif_keep_dst(dev); +} + +static int xfrmi_dev_init(struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct net_device *phydev = xi->phydev; + int err; + + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + err = gro_cells_init(&xi->gro_cells, dev); + if (err) { + free_percpu(dev->tstats); + return err; + } + + dev->features |= NETIF_F_LLTX; + + dev->needed_headroom = phydev->needed_headroom; + dev->needed_tailroom = phydev->needed_tailroom; + + if (is_zero_ether_addr(dev->dev_addr)) + eth_hw_addr_inherit(dev, phydev); + if (is_zero_ether_addr(dev->broadcast)) + memcpy(dev->broadcast, phydev->broadcast, dev->addr_len); + + return 0; +} + +static int xfrmi_validate(struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + return 0; +} + +static void xfrmi_netlink_parms(struct nlattr *data[], + struct xfrm_if_parms *parms) +{ + memset(parms, 0, sizeof(*parms)); + + if (!data) + return; + + if (data[IFLA_XFRM_LINK]) + parms->link = nla_get_u32(data[IFLA_XFRM_LINK]); + + if (data[IFLA_XFRM_IF_ID]) + parms->if_id = nla_get_u32(data[IFLA_XFRM_IF_ID]); +} + +static int xfrmi_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct net *net = dev_net(dev); + struct xfrm_if_parms *p; + struct xfrm_if *xi; + + xi = netdev_priv(dev); + p = &xi->p; + + xfrmi_netlink_parms(data, p); + + if (!tb[IFLA_IFNAME]) + return -EINVAL; + + nla_strlcpy(p->name, tb[IFLA_IFNAME], IFNAMSIZ); + + if (!xfrmi_locate(net, p, 1)) + return -EEXIST; + + return 0; +} + +static void xfrmi_dellink(struct net_device *dev, struct list_head *head) +{ + unregister_netdevice_queue(dev, head); +} + +static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct net *net = dev_net(dev); + + xfrmi_netlink_parms(data, &xi->p); + + xi = xfrmi_locate(net, &xi->p, 0); + + if (xi) { + if (xi->dev != dev) + return -EEXIST; + } else + xi = netdev_priv(dev); + + return xfrmi_update(xi, &xi->p); +} + +static size_t xfrmi_get_size(const struct net_device *dev) +{ + return + /* IFLA_XFRM_LINK */ + nla_total_size(4) + + /* IFLA_XFRM_IF_ID */ + nla_total_size(4) + + 0; +} + +static int xfrmi_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct xfrm_if_parms *parm = &xi->p; + + if (nla_put_u32(skb, IFLA_XFRM_LINK, parm->link) || + nla_put_u32(skb, IFLA_XFRM_IF_ID, parm->if_id)) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +struct net *xfrmi_get_link_net(const struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + + return dev_net(xi->phydev); +} + +static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = { + [IFLA_XFRM_LINK] = { .type = NLA_U32 }, + [IFLA_XFRM_IF_ID] = { .type = NLA_U32 }, +}; + +static struct rtnl_link_ops xfrmi_link_ops __read_mostly = { + .kind = "xfrm", + .maxtype = IFLA_XFRM_MAX, + .policy = xfrmi_policy, + .priv_size = sizeof(struct xfrm_if), + .setup = xfrmi_dev_setup, + .validate = xfrmi_validate, + .newlink = xfrmi_newlink, + .dellink = xfrmi_dellink, + .changelink = xfrmi_changelink, + .get_size = xfrmi_get_size, + .fill_info = xfrmi_fill_info, + .get_link_net = xfrmi_get_link_net, +}; + +static void __net_exit xfrmi_destroy_interfaces(struct xfrmi_net *xfrmn) +{ + struct xfrm_if *xi; + LIST_HEAD(list); + + xi = rtnl_dereference(xfrmn->xfrmi[0]); + if (!xi) + return; + + unregister_netdevice_queue(xi->dev, &list); + unregister_netdevice_many(&list); +} + +static int __net_init xfrmi_init_net(struct net *net) +{ + return 0; +} + +static void __net_exit xfrmi_exit_net(struct net *net) +{ + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + + rtnl_lock(); + xfrmi_destroy_interfaces(xfrmn); + rtnl_unlock(); +} + +static struct pernet_operations xfrmi_net_ops = { + .init = xfrmi_init_net, + .exit = xfrmi_exit_net, + .id = &xfrmi_net_id, + .size = sizeof(struct xfrmi_net), +}; + +static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { + .handler = xfrm6_rcv, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi6_err, + .priority = 10, +}; + +static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = { + .handler = xfrm6_rcv, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi6_err, + .priority = 10, +}; + +static struct xfrm6_protocol xfrmi_ipcomp6_protocol __read_mostly = { + .handler = xfrm6_rcv, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi6_err, + .priority = 10, +}; + +static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = { + .handler = xfrm4_rcv, + .input_handler = xfrm_input, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi4_err, + .priority = 10, +}; + +static struct xfrm4_protocol xfrmi_ah4_protocol __read_mostly = { + .handler = xfrm4_rcv, + .input_handler = xfrm_input, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi4_err, + .priority = 10, +}; + +static struct xfrm4_protocol xfrmi_ipcomp4_protocol __read_mostly = { + .handler = xfrm4_rcv, + .input_handler = xfrm_input, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi4_err, + .priority = 10, +}; + +static int __init xfrmi4_init(void) +{ + int err; + + err = xfrm4_protocol_register(&xfrmi_esp4_protocol, IPPROTO_ESP); + if (err < 0) + goto xfrm_proto_esp_failed; + err = xfrm4_protocol_register(&xfrmi_ah4_protocol, IPPROTO_AH); + if (err < 0) + goto xfrm_proto_ah_failed; + err = xfrm4_protocol_register(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); + if (err < 0) + goto xfrm_proto_comp_failed; + + return 0; + +xfrm_proto_comp_failed: + xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); +xfrm_proto_ah_failed: + xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); +xfrm_proto_esp_failed: + return err; +} + +static void xfrmi4_fini(void) +{ + xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); + xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); + xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); +} + +static int __init xfrmi6_init(void) +{ + int err; + + err = xfrm6_protocol_register(&xfrmi_esp6_protocol, IPPROTO_ESP); + if (err < 0) + goto xfrm_proto_esp_failed; + err = xfrm6_protocol_register(&xfrmi_ah6_protocol, IPPROTO_AH); + if (err < 0) + goto xfrm_proto_ah_failed; + err = xfrm6_protocol_register(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); + if (err < 0) + goto xfrm_proto_comp_failed; + + return 0; + +xfrm_proto_comp_failed: + xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); +xfrm_proto_ah_failed: + xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); +xfrm_proto_esp_failed: + return err; +} + +static void xfrmi6_fini(void) +{ + xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); + xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); + xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); +} + +static const struct xfrm_if_cb xfrm_if_cb = { + .decode_session = xfrmi_decode_session, +}; + +static int __init xfrmi_init(void) +{ + const char *msg; + int err; + + pr_info("IPsec XFRM device driver\n"); + + msg = "tunnel device"; + err = register_pernet_device(&xfrmi_net_ops); + if (err < 0) + goto pernet_dev_failed; + + msg = "xfrm4 protocols"; + err = xfrmi4_init(); + if (err < 0) + goto xfrmi4_failed; + + msg = "xfrm6 protocols"; + err = xfrmi6_init(); + if (err < 0) + goto xfrmi6_failed; + + + msg = "netlink interface"; + err = rtnl_link_register(&xfrmi_link_ops); + if (err < 0) + goto rtnl_link_failed; + + xfrm_if_register_cb(&xfrm_if_cb); + + return err; + +rtnl_link_failed: + xfrmi6_fini(); +xfrmi6_failed: + xfrmi4_fini(); +xfrmi4_failed: + unregister_pernet_device(&xfrmi_net_ops); +pernet_dev_failed: + pr_err("xfrmi init: failed to register %s\n", msg); + return err; +} + +static void __exit xfrmi_fini(void) +{ + xfrm_if_unregister_cb(); + rtnl_link_unregister(&xfrmi_link_ops); + xfrmi4_fini(); + xfrmi6_fini(); + unregister_pernet_device(&xfrmi_net_ops); +} + +module_init(xfrmi_init); +module_exit(xfrmi_fini); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK("xfrm"); +MODULE_ALIAS_NETDEV("xfrm0"); +MODULE_AUTHOR("Steffen Klassert"); +MODULE_DESCRIPTION("XFRM virtual interface"); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index fc0c69312b2c..d960ea6657b5 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -47,6 +47,9 @@ struct xfrm_flo { static DEFINE_PER_CPU(struct xfrm_dst *, xfrm_last_dst); static struct work_struct *xfrm_pcpu_work __read_mostly; +static DEFINE_SPINLOCK(xfrm_if_cb_lock); +static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly; + static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock); static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1] __read_mostly; @@ -119,6 +122,12 @@ static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short fa return afinfo; } +/* Called with rcu_read_lock(). */ +static const struct xfrm_if_cb *xfrm_if_get_cb(void) +{ + return rcu_dereference(xfrm_if_cb); +} + struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr, @@ -2083,6 +2092,11 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, if (IS_ERR(xdst)) { err = PTR_ERR(xdst); + if (err == -EREMOTE) { + xfrm_pols_put(pols, num_pols); + return NULL; + } + if (err != -EAGAIN) goto error; goto make_dummy_bundle; @@ -2176,6 +2190,9 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, if (IS_ERR(xdst)) { xfrm_pols_put(pols, num_pols); err = PTR_ERR(xdst); + if (err == -EREMOTE) + goto nopol; + goto dropdst; } else if (xdst == NULL) { num_xfrms = 0; @@ -2368,12 +2385,20 @@ int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned int family, int reverse) { const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); + const struct xfrm_if_cb *ifcb = xfrm_if_get_cb(); + struct xfrm_if *xi; int err; if (unlikely(afinfo == NULL)) return -EAFNOSUPPORT; afinfo->decode_session(skb, fl, reverse); + if (ifcb) { + xi = ifcb->decode_session(skb); + if (xi) + fl->flowi_xfrm.if_id = xi->p.if_id; + } + err = security_xfrm_decode_session(skb, &fl->flowi_secid); rcu_read_unlock(); return err; @@ -2828,6 +2853,21 @@ void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo) } EXPORT_SYMBOL(xfrm_policy_unregister_afinfo); +void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb) +{ + spin_lock(&xfrm_if_cb_lock); + rcu_assign_pointer(xfrm_if_cb, ifcb); + spin_unlock(&xfrm_if_cb_lock); +} +EXPORT_SYMBOL(xfrm_if_register_cb); + +void xfrm_if_unregister_cb(void) +{ + RCU_INIT_POINTER(xfrm_if_cb, NULL); + synchronize_rcu(); +} +EXPORT_SYMBOL(xfrm_if_unregister_cb); + #ifdef CONFIG_XFRM_STATISTICS static int __net_init xfrm_statistics_init(struct net *net) { @@ -3008,6 +3048,9 @@ void __init xfrm_init(void) xfrm_dev_init(); seqcount_init(&xfrm_policy_hash_generation); xfrm_input_init(); + + RCU_INIT_POINTER(xfrm_if_cb, NULL); + synchronize_rcu(); } #ifdef CONFIG_AUDITSYSCALL -- cgit v1.2.3 From e4db5b61c572475bbbcf63e3c8a2606bfccf2c9d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 25 Jun 2018 17:26:02 +0200 Subject: xfrm: policy: remove pcpu policy cache Kristian Evensen says: In a project I am involved in, we are running ipsec (Strongswan) on different mt7621-based routers. Each router is configured as an initiator and has around ~30 tunnels to different responders (running on misc. devices). Before the flow cache was removed (kernel 4.9), we got a combined throughput of around 70Mbit/s for all tunnels on one router. However, we recently switched to kernel 4.14 (4.14.48), and the total throughput is somewhere around 57Mbit/s (best-case). I.e., a drop of around 20%. Reverting the flow cache removal restores, as expected, performance levels to that of kernel 4.9. When pcpu xdst exists, it has to be validated first before it can be used. A negative hit thus increases cost vs. no-cache. As number of tunnels increases, hit rate decreases so this pcpu caching isn't a viable strategy. Furthermore, the xdst cache also needs to run with BH off, so when removing this the bh disable/enable pairs can be removed too. Kristian tested a 4.14.y backport of this change and reported increased performance: In our tests, the throughput reduction has been reduced from around -20% to -5%. We also see that the overall throughput is independent of the number of tunnels, while before the throughput was reduced as the number of tunnels increased. Reported-by: Kristian Evensen Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 - net/xfrm/xfrm_device.c | 10 ---- net/xfrm/xfrm_policy.c | 139 +------------------------------------------------ net/xfrm/xfrm_state.c | 5 +- 4 files changed, 3 insertions(+), 152 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 3fa578a6a819..a5378613a49c 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -332,7 +332,6 @@ int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int fam void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo); void km_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c); -void xfrm_policy_cache_flush(void); void km_state_notify(struct xfrm_state *x, const struct km_event *c); struct xfrm_tmpl; diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 16c1230d20fa..11d56a44e9e8 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -307,12 +307,6 @@ static int xfrm_dev_register(struct net_device *dev) return xfrm_api_check(dev); } -static int xfrm_dev_unregister(struct net_device *dev) -{ - xfrm_policy_cache_flush(); - return NOTIFY_DONE; -} - static int xfrm_dev_feat_change(struct net_device *dev) { return xfrm_api_check(dev); @@ -323,7 +317,6 @@ static int xfrm_dev_down(struct net_device *dev) if (dev->features & NETIF_F_HW_ESP) xfrm_dev_state_flush(dev_net(dev), dev, true); - xfrm_policy_cache_flush(); return NOTIFY_DONE; } @@ -335,9 +328,6 @@ static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void case NETDEV_REGISTER: return xfrm_dev_register(dev); - case NETDEV_UNREGISTER: - return xfrm_dev_unregister(dev); - case NETDEV_FEAT_CHANGE: return xfrm_dev_feat_change(dev); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index d960ea6657b5..ef75891450e7 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -45,8 +45,6 @@ struct xfrm_flo { u8 flags; }; -static DEFINE_PER_CPU(struct xfrm_dst *, xfrm_last_dst); -static struct work_struct *xfrm_pcpu_work __read_mostly; static DEFINE_SPINLOCK(xfrm_if_cb_lock); static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly; @@ -1732,108 +1730,6 @@ static int xfrm_expand_policies(const struct flowi *fl, u16 family, } -static void xfrm_last_dst_update(struct xfrm_dst *xdst, struct xfrm_dst *old) -{ - this_cpu_write(xfrm_last_dst, xdst); - if (old) - dst_release(&old->u.dst); -} - -static void __xfrm_pcpu_work_fn(void) -{ - struct xfrm_dst *old; - - old = this_cpu_read(xfrm_last_dst); - if (old && !xfrm_bundle_ok(old)) - xfrm_last_dst_update(NULL, old); -} - -static void xfrm_pcpu_work_fn(struct work_struct *work) -{ - local_bh_disable(); - rcu_read_lock(); - __xfrm_pcpu_work_fn(); - rcu_read_unlock(); - local_bh_enable(); -} - -void xfrm_policy_cache_flush(void) -{ - struct xfrm_dst *old; - bool found = false; - int cpu; - - might_sleep(); - - local_bh_disable(); - rcu_read_lock(); - for_each_possible_cpu(cpu) { - old = per_cpu(xfrm_last_dst, cpu); - if (old && !xfrm_bundle_ok(old)) { - if (smp_processor_id() == cpu) { - __xfrm_pcpu_work_fn(); - continue; - } - found = true; - break; - } - } - - rcu_read_unlock(); - local_bh_enable(); - - if (!found) - return; - - get_online_cpus(); - - for_each_possible_cpu(cpu) { - bool bundle_release; - - rcu_read_lock(); - old = per_cpu(xfrm_last_dst, cpu); - bundle_release = old && !xfrm_bundle_ok(old); - rcu_read_unlock(); - - if (!bundle_release) - continue; - - if (cpu_online(cpu)) { - schedule_work_on(cpu, &xfrm_pcpu_work[cpu]); - continue; - } - - rcu_read_lock(); - old = per_cpu(xfrm_last_dst, cpu); - if (old && !xfrm_bundle_ok(old)) { - per_cpu(xfrm_last_dst, cpu) = NULL; - dst_release(&old->u.dst); - } - rcu_read_unlock(); - } - - put_online_cpus(); -} - -static bool xfrm_xdst_can_reuse(struct xfrm_dst *xdst, - struct xfrm_state * const xfrm[], - int num) -{ - const struct dst_entry *dst = &xdst->u.dst; - int i; - - if (xdst->num_xfrms != num) - return false; - - for (i = 0; i < num; i++) { - if (!dst || dst->xfrm != xfrm[i]) - return false; - dst = xfrm_dst_child(dst); - } - - return xfrm_bundle_ok(xdst); -} - static struct xfrm_dst * xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, const struct flowi *fl, u16 family, @@ -1842,7 +1738,7 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, struct net *net = xp_net(pols[0]); struct xfrm_state *xfrm[XFRM_MAX_DEPTH]; struct xfrm_dst *bundle[XFRM_MAX_DEPTH]; - struct xfrm_dst *xdst, *old; + struct xfrm_dst *xdst; struct dst_entry *dst; int err; @@ -1854,22 +1750,6 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, return ERR_PTR(err); } - xdst = this_cpu_read(xfrm_last_dst); - if (xdst && - xdst->u.dst.dev == dst_orig->dev && - xdst->num_pols == num_pols && - memcmp(xdst->pols, pols, - sizeof(struct xfrm_policy *) * num_pols) == 0 && - xfrm_xdst_can_reuse(xdst, xfrm, err)) { - dst_hold(&xdst->u.dst); - xfrm_pols_put(pols, num_pols); - while (err > 0) - xfrm_state_put(xfrm[--err]); - return xdst; - } - - old = xdst; - dst = xfrm_bundle_create(pols[0], xfrm, bundle, err, fl, dst_orig); if (IS_ERR(dst)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR); @@ -1882,9 +1762,6 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols); xdst->policy_genid = atomic_read(&pols[0]->genid); - atomic_set(&xdst->u.dst.__refcnt, 2); - xfrm_last_dst_update(xdst, old); - return xdst; } @@ -2085,11 +1962,8 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, if (num_xfrms <= 0) goto make_dummy_bundle; - local_bh_disable(); xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, xflo->dst_orig); - local_bh_enable(); - if (IS_ERR(xdst)) { err = PTR_ERR(xdst); if (err == -EREMOTE) { @@ -2181,11 +2055,9 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, goto no_transform; } - local_bh_disable(); xdst = xfrm_resolve_and_create_bundle( pols, num_pols, fl, family, dst_orig); - local_bh_enable(); if (IS_ERR(xdst)) { xfrm_pols_put(pols, num_pols); @@ -3035,15 +2907,6 @@ static struct pernet_operations __net_initdata xfrm_net_ops = { void __init xfrm_init(void) { - int i; - - xfrm_pcpu_work = kmalloc_array(NR_CPUS, sizeof(*xfrm_pcpu_work), - GFP_KERNEL); - BUG_ON(!xfrm_pcpu_work); - - for (i = 0; i < NR_CPUS; i++) - INIT_WORK(&xfrm_pcpu_work[i], xfrm_pcpu_work_fn); - register_pernet_subsys(&xfrm_net_ops); xfrm_dev_init(); seqcount_init(&xfrm_policy_hash_generation); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 3803b6813fc5..e04a510ec992 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -735,10 +735,9 @@ restart: } out: spin_unlock_bh(&net->xfrm.xfrm_state_lock); - if (cnt) { + if (cnt) err = 0; - xfrm_policy_cache_flush(); - } + return err; } EXPORT_SYMBOL(xfrm_state_flush); -- cgit v1.2.3 From d4546c2509b1e9cd082e3682dcec98472e37ee5a Mon Sep 17 00:00:00 2001 From: David Miller Date: Sun, 24 Jun 2018 14:13:49 +0900 Subject: net: Convert GRO SKB handling to list_head. Manage pending per-NAPI GRO packets via list_head. Return an SKB pointer from the GRO receive handlers. When GRO receive handlers return non-NULL, it means that this SKB needs to be completed at this time and removed from the NAPI queue. Several operations are greatly simplified by this transformation, especially timing out the oldest SKB in the list when gro_count exceeds MAX_GRO_SKBS, and napi_gro_flush() which walks the queue in reverse order. Signed-off-by: David S. Miller --- drivers/net/geneve.c | 11 ++++---- drivers/net/vxlan.c | 11 ++++---- include/linux/etherdevice.h | 3 +- include/linux/netdevice.h | 32 ++++++++++----------- include/linux/skbuff.h | 3 +- include/linux/udp.h | 4 +-- include/net/inet_common.h | 2 +- include/net/tcp.h | 2 +- include/net/udp.h | 4 +-- include/net/udp_tunnel.h | 6 ++-- net/8021q/vlan.c | 13 +++++---- net/core/dev.c | 68 +++++++++++++++++++-------------------------- net/core/skbuff.c | 4 +-- net/ethernet/eth.c | 12 ++++---- net/ipv4/af_inet.c | 12 ++++---- net/ipv4/esp4_offload.c | 4 +-- net/ipv4/fou.c | 20 ++++++------- net/ipv4/gre_offload.c | 8 +++--- net/ipv4/tcp_offload.c | 14 +++++----- net/ipv4/udp_offload.c | 13 +++++---- net/ipv6/esp6_offload.c | 4 +-- net/ipv6/ip6_offload.c | 16 +++++------ net/ipv6/tcpv6_offload.c | 4 +-- net/ipv6/udp_offload.c | 4 +-- 24 files changed, 133 insertions(+), 141 deletions(-) (limited to 'include/net') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 750eaa53bf0c..3e94375b9b01 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -418,11 +418,12 @@ static int geneve_hlen(struct genevehdr *gh) return sizeof(*gh) + gh->opt_len * 4; } -static struct sk_buff **geneve_gro_receive(struct sock *sk, - struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *geneve_gro_receive(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) { - struct sk_buff *p, **pp = NULL; + struct sk_buff *pp = NULL; + struct sk_buff *p; struct genevehdr *gh, *gh2; unsigned int hlen, gh_len, off_gnv; const struct packet_offload *ptype; @@ -449,7 +450,7 @@ static struct sk_buff **geneve_gro_receive(struct sock *sk, goto out; } - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index aee0e60471f1..cc14e0cd5647 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -568,11 +568,12 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, return vh; } -static struct sk_buff **vxlan_gro_receive(struct sock *sk, - struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *vxlan_gro_receive(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) { - struct sk_buff *p, **pp = NULL; + struct sk_buff *pp = NULL; + struct sk_buff *p; struct vxlanhdr *vh, *vh2; unsigned int hlen, off_vx; int flush = 1; @@ -607,7 +608,7 @@ static struct sk_buff **vxlan_gro_receive(struct sock *sk, skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */ - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 79563840c295..572e11bb8696 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -59,8 +59,7 @@ struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv, unsigned int rxqs); #define devm_alloc_etherdev(dev, sizeof_priv) devm_alloc_etherdev_mqs(dev, sizeof_priv, 1, 1) -struct sk_buff **eth_gro_receive(struct sk_buff **head, - struct sk_buff *skb); +struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb); int eth_gro_complete(struct sk_buff *skb, int nhoff); /* Reserved Ethernet Addresses per IEEE 802.1Q */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3ec9850c7936..f176d9873910 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -322,7 +322,7 @@ struct napi_struct { int poll_owner; #endif struct net_device *dev; - struct sk_buff *gro_list; + struct list_head gro_list; struct sk_buff *skb; struct hrtimer timer; struct list_head dev_list; @@ -2255,10 +2255,10 @@ static inline int gro_recursion_inc_test(struct sk_buff *skb) return ++NAPI_GRO_CB(skb)->recursion_counter == GRO_RECURSION_LIMIT; } -typedef struct sk_buff **(*gro_receive_t)(struct sk_buff **, struct sk_buff *); -static inline struct sk_buff **call_gro_receive(gro_receive_t cb, - struct sk_buff **head, - struct sk_buff *skb) +typedef struct sk_buff *(*gro_receive_t)(struct list_head *, struct sk_buff *); +static inline struct sk_buff *call_gro_receive(gro_receive_t cb, + struct list_head *head, + struct sk_buff *skb) { if (unlikely(gro_recursion_inc_test(skb))) { NAPI_GRO_CB(skb)->flush |= 1; @@ -2268,12 +2268,12 @@ static inline struct sk_buff **call_gro_receive(gro_receive_t cb, return cb(head, skb); } -typedef struct sk_buff **(*gro_receive_sk_t)(struct sock *, struct sk_buff **, - struct sk_buff *); -static inline struct sk_buff **call_gro_receive_sk(gro_receive_sk_t cb, - struct sock *sk, - struct sk_buff **head, - struct sk_buff *skb) +typedef struct sk_buff *(*gro_receive_sk_t)(struct sock *, struct list_head *, + struct sk_buff *); +static inline struct sk_buff *call_gro_receive_sk(gro_receive_sk_t cb, + struct sock *sk, + struct list_head *head, + struct sk_buff *skb) { if (unlikely(gro_recursion_inc_test(skb))) { NAPI_GRO_CB(skb)->flush |= 1; @@ -2299,8 +2299,8 @@ struct packet_type { struct offload_callbacks { struct sk_buff *(*gso_segment)(struct sk_buff *skb, netdev_features_t features); - struct sk_buff **(*gro_receive)(struct sk_buff **head, - struct sk_buff *skb); + struct sk_buff *(*gro_receive)(struct list_head *head, + struct sk_buff *skb); int (*gro_complete)(struct sk_buff *skb, int nhoff); }; @@ -2568,7 +2568,7 @@ struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); struct net_device *dev_get_by_napi_id(unsigned int napi_id); int netdev_get_name(struct net *net, char *name, int ifindex); int dev_restart(struct net_device *dev); -int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb); +int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); static inline unsigned int skb_gro_offset(const struct sk_buff *skb) { @@ -2784,13 +2784,13 @@ static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb, } #ifdef CONFIG_XFRM_OFFLOAD -static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff **pp, int flush) +static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush) { if (PTR_ERR(pp) != -EINPROGRESS) NAPI_GRO_CB(skb)->flush |= flush; } #else -static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff **pp, int flush) +static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush) { NAPI_GRO_CB(skb)->flush |= flush; } diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c86885954994..7ccc601b55d9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -677,7 +677,8 @@ struct sk_buff { int ip_defrag_offset; }; }; - struct rb_node rbnode; /* used in netem & tcp stack */ + struct rb_node rbnode; /* used in netem & tcp stack */ + struct list_head list; }; struct sock *sk; diff --git a/include/linux/udp.h b/include/linux/udp.h index ca840345571b..320d49d85484 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -74,8 +74,8 @@ struct udp_sock { void (*encap_destroy)(struct sock *sk); /* GRO functions for UDP socket */ - struct sk_buff ** (*gro_receive)(struct sock *sk, - struct sk_buff **head, + struct sk_buff * (*gro_receive)(struct sock *sk, + struct list_head *head, struct sk_buff *skb); int (*gro_complete)(struct sock *sk, struct sk_buff *skb, diff --git a/include/net/inet_common.h b/include/net/inet_common.h index 384b90c62c0b..3ca969cbd161 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -43,7 +43,7 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len); -struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb); +struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb); int inet_gro_complete(struct sk_buff *skb, int nhoff); struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features); diff --git a/include/net/tcp.h b/include/net/tcp.h index 822ee49ed0f9..402a88b0e8a8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1788,7 +1788,7 @@ void tcp_v4_destroy_sock(struct sock *sk); struct sk_buff *tcp_gso_segment(struct sk_buff *skb, netdev_features_t features); -struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb); +struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb); int tcp_gro_complete(struct sk_buff *skb); void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr); diff --git a/include/net/udp.h b/include/net/udp.h index b1ea8b0f5e6a..5723c6128ae4 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -170,8 +170,8 @@ static inline void udp_csum_pull_header(struct sk_buff *skb) typedef struct sock *(*udp_lookup_t)(struct sk_buff *skb, __be16 sport, __be16 dport); -struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, - struct udphdr *uh, udp_lookup_t lookup); +struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, + struct udphdr *uh, udp_lookup_t lookup); int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup); struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index b95a6927c718..fe680ab6b15a 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -65,9 +65,9 @@ static inline int udp_sock_create(struct net *net, typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb); typedef void (*udp_tunnel_encap_destroy_t)(struct sock *sk); -typedef struct sk_buff **(*udp_tunnel_gro_receive_t)(struct sock *sk, - struct sk_buff **head, - struct sk_buff *skb); +typedef struct sk_buff *(*udp_tunnel_gro_receive_t)(struct sock *sk, + struct list_head *head, + struct sk_buff *skb); typedef int (*udp_tunnel_gro_complete_t)(struct sock *sk, struct sk_buff *skb, int nhoff); diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 73a65789271b..99141986efa0 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -647,13 +647,14 @@ out: return err; } -static struct sk_buff **vlan_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *vlan_gro_receive(struct list_head *head, + struct sk_buff *skb) { - struct sk_buff *p, **pp = NULL; - struct vlan_hdr *vhdr; - unsigned int hlen, off_vlan; const struct packet_offload *ptype; + unsigned int hlen, off_vlan; + struct sk_buff *pp = NULL; + struct vlan_hdr *vhdr; + struct sk_buff *p; __be16 type; int flush = 1; @@ -675,7 +676,7 @@ static struct sk_buff **vlan_gro_receive(struct sk_buff **head, flush = 0; - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { struct vlan_hdr *vhdr2; if (!NAPI_GRO_CB(p)->same_flow) diff --git a/net/core/dev.c b/net/core/dev.c index a5aa1c7444e6..aa61b9344b46 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4881,36 +4881,25 @@ out: */ void napi_gro_flush(struct napi_struct *napi, bool flush_old) { - struct sk_buff *skb, *prev = NULL; - - /* scan list and build reverse chain */ - for (skb = napi->gro_list; skb != NULL; skb = skb->next) { - skb->prev = prev; - prev = skb; - } - - for (skb = prev; skb; skb = prev) { - skb->next = NULL; + struct sk_buff *skb, *p; + list_for_each_entry_safe_reverse(skb, p, &napi->gro_list, list) { if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) return; - - prev = skb->prev; + list_del_init(&skb->list); napi_gro_complete(skb); napi->gro_count--; } - - napi->gro_list = NULL; } EXPORT_SYMBOL(napi_gro_flush); static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) { - struct sk_buff *p; unsigned int maclen = skb->dev->hard_header_len; u32 hash = skb_get_hash_raw(skb); + struct sk_buff *p; - for (p = napi->gro_list; p; p = p->next) { + list_for_each_entry(p, &napi->gro_list, list) { unsigned long diffs; NAPI_GRO_CB(p)->flush = 0; @@ -4977,12 +4966,12 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow) static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { - struct sk_buff **pp = NULL; + struct list_head *head = &offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; - struct list_head *head = &offload_base; - int same_flow; + struct sk_buff *pp = NULL; enum gro_result ret; + int same_flow; int grow; if (netif_elide_gro(skb->dev)) @@ -5039,11 +5028,8 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; if (pp) { - struct sk_buff *nskb = *pp; - - *pp = nskb->next; - nskb->next = NULL; - napi_gro_complete(nskb); + list_del_init(&pp->list); + napi_gro_complete(pp); napi->gro_count--; } @@ -5054,15 +5040,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff goto normal; if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) { - struct sk_buff *nskb = napi->gro_list; + struct sk_buff *nskb; - /* locate the end of the list to select the 'oldest' flow */ - while (nskb->next) { - pp = &nskb->next; - nskb = *pp; - } - *pp = NULL; - nskb->next = NULL; + nskb = list_last_entry(&napi->gro_list, struct sk_buff, list); + list_del(&nskb->list); napi_gro_complete(nskb); } else { napi->gro_count++; @@ -5071,8 +5052,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->age = jiffies; NAPI_GRO_CB(skb)->last = skb; skb_shinfo(skb)->gso_size = skb_gro_len(skb); - skb->next = napi->gro_list; - napi->gro_list = skb; + list_add(&skb->list, &napi->gro_list); ret = GRO_HELD; pull: @@ -5478,7 +5458,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) NAPIF_STATE_IN_BUSY_POLL))) return false; - if (n->gro_list) { + if (!list_empty(&n->gro_list)) { unsigned long timeout = 0; if (work_done) @@ -5687,7 +5667,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) /* Note : we use a relaxed variant of napi_schedule_prep() not setting * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ - if (napi->gro_list && !napi_disable_pending(napi) && + if (!list_empty(&napi->gro_list) && !napi_disable_pending(napi) && !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) __napi_schedule_irqoff(napi); @@ -5701,7 +5681,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; napi->gro_count = 0; - napi->gro_list = NULL; + INIT_LIST_HEAD(&napi->gro_list); napi->skb = NULL; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) @@ -5734,6 +5714,14 @@ void napi_disable(struct napi_struct *n) } EXPORT_SYMBOL(napi_disable); +static void gro_list_free(struct list_head *head) +{ + struct sk_buff *skb, *p; + + list_for_each_entry_safe(skb, p, head, list) + kfree_skb(skb); +} + /* Must be called in process context */ void netif_napi_del(struct napi_struct *napi) { @@ -5743,8 +5731,8 @@ void netif_napi_del(struct napi_struct *napi) list_del_init(&napi->dev_list); napi_free_frags(napi); - kfree_skb_list(napi->gro_list); - napi->gro_list = NULL; + gro_list_free(&napi->gro_list); + INIT_LIST_HEAD(&napi->gro_list); napi->gro_count = 0; } EXPORT_SYMBOL(netif_napi_del); @@ -5787,7 +5775,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) goto out_unlock; } - if (n->gro_list) { + if (!list_empty(&n->gro_list)) { /* flush too old packets * If HZ < 1000, flush all packets. */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c642304f178c..b1f274f22d85 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3815,14 +3815,14 @@ err: } EXPORT_SYMBOL_GPL(skb_segment); -int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) +int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) { struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); unsigned int offset = skb_gro_offset(skb); unsigned int headlen = skb_headlen(skb); unsigned int len = skb_gro_len(skb); - struct sk_buff *lp, *p = *head; unsigned int delta_truesize; + struct sk_buff *lp; if (unlikely(p->len + len >= 65536)) return -E2BIG; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index ee28440f57c5..fd8faa0dfa61 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -427,13 +427,13 @@ ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len) } EXPORT_SYMBOL(sysfs_format_mac); -struct sk_buff **eth_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb) { - struct sk_buff *p, **pp = NULL; - struct ethhdr *eh, *eh2; - unsigned int hlen, off_eth; const struct packet_offload *ptype; + unsigned int hlen, off_eth; + struct sk_buff *pp = NULL; + struct ethhdr *eh, *eh2; + struct sk_buff *p; __be16 type; int flush = 1; @@ -448,7 +448,7 @@ struct sk_buff **eth_gro_receive(struct sk_buff **head, flush = 0; - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 15e125558c76..06b218a2870f 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1384,12 +1384,12 @@ out: } EXPORT_SYMBOL(inet_gso_segment); -struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb) +struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) { const struct net_offload *ops; - struct sk_buff **pp = NULL; - struct sk_buff *p; + struct sk_buff *pp = NULL; const struct iphdr *iph; + struct sk_buff *p; unsigned int hlen; unsigned int off; unsigned int id; @@ -1425,7 +1425,7 @@ struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb) flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF)); id >>= 16; - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { struct iphdr *iph2; u16 flush_id; @@ -1505,8 +1505,8 @@ out: } EXPORT_SYMBOL(inet_gro_receive); -static struct sk_buff **ipip_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *ipip_gro_receive(struct list_head *head, + struct sk_buff *skb) { if (NAPI_GRO_CB(skb)->encap_mark) { NAPI_GRO_CB(skb)->flush = 1; diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 7cf755ef9efb..bbeecd13e534 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -28,8 +28,8 @@ #include #include -static struct sk_buff **esp4_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *esp4_gro_receive(struct list_head *head, + struct sk_buff *skb) { int offset = skb_gro_offset(skb); struct xfrm_offload *xo; diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 1540db65241a..efdc9e1f741e 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -224,14 +224,14 @@ drop: return 0; } -static struct sk_buff **fou_gro_receive(struct sock *sk, - struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *fou_gro_receive(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) { - const struct net_offload *ops; - struct sk_buff **pp = NULL; u8 proto = fou_from_sock(sk)->protocol; const struct net_offload **offloads; + const struct net_offload *ops; + struct sk_buff *pp = NULL; /* We can clear the encap_mark for FOU as we are essentially doing * one of two possible things. We are either adding an L4 tunnel @@ -305,13 +305,13 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, return guehdr; } -static struct sk_buff **gue_gro_receive(struct sock *sk, - struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *gue_gro_receive(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) { const struct net_offload **offloads; const struct net_offload *ops; - struct sk_buff **pp = NULL; + struct sk_buff *pp = NULL; struct sk_buff *p; struct guehdr *guehdr; size_t len, optlen, hdrlen, off; @@ -397,7 +397,7 @@ static struct sk_buff **gue_gro_receive(struct sock *sk, skb_gro_pull(skb, hdrlen); - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { const struct guehdr *guehdr2; if (!NAPI_GRO_CB(p)->same_flow) diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 1859c473b21a..b9673c21be45 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -108,10 +108,10 @@ out: return segs; } -static struct sk_buff **gre_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *gre_gro_receive(struct list_head *head, + struct sk_buff *skb) { - struct sk_buff **pp = NULL; + struct sk_buff *pp = NULL; struct sk_buff *p; const struct gre_base_hdr *greh; unsigned int hlen, grehlen; @@ -182,7 +182,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, null_compute_pseudo); } - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { const struct gre_base_hdr *greh2; if (!NAPI_GRO_CB(p)->same_flow) diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 8cc7c3487330..f5aee641f825 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -180,9 +180,9 @@ out: return segs; } -struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) +struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) { - struct sk_buff **pp = NULL; + struct sk_buff *pp = NULL; struct sk_buff *p; struct tcphdr *th; struct tcphdr *th2; @@ -220,7 +220,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) len = skb_gro_len(skb); flags = tcp_flag_word(th); - for (; (p = *head); head = &p->next) { + list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; @@ -233,7 +233,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) goto found; } - + p = NULL; goto out_check_final; found: @@ -263,7 +263,7 @@ found: flush |= (len - 1) >= mss; flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); - if (flush || skb_gro_receive(head, skb)) { + if (flush || skb_gro_receive(p, skb)) { mss = 1; goto out_check_final; } @@ -277,7 +277,7 @@ out_check_final: TCP_FLAG_FIN)); if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) - pp = head; + pp = p; out: NAPI_GRO_CB(skb)->flush |= (flush != 0); @@ -302,7 +302,7 @@ int tcp_gro_complete(struct sk_buff *skb) } EXPORT_SYMBOL(tcp_gro_complete); -static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) +static struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb) { /* Don't bother verifying checksum if we're going to flush anyway. */ if (!NAPI_GRO_CB(skb)->flush && diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 92dc9e5a7ff3..ac46c1c55c99 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -343,10 +343,11 @@ out: return segs; } -struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, - struct udphdr *uh, udp_lookup_t lookup) +struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, + struct udphdr *uh, udp_lookup_t lookup) { - struct sk_buff *p, **pp = NULL; + struct sk_buff *pp = NULL; + struct sk_buff *p; struct udphdr *uh2; unsigned int off = skb_gro_offset(skb); int flush = 1; @@ -371,7 +372,7 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, unflush: flush = 0; - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; @@ -399,8 +400,8 @@ out: } EXPORT_SYMBOL(udp_gro_receive); -static struct sk_buff **udp4_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *udp4_gro_receive(struct list_head *head, + struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 27f59b61f70f..ddfa533a84e5 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -49,8 +49,8 @@ static __u16 esp6_nexthdr_esp_offset(struct ipv6hdr *ipv6_hdr, int nhlen) return 0; } -static struct sk_buff **esp6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *esp6_gro_receive(struct list_head *head, + struct sk_buff *skb) { int offset = skb_gro_offset(skb); struct xfrm_offload *xo; diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 5b3f2f89ef41..37ff4805b20c 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -163,11 +163,11 @@ static int ipv6_exthdrs_len(struct ipv6hdr *iph, return len; } -static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *ipv6_gro_receive(struct list_head *head, + struct sk_buff *skb) { const struct net_offload *ops; - struct sk_buff **pp = NULL; + struct sk_buff *pp = NULL; struct sk_buff *p; struct ipv6hdr *iph; unsigned int nlen; @@ -214,7 +214,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, flush--; nlen = skb_network_header_len(skb); - for (p = *head; p; p = p->next) { + list_for_each_entry(p, head, list) { const struct ipv6hdr *iph2; __be32 first_word; /* */ @@ -263,8 +263,8 @@ out: return pp; } -static struct sk_buff **sit_ip6ip6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *sit_ip6ip6_gro_receive(struct list_head *head, + struct sk_buff *skb) { /* Common GRO receive for SIT and IP6IP6 */ @@ -278,8 +278,8 @@ static struct sk_buff **sit_ip6ip6_gro_receive(struct sk_buff **head, return ipv6_gro_receive(head, skb); } -static struct sk_buff **ip4ip6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *ip4ip6_gro_receive(struct list_head *head, + struct sk_buff *skb) { /* Common GRO receive for SIT and IP6IP6 */ diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index 278e49cd67d4..e72947c99454 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -15,8 +15,8 @@ #include #include "ip6_offload.h" -static struct sk_buff **tcp6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *tcp6_gro_receive(struct list_head *head, + struct sk_buff *skb) { /* Don't bother verifying checksum if we're going to flush anyway. */ if (!NAPI_GRO_CB(skb)->flush && diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 03a2ff3fe1e6..95dee9ca8d22 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -114,8 +114,8 @@ out: return segs; } -static struct sk_buff **udp6_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff *udp6_gro_receive(struct list_head *head, + struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); -- cgit v1.2.3 From 60513bd82c825b659c05957e4f8106ba06f0797f Mon Sep 17 00:00:00 2001 From: John Hurley Date: Mon, 25 Jun 2018 14:30:04 -0700 Subject: net: sched: pass extack pointer to block binds and cb registration Pass the extact struct from a tc qdisc add to the block bind function and, in turn, to the setup_tc ndo of binding device via the tc_block_offload struct. Pass this back to any block callback registrations to allow netlink logging of fails in the bind process. Signed-off-by: John Hurley Signed-off-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c | 2 +- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 2 +- drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +- drivers/net/ethernet/intel/i40evf/i40evf_main.c | 2 +- drivers/net/ethernet/intel/igb/igb_main.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 2 +- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 10 +++++---- drivers/net/ethernet/netronome/nfp/bpf/main.c | 2 +- .../net/ethernet/netronome/nfp/flower/offload.c | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 +- drivers/net/netdevsim/netdev.c | 2 +- include/net/pkt_cls.h | 11 ++++++---- net/dsa/slave.c | 2 +- net/sched/cls_api.c | 25 ++++++++++++++-------- 17 files changed, 43 insertions(+), 31 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 176fc9f4d7de..b5fc6414a951 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -7984,7 +7984,7 @@ static int bnxt_setup_tc_block(struct net_device *dev, switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, bnxt_setup_tc_block_cb, - bp, bp); + bp, bp, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, bnxt_setup_tc_block_cb, bp); return 0; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c index 05d405905906..0745f2dfc80c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c @@ -173,7 +173,7 @@ static int bnxt_vf_rep_setup_tc_block(struct net_device *dev, case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, bnxt_vf_rep_setup_tc_block_cb, - vf_rep, vf_rep); + vf_rep, vf_rep, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, bnxt_vf_rep_setup_tc_block_cb, vf_rep); diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index bc03c175a3cd..96bc177d54de 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -3016,7 +3016,7 @@ static int cxgb_setup_tc_block(struct net_device *dev, switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, cxgb_setup_tc_block_cb, - pi, dev); + pi, dev, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, cxgb_setup_tc_block_cb, pi); return 0; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 7ad2b1b0b125..426b0ccb1fc6 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -7554,7 +7554,7 @@ static int i40e_setup_tc_block(struct net_device *dev, switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, i40e_setup_tc_block_cb, - np, np); + np, np, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, i40e_setup_tc_block_cb, np); return 0; diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c index dc56a8667495..5906c1c1d19d 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c +++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c @@ -2926,7 +2926,7 @@ static int i40evf_setup_tc_block(struct net_device *dev, switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, i40evf_setup_tc_block_cb, - adapter, adapter); + adapter, adapter, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, i40evf_setup_tc_block_cb, adapter); diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 6a78d8272eb2..f1e3397bd405 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -2728,7 +2728,7 @@ static int igb_setup_tc_block(struct igb_adapter *adapter, switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, igb_setup_tc_block_cb, - adapter, adapter); + adapter, adapter, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, igb_setup_tc_block_cb, adapter); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 3e87dbbc9024..d29bd8fc3ff3 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -9325,7 +9325,7 @@ static int ixgbe_setup_tc_block(struct net_device *dev, switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, ixgbe_setup_tc_block_cb, - adapter, adapter); + adapter, adapter, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, ixgbe_setup_tc_block_cb, adapter); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 56c1b6f5593e..134f20a182b5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3371,7 +3371,7 @@ static int mlx5e_setup_tc_block(struct net_device *dev, switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, mlx5e_setup_tc_block_cb, - priv, priv); + priv, priv, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, mlx5e_setup_tc_block_cb, priv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 57987f6546e8..3f2fe95e01d9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -797,7 +797,7 @@ static int mlx5e_rep_setup_tc_block(struct net_device *dev, switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, mlx5e_rep_setup_tc_cb, - priv, priv); + priv, priv, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, mlx5e_rep_setup_tc_cb, priv); return 0; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 968b88af2ef5..d2bc335dda11 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -1503,7 +1503,8 @@ static int mlxsw_sp_setup_tc_block_cb_flower(enum tc_setup_type type, static int mlxsw_sp_setup_tc_block_flower_bind(struct mlxsw_sp_port *mlxsw_sp_port, - struct tcf_block *block, bool ingress) + struct tcf_block *block, bool ingress, + struct netlink_ext_ack *extack) { struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; struct mlxsw_sp_acl_block *acl_block; @@ -1518,7 +1519,7 @@ mlxsw_sp_setup_tc_block_flower_bind(struct mlxsw_sp_port *mlxsw_sp_port, return -ENOMEM; block_cb = __tcf_block_cb_register(block, mlxsw_sp_setup_tc_block_cb_flower, - mlxsw_sp, acl_block); + mlxsw_sp, acl_block, extack); if (IS_ERR(block_cb)) { err = PTR_ERR(block_cb); goto err_cb_register; @@ -1596,11 +1597,12 @@ static int mlxsw_sp_setup_tc_block(struct mlxsw_sp_port *mlxsw_sp_port, switch (f->command) { case TC_BLOCK_BIND: err = tcf_block_cb_register(f->block, cb, mlxsw_sp_port, - mlxsw_sp_port); + mlxsw_sp_port, f->extack); if (err) return err; err = mlxsw_sp_setup_tc_block_flower_bind(mlxsw_sp_port, - f->block, ingress); + f->block, ingress, + f->extack); if (err) { tcf_block_cb_unregister(f->block, cb, mlxsw_sp_port); return err; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index fcdfb8e7fdea..bf46f7bff912 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -206,7 +206,7 @@ static int nfp_bpf_setup_tc_block(struct net_device *netdev, case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, nfp_bpf_setup_tc_block_cb, - nn, nn); + nn, nn, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, nfp_bpf_setup_tc_block_cb, diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c index c0e74aa4cb5e..a427dab4bf49 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/offload.c +++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c @@ -627,7 +627,7 @@ static int nfp_flower_setup_tc_block(struct net_device *netdev, case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, nfp_flower_setup_tc_block_cb, - repr, repr); + repr, repr, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, nfp_flower_setup_tc_block_cb, diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index cba46b62a1cd..2354e30caa78 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -3776,7 +3776,7 @@ static int stmmac_setup_tc_block(struct stmmac_priv *priv, switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, stmmac_setup_tc_block_cb, - priv, priv); + priv, priv, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, stmmac_setup_tc_block_cb, priv); return 0; diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c index ec68f38213d9..c9dacc6fcd59 100644 --- a/drivers/net/netdevsim/netdev.c +++ b/drivers/net/netdevsim/netdev.c @@ -260,7 +260,7 @@ nsim_setup_tc_block(struct net_device *dev, struct tc_block_offload *f) switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, nsim_setup_tc_block_cb, - ns, ns); + ns, ns, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, nsim_setup_tc_block_cb, ns); return 0; diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index a3c1a2c47cd4..a2c6d35ba057 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -73,10 +73,11 @@ void tcf_block_cb_incref(struct tcf_block_cb *block_cb); unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb); struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident, - void *cb_priv); + void *cb_priv, + struct netlink_ext_ack *extack); int tcf_block_cb_register(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident, - void *cb_priv); + void *cb_priv, struct netlink_ext_ack *extack); void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb); void tcf_block_cb_unregister(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident); @@ -161,7 +162,8 @@ unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb) static inline struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident, - void *cb_priv) + void *cb_priv, + struct netlink_ext_ack *extack) { return NULL; } @@ -169,7 +171,7 @@ struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, static inline int tcf_block_cb_register(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident, - void *cb_priv) + void *cb_priv, struct netlink_ext_ack *extack) { return 0; } @@ -596,6 +598,7 @@ struct tc_block_offload { enum tc_block_command command; enum tcf_block_binder_type binder_type; struct tcf_block *block; + struct netlink_ext_ack *extack; }; struct tc_cls_common_offload { diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 1e3b6a6d8a40..71536c435132 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -900,7 +900,7 @@ static int dsa_slave_setup_tc_block(struct net_device *dev, switch (f->command) { case TC_BLOCK_BIND: - return tcf_block_cb_register(f->block, cb, dev, dev); + return tcf_block_cb_register(f->block, cb, dev, dev, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, cb, dev); return 0; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index cdc3c87c53e6..8c9fb4b827a1 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -277,18 +277,21 @@ static bool tcf_block_offload_in_use(struct tcf_block *block) static int tcf_block_offload_cmd(struct tcf_block *block, struct net_device *dev, struct tcf_block_ext_info *ei, - enum tc_block_command command) + enum tc_block_command command, + struct netlink_ext_ack *extack) { struct tc_block_offload bo = {}; bo.command = command; bo.binder_type = ei->binder_type; bo.block = block; + bo.extack = extack; return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); } static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, - struct tcf_block_ext_info *ei) + struct tcf_block_ext_info *ei, + struct netlink_ext_ack *extack) { struct net_device *dev = q->dev_queue->dev; int err; @@ -299,10 +302,12 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, /* If tc offload feature is disabled and the block we try to bind * to already has some offloaded filters, forbid to bind. */ - if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) + if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) { + NL_SET_ERR_MSG(extack, "Bind to offloaded block failed as dev has offload disabled"); return -EOPNOTSUPP; + } - err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND); + err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND, extack); if (err == -EOPNOTSUPP) goto no_offload_dev_inc; return err; @@ -322,7 +327,7 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q, if (!dev->netdev_ops->ndo_setup_tc) goto no_offload_dev_dec; - err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND); + err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND, NULL); if (err == -EOPNOTSUPP) goto no_offload_dev_dec; return; @@ -612,7 +617,7 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, if (err) goto err_chain_head_change_cb_add; - err = tcf_block_offload_bind(block, q, ei); + err = tcf_block_offload_bind(block, q, ei, extack); if (err) goto err_block_offload_bind; @@ -748,7 +753,8 @@ EXPORT_SYMBOL(tcf_block_cb_decref); struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident, - void *cb_priv) + void *cb_priv, + struct netlink_ext_ack *extack) { struct tcf_block_cb *block_cb; @@ -772,11 +778,12 @@ EXPORT_SYMBOL(__tcf_block_cb_register); int tcf_block_cb_register(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident, - void *cb_priv) + void *cb_priv, struct netlink_ext_ack *extack) { struct tcf_block_cb *block_cb; - block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv); + block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv, + extack); return IS_ERR(block_cb) ? PTR_ERR(block_cb) : 0; } EXPORT_SYMBOL(tcf_block_cb_register); -- cgit v1.2.3 From e56185c78b500ac4d08768278ad8a25d5b756942 Mon Sep 17 00:00:00 2001 From: John Hurley Date: Mon, 25 Jun 2018 14:30:05 -0700 Subject: net: sched: add tcf_proto_op to offload a rule Create a new tcf_proto_op called 'reoffload' that generates a new offload message for each node in a tcf_proto. Pointers to the tcf_proto and whether the offload request is to add or delete the node are included. Also included is a callback function to send the offload message to and the option of priv data to go with the cb. Signed-off-by: John Hurley Signed-off-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 3 --- include/net/sch_generic.h | 6 ++++++ 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 9e59ebfded62..5ff11adbe2a6 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -190,9 +190,6 @@ static inline void tcf_action_stats_update(struct tc_action *a, u64 bytes, #endif } -typedef int tc_setup_cb_t(enum tc_setup_type type, - void *type_data, void *cb_priv); - #ifdef CONFIG_NET_CLS_ACT int tc_setup_cb_egdev_register(const struct net_device *dev, tc_setup_cb_t *cb, void *cb_priv); diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 6488daa32f82..18adc9142b18 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -20,6 +20,9 @@ struct qdisc_walker; struct tcf_walker; struct module; +typedef int tc_setup_cb_t(enum tc_setup_type type, + void *type_data, void *cb_priv); + struct qdisc_rate_table { struct tc_ratespec rate; u32 data[256]; @@ -256,6 +259,9 @@ struct tcf_proto_ops { bool *last, struct netlink_ext_ack *); void (*walk)(struct tcf_proto*, struct tcf_walker *arg); + int (*reoffload)(struct tcf_proto *tp, bool add, + tc_setup_cb_t *cb, void *cb_priv, + struct netlink_ext_ack *extack); void (*bind_class)(void *, u32, unsigned long); /* rtnetlink specific */ -- cgit v1.2.3 From 31533cba4327aefeafe8a7d57de0c737a3b2faa6 Mon Sep 17 00:00:00 2001 From: John Hurley Date: Mon, 25 Jun 2018 14:30:06 -0700 Subject: net: sched: cls_flower: implement offload tcf_proto_op Add the reoffload tcf_proto_op in flower to generate an offload message for each filter in the given tcf_proto. Call the specified callback with this new offload message. The function only returns an error if the callback rejects adding a 'hardware only' rule. A filter contains a flag to indicate if it is in hardware or not. To ensure the reoffload function properly maintains this flag, keep a reference counter for the number of instances of the filter that are in hardware. Only update the flag when this counter changes from or to 0. Add a generic helper function to implement this behaviour. Signed-off-by: John Hurley Signed-off-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 15 +++++++++++++++ net/sched/cls_flower.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 18adc9142b18..7432100027b7 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -336,6 +336,21 @@ static inline void tcf_block_offload_dec(struct tcf_block *block, u32 *flags) block->offloadcnt--; } +static inline void +tc_cls_offload_cnt_update(struct tcf_block *block, unsigned int *cnt, + u32 *flags, bool add) +{ + if (add) { + if (!*cnt) + tcf_block_offload_inc(block, flags); + (*cnt)++; + } else { + (*cnt)--; + if (!*cnt) + tcf_block_offload_dec(block, flags); + } +} + static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) { struct qdisc_skb_cb *qcb; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 9e8b26a80fb3..352876bb901b 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -87,6 +87,7 @@ struct cls_fl_filter { struct list_head list; u32 handle; u32 flags; + unsigned int in_hw_count; struct rcu_work rwork; struct net_device *hw_dev; }; @@ -289,6 +290,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, fl_hw_destroy_filter(tp, f, NULL); return err; } else if (err > 0) { + f->in_hw_count = err; tcf_block_offload_inc(block, &f->flags); } @@ -1087,6 +1089,47 @@ skip: } } +static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, + void *cb_priv, struct netlink_ext_ack *extack) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct tc_cls_flower_offload cls_flower = {}; + struct tcf_block *block = tp->chain->block; + struct fl_flow_mask *mask; + struct cls_fl_filter *f; + int err; + + list_for_each_entry(mask, &head->masks, list) { + list_for_each_entry(f, &mask->filters, list) { + if (tc_skip_hw(f->flags)) + continue; + + tc_cls_common_offload_init(&cls_flower.common, tp, + f->flags, extack); + cls_flower.command = add ? + TC_CLSFLOWER_REPLACE : TC_CLSFLOWER_DESTROY; + cls_flower.cookie = (unsigned long)f; + cls_flower.dissector = &mask->dissector; + cls_flower.mask = &f->mkey; + cls_flower.key = &f->key; + cls_flower.exts = &f->exts; + cls_flower.classid = f->res.classid; + + err = cb(TC_SETUP_CLSFLOWER, &cls_flower, cb_priv); + if (err) { + if (add && tc_skip_sw(f->flags)) + return err; + continue; + } + + tc_cls_offload_cnt_update(block, &f->in_hw_count, + &f->flags, add); + } + } + + return 0; +} + static int fl_dump_key_val(struct sk_buff *skb, void *val, int val_type, void *mask, int mask_type, int len) @@ -1438,6 +1481,7 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = { .change = fl_change, .delete = fl_delete, .walk = fl_walk, + .reoffload = fl_reoffload, .dump = fl_dump, .bind_class = fl_bind_class, .owner = THIS_MODULE, -- cgit v1.2.3 From 326367427cc09d38e4c1d145131ee2e228ac94c5 Mon Sep 17 00:00:00 2001 From: John Hurley Date: Mon, 25 Jun 2018 14:30:10 -0700 Subject: net: sched: call reoffload op on block callback reg Call the reoffload tcf_proto_op on all tcf_proto nodes in all chains of a block when a callback tries to register to a block that already has offloaded rules. If all existing rules cannot be offloaded then the registration is rejected. This replaces the previous policy of rejecting such callback registration outright. On unregistration of a callback, the rules are flushed for that given cb. The implementation of block sharing in the NFP driver, for example, duplicates shared rules to all devs bound to a block. This meant that rules could still exist in hw even after a device is unbound from a block (assuming the block still remains active). Signed-off-by: John Hurley Signed-off-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 4 +- include/net/pkt_cls.h | 6 ++- net/sched/cls_api.c | 54 ++++++++++++++++++++++---- 3 files changed, 52 insertions(+), 12 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index d2bc335dda11..52437363766a 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -1542,7 +1542,7 @@ mlxsw_sp_setup_tc_block_flower_bind(struct mlxsw_sp_port *mlxsw_sp_port, err_block_bind: if (!tcf_block_cb_decref(block_cb)) { - __tcf_block_cb_unregister(block_cb); + __tcf_block_cb_unregister(block, block_cb); err_cb_register: mlxsw_sp_acl_block_destroy(acl_block); } @@ -1572,7 +1572,7 @@ mlxsw_sp_setup_tc_block_flower_unbind(struct mlxsw_sp_port *mlxsw_sp_port, err = mlxsw_sp_acl_block_unbind(mlxsw_sp, acl_block, mlxsw_sp_port, ingress); if (!err && !tcf_block_cb_decref(block_cb)) { - __tcf_block_cb_unregister(block_cb); + __tcf_block_cb_unregister(block, block_cb); mlxsw_sp_acl_block_destroy(acl_block); } } diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index a2c6d35ba057..4070b8eb6d14 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -78,7 +78,8 @@ struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, int tcf_block_cb_register(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident, void *cb_priv, struct netlink_ext_ack *extack); -void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb); +void __tcf_block_cb_unregister(struct tcf_block *block, + struct tcf_block_cb *block_cb); void tcf_block_cb_unregister(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident); @@ -177,7 +178,8 @@ int tcf_block_cb_register(struct tcf_block *block, } static inline -void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb) +void __tcf_block_cb_unregister(struct tcf_block *block, + struct tcf_block_cb *block_cb) { } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 8c9fb4b827a1..bbf8dda96b0e 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -751,19 +751,53 @@ unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb) } EXPORT_SYMBOL(tcf_block_cb_decref); +static int +tcf_block_playback_offloads(struct tcf_block *block, tc_setup_cb_t *cb, + void *cb_priv, bool add, bool offload_in_use, + struct netlink_ext_ack *extack) +{ + struct tcf_chain *chain; + struct tcf_proto *tp; + int err; + + list_for_each_entry(chain, &block->chain_list, list) { + for (tp = rtnl_dereference(chain->filter_chain); tp; + tp = rtnl_dereference(tp->next)) { + if (tp->ops->reoffload) { + err = tp->ops->reoffload(tp, add, cb, cb_priv, + extack); + if (err && add) + goto err_playback_remove; + } else if (add && offload_in_use) { + err = -EOPNOTSUPP; + NL_SET_ERR_MSG(extack, "Filter HW offload failed - classifier without re-offloading support"); + goto err_playback_remove; + } + } + } + + return 0; + +err_playback_remove: + tcf_block_playback_offloads(block, cb, cb_priv, false, offload_in_use, + extack); + return err; +} + struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident, void *cb_priv, struct netlink_ext_ack *extack) { struct tcf_block_cb *block_cb; + int err; - /* At this point, playback of previous block cb calls is not supported, - * so forbid to register to block which already has some offloaded - * filters present. - */ - if (tcf_block_offload_in_use(block)) - return ERR_PTR(-EOPNOTSUPP); + /* Replay any already present rules */ + err = tcf_block_playback_offloads(block, cb, cb_priv, true, + tcf_block_offload_in_use(block), + extack); + if (err) + return ERR_PTR(err); block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL); if (!block_cb) @@ -788,8 +822,12 @@ int tcf_block_cb_register(struct tcf_block *block, } EXPORT_SYMBOL(tcf_block_cb_register); -void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb) +void __tcf_block_cb_unregister(struct tcf_block *block, + struct tcf_block_cb *block_cb) { + tcf_block_playback_offloads(block, block_cb->cb, block_cb->cb_priv, + false, tcf_block_offload_in_use(block), + NULL); list_del(&block_cb->list); kfree(block_cb); } @@ -803,7 +841,7 @@ void tcf_block_cb_unregister(struct tcf_block *block, block_cb = tcf_block_cb_lookup(block, cb, cb_ident); if (!block_cb) return; - __tcf_block_cb_unregister(block_cb); + __tcf_block_cb_unregister(block, block_cb); } EXPORT_SYMBOL(tcf_block_cb_unregister); -- cgit v1.2.3 From d020d4559de9baf47cafa2669f29ea59d11a914c Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Wed, 27 Jun 2018 13:33:31 -0400 Subject: net sched actions: fix coding style in pedit headers Fix coding style issues in tc pedit headers detected by the checkpatch script. Reviewed-by: Simon Horman Signed-off-by: Roman Mashak Signed-off-by: David S. Miller --- include/net/tc_act/tc_pedit.h | 1 + include/uapi/linux/tc_act/tc_pedit.h | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h index 227a6f1d02f4..fac3ad4a86de 100644 --- a/include/net/tc_act/tc_pedit.h +++ b/include/net/tc_act/tc_pedit.h @@ -17,6 +17,7 @@ struct tcf_pedit { struct tc_pedit_key *tcfp_keys; struct tcf_pedit_key_ex *tcfp_keys_ex; }; + #define to_pedit(a) ((struct tcf_pedit *)a) static inline bool is_tcf_pedit(const struct tc_action *a) diff --git a/include/uapi/linux/tc_act/tc_pedit.h b/include/uapi/linux/tc_act/tc_pedit.h index 162d1094c41c..24ec792dacc1 100644 --- a/include/uapi/linux/tc_act/tc_pedit.h +++ b/include/uapi/linux/tc_act/tc_pedit.h @@ -17,13 +17,15 @@ enum { TCA_PEDIT_KEY_EX, __TCA_PEDIT_MAX }; + #define TCA_PEDIT_MAX (__TCA_PEDIT_MAX - 1) - + enum { TCA_PEDIT_KEY_EX_HTYPE = 1, TCA_PEDIT_KEY_EX_CMD = 2, __TCA_PEDIT_KEY_EX_MAX }; + #define TCA_PEDIT_KEY_EX_MAX (__TCA_PEDIT_KEY_EX_MAX - 1) /* TCA_PEDIT_KEY_EX_HDR_TYPE_NETWROK is a special case for legacy users. It @@ -38,6 +40,7 @@ enum pedit_header_type { TCA_PEDIT_KEY_EX_HDR_TYPE_UDP = 5, __PEDIT_HDR_TYPE_MAX, }; + #define TCA_PEDIT_HDR_TYPE_MAX (__PEDIT_HDR_TYPE_MAX - 1) enum pedit_cmd { @@ -45,6 +48,7 @@ enum pedit_cmd { TCA_PEDIT_KEY_EX_CMD_ADD = 1, __PEDIT_CMD_MAX, }; + #define TCA_PEDIT_CMD_MAX (__PEDIT_CMD_MAX - 1) struct tc_pedit_key { @@ -55,13 +59,14 @@ struct tc_pedit_key { __u32 offmask; __u32 shift; }; - + struct tc_pedit_sel { tc_gen; unsigned char nkeys; unsigned char flags; struct tc_pedit_key keys[0]; }; + #define tc_pedit tc_pedit_sel #endif -- cgit v1.2.3 From f564650106a6e85702660fefd59fdff0877ab46a Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Wed, 27 Jun 2018 10:34:25 -0300 Subject: netfilter: check if the socket netns is correct. Netfilter assumes that if the socket is present in the skb, then it can be used because that reference is cleaned up while the skb is crossing netns. We want to change that to preserve the socket reference in a future patch, so this is a preparation updating netfilter to check if the socket netns matches before use it. Signed-off-by: Flavio Leitner Acked-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/netfilter/nf_log.h | 3 ++- net/ipv4/netfilter/nf_log_ipv4.c | 8 ++++---- net/ipv6/netfilter/nf_log_ipv6.c | 8 ++++---- net/netfilter/nf_conntrack_broadcast.c | 2 +- net/netfilter/nf_log_common.c | 5 +++-- net/netfilter/nf_nat_core.c | 6 +++++- net/netfilter/nft_meta.c | 9 ++++++--- net/netfilter/nft_socket.c | 5 ++++- net/netfilter/xt_cgroup.c | 6 ++++-- net/netfilter/xt_owner.c | 2 +- net/netfilter/xt_recent.c | 3 ++- net/netfilter/xt_socket.c | 8 ++++++++ 12 files changed, 44 insertions(+), 21 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h index e811ac07ea94..0d3920896d50 100644 --- a/include/net/netfilter/nf_log.h +++ b/include/net/netfilter/nf_log.h @@ -106,7 +106,8 @@ int nf_log_dump_udp_header(struct nf_log_buf *m, const struct sk_buff *skb, int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb, u8 proto, int fragment, unsigned int offset, unsigned int logflags); -void nf_log_dump_sk_uid_gid(struct nf_log_buf *m, struct sock *sk); +void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m, + struct sock *sk); void nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c index 4388de0e5380..1e6f28c97d3a 100644 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -35,7 +35,7 @@ static const struct nf_loginfo default_loginfo = { }; /* One level of recursion won't kill us */ -static void dump_ipv4_packet(struct nf_log_buf *m, +static void dump_ipv4_packet(struct net *net, struct nf_log_buf *m, const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int iphoff) { @@ -183,7 +183,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m, /* Max length: 3+maxlen */ if (!iphoff) { /* Only recurse once. */ nf_log_buf_add(m, "["); - dump_ipv4_packet(m, info, skb, + dump_ipv4_packet(net, m, info, skb, iphoff + ih->ihl*4+sizeof(_icmph)); nf_log_buf_add(m, "] "); } @@ -251,7 +251,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m, /* Max length: 15 "UID=4294967295 " */ if ((logflags & NF_LOG_UID) && !iphoff) - nf_log_dump_sk_uid_gid(m, skb->sk); + nf_log_dump_sk_uid_gid(net, m, skb->sk); /* Max length: 16 "MARK=0xFFFFFFFF " */ if (!iphoff && skb->mark) @@ -333,7 +333,7 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf, if (in != NULL) dump_ipv4_mac_header(m, loginfo, skb); - dump_ipv4_packet(m, loginfo, skb, 0); + dump_ipv4_packet(net, m, loginfo, skb, 0); nf_log_buf_close(m); } diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c index b397a8fe88b9..c6bf580d0f33 100644 --- a/net/ipv6/netfilter/nf_log_ipv6.c +++ b/net/ipv6/netfilter/nf_log_ipv6.c @@ -36,7 +36,7 @@ static const struct nf_loginfo default_loginfo = { }; /* One level of recursion won't kill us */ -static void dump_ipv6_packet(struct nf_log_buf *m, +static void dump_ipv6_packet(struct net *net, struct nf_log_buf *m, const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int ip6hoff, int recurse) @@ -258,7 +258,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m, /* Max length: 3+maxlen */ if (recurse) { nf_log_buf_add(m, "["); - dump_ipv6_packet(m, info, skb, + dump_ipv6_packet(net, m, info, skb, ptr + sizeof(_icmp6h), 0); nf_log_buf_add(m, "] "); } @@ -278,7 +278,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m, /* Max length: 15 "UID=4294967295 " */ if ((logflags & NF_LOG_UID) && recurse) - nf_log_dump_sk_uid_gid(m, skb->sk); + nf_log_dump_sk_uid_gid(net, m, skb->sk); /* Max length: 16 "MARK=0xFFFFFFFF " */ if (recurse && skb->mark) @@ -365,7 +365,7 @@ static void nf_log_ip6_packet(struct net *net, u_int8_t pf, if (in != NULL) dump_ipv6_mac_header(m, loginfo, skb); - dump_ipv6_packet(m, loginfo, skb, skb_network_offset(skb), 1); + dump_ipv6_packet(net, m, loginfo, skb, skb_network_offset(skb), 1); nf_log_buf_close(m); } diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c index a1086bdec242..5423b197d98a 100644 --- a/net/netfilter/nf_conntrack_broadcast.c +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -32,7 +32,7 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb, __be32 mask = 0; /* we're only interested in locally generated packets */ - if (skb->sk == NULL) + if (skb->sk == NULL || !net_eq(nf_ct_net(ct), sock_net(skb->sk))) goto out; if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST)) goto out; diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c index dc61399e30be..a8c5c846aec1 100644 --- a/net/netfilter/nf_log_common.c +++ b/net/netfilter/nf_log_common.c @@ -132,9 +132,10 @@ int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb, } EXPORT_SYMBOL_GPL(nf_log_dump_tcp_header); -void nf_log_dump_sk_uid_gid(struct nf_log_buf *m, struct sock *sk) +void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m, + struct sock *sk) { - if (!sk || !sk_fullsock(sk)) + if (!sk || !sk_fullsock(sk) || !net_eq(net, sock_net(sk))) return; read_lock_bh(&sk->sk_callback_lock); diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 46f9df99d276..86df2a1666fd 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -108,6 +108,7 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) struct flowi fl; unsigned int hh_len; struct dst_entry *dst; + struct sock *sk = skb->sk; int err; err = xfrm_decode_session(skb, &fl, family); @@ -119,7 +120,10 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) dst = ((struct xfrm_dst *)dst)->route; dst_hold(dst); - dst = xfrm_lookup(net, dst, &fl, skb->sk, 0); + if (sk && !net_eq(net, sock_net(sk))) + sk = NULL; + + dst = xfrm_lookup(net, dst, &fl, sk, 0); if (IS_ERR(dst)) return PTR_ERR(dst); diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 1105a23bda5e..2b94dcc43456 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -107,7 +107,8 @@ static void nft_meta_get_eval(const struct nft_expr *expr, break; case NFT_META_SKUID: sk = skb_to_full_sk(skb); - if (!sk || !sk_fullsock(sk)) + if (!sk || !sk_fullsock(sk) || + !net_eq(nft_net(pkt), sock_net(sk))) goto err; read_lock_bh(&sk->sk_callback_lock); @@ -123,7 +124,8 @@ static void nft_meta_get_eval(const struct nft_expr *expr, break; case NFT_META_SKGID: sk = skb_to_full_sk(skb); - if (!sk || !sk_fullsock(sk)) + if (!sk || !sk_fullsock(sk) || + !net_eq(nft_net(pkt), sock_net(sk))) goto err; read_lock_bh(&sk->sk_callback_lock); @@ -214,7 +216,8 @@ static void nft_meta_get_eval(const struct nft_expr *expr, #ifdef CONFIG_CGROUP_NET_CLASSID case NFT_META_CGROUP: sk = skb_to_full_sk(skb); - if (!sk || !sk_fullsock(sk)) + if (!sk || !sk_fullsock(sk) || + !net_eq(nft_net(pkt), sock_net(sk))) goto err; *dest = sock_cgroup_classid(&sk->sk_cgrp_data); break; diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index 74e1b3bd6954..998c2b546f6d 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -23,6 +23,9 @@ static void nft_socket_eval(const struct nft_expr *expr, struct sock *sk = skb->sk; u32 *dest = ®s->data[priv->dreg]; + if (sk && !net_eq(nft_net(pkt), sock_net(sk))) + sk = NULL; + if (!sk) switch(nft_pf(pkt)) { case NFPROTO_IPV4: @@ -39,7 +42,7 @@ static void nft_socket_eval(const struct nft_expr *expr, return; } - if(!sk) { + if (!sk) { nft_reg_store8(dest, 0); return; } diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index 7df2dece57d3..5d92e1781980 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -72,8 +72,9 @@ static bool cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_cgroup_info_v0 *info = par->matchinfo; + struct sock *sk = skb->sk; - if (skb->sk == NULL || !sk_fullsock(skb->sk)) + if (!sk || !sk_fullsock(sk) || !net_eq(xt_net(par), sock_net(sk))) return false; return (info->id == sock_cgroup_classid(&skb->sk->sk_cgrp_data)) ^ @@ -85,8 +86,9 @@ static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) const struct xt_cgroup_info_v1 *info = par->matchinfo; struct sock_cgroup_data *skcd = &skb->sk->sk_cgrp_data; struct cgroup *ancestor = info->priv; + struct sock *sk = skb->sk; - if (!skb->sk || !sk_fullsock(skb->sk)) + if (!sk || !sk_fullsock(sk) || !net_eq(xt_net(par), sock_net(sk))) return false; if (ancestor) diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index 3d705c688a27..46686fb73784 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -67,7 +67,7 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) struct sock *sk = skb_to_full_sk(skb); struct net *net = xt_net(par); - if (sk == NULL || sk->sk_socket == NULL) + if (!sk || !sk->sk_socket || !net_eq(net, sock_net(sk))) return (info->match ^ info->invert) == 0; else if (info->match & info->invert & XT_OWNER_SOCKET) /* diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 07085c22b19c..f44de4bc2100 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -265,7 +265,8 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par) } /* use TTL as seen before forwarding */ - if (xt_out(par) != NULL && skb->sk == NULL) + if (xt_out(par) != NULL && + (!skb->sk || !net_eq(net, sock_net(skb->sk)))) ttl++; spin_lock_bh(&recent_lock); diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 5c0779c4fa3c..0472f3472842 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -56,8 +56,12 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, struct sk_buff *pskb = (struct sk_buff *)skb; struct sock *sk = skb->sk; + if (!net_eq(xt_net(par), sock_net(sk))) + sk = NULL; + if (!sk) sk = nf_sk_lookup_slow_v4(xt_net(par), skb, xt_in(par)); + if (sk) { bool wildcard; bool transparent = true; @@ -113,8 +117,12 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) struct sk_buff *pskb = (struct sk_buff *)skb; struct sock *sk = skb->sk; + if (!net_eq(xt_net(par), sock_net(sk))) + sk = NULL; + if (!sk) sk = nf_sk_lookup_slow_v6(xt_net(par), skb, xt_in(par)); + if (sk) { bool wildcard; bool transparent = true; -- cgit v1.2.3 From b0e9a2fe3ff971950833bc0ffc383babd9443bc4 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 28 Jun 2018 15:31:00 +0800 Subject: sctp: add support for SCTP_REUSE_PORT sockopt This feature is actually already supported by sk->sk_reuse which can be set by socket level opt SO_REUSEADDR. But it's not working exactly as RFC6458 demands in section 8.1.27, like: - This option only supports one-to-one style SCTP sockets - This socket option must not be used after calling bind() or sctp_bindx(). Besides, SCTP_REUSE_PORT sockopt should be provided for user's programs. Otherwise, the programs with SCTP_REUSE_PORT from other systems will not work in linux. To separate it from the socket level version, this patch adds 'reuse' in sctp_sock and it works pretty much as sk->sk_reuse, but with some extra setup limitations that are needed when it is being enabled. "It should be noted that the behavior of the socket-level socket option to reuse ports and/or addresses for SCTP sockets is unspecified", so it leaves SO_REUSEADDR as is for the compatibility. Note that the name SCTP_REUSE_PORT is somewhat confusing, as its functionality is nearly identical to SO_REUSEADDR, but with some extra restrictions. Here it uses 'reuse' in sctp_sock instead of 'reuseport'. As for sk->sk_reuseport support for SCTP, it will be added in another patch. Thanks to Neil to make this clear. v1->v2: - add sctp_sk->reuse to separate it from the socket level version. v2->v3: - improve changelog according to Marcelo's suggestion. Acked-by: Neil Horman Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 1 + include/uapi/linux/sctp.h | 1 + net/sctp/socket.c | 62 ++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 57 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index e0f962d27386..701a51736fa5 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -220,6 +220,7 @@ struct sctp_sock { __u32 adaptation_ind; __u32 pd_point; __u16 nodelay:1, + reuse:1, disable_fragments:1, v4mapped:1, frag_interleave:1, diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index b64d583bf053..c02986a284db 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -100,6 +100,7 @@ typedef __s32 sctp_assoc_t; #define SCTP_RECVNXTINFO 33 #define SCTP_DEFAULT_SNDINFO 34 #define SCTP_AUTH_DEACTIVATE_KEY 35 +#define SCTP_REUSE_PORT 36 /* Internal Socket Options. Some of the sctp library functions are * implemented using these socket options. diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 0e91e83eea5a..bf11f9cacb63 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4170,6 +4170,28 @@ out: return retval; } +static int sctp_setsockopt_reuse_port(struct sock *sk, char __user *optval, + unsigned int optlen) +{ + int val; + + if (!sctp_style(sk, TCP)) + return -EOPNOTSUPP; + + if (sctp_sk(sk)->ep->base.bind_addr.port) + return -EFAULT; + + if (optlen < sizeof(int)) + return -EINVAL; + + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + sctp_sk(sk)->reuse = !!val; + + return 0; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -4364,6 +4386,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, retval = sctp_setsockopt_interleaving_supported(sk, optval, optlen); break; + case SCTP_REUSE_PORT: + retval = sctp_setsockopt_reuse_port(sk, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -7197,6 +7222,26 @@ out: return retval; } +static int sctp_getsockopt_reuse_port(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + int val; + + if (len < sizeof(int)) + return -EINVAL; + + len = sizeof(int); + val = sctp_sk(sk)->reuse; + if (put_user(len, optlen)) + return -EFAULT; + + if (copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -7392,6 +7437,9 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_interleaving_supported(sk, len, optval, optlen); break; + case SCTP_REUSE_PORT: + retval = sctp_getsockopt_reuse_port(sk, len, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -7429,6 +7477,7 @@ static struct sctp_bind_bucket *sctp_bucket_create( static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) { + bool reuse = (sk->sk_reuse || sctp_sk(sk)->reuse); struct sctp_bind_hashbucket *head; /* hash list */ struct sctp_bind_bucket *pp; unsigned short snum; @@ -7501,13 +7550,11 @@ pp_found: * used by other socket (pp->owner not empty); that other * socket is going to be sk2. */ - int reuse = sk->sk_reuse; struct sock *sk2; pr_debug("%s: found a possible match\n", __func__); - if (pp->fastreuse && sk->sk_reuse && - sk->sk_state != SCTP_SS_LISTENING) + if (pp->fastreuse && reuse && sk->sk_state != SCTP_SS_LISTENING) goto success; /* Run through the list of sockets bound to the port @@ -7525,7 +7572,7 @@ pp_found: ep2 = sctp_sk(sk2)->ep; if (sk == sk2 || - (reuse && sk2->sk_reuse && + (reuse && (sk2->sk_reuse || sctp_sk(sk2)->reuse) && sk2->sk_state != SCTP_SS_LISTENING)) continue; @@ -7549,12 +7596,12 @@ pp_not_found: * SO_REUSEADDR on this socket -sk-). */ if (hlist_empty(&pp->owner)) { - if (sk->sk_reuse && sk->sk_state != SCTP_SS_LISTENING) + if (reuse && sk->sk_state != SCTP_SS_LISTENING) pp->fastreuse = 1; else pp->fastreuse = 0; } else if (pp->fastreuse && - (!sk->sk_reuse || sk->sk_state == SCTP_SS_LISTENING)) + (!reuse || sk->sk_state == SCTP_SS_LISTENING)) pp->fastreuse = 0; /* We are set, so fill up all the data in the hash table @@ -7685,7 +7732,7 @@ int sctp_inet_listen(struct socket *sock, int backlog) err = 0; sctp_unhash_endpoint(ep); sk->sk_state = SCTP_SS_CLOSED; - if (sk->sk_reuse) + if (sk->sk_reuse || sctp_sk(sk)->reuse) sctp_sk(sk)->bind_hash->fastreuse = 1; goto out; } @@ -8550,6 +8597,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, newsk->sk_no_check_tx = sk->sk_no_check_tx; newsk->sk_no_check_rx = sk->sk_no_check_rx; newsk->sk_reuse = sk->sk_reuse; + sctp_sk(newsk)->reuse = sp->reuse; newsk->sk_shutdown = sk->sk_shutdown; newsk->sk_destruct = sctp_destruct_sock; -- cgit v1.2.3 From 256c87c17c53e60882a43dcf3e98f3bf859eaf6f Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Tue, 26 Jun 2018 21:39:36 -0700 Subject: net: check tunnel option type in tunnel flags Check the tunnel option type stored in tunnel flags when creating options for tunnels. Thereby ensuring we do not set geneve, vxlan or erspan tunnel options on interfaces that are not associated with them. Make sure all users of the infrastructure set correct flags, for the BPF helper we have to set all bits to keep backward compatibility. Signed-off-by: Pieter Jansen van Vuuren Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/geneve.c | 6 ++++-- drivers/net/vxlan.c | 3 ++- include/net/ip_tunnels.h | 8 ++++++-- net/core/filter.c | 2 +- net/ipv4/ip_gre.c | 2 ++ net/ipv6/ip6_gre.c | 2 ++ net/openvswitch/flow_netlink.c | 7 ++++++- 7 files changed, 23 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 3e94375b9b01..471edd76ff55 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -236,7 +236,8 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, } /* Update tunnel dst according to Geneve options. */ ip_tunnel_info_opts_set(&tun_dst->u.tun_info, - gnvh->options, gnvh->opt_len * 4); + gnvh->options, gnvh->opt_len * 4, + TUNNEL_GENEVE_OPT); } else { /* Drop packets w/ critical options, * since we don't support any... @@ -675,7 +676,8 @@ static void geneve_build_header(struct genevehdr *geneveh, geneveh->proto_type = htons(ETH_P_TEB); geneveh->rsvd2 = 0; - ip_tunnel_info_opts_get(geneveh->options, info); + if (info->key.tun_flags & TUNNEL_GENEVE_OPT) + ip_tunnel_info_opts_get(geneveh->options, info); } static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb, diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index cc14e0cd5647..7eb30d7c8bd7 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2122,7 +2122,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, vni = tunnel_id_to_key32(info->key.tun_id); ifindex = 0; dst_cache = &info->dst_cache; - if (info->options_len) + if (info->options_len && + info->key.tun_flags & TUNNEL_VXLAN_OPT) md = ip_tunnel_info_opts(info); ttl = info->key.ttl; tos = info->key.tos; diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 90ff430f5e9d..b0d022ff6ea1 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -466,10 +466,12 @@ static inline void ip_tunnel_info_opts_get(void *to, } static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, - const void *from, int len) + const void *from, int len, + __be16 flags) { memcpy(ip_tunnel_info_opts(info), from, len); info->options_len = len; + info->key.tun_flags |= flags; } static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate) @@ -511,9 +513,11 @@ static inline void ip_tunnel_info_opts_get(void *to, } static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, - const void *from, int len) + const void *from, int len, + __be16 flags) { info->options_len = 0; + info->key.tun_flags |= flags; } #endif /* CONFIG_INET */ diff --git a/net/core/filter.c b/net/core/filter.c index e7f12e9f598c..dade922678f6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3582,7 +3582,7 @@ BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb, if (unlikely(size > IP_TUNNEL_OPTS_MAX)) return -ENOMEM; - ip_tunnel_info_opts_set(info, from, size); + ip_tunnel_info_opts_set(info, from, size, TUNNEL_OPTIONS_PRESENT); return 0; } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 2d8efeecf619..c8ca5d8f0f75 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -587,6 +587,8 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, goto err_free_skb; key = &tun_info->key; + if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)) + goto err_free_rt; md = ip_tunnel_info_opts(tun_info); if (!md) goto err_free_rt; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index c8cf2fdbb13b..367177786e34 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -990,6 +990,8 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); dsfield = key->tos; + if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)) + goto tx_err; md = ip_tunnel_info_opts(tun_info); if (!md) goto tx_err; diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 492ab0c36f7c..391c4073a6dc 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2516,7 +2516,9 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, struct ovs_tunnel_info *ovs_tun; struct nlattr *a; int err = 0, start, opts_type; + __be16 dst_opt_type; + dst_opt_type = 0; ovs_match_init(&match, &key, true, NULL); opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log); if (opts_type < 0) @@ -2528,10 +2530,13 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, err = validate_geneve_opts(&key); if (err < 0) return err; + dst_opt_type = TUNNEL_GENEVE_OPT; break; case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: + dst_opt_type = TUNNEL_VXLAN_OPT; break; case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: + dst_opt_type = TUNNEL_ERSPAN_OPT; break; } } @@ -2574,7 +2579,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, */ ip_tunnel_info_opts_set(tun_info, TUN_METADATA_OPTS(&key, key.tun_opts_len), - key.tun_opts_len); + key.tun_opts_len, dst_opt_type); add_nested_action_end(*sfa, start); return err; -- cgit v1.2.3 From 0afff91c6f5ecef27715ea71e34dc2baacba1060 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 28 Jun 2018 19:05:05 +0200 Subject: net/smc: add pnetid support s390 hardware supports the definition of a so-call Physical NETwork IDentifier (short PNETID) per network device port. These PNETIDS can be used to identify network devices that are attached to the same physical network (broadcast domain). On s390 try to use the PNETID of the ethernet device port used for initial connecting, and derive the IB device port used for SMC RDMA traffic. On platforms without PNETID support fall back to the existing solution of a configured pnet table. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- include/net/smc.h | 2 + net/smc/smc_ib.c | 6 ++- net/smc/smc_ib.h | 3 ++ net/smc/smc_pnet.c | 109 +++++++++++++++++++++++++++++++++++++++++++---------- net/smc/smc_pnet.h | 14 +++++++ 5 files changed, 114 insertions(+), 20 deletions(-) (limited to 'include/net') diff --git a/include/net/smc.h b/include/net/smc.h index 8381d163fefa..2173932fab9d 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -11,6 +11,8 @@ #ifndef _SMC_H #define _SMC_H +#define SMC_MAX_PNETID_LEN 16 /* Max. length of PNET id */ + struct smc_hashinfo { rwlock_t lock; struct hlist_head ht; diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index f8b159ced032..36de2fd76170 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -504,8 +504,12 @@ static void smc_ib_add_dev(struct ib_device *ibdev) port_cnt = smcibdev->ibdev->phys_port_cnt; for (i = 0; i < min_t(size_t, port_cnt, SMC_MAX_PORTS); - i++) + i++) { set_bit(i, &smcibdev->port_event_mask); + /* determine pnetids of the port */ + smc_pnetid_by_dev_port(ibdev->dev.parent, i, + smcibdev->pnetid[i]); + } schedule_work(&smcibdev->port_event_work); } diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 2c480b352928..7c1223c91229 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -15,6 +15,7 @@ #include #include #include +#include #define SMC_MAX_PORTS 2 /* Max # of ports */ #define SMC_GID_SIZE sizeof(union ib_gid) @@ -40,6 +41,8 @@ struct smc_ib_device { /* ib-device infos for smc */ char mac[SMC_MAX_PORTS][ETH_ALEN]; /* mac address per port*/ union ib_gid gid[SMC_MAX_PORTS]; /* gid per port */ + u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN]; + /* pnetid per port */ u8 initialized : 1; /* ib dev CQ, evthdl done */ struct work_struct port_event_work; unsigned long port_event_mask; diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index a82a5cad0282..cdc6e23b6ce1 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -23,12 +23,10 @@ #include "smc_pnet.h" #include "smc_ib.h" -#define SMC_MAX_PNET_ID_LEN 16 /* Max. length of PNET id */ - static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { [SMC_PNETID_NAME] = { .type = NLA_NUL_STRING, - .len = SMC_MAX_PNET_ID_LEN - 1 + .len = SMC_MAX_PNETID_LEN - 1 }, [SMC_PNETID_ETHNAME] = { .type = NLA_NUL_STRING, @@ -65,7 +63,7 @@ static struct smc_pnettable { */ struct smc_pnetentry { struct list_head list; - char pnet_name[SMC_MAX_PNET_ID_LEN + 1]; + char pnet_name[SMC_MAX_PNETID_LEN + 1]; struct net_device *ndev; struct smc_ib_device *smcibdev; u8 ib_port; @@ -209,7 +207,7 @@ static bool smc_pnetid_valid(const char *pnet_name, char *pnetid) return false; while (--end >= bf && isspace(*end)) ; - if (end - bf >= SMC_MAX_PNET_ID_LEN) + if (end - bf >= SMC_MAX_PNETID_LEN) return false; while (bf <= end) { if (!isalnum(*bf)) @@ -512,26 +510,70 @@ void smc_pnet_exit(void) genl_unregister_family(&smc_pnet_nl_family); } -/* PNET table analysis for a given sock: - * determine ib_device and port belonging to used internal TCP socket - * ethernet interface. +/* Determine one base device for stacked net devices. + * If the lower device level contains more than one devices + * (for instance with bonding slaves), just the first device + * is used to reach a base device. */ -void smc_pnet_find_roce_resource(struct sock *sk, - struct smc_ib_device **smcibdev, u8 *ibport) +static struct net_device *pnet_find_base_ndev(struct net_device *ndev) { - struct dst_entry *dst = sk_dst_get(sk); - struct smc_pnetentry *pnetelem; + int i, nest_lvl; - *smcibdev = NULL; - *ibport = 0; + rtnl_lock(); + nest_lvl = dev_get_nest_level(ndev); + for (i = 0; i < nest_lvl; i++) { + struct list_head *lower = &ndev->adj_list.lower; + + if (list_empty(lower)) + break; + lower = lower->next; + ndev = netdev_lower_get_next(ndev, &lower); + } + rtnl_unlock(); + return ndev; +} + +/* Determine the corresponding IB device port based on the hardware PNETID. + * Searching stops at the first matching active IB device port. + */ +static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, + struct smc_ib_device **smcibdev, + u8 *ibport) +{ + u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; + struct smc_ib_device *ibdev; + int i; + + ndev = pnet_find_base_ndev(ndev); + if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, + ndev_pnetid)) + return; /* pnetid could not be determined */ + + spin_lock(&smc_ib_devices.lock); + list_for_each_entry(ibdev, &smc_ib_devices.list, list) { + for (i = 1; i <= SMC_MAX_PORTS; i++) { + if (!memcmp(ibdev->pnetid[i - 1], ndev_pnetid, + SMC_MAX_PNETID_LEN) && + smc_ib_port_active(ibdev, i)) { + *smcibdev = ibdev; + *ibport = i; + break; + } + } + } + spin_unlock(&smc_ib_devices.lock); +} + +/* Lookup of coupled ib_device via SMC pnet table */ +static void smc_pnet_find_roce_by_table(struct net_device *netdev, + struct smc_ib_device **smcibdev, + u8 *ibport) +{ + struct smc_pnetentry *pnetelem; - if (!dst) - return; - if (!dst->dev) - goto out_rel; read_lock(&smc_pnettable.lock); list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) { - if (dst->dev == pnetelem->ndev) { + if (netdev == pnetelem->ndev) { if (smc_ib_port_active(pnetelem->smcibdev, pnetelem->ib_port)) { *smcibdev = pnetelem->smcibdev; @@ -541,6 +583,35 @@ void smc_pnet_find_roce_resource(struct sock *sk, } } read_unlock(&smc_pnettable.lock); +} + +/* PNET table analysis for a given sock: + * determine ib_device and port belonging to used internal TCP socket + * ethernet interface. + */ +void smc_pnet_find_roce_resource(struct sock *sk, + struct smc_ib_device **smcibdev, u8 *ibport) +{ + struct dst_entry *dst = sk_dst_get(sk); + + *smcibdev = NULL; + *ibport = 0; + + if (!dst) + goto out; + if (!dst->dev) + goto out_rel; + + /* if possible, lookup via hardware-defined pnetid */ + smc_pnet_find_roce_by_pnetid(dst->dev, smcibdev, ibport); + if (*smcibdev) + goto out_rel; + + /* lookup via SMC PNET table */ + smc_pnet_find_roce_by_table(dst->dev, smcibdev, ibport); + out_rel: dst_release(dst); +out: + return; } diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h index 5a29519db976..ad4455cde9e7 100644 --- a/net/smc/smc_pnet.h +++ b/net/smc/smc_pnet.h @@ -12,8 +12,22 @@ #ifndef _SMC_PNET_H #define _SMC_PNET_H +#if IS_ENABLED(CONFIG_HAVE_PNETID) +#include +#endif + struct smc_ib_device; +static inline int smc_pnetid_by_dev_port(struct device *dev, + unsigned short port, u8 *pnetid) +{ +#if IS_ENABLED(CONFIG_HAVE_PNETID) + return pnet_id_by_dev_port(dev, port, pnetid); +#else + return -ENOENT; +#endif +} + int smc_pnet_init(void) __init; void smc_pnet_exit(void); int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev); -- cgit v1.2.3 From c6ba7c9ba43de1b57e9a53946e7ff988554c84ed Mon Sep 17 00:00:00 2001 From: Hans Wippel Date: Thu, 28 Jun 2018 19:05:07 +0200 Subject: net/smc: add base infrastructure for SMC-D and ISM SMC supports two variants: SMC-R and SMC-D. For data transport, SMC-R uses RDMA devices, SMC-D uses so-called Internal Shared Memory (ISM) devices. An ISM device only allows shared memory communication between SMC instances on the same machine. For example, this allows virtual machines on the same host to communicate via SMC without RDMA devices. This patch adds the base infrastructure for SMC-D and ISM devices to the existing SMC code. It contains the following: * ISM driver interface: This interface allows an ISM driver to register ISM devices in SMC. In the process, the driver provides a set of device ops for each device. SMC uses these ops to execute SMC specific operations on or transfer data over the device. * Core SMC-D link group, connection, and buffer support: Link groups, SMC connections and SMC buffers (in smc_core) are extended to support SMC-D. * SMC type checks: Some type checks are added to prevent using SMC-R specific code for SMC-D and vice versa. To actually use SMC-D, additional changes to pnetid, CLC, CDC, etc. are required. These are added in follow-up patches. Signed-off-by: Hans Wippel Signed-off-by: Ursula Braun Suggested-by: Thomas Richter Signed-off-by: David S. Miller --- include/net/smc.h | 62 +++++++++++ net/smc/Makefile | 2 +- net/smc/af_smc.c | 11 +- net/smc/smc_core.c | 270 +++++++++++++++++++++++++++++++++++------------ net/smc/smc_core.h | 71 +++++++++---- net/smc/smc_diag.c | 3 +- net/smc/smc_ism.c | 304 +++++++++++++++++++++++++++++++++++++++++++++++++++++ net/smc/smc_ism.h | 48 +++++++++ 8 files changed, 679 insertions(+), 92 deletions(-) create mode 100644 net/smc/smc_ism.c create mode 100644 net/smc/smc_ism.h (limited to 'include/net') diff --git a/include/net/smc.h b/include/net/smc.h index 2173932fab9d..824a7af8d654 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -20,4 +20,66 @@ struct smc_hashinfo { int smc_hash_sk(struct sock *sk); void smc_unhash_sk(struct sock *sk); + +/* SMCD/ISM device driver interface */ +struct smcd_dmb { + u64 dmb_tok; + u64 rgid; + u32 dmb_len; + u32 sba_idx; + u32 vlan_valid; + u32 vlan_id; + void *cpu_addr; + dma_addr_t dma_addr; +}; + +#define ISM_EVENT_DMB 0 +#define ISM_EVENT_GID 1 +#define ISM_EVENT_SWR 2 + +struct smcd_event { + u32 type; + u32 code; + u64 tok; + u64 time; + u64 info; +}; + +struct smcd_dev; + +struct smcd_ops { + int (*query_remote_gid)(struct smcd_dev *dev, u64 rgid, u32 vid_valid, + u32 vid); + int (*register_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb); + int (*unregister_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb); + int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id); + int (*del_vlan_id)(struct smcd_dev *dev, u64 vlan_id); + int (*set_vlan_required)(struct smcd_dev *dev); + int (*reset_vlan_required)(struct smcd_dev *dev); + int (*signal_event)(struct smcd_dev *dev, u64 rgid, u32 trigger_irq, + u32 event_code, u64 info); + int (*move_data)(struct smcd_dev *dev, u64 dmb_tok, unsigned int idx, + bool sf, unsigned int offset, void *data, + unsigned int size); +}; + +struct smcd_dev { + const struct smcd_ops *ops; + struct device dev; + void *priv; + u64 local_gid; + struct list_head list; + spinlock_t lock; + struct smc_connection **conn; + struct list_head vlan; + struct workqueue_struct *event_wq; +}; + +struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, + const struct smcd_ops *ops, int max_dmbs); +int smcd_register_dev(struct smcd_dev *smcd); +void smcd_unregister_dev(struct smcd_dev *smcd); +void smcd_free_dev(struct smcd_dev *smcd); +void smcd_handle_event(struct smcd_dev *dev, struct smcd_event *event); +void smcd_handle_irq(struct smcd_dev *dev, unsigned int bit); #endif /* _SMC_H */ diff --git a/net/smc/Makefile b/net/smc/Makefile index 188104654b54..4df96b4b8130 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o -smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o +smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index da7f02edcd37..8ce48799cf68 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -475,8 +475,8 @@ static int smc_connect_rdma(struct smc_sock *smc, int reason_code = 0; mutex_lock(&smc_create_lgr_pending); - local_contact = smc_conn_create(smc, ibdev, ibport, &aclc->lcl, - aclc->hdr.flag); + local_contact = smc_conn_create(smc, false, aclc->hdr.flag, ibdev, + ibport, &aclc->lcl, NULL, 0); if (local_contact < 0) { if (local_contact == -ENOMEM) reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ @@ -491,7 +491,7 @@ static int smc_connect_rdma(struct smc_sock *smc, smc_conn_save_peer_info(smc, aclc); /* create send buffer and rmb */ - if (smc_buf_create(smc)) + if (smc_buf_create(smc, false)) return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact); if (local_contact == SMC_FIRST_CONTACT) @@ -894,7 +894,8 @@ static int smc_listen_rdma_init(struct smc_sock *new_smc, int *local_contact) { /* allocate connection / link group */ - *local_contact = smc_conn_create(new_smc, ibdev, ibport, &pclc->lcl, 0); + *local_contact = smc_conn_create(new_smc, false, 0, ibdev, ibport, + &pclc->lcl, NULL, 0); if (*local_contact < 0) { if (*local_contact == -ENOMEM) return SMC_CLC_DECL_MEM;/* insufficient memory*/ @@ -902,7 +903,7 @@ static int smc_listen_rdma_init(struct smc_sock *new_smc, } /* create send buffer and rmb */ - if (smc_buf_create(new_smc)) + if (smc_buf_create(new_smc, false)) return SMC_CLC_DECL_MEM; return 0; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index add82b0266f3..daa88db1841a 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -25,6 +25,7 @@ #include "smc_llc.h" #include "smc_cdc.h" #include "smc_close.h" +#include "smc_ism.h" #define SMC_LGR_NUM_INCR 256 #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) @@ -46,8 +47,8 @@ static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) * otherwise there is a risk of out-of-sync link groups. */ mod_delayed_work(system_wq, &lgr->free_work, - lgr->role == SMC_CLNT ? SMC_LGR_FREE_DELAY_CLNT : - SMC_LGR_FREE_DELAY_SERV); + (!lgr->is_smcd && lgr->role == SMC_CLNT) ? + SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV); } /* Register connection's alert token in our lookup structure. @@ -153,16 +154,18 @@ static void smc_lgr_free_work(struct work_struct *work) free: spin_unlock_bh(&smc_lgr_list.lock); if (!delayed_work_pending(&lgr->free_work)) { - if (lgr->lnk[SMC_SINGLE_LINK].state != SMC_LNK_INACTIVE) + if (!lgr->is_smcd && + lgr->lnk[SMC_SINGLE_LINK].state != SMC_LNK_INACTIVE) smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); smc_lgr_free(lgr); } } /* create a new SMC link group */ -static int smc_lgr_create(struct smc_sock *smc, +static int smc_lgr_create(struct smc_sock *smc, bool is_smcd, struct smc_ib_device *smcibdev, u8 ibport, - char *peer_systemid, unsigned short vlan_id) + char *peer_systemid, unsigned short vlan_id, + struct smcd_dev *smcismdev, u64 peer_gid) { struct smc_link_group *lgr; struct smc_link *lnk; @@ -170,17 +173,23 @@ static int smc_lgr_create(struct smc_sock *smc, int rc = 0; int i; + if (is_smcd && vlan_id) { + rc = smc_ism_get_vlan(smcismdev, vlan_id); + if (rc) + goto out; + } + lgr = kzalloc(sizeof(*lgr), GFP_KERNEL); if (!lgr) { rc = -ENOMEM; goto out; } - lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; + lgr->is_smcd = is_smcd; lgr->sync_err = 0; - memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN); lgr->vlan_id = vlan_id; rwlock_init(&lgr->sndbufs_lock); rwlock_init(&lgr->rmbs_lock); + rwlock_init(&lgr->conns_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { INIT_LIST_HEAD(&lgr->sndbufs[i]); INIT_LIST_HEAD(&lgr->rmbs[i]); @@ -189,36 +198,44 @@ static int smc_lgr_create(struct smc_sock *smc, memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); lgr->conns_all = RB_ROOT; - - lnk = &lgr->lnk[SMC_SINGLE_LINK]; - /* initialize link */ - lnk->state = SMC_LNK_ACTIVATING; - lnk->link_id = SMC_SINGLE_LINK; - lnk->smcibdev = smcibdev; - lnk->ibport = ibport; - lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu; - if (!smcibdev->initialized) - smc_ib_setup_per_ibdev(smcibdev); - get_random_bytes(rndvec, sizeof(rndvec)); - lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16); - rc = smc_llc_link_init(lnk); - if (rc) - goto free_lgr; - rc = smc_wr_alloc_link_mem(lnk); - if (rc) - goto clear_llc_lnk; - rc = smc_ib_create_protection_domain(lnk); - if (rc) - goto free_link_mem; - rc = smc_ib_create_queue_pair(lnk); - if (rc) - goto dealloc_pd; - rc = smc_wr_create_link(lnk); - if (rc) - goto destroy_qp; - + if (is_smcd) { + /* SMC-D specific settings */ + lgr->peer_gid = peer_gid; + lgr->smcd = smcismdev; + } else { + /* SMC-R specific settings */ + lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; + memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN); + + lnk = &lgr->lnk[SMC_SINGLE_LINK]; + /* initialize link */ + lnk->state = SMC_LNK_ACTIVATING; + lnk->link_id = SMC_SINGLE_LINK; + lnk->smcibdev = smcibdev; + lnk->ibport = ibport; + lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu; + if (!smcibdev->initialized) + smc_ib_setup_per_ibdev(smcibdev); + get_random_bytes(rndvec, sizeof(rndvec)); + lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + + (rndvec[2] << 16); + rc = smc_llc_link_init(lnk); + if (rc) + goto free_lgr; + rc = smc_wr_alloc_link_mem(lnk); + if (rc) + goto clear_llc_lnk; + rc = smc_ib_create_protection_domain(lnk); + if (rc) + goto free_link_mem; + rc = smc_ib_create_queue_pair(lnk); + if (rc) + goto dealloc_pd; + rc = smc_wr_create_link(lnk); + if (rc) + goto destroy_qp; + } smc->conn.lgr = lgr; - rwlock_init(&lgr->conns_lock); spin_lock_bh(&smc_lgr_list.lock); list_add(&lgr->list, &smc_lgr_list.list); spin_unlock_bh(&smc_lgr_list.lock); @@ -264,7 +281,10 @@ void smc_conn_free(struct smc_connection *conn) { if (!conn->lgr) return; - smc_cdc_tx_dismiss_slots(conn); + if (conn->lgr->is_smcd) + smc_ism_unset_conn(conn); + else + smc_cdc_tx_dismiss_slots(conn); smc_lgr_unregister_conn(conn); smc_buf_unuse(conn); } @@ -280,8 +300,8 @@ static void smc_link_clear(struct smc_link *lnk) smc_wr_free_link_mem(lnk); } -static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, - struct smc_buf_desc *buf_desc) +static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, + struct smc_buf_desc *buf_desc) { struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; @@ -301,6 +321,25 @@ static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, kfree(buf_desc); } +static void smcd_buf_free(struct smc_link_group *lgr, bool is_dmb, + struct smc_buf_desc *buf_desc) +{ + if (is_dmb) + smc_ism_unregister_dmb(lgr->smcd, buf_desc); + else + kfree(buf_desc->cpu_addr); + kfree(buf_desc); +} + +static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, + struct smc_buf_desc *buf_desc) +{ + if (lgr->is_smcd) + smcd_buf_free(lgr, is_rmb, buf_desc); + else + smcr_buf_free(lgr, is_rmb, buf_desc); +} + static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb) { struct smc_buf_desc *buf_desc, *bf_desc; @@ -332,7 +371,10 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr) void smc_lgr_free(struct smc_link_group *lgr) { smc_lgr_free_bufs(lgr); - smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); + if (lgr->is_smcd) + smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); + else + smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); kfree(lgr); } @@ -357,7 +399,8 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr) lgr->terminating = 1; if (!list_empty(&lgr->list)) /* forget lgr */ list_del_init(&lgr->list); - smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); + if (!lgr->is_smcd) + smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); write_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); @@ -374,7 +417,8 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr) node = rb_first(&lgr->conns_all); } write_unlock_bh(&lgr->conns_lock); - wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait); + if (!lgr->is_smcd) + wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait); smc_lgr_schedule_free_work(lgr); } @@ -392,13 +436,40 @@ void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { - if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && + if (!lgr->is_smcd && + lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) __smc_lgr_terminate(lgr); } spin_unlock_bh(&smc_lgr_list.lock); } +/* Called when SMC-D device is terminated or peer is lost */ +void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid) +{ + struct smc_link_group *lgr, *l; + LIST_HEAD(lgr_free_list); + + /* run common cleanup function and build free list */ + spin_lock_bh(&smc_lgr_list.lock); + list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { + if (lgr->is_smcd && lgr->smcd == dev && + (!peer_gid || lgr->peer_gid == peer_gid) && + !list_empty(&lgr->list)) { + __smc_lgr_terminate(lgr); + list_move(&lgr->list, &lgr_free_list); + } + } + spin_unlock_bh(&smc_lgr_list.lock); + + /* cancel the regular free workers and actually free lgrs */ + list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { + list_del_init(&lgr->list); + cancel_delayed_work_sync(&lgr->free_work); + smc_lgr_free(lgr); + } +} + /* Determine vlan of internal TCP socket. * @vlan_id: address to store the determined vlan id into */ @@ -477,10 +548,30 @@ static int smc_link_determine_gid(struct smc_link_group *lgr) return -ENODEV; } +static bool smcr_lgr_match(struct smc_link_group *lgr, + struct smc_clc_msg_local *lcl, + enum smc_lgr_role role) +{ + return !memcmp(lgr->peer_systemid, lcl->id_for_peer, + SMC_SYSTEMID_LEN) && + !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid, + SMC_GID_SIZE) && + !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac, + sizeof(lcl->mac)) && + lgr->role == role; +} + +static bool smcd_lgr_match(struct smc_link_group *lgr, + struct smcd_dev *smcismdev, u64 peer_gid) +{ + return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev; +} + /* create a new SMC connection (and a new link group if necessary) */ -int smc_conn_create(struct smc_sock *smc, +int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, struct smc_ib_device *smcibdev, u8 ibport, - struct smc_clc_msg_local *lcl, int srv_first_contact) + struct smc_clc_msg_local *lcl, struct smcd_dev *smcd, + u64 peer_gid) { struct smc_connection *conn = &smc->conn; int local_contact = SMC_FIRST_CONTACT; @@ -502,17 +593,12 @@ int smc_conn_create(struct smc_sock *smc, spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry(lgr, &smc_lgr_list.list, list) { write_lock_bh(&lgr->conns_lock); - if (!memcmp(lgr->peer_systemid, lcl->id_for_peer, - SMC_SYSTEMID_LEN) && - !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid, - SMC_GID_SIZE) && - !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac, - sizeof(lcl->mac)) && + if ((is_smcd ? smcd_lgr_match(lgr, smcd, peer_gid) : + smcr_lgr_match(lgr, lcl, role)) && !lgr->sync_err && - (lgr->role == role) && - (lgr->vlan_id == vlan_id) && - ((role == SMC_CLNT) || - (lgr->conns_num < SMC_RMBS_PER_LGR_MAX))) { + lgr->vlan_id == vlan_id && + (role == SMC_CLNT || + lgr->conns_num < SMC_RMBS_PER_LGR_MAX)) { /* link group found */ local_contact = SMC_REUSE_CONTACT; conn->lgr = lgr; @@ -535,12 +621,13 @@ int smc_conn_create(struct smc_sock *smc, create: if (local_contact == SMC_FIRST_CONTACT) { - rc = smc_lgr_create(smc, smcibdev, ibport, - lcl->id_for_peer, vlan_id); + rc = smc_lgr_create(smc, is_smcd, smcibdev, ibport, + lcl->id_for_peer, vlan_id, smcd, peer_gid); if (rc) goto out; smc_lgr_register_conn(conn); /* add smc conn to lgr */ - rc = smc_link_determine_gid(conn->lgr); + if (!is_smcd) + rc = smc_link_determine_gid(conn->lgr); } conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; @@ -609,8 +696,8 @@ static inline int smc_rmb_wnd_update_limit(int rmbe_size) return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); } -static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr, - bool is_rmb, int bufsize) +static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, + bool is_rmb, int bufsize) { struct smc_buf_desc *buf_desc; struct smc_link *lnk; @@ -668,7 +755,43 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr, return buf_desc; } -static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) +#define SMCD_DMBE_SIZES 7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ + +static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, + bool is_dmb, int bufsize) +{ + struct smc_buf_desc *buf_desc; + int rc; + + if (smc_compress_bufsize(bufsize) > SMCD_DMBE_SIZES) + return ERR_PTR(-EAGAIN); + + /* try to alloc a new DMB */ + buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); + if (!buf_desc) + return ERR_PTR(-ENOMEM); + if (is_dmb) { + rc = smc_ism_register_dmb(lgr, bufsize, buf_desc); + if (rc) { + kfree(buf_desc); + return ERR_PTR(-EAGAIN); + } + memset(buf_desc->cpu_addr, 0, bufsize); + buf_desc->len = bufsize; + } else { + buf_desc->cpu_addr = kzalloc(bufsize, GFP_KERNEL | + __GFP_NOWARN | __GFP_NORETRY | + __GFP_NOMEMALLOC); + if (!buf_desc->cpu_addr) { + kfree(buf_desc); + return ERR_PTR(-EAGAIN); + } + buf_desc->len = bufsize; + } + return buf_desc; +} + +static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) { struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM); struct smc_connection *conn = &smc->conn; @@ -706,7 +829,11 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) break; /* found reusable slot */ } - buf_desc = smc_new_buf_create(lgr, is_rmb, bufsize); + if (is_smcd) + buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize); + else + buf_desc = smcr_new_buf_create(lgr, is_rmb, bufsize); + if (PTR_ERR(buf_desc) == -ENOMEM) break; if (IS_ERR(buf_desc)) @@ -728,6 +855,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) smc->sk.sk_rcvbuf = bufsize * 2; atomic_set(&conn->bytes_to_rcv, 0); conn->rmbe_update_limit = smc_rmb_wnd_update_limit(bufsize); + if (is_smcd) + smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */ } else { conn->sndbuf_desc = buf_desc; smc->sk.sk_sndbuf = bufsize * 2; @@ -740,6 +869,8 @@ void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; + if (!conn->lgr || conn->lgr->is_smcd) + return; smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, conn->sndbuf_desc, DMA_TO_DEVICE); } @@ -748,6 +879,8 @@ void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; + if (!conn->lgr || conn->lgr->is_smcd) + return; smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, conn->sndbuf_desc, DMA_TO_DEVICE); } @@ -756,6 +889,8 @@ void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; + if (!conn->lgr || conn->lgr->is_smcd) + return; smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, conn->rmb_desc, DMA_FROM_DEVICE); } @@ -764,6 +899,8 @@ void smc_rmb_sync_sg_for_device(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; + if (!conn->lgr || conn->lgr->is_smcd) + return; smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, conn->rmb_desc, DMA_FROM_DEVICE); } @@ -774,16 +911,16 @@ void smc_rmb_sync_sg_for_device(struct smc_connection *conn) * the Linux implementation uses just one RMB-element per RMB, i.e. uses an * extra RMB for every connection in a link group */ -int smc_buf_create(struct smc_sock *smc) +int smc_buf_create(struct smc_sock *smc, bool is_smcd) { int rc; /* create send buffer */ - rc = __smc_buf_create(smc, false); + rc = __smc_buf_create(smc, is_smcd, false); if (rc) return rc; /* create rmb */ - rc = __smc_buf_create(smc, true); + rc = __smc_buf_create(smc, is_smcd, true); if (rc) smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc); return rc; @@ -865,7 +1002,8 @@ void smc_core_exit(void) spin_unlock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { list_del_init(&lgr->list); - smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); + if (!lgr->is_smcd) + smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); cancel_delayed_work_sync(&lgr->free_work); smc_lgr_free(lgr); /* free link group */ } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 93cb3523bf50..cd9268a9570e 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -124,15 +124,28 @@ struct smc_buf_desc { void *cpu_addr; /* virtual address of buffer */ struct page *pages; int len; /* length of buffer */ - struct sg_table sgt[SMC_LINKS_PER_LGR_MAX];/* virtual buffer */ - struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; - /* for rmb only: memory region - * incl. rkey provided to peer - */ - u32 order; /* allocation order */ u32 used; /* currently used / unused */ u8 reused : 1; /* new created / reused */ u8 regerr : 1; /* err during registration */ + union { + struct { /* SMC-R */ + struct sg_table sgt[SMC_LINKS_PER_LGR_MAX]; + /* virtual buffer */ + struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; + /* for rmb only: memory region + * incl. rkey provided to peer + */ + u32 order; /* allocation order */ + }; + struct { /* SMC-D */ + unsigned short sba_idx; + /* SBA index number */ + u64 token; + /* DMB token number */ + dma_addr_t dma_addr; + /* DMA address */ + }; + }; }; struct smc_rtoken { /* address/key of remote RMB */ @@ -148,12 +161,10 @@ struct smc_rtoken { /* address/key of remote RMB */ * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15) */ +struct smcd_dev; + struct smc_link_group { struct list_head list; - enum smc_lgr_role role; /* client or server */ - struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; /* smc link */ - char peer_systemid[SMC_SYSTEMID_LEN]; - /* unique system_id of peer */ struct rb_root conns_all; /* connection tree */ rwlock_t conns_lock; /* protects conns_all */ unsigned int conns_num; /* current # of connections */ @@ -163,17 +174,35 @@ struct smc_link_group { rwlock_t sndbufs_lock; /* protects tx buffers */ struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */ rwlock_t rmbs_lock; /* protects rx buffers */ - struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX] - [SMC_LINKS_PER_LGR_MAX]; - /* remote addr/key pairs */ - unsigned long rtokens_used_mask[BITS_TO_LONGS( - SMC_RMBS_PER_LGR_MAX)]; - /* used rtoken elements */ u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ struct delayed_work free_work; /* delayed freeing of an lgr */ u8 sync_err : 1; /* lgr no longer fits to peer */ u8 terminating : 1;/* lgr is terminating */ + + bool is_smcd; /* SMC-R or SMC-D */ + union { + struct { /* SMC-R */ + enum smc_lgr_role role; + /* client or server */ + struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; + /* smc link */ + char peer_systemid[SMC_SYSTEMID_LEN]; + /* unique system_id of peer */ + struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX] + [SMC_LINKS_PER_LGR_MAX]; + /* remote addr/key pairs */ + unsigned long rtokens_used_mask[BITS_TO_LONGS + (SMC_RMBS_PER_LGR_MAX)]; + /* used rtoken elements */ + }; + struct { /* SMC-D */ + u64 peer_gid; + /* Peer GID (remote) */ + struct smcd_dev *smcd; + /* ISM device for VLAN reg. */ + }; + }; }; /* Find the connection associated with the given alert token in the link group. @@ -217,7 +246,8 @@ void smc_lgr_free(struct smc_link_group *lgr); void smc_lgr_forget(struct smc_link_group *lgr); void smc_lgr_terminate(struct smc_link_group *lgr); void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport); -int smc_buf_create(struct smc_sock *smc); +void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid); +int smc_buf_create(struct smc_sock *smc, bool is_smcd); int smc_uncompress_bufsize(u8 compressed); int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_clc_msg_accept_confirm *clc); @@ -227,9 +257,12 @@ void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn); void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); void smc_rmb_sync_sg_for_device(struct smc_connection *conn); + void smc_conn_free(struct smc_connection *conn); -int smc_conn_create(struct smc_sock *smc, +int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact, struct smc_ib_device *smcibdev, u8 ibport, - struct smc_clc_msg_local *lcl, int srv_first_contact); + struct smc_clc_msg_local *lcl, struct smcd_dev *smcd, + u64 peer_gid); +void smcd_conn_free(struct smc_connection *conn); void smc_core_exit(void); #endif diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 839354402215..64ce107c24d9 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -136,7 +136,8 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, goto errout; } - if ((req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && smc->conn.lgr && + if (smc->conn.lgr && !smc->conn.lgr->is_smcd && + (req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && !list_empty(&smc->conn.lgr->list)) { struct smc_diag_lgrinfo linfo = { .role = smc->conn.lgr->role, diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c new file mode 100644 index 000000000000..ca1ce42fd49f --- /dev/null +++ b/net/smc/smc_ism.c @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Shared Memory Communications Direct over ISM devices (SMC-D) + * + * Functions for ISM device. + * + * Copyright IBM Corp. 2018 + */ + +#include +#include +#include + +#include "smc.h" +#include "smc_core.h" +#include "smc_ism.h" + +struct smcd_dev_list smcd_dev_list = { + .list = LIST_HEAD_INIT(smcd_dev_list.list), + .lock = __SPIN_LOCK_UNLOCKED(smcd_dev_list.lock) +}; + +/* Test if an ISM communication is possible. */ +int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *smcd) +{ + return smcd->ops->query_remote_gid(smcd, peer_gid, vlan_id ? 1 : 0, + vlan_id); +} + +int smc_ism_write(struct smcd_dev *smcd, const struct smc_ism_position *pos, + void *data, size_t len) +{ + int rc; + + rc = smcd->ops->move_data(smcd, pos->token, pos->index, pos->signal, + pos->offset, data, len); + + return rc < 0 ? rc : 0; +} + +/* Set a connection using this DMBE. */ +void smc_ism_set_conn(struct smc_connection *conn) +{ + unsigned long flags; + + spin_lock_irqsave(&conn->lgr->smcd->lock, flags); + conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = conn; + spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags); +} + +/* Unset a connection using this DMBE. */ +void smc_ism_unset_conn(struct smc_connection *conn) +{ + unsigned long flags; + + if (!conn->rmb_desc) + return; + + spin_lock_irqsave(&conn->lgr->smcd->lock, flags); + conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = NULL; + spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags); +} + +/* Register a VLAN identifier with the ISM device. Use a reference count + * and add a VLAN identifier only when the first DMB using this VLAN is + * registered. + */ +int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid) +{ + struct smc_ism_vlanid *new_vlan, *vlan; + unsigned long flags; + int rc = 0; + + if (!vlanid) /* No valid vlan id */ + return -EINVAL; + + /* create new vlan entry, in case we need it */ + new_vlan = kzalloc(sizeof(*new_vlan), GFP_KERNEL); + if (!new_vlan) + return -ENOMEM; + new_vlan->vlanid = vlanid; + refcount_set(&new_vlan->refcnt, 1); + + /* if there is an existing entry, increase count and return */ + spin_lock_irqsave(&smcd->lock, flags); + list_for_each_entry(vlan, &smcd->vlan, list) { + if (vlan->vlanid == vlanid) { + refcount_inc(&vlan->refcnt); + kfree(new_vlan); + goto out; + } + } + + /* no existing entry found. + * add new entry to device; might fail, e.g., if HW limit reached + */ + if (smcd->ops->add_vlan_id(smcd, vlanid)) { + kfree(new_vlan); + rc = -EIO; + goto out; + } + list_add_tail(&new_vlan->list, &smcd->vlan); +out: + spin_unlock_irqrestore(&smcd->lock, flags); + return rc; +} + +/* Unregister a VLAN identifier with the ISM device. Use a reference count + * and remove a VLAN identifier only when the last DMB using this VLAN is + * unregistered. + */ +int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid) +{ + struct smc_ism_vlanid *vlan; + unsigned long flags; + bool found = false; + int rc = 0; + + if (!vlanid) /* No valid vlan id */ + return -EINVAL; + + spin_lock_irqsave(&smcd->lock, flags); + list_for_each_entry(vlan, &smcd->vlan, list) { + if (vlan->vlanid == vlanid) { + if (!refcount_dec_and_test(&vlan->refcnt)) + goto out; + found = true; + break; + } + } + if (!found) { + rc = -ENOENT; + goto out; /* VLAN id not in table */ + } + + /* Found and the last reference just gone */ + if (smcd->ops->del_vlan_id(smcd, vlanid)) + rc = -EIO; + list_del(&vlan->list); + kfree(vlan); +out: + spin_unlock_irqrestore(&smcd->lock, flags); + return rc; +} + +int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) +{ + struct smcd_dmb dmb; + + memset(&dmb, 0, sizeof(dmb)); + dmb.dmb_tok = dmb_desc->token; + dmb.sba_idx = dmb_desc->sba_idx; + dmb.cpu_addr = dmb_desc->cpu_addr; + dmb.dma_addr = dmb_desc->dma_addr; + dmb.dmb_len = dmb_desc->len; + return smcd->ops->unregister_dmb(smcd, &dmb); +} + +int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, + struct smc_buf_desc *dmb_desc) +{ + struct smcd_dmb dmb; + int rc; + + memset(&dmb, 0, sizeof(dmb)); + dmb.dmb_len = dmb_len; + dmb.sba_idx = dmb_desc->sba_idx; + dmb.vlan_id = lgr->vlan_id; + dmb.rgid = lgr->peer_gid; + rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb); + if (!rc) { + dmb_desc->sba_idx = dmb.sba_idx; + dmb_desc->token = dmb.dmb_tok; + dmb_desc->cpu_addr = dmb.cpu_addr; + dmb_desc->dma_addr = dmb.dma_addr; + dmb_desc->len = dmb.dmb_len; + } + return rc; +} + +struct smc_ism_event_work { + struct work_struct work; + struct smcd_dev *smcd; + struct smcd_event event; +}; + +/* worker for SMC-D events */ +static void smc_ism_event_work(struct work_struct *work) +{ + struct smc_ism_event_work *wrk = + container_of(work, struct smc_ism_event_work, work); + + switch (wrk->event.type) { + case ISM_EVENT_GID: /* GID event, token is peer GID */ + smc_smcd_terminate(wrk->smcd, wrk->event.tok); + break; + case ISM_EVENT_DMB: + break; + } + kfree(wrk); +} + +static void smcd_release(struct device *dev) +{ + struct smcd_dev *smcd = container_of(dev, struct smcd_dev, dev); + + kfree(smcd->conn); + kfree(smcd); +} + +struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, + const struct smcd_ops *ops, int max_dmbs) +{ + struct smcd_dev *smcd; + + smcd = kzalloc(sizeof(*smcd), GFP_KERNEL); + if (!smcd) + return NULL; + smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *), + GFP_KERNEL); + if (!smcd->conn) { + kfree(smcd); + return NULL; + } + + smcd->dev.parent = parent; + smcd->dev.release = smcd_release; + device_initialize(&smcd->dev); + dev_set_name(&smcd->dev, name); + smcd->ops = ops; + + spin_lock_init(&smcd->lock); + INIT_LIST_HEAD(&smcd->vlan); + smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", + WQ_MEM_RECLAIM, name); + return smcd; +} +EXPORT_SYMBOL_GPL(smcd_alloc_dev); + +int smcd_register_dev(struct smcd_dev *smcd) +{ + spin_lock(&smcd_dev_list.lock); + list_add_tail(&smcd->list, &smcd_dev_list.list); + spin_unlock(&smcd_dev_list.lock); + + return device_add(&smcd->dev); +} +EXPORT_SYMBOL_GPL(smcd_register_dev); + +void smcd_unregister_dev(struct smcd_dev *smcd) +{ + spin_lock(&smcd_dev_list.lock); + list_del(&smcd->list); + spin_unlock(&smcd_dev_list.lock); + flush_workqueue(smcd->event_wq); + destroy_workqueue(smcd->event_wq); + smc_smcd_terminate(smcd, 0); + + device_del(&smcd->dev); +} +EXPORT_SYMBOL_GPL(smcd_unregister_dev); + +void smcd_free_dev(struct smcd_dev *smcd) +{ + put_device(&smcd->dev); +} +EXPORT_SYMBOL_GPL(smcd_free_dev); + +/* SMCD Device event handler. Called from ISM device interrupt handler. + * Parameters are smcd device pointer, + * - event->type (0 --> DMB, 1 --> GID), + * - event->code (event code), + * - event->tok (either DMB token when event type 0, or GID when event type 1) + * - event->time (time of day) + * - event->info (debug info). + * + * Context: + * - Function called in IRQ context from ISM device driver event handler. + */ +void smcd_handle_event(struct smcd_dev *smcd, struct smcd_event *event) +{ + struct smc_ism_event_work *wrk; + + /* copy event to event work queue, and let it be handled there */ + wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC); + if (!wrk) + return; + INIT_WORK(&wrk->work, smc_ism_event_work); + wrk->smcd = smcd; + wrk->event = *event; + queue_work(smcd->event_wq, &wrk->work); +} +EXPORT_SYMBOL_GPL(smcd_handle_event); + +/* SMCD Device interrupt handler. Called from ISM device interrupt handler. + * Parameters are smcd device pointer and DMB number. Find the connection and + * schedule the tasklet for this connection. + * + * Context: + * - Function called in IRQ context from ISM device driver IRQ handler. + */ +void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno) +{ +} +EXPORT_SYMBOL_GPL(smcd_handle_irq); diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h new file mode 100644 index 000000000000..aee45b860b79 --- /dev/null +++ b/net/smc/smc_ism.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Shared Memory Communications Direct over ISM devices (SMC-D) + * + * SMC-D ISM device structure definitions. + * + * Copyright IBM Corp. 2018 + */ + +#ifndef SMCD_ISM_H +#define SMCD_ISM_H + +#include + +#include "smc.h" + +struct smcd_dev_list { /* List of SMCD devices */ + struct list_head list; + spinlock_t lock; /* Protects list of devices */ +}; + +extern struct smcd_dev_list smcd_dev_list; /* list of smcd devices */ + +struct smc_ism_vlanid { /* VLAN id set on ISM device */ + struct list_head list; + unsigned short vlanid; /* Vlan id */ + refcount_t refcnt; /* Reference count */ +}; + +struct smc_ism_position { /* ISM device position to write to */ + u64 token; /* Token of DMB */ + u32 offset; /* Offset into DMBE */ + u8 index; /* Index of DMBE */ + u8 signal; /* Generate interrupt on owner side */ +}; + +struct smcd_dev; + +int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *dev); +void smc_ism_set_conn(struct smc_connection *conn); +void smc_ism_unset_conn(struct smc_connection *conn); +int smc_ism_get_vlan(struct smcd_dev *dev, unsigned short vlan_id); +int smc_ism_put_vlan(struct smcd_dev *dev, unsigned short vlan_id); +int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size, + struct smc_buf_desc *dmb_desc); +int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc); +int smc_ism_write(struct smcd_dev *dev, const struct smc_ism_position *pos, + void *data, size_t len); +#endif -- cgit v1.2.3 From 1619f770589a183af56f248de261534b255122de Mon Sep 17 00:00:00 2001 From: Hans Wippel Date: Thu, 28 Jun 2018 19:05:08 +0200 Subject: net/smc: add pnetid support for SMC-D and ISM SMC-D relies on PNETIDs to find usable SMC-D/ISM devices for a SMC connection. This patch adds SMC-D/ISM support to the current PNETID implementation. Signed-off-by: Hans Wippel Signed-off-by: Ursula Braun Suggested-by: Thomas Richter Signed-off-by: David S. Miller --- include/net/smc.h | 1 + net/smc/smc_ism.c | 2 ++ net/smc/smc_pnet.c | 41 +++++++++++++++++++++++++++++++++++++++++ net/smc/smc_pnet.h | 2 ++ 4 files changed, 46 insertions(+) (limited to 'include/net') diff --git a/include/net/smc.h b/include/net/smc.h index 824a7af8d654..9ef49f8b1002 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -73,6 +73,7 @@ struct smcd_dev { struct smc_connection **conn; struct list_head vlan; struct workqueue_struct *event_wq; + u8 pnetid[SMC_MAX_PNETID_LEN]; }; struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index ca1ce42fd49f..f44e4dff244a 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -13,6 +13,7 @@ #include "smc.h" #include "smc_core.h" #include "smc_ism.h" +#include "smc_pnet.h" struct smcd_dev_list smcd_dev_list = { .list = LIST_HEAD_INIT(smcd_dev_list.list), @@ -227,6 +228,7 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, device_initialize(&smcd->dev); dev_set_name(&smcd->dev, name); smcd->ops = ops; + smc_pnetid_by_dev_port(parent, 0, smcd->pnetid); spin_lock_init(&smcd->lock); INIT_LIST_HEAD(&smcd->vlan); diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index cdc6e23b6ce1..1b6c066d3495 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -22,6 +22,7 @@ #include "smc_pnet.h" #include "smc_ib.h" +#include "smc_ism.h" static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { [SMC_PNETID_NAME] = { @@ -564,6 +565,27 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, spin_unlock(&smc_ib_devices.lock); } +static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, + struct smcd_dev **smcismdev) +{ + u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; + struct smcd_dev *ismdev; + + ndev = pnet_find_base_ndev(ndev); + if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, + ndev_pnetid)) + return; /* pnetid could not be determined */ + + spin_lock(&smcd_dev_list.lock); + list_for_each_entry(ismdev, &smcd_dev_list.list, list) { + if (!memcmp(ismdev->pnetid, ndev_pnetid, SMC_MAX_PNETID_LEN)) { + *smcismdev = ismdev; + break; + } + } + spin_unlock(&smcd_dev_list.lock); +} + /* Lookup of coupled ib_device via SMC pnet table */ static void smc_pnet_find_roce_by_table(struct net_device *netdev, struct smc_ib_device **smcibdev, @@ -615,3 +637,22 @@ out_rel: out: return; } + +void smc_pnet_find_ism_resource(struct sock *sk, struct smcd_dev **smcismdev) +{ + struct dst_entry *dst = sk_dst_get(sk); + + *smcismdev = NULL; + if (!dst) + goto out; + if (!dst->dev) + goto out_rel; + + /* if possible, lookup via hardware-defined pnetid */ + smc_pnet_find_ism_by_pnetid(dst->dev, smcismdev); + +out_rel: + dst_release(dst); +out: + return; +} diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h index ad4455cde9e7..1e94fd4df7bc 100644 --- a/net/smc/smc_pnet.h +++ b/net/smc/smc_pnet.h @@ -17,6 +17,7 @@ #endif struct smc_ib_device; +struct smcd_dev; static inline int smc_pnetid_by_dev_port(struct device *dev, unsigned short port, u8 *pnetid) @@ -33,5 +34,6 @@ void smc_pnet_exit(void); int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev); void smc_pnet_find_roce_resource(struct sock *sk, struct smc_ib_device **smcibdev, u8 *ibport); +void smc_pnet_find_ism_resource(struct sock *sk, struct smcd_dev **smcismdev); #endif -- cgit v1.2.3 From 755c31cd85aea35cf7a5e7253851b52c08eff6e9 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 29 Jun 2018 21:26:51 -0700 Subject: net: sock: Change tx_queue_mapping in sock_common to unsigned short Change 'skc_tx_queue_mapping' field in sock_common structure from 'int' to 'unsigned short' type with ~0 indicating unset and other positive queue values being set. This will accommodate adding a new 'unsigned short' field in sock_common in the next patch for rx_queue_mapping. Signed-off-by: Amritha Nambiar Signed-off-by: David S. Miller --- include/net/sock.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index b3b75419eafe..37b09c84504b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -214,7 +214,7 @@ struct sock_common { struct hlist_node skc_node; struct hlist_nulls_node skc_nulls_node; }; - int skc_tx_queue_mapping; + unsigned short skc_tx_queue_mapping; union { int skc_incoming_cpu; u32 skc_rcv_wnd; @@ -1681,17 +1681,25 @@ static inline int sk_receive_skb(struct sock *sk, struct sk_buff *skb, static inline void sk_tx_queue_set(struct sock *sk, int tx_queue) { + /* sk_tx_queue_mapping accept only upto a 16-bit value */ + if (WARN_ON_ONCE((unsigned short)tx_queue >= USHRT_MAX)) + return; sk->sk_tx_queue_mapping = tx_queue; } +#define NO_QUEUE_MAPPING USHRT_MAX + static inline void sk_tx_queue_clear(struct sock *sk) { - sk->sk_tx_queue_mapping = -1; + sk->sk_tx_queue_mapping = NO_QUEUE_MAPPING; } static inline int sk_tx_queue_get(const struct sock *sk) { - return sk ? sk->sk_tx_queue_mapping : -1; + if (sk && sk->sk_tx_queue_mapping != NO_QUEUE_MAPPING) + return sk->sk_tx_queue_mapping; + + return -1; } static inline void sk_set_socket(struct sock *sk, struct socket *sock) -- cgit v1.2.3 From c6345ce7d361dce1b5d02a2181ccb598c27fd7ae Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 29 Jun 2018 21:26:57 -0700 Subject: net: Record receive queue number for a connection This patch adds a new field to sock_common 'skc_rx_queue_mapping' which holds the receive queue number for the connection. The Rx queue is marked in tcp_finish_connect() to allow a client app to do SO_INCOMING_NAPI_ID after a connect() call to get the right queue association for a socket. Rx queue is also marked in tcp_conn_request() to allow syn-ack to go on the right tx-queue associated with the queue on which syn is received. Signed-off-by: Amritha Nambiar Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- include/net/busy_poll.h | 1 + include/net/sock.h | 28 ++++++++++++++++++++++++++++ net/core/sock.c | 2 ++ net/ipv4/tcp_input.c | 3 +++ 4 files changed, 34 insertions(+) (limited to 'include/net') diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index c5187438af38..9e36fda652b7 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -151,6 +151,7 @@ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb) #ifdef CONFIG_NET_RX_BUSY_POLL sk->sk_napi_id = skb->napi_id; #endif + sk_rx_queue_set(sk, skb); } /* variant used for unconnected sockets */ diff --git a/include/net/sock.h b/include/net/sock.h index 37b09c84504b..2b097cc89727 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -139,6 +139,7 @@ typedef __u64 __bitwise __addrpair; * @skc_node: main hash linkage for various protocol lookup tables * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol * @skc_tx_queue_mapping: tx queue number for this connection + * @skc_rx_queue_mapping: rx queue number for this connection * @skc_flags: place holder for sk_flags * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings @@ -215,6 +216,9 @@ struct sock_common { struct hlist_nulls_node skc_nulls_node; }; unsigned short skc_tx_queue_mapping; +#ifdef CONFIG_XPS + unsigned short skc_rx_queue_mapping; +#endif union { int skc_incoming_cpu; u32 skc_rcv_wnd; @@ -326,6 +330,9 @@ struct sock { #define sk_nulls_node __sk_common.skc_nulls_node #define sk_refcnt __sk_common.skc_refcnt #define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping +#ifdef CONFIG_XPS +#define sk_rx_queue_mapping __sk_common.skc_rx_queue_mapping +#endif #define sk_dontcopy_begin __sk_common.skc_dontcopy_begin #define sk_dontcopy_end __sk_common.skc_dontcopy_end @@ -1702,6 +1709,27 @@ static inline int sk_tx_queue_get(const struct sock *sk) return -1; } +static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb) +{ +#ifdef CONFIG_XPS + if (skb_rx_queue_recorded(skb)) { + u16 rx_queue = skb_get_rx_queue(skb); + + if (WARN_ON_ONCE(rx_queue == NO_QUEUE_MAPPING)) + return; + + sk->sk_rx_queue_mapping = rx_queue; + } +#endif +} + +static inline void sk_rx_queue_clear(struct sock *sk) +{ +#ifdef CONFIG_XPS + sk->sk_rx_queue_mapping = NO_QUEUE_MAPPING; +#endif +} + static inline void sk_set_socket(struct sock *sk, struct socket *sock) { sk_tx_queue_clear(sk); diff --git a/net/core/sock.c b/net/core/sock.c index bcc41829a16d..dac6d785186b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2818,6 +2818,8 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_pacing_rate = ~0U; sk->sk_pacing_shift = 10; sk->sk_incoming_cpu = -1; + + sk_rx_queue_clear(sk); /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eecd359595fc..a4731995e899 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -78,6 +78,7 @@ #include #include #include +#include int sysctl_tcp_max_orphans __read_mostly = NR_FILE; @@ -5592,6 +5593,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) if (skb) { icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); security_inet_conn_established(sk, skb); + sk_mark_napi_id(sk, skb); } tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); @@ -6420,6 +6422,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->txhash = net_tx_rndhash(); tcp_openreq_init_rwin(req, sk, dst); + sk_rx_queue_set(req_to_sk(req), skb); if (!want_cookie) { tcp_reqsk_record_syn(sk, req, skb); fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); -- cgit v1.2.3 From fc9bab24e9c654f62f3d411fc0b041be9e487e9d Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 29 Jun 2018 21:27:02 -0700 Subject: net: Enable Tx queue selection based on Rx queues This patch adds support to pick Tx queue based on the Rx queue(s) map configuration set by the admin through the sysfs attribute for each Tx queue. If the user configuration for receive queue(s) map does not apply, then the Tx queue selection falls back to CPU(s) map based selection and finally to hashing. Signed-off-by: Amritha Nambiar Signed-off-by: David S. Miller --- include/net/sock.h | 10 +++++++++ net/core/dev.c | 62 +++++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 55 insertions(+), 17 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 2b097cc89727..2ed99bfa4595 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1730,6 +1730,16 @@ static inline void sk_rx_queue_clear(struct sock *sk) #endif } +#ifdef CONFIG_XPS +static inline int sk_rx_queue_get(const struct sock *sk) +{ + if (sk && sk->sk_rx_queue_mapping != NO_QUEUE_MAPPING) + return sk->sk_rx_queue_mapping; + + return -1; +} +#endif + static inline void sk_set_socket(struct sock *sk, struct socket *sock) { sk_tx_queue_clear(sk); diff --git a/net/core/dev.c b/net/core/dev.c index 43b5575e40c5..08d58e0debe5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3459,35 +3459,63 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) } #endif /* CONFIG_NET_EGRESS */ -static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) +#ifdef CONFIG_XPS +static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, + struct xps_dev_maps *dev_maps, unsigned int tci) +{ + struct xps_map *map; + int queue_index = -1; + + if (dev->num_tc) { + tci *= dev->num_tc; + tci += netdev_get_prio_tc_map(dev, skb->priority); + } + + map = rcu_dereference(dev_maps->attr_map[tci]); + if (map) { + if (map->len == 1) + queue_index = map->queues[0]; + else + queue_index = map->queues[reciprocal_scale( + skb_get_hash(skb), map->len)]; + if (unlikely(queue_index >= dev->real_num_tx_queues)) + queue_index = -1; + } + return queue_index; +} +#endif + +static int get_xps_queue(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_XPS struct xps_dev_maps *dev_maps; - struct xps_map *map; + struct sock *sk = skb->sk; int queue_index = -1; if (!static_key_false(&xps_needed)) return -1; rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_cpus_map); + if (!static_key_false(&xps_rxqs_needed)) + goto get_cpus_map; + + dev_maps = rcu_dereference(dev->xps_rxqs_map); if (dev_maps) { - unsigned int tci = skb->sender_cpu - 1; + int tci = sk_rx_queue_get(sk); - if (dev->num_tc) { - tci *= dev->num_tc; - tci += netdev_get_prio_tc_map(dev, skb->priority); - } + if (tci >= 0 && tci < dev->num_rx_queues) + queue_index = __get_xps_queue_idx(dev, skb, dev_maps, + tci); + } - map = rcu_dereference(dev_maps->attr_map[tci]); - if (map) { - if (map->len == 1) - queue_index = map->queues[0]; - else - queue_index = map->queues[reciprocal_scale(skb_get_hash(skb), - map->len)]; - if (unlikely(queue_index >= dev->real_num_tx_queues)) - queue_index = -1; +get_cpus_map: + if (queue_index < 0) { + dev_maps = rcu_dereference(dev->xps_cpus_map); + if (dev_maps) { + unsigned int tci = skb->sender_cpu - 1; + + queue_index = __get_xps_queue_idx(dev, skb, dev_maps, + tci); } } rcu_read_unlock(); -- cgit v1.2.3 From 69b9e1e07d98b57b972df3c44647ca8795284d39 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 2 Jul 2018 18:21:11 +0800 Subject: ipv4: add __ip_queue_xmit() that supports tos param This patch introduces __ip_queue_xmit(), through which the callers can pass tos param into it without having to set inet->tos. For ipv6, ip6_xmit() already allows passing tclass parameter. It's needed when some transport protocol doesn't use inet->tos, like sctp's per transport dscp, which will be added in next patch. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/ip.h | 9 ++++++++- net/ipv4/ip_output.c | 9 +++++---- 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index 0d2281b4b27a..09da79d8ceea 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -148,7 +148,8 @@ void ip_send_check(struct iphdr *ip); int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb); -int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl); +int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, + __u8 tos); void ip_init(void); int ip_append_data(struct sock *sk, struct flowi4 *fl4, int getfrag(void *from, char *to, int offset, int len, @@ -174,6 +175,12 @@ struct sk_buff *ip_make_skb(struct sock *sk, struct flowi4 *fl4, struct ipcm_cookie *ipc, struct rtable **rtp, struct inet_cork *cork, unsigned int flags); +static inline int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, + struct flowi *fl) +{ + return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos); +} + static inline struct sk_buff *ip_finish_skb(struct sock *sk, struct flowi4 *fl4) { return __ip_make_skb(sk, fl4, &sk->sk_write_queue, &inet_sk(sk)->cork.base); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b3308e9d9762..188cc586e7ff 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -423,7 +423,8 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) } /* Note: skb->sk can be different from sk, in case of tunnels */ -int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) +int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, + __u8 tos) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); @@ -462,7 +463,7 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) inet->inet_dport, inet->inet_sport, sk->sk_protocol, - RT_CONN_FLAGS(sk), + RT_CONN_FLAGS_TOS(sk, tos), sk->sk_bound_dev_if); if (IS_ERR(rt)) goto no_route; @@ -478,7 +479,7 @@ packet_routed: skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0)); skb_reset_network_header(skb); iph = ip_hdr(skb); - *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); + *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff)); if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df) iph->frag_off = htons(IP_DF); else @@ -511,7 +512,7 @@ no_route: kfree_skb(skb); return -EHOSTUNREACH; } -EXPORT_SYMBOL(ip_queue_xmit); +EXPORT_SYMBOL(__ip_queue_xmit); static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) { -- cgit v1.2.3 From 8a9c58d28d0f66569737a3295116710ed24573cd Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 2 Jul 2018 18:21:12 +0800 Subject: sctp: add support for dscp and flowlabel per transport Like some other per transport params, flowlabel and dscp are added in transport, asoc and sctp_sock. By default, transport sets its value from asoc's, and asoc does it from sctp_sock. flowlabel only works for ipv6 transport. Other than that they need to be passed down in sctp_xmit, flow4/6 also needs to set them before looking up route in get_dst. Note that it uses '& 0x100000' to check if flowlabel is set and '& 0x1' (tos 1st bit is unused) to check if dscp is set by users, so that they could be set to 0 by sockopt in next patch. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/linux/sctp.h | 7 +++++++ include/net/sctp/structs.h | 9 +++++++++ net/sctp/associola.c | 7 +++++++ net/sctp/ipv6.c | 11 +++++++++-- net/sctp/protocol.c | 16 ++++++++++++---- 5 files changed, 44 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/linux/sctp.h b/include/linux/sctp.h index b36c76635f18..83d94341e003 100644 --- a/include/linux/sctp.h +++ b/include/linux/sctp.h @@ -801,4 +801,11 @@ struct sctp_strreset_resptsn { __be32 receivers_next_tsn; }; +enum { + SCTP_DSCP_SET_MASK = 0x1, + SCTP_DSCP_VAL_MASK = 0xfc, + SCTP_FLOWLABEL_SET_MASK = 0x100000, + SCTP_FLOWLABEL_VAL_MASK = 0xfffff +}; + #endif /* __LINUX_SCTP_H__ */ diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 701a51736fa5..ab869e0d8326 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -193,6 +193,9 @@ struct sctp_sock { /* This is the max_retrans value for new associations. */ __u16 pathmaxrxt; + __u32 flowlabel; + __u8 dscp; + /* The initial Path MTU to use for new associations. */ __u32 pathmtu; @@ -895,6 +898,9 @@ struct sctp_transport { */ __u16 pathmaxrxt; + __u32 flowlabel; + __u8 dscp; + /* This is the partially failed retrans value for the transport * and will be initialized from the assocs value. This can be changed * using the SCTP_PEER_ADDR_THLDS socket option @@ -1772,6 +1778,9 @@ struct sctp_association { */ __u16 pathmaxrxt; + __u32 flowlabel; + __u8 dscp; + /* Flag that path mtu update is pending */ __u8 pmtu_pending; diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 5d5a16204d50..16ecfbc95614 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -115,6 +115,9 @@ static struct sctp_association *sctp_association_init( /* Initialize path max retrans value. */ asoc->pathmaxrxt = sp->pathmaxrxt; + asoc->flowlabel = sp->flowlabel; + asoc->dscp = sp->dscp; + /* Initialize default path MTU. */ asoc->pathmtu = sp->pathmtu; @@ -647,6 +650,10 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, peer->sackdelay = asoc->sackdelay; peer->sackfreq = asoc->sackfreq; + if (addr->sa.sa_family == AF_INET6) + peer->flowlabel = asoc->flowlabel; + peer->dscp = asoc->dscp; + /* Enable/disable heartbeat, SACK delay, and path MTU discovery * based on association setting. */ diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 0cd2e764f47f..38102bf7f13e 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -209,12 +209,17 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) struct sock *sk = skb->sk; struct ipv6_pinfo *np = inet6_sk(sk); struct flowi6 *fl6 = &transport->fl.u.ip6; + __u8 tclass = np->tclass; int res; pr_debug("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", __func__, skb, skb->len, &fl6->saddr, &fl6->daddr); - IP6_ECN_flow_xmit(sk, fl6->flowlabel); + if (transport->dscp & SCTP_DSCP_SET_MASK) + tclass = transport->dscp & SCTP_DSCP_VAL_MASK; + + if (INET_ECN_is_capable(tclass)) + IP6_ECN_flow_xmit(sk, fl6->flowlabel); if (!(transport->param_flags & SPP_PMTUD_ENABLE)) skb->ignore_df = 1; @@ -223,7 +228,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) rcu_read_lock(); res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt), - np->tclass); + tclass); rcu_read_unlock(); return res; } @@ -254,6 +259,8 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, fl6->flowi6_oif = daddr->v6.sin6_scope_id; else if (asoc) fl6->flowi6_oif = asoc->base.sk->sk_bound_dev_if; + if (t->flowlabel & SCTP_FLOWLABEL_SET_MASK) + fl6->flowlabel = htonl(t->flowlabel & SCTP_FLOWLABEL_VAL_MASK); pr_debug("%s: dst=%pI6 ", __func__, &fl6->daddr); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 67f73d3a1356..e948db29ab53 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -426,13 +426,16 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, struct dst_entry *dst = NULL; union sctp_addr *daddr = &t->ipaddr; union sctp_addr dst_saddr; + __u8 tos = inet_sk(sk)->tos; + if (t->dscp & SCTP_DSCP_SET_MASK) + tos = t->dscp & SCTP_DSCP_VAL_MASK; memset(fl4, 0x0, sizeof(struct flowi4)); fl4->daddr = daddr->v4.sin_addr.s_addr; fl4->fl4_dport = daddr->v4.sin_port; fl4->flowi4_proto = IPPROTO_SCTP; if (asoc) { - fl4->flowi4_tos = RT_CONN_FLAGS(asoc->base.sk); + fl4->flowi4_tos = RT_CONN_FLAGS_TOS(asoc->base.sk, tos); fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if; fl4->fl4_sport = htons(asoc->base.bind_addr.port); } @@ -495,7 +498,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, fl4->fl4_sport = laddr->a.v4.sin_port; flowi4_update_output(fl4, asoc->base.sk->sk_bound_dev_if, - RT_CONN_FLAGS(asoc->base.sk), + RT_CONN_FLAGS_TOS(asoc->base.sk, tos), daddr->v4.sin_addr.s_addr, laddr->a.v4.sin_addr.s_addr); @@ -971,16 +974,21 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *transport) { struct inet_sock *inet = inet_sk(skb->sk); + __u8 dscp = inet->tos; pr_debug("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n", __func__, skb, - skb->len, &transport->fl.u.ip4.saddr, &transport->fl.u.ip4.daddr); + skb->len, &transport->fl.u.ip4.saddr, + &transport->fl.u.ip4.daddr); + + if (transport->dscp & SCTP_DSCP_SET_MASK) + dscp = transport->dscp & SCTP_DSCP_VAL_MASK; inet->pmtudisc = transport->param_flags & SPP_PMTUD_ENABLE ? IP_PMTUDISC_DO : IP_PMTUDISC_DONT; SCTP_INC_STATS(sock_net(&inet->sk), SCTP_MIB_OUTSCTPPACKS); - return ip_queue_xmit(&inet->sk, skb, &transport->fl); + return __ip_queue_xmit(&inet->sk, skb, &transport->fl, dscp); } static struct sctp_af sctp_af_inet; -- cgit v1.2.3 From 17266ee939849cb095ed7dd9edbec4162172226b Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Mon, 2 Jul 2018 16:14:12 +0100 Subject: net: ipv4: listified version of ip_rcv Also involved adding a way to run a netfilter hook over a list of packets. Rather than attempting to make netfilter know about lists (which would be a major project in itself) we just let it call the regular okfn (in this case ip_rcv_finish()) for any packets it steals, and have it give us back a list of packets it's synchronously accepted (which normally NF_HOOK would automatically call okfn() on, but we want to be able to potentially pass the list to a listified version of okfn().) The netfilter hooks themselves are indirect calls that still happen per- packet (see nf_hook_entry_hookfn()), but again, changing that can be left for future work. There is potential for out-of-order receives if the netfilter hook ends up synchronously stealing packets, as they will be processed before any accepts earlier in the list. However, it was already possible for an asynchronous accept to cause out-of-order receives, so presumably this is considered OK. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ include/linux/netfilter.h | 22 +++++++++++++++ include/net/ip.h | 2 ++ net/core/dev.c | 8 +++--- net/ipv4/af_inet.c | 1 + net/ipv4/ip_input.c | 68 ++++++++++++++++++++++++++++++++++++++++++----- 6 files changed, 94 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f67258f057ca..c1ef749b6f9f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2297,6 +2297,9 @@ struct packet_type { struct net_device *, struct packet_type *, struct net_device *); + void (*list_func) (struct list_head *, + struct packet_type *, + struct net_device *); bool (*id_match)(struct packet_type *ptype, struct sock *sk); void *af_packet_priv; diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index dd2052f0efb7..5a5e0a2ab2a3 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -288,6 +288,20 @@ NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct return ret; } +static inline void +NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, + struct list_head *head, struct net_device *in, struct net_device *out, + int (*okfn)(struct net *, struct sock *, struct sk_buff *)) +{ + struct sk_buff *skb, *next; + + list_for_each_entry_safe(skb, next, head, list) { + int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn); + if (ret != 1) + list_del(&skb->list); + } +} + /* Call setsockopt() */ int nf_setsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt, unsigned int len); @@ -369,6 +383,14 @@ NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, return okfn(net, sk, skb); } +static inline void +NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, + struct list_head *head, struct net_device *in, struct net_device *out, + int (*okfn)(struct net *, struct sock *, struct sk_buff *)) +{ + /* nothing to do */ +} + static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, diff --git a/include/net/ip.h b/include/net/ip.h index 09da79d8ceea..99d1b835d2aa 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -138,6 +138,8 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, struct ip_options_rcu *opt); int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); +void ip_list_rcv(struct list_head *head, struct packet_type *pt, + struct net_device *orig_dev); int ip_local_deliver(struct sk_buff *skb); int ip_mr_input(struct sk_buff *skb); int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index 1bc485bb0678..5e22719ce71d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4806,9 +4806,11 @@ static inline void __netif_receive_skb_list_ptype(struct list_head *head, return; if (list_empty(head)) return; - - list_for_each_entry_safe(skb, next, head, list) - pt_prev->func(skb, skb->dev, pt_prev, orig_dev); + if (pt_prev->list_func != NULL) + pt_prev->list_func(head, pt_prev, orig_dev); + else + list_for_each_entry_safe(skb, next, head, list) + pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 9263a2c114e0..c716be13d58c 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1882,6 +1882,7 @@ fs_initcall(ipv4_offload_init); static struct packet_type ip_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IP), .func = ip_rcv, + .list_func = ip_list_rcv, }; static int __init inet_init(void) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 7582713dd18f..914240830bdf 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -408,10 +408,9 @@ drop_error: /* * Main IP Receive routine. */ -int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) +static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) { const struct iphdr *iph; - struct net *net; u32 len; /* When the interface is in promisc. mode, drop all the crap @@ -421,7 +420,6 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, goto drop; - net = dev_net(dev); __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len); skb = skb_share_check(skb, GFP_ATOMIC); @@ -489,9 +487,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, /* Must drop socket now because of tproxy. */ skb_orphan(skb); - return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, - net, NULL, skb, dev, NULL, - ip_rcv_finish); + return skb; csum_error: __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS); @@ -500,5 +496,63 @@ inhdr_error: drop: kfree_skb(skb); out: - return NET_RX_DROP; + return NULL; +} + +/* + * IP receive entry point + */ +int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, + struct net_device *orig_dev) +{ + struct net *net = dev_net(dev); + + skb = ip_rcv_core(skb, net); + if (skb == NULL) + return NET_RX_DROP; + return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, + net, NULL, skb, dev, NULL, + ip_rcv_finish); +} + +static void ip_sublist_rcv(struct list_head *head, struct net_device *dev, + struct net *net) +{ + struct sk_buff *skb, *next; + + NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, + head, dev, NULL, ip_rcv_finish); + list_for_each_entry_safe(skb, next, head, list) + ip_rcv_finish(net, NULL, skb); +} + +/* Receive a list of IP packets */ +void ip_list_rcv(struct list_head *head, struct packet_type *pt, + struct net_device *orig_dev) +{ + struct net_device *curr_dev = NULL; + struct net *curr_net = NULL; + struct sk_buff *skb, *next; + struct list_head sublist; + + list_for_each_entry_safe(skb, next, head, list) { + struct net_device *dev = skb->dev; + struct net *net = dev_net(dev); + + skb = ip_rcv_core(skb, net); + if (skb == NULL) + continue; + + if (curr_dev != dev || curr_net != net) { + /* dispatch old sublist */ + list_cut_before(&sublist, head, &skb->list); + if (!list_empty(&sublist)) + ip_sublist_rcv(&sublist, dev, net); + /* start new sublist */ + curr_dev = dev; + curr_net = net; + } + } + /* dispatch final sublist */ + ip_sublist_rcv(head, curr_dev, curr_net); } -- cgit v1.2.3 From 80b14dee2bea128928537d61c333f24cb8cbb62f Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Tue, 3 Jul 2018 15:42:48 -0700 Subject: net: Add a new socket option for a future transmit time. This patch introduces SO_TXTIME. User space enables this option in order to pass a desired future transmit time in a CMSG when calling sendmsg(2). The argument to this socket option is a 8-bytes long struct provided by the uapi header net_tstamp.h defined as: struct sock_txtime { clockid_t clockid; u32 flags; }; Note that new fields were added to struct sock by filling a 2-bytes hole found in the struct. For that reason, neither the struct size or number of cachelines were altered. Signed-off-by: Richard Cochran Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: David S. Miller --- arch/alpha/include/uapi/asm/socket.h | 3 +++ arch/ia64/include/uapi/asm/socket.h | 3 +++ arch/mips/include/uapi/asm/socket.h | 3 +++ arch/parisc/include/uapi/asm/socket.h | 3 +++ arch/s390/include/uapi/asm/socket.h | 3 +++ arch/sparc/include/uapi/asm/socket.h | 3 +++ arch/xtensa/include/uapi/asm/socket.h | 3 +++ include/net/sock.h | 10 ++++++++++ include/uapi/asm-generic/socket.h | 3 +++ include/uapi/linux/net_tstamp.h | 15 +++++++++++++++ net/core/sock.c | 35 +++++++++++++++++++++++++++++++++++ 11 files changed, 84 insertions(+) (limited to 'include/net') diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index be14f16149d5..065fb372e355 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -112,4 +112,7 @@ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h index 3efba40adc54..c872c4e6bafb 100644 --- a/arch/ia64/include/uapi/asm/socket.h +++ b/arch/ia64/include/uapi/asm/socket.h @@ -114,4 +114,7 @@ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME + #endif /* _ASM_IA64_SOCKET_H */ diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 49c3d4795963..71370fb3ceef 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -123,4 +123,7 @@ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 1d0fdc3b5d22..061b9cf2a779 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -104,4 +104,7 @@ #define SO_ZEROCOPY 0x4035 +#define SO_TXTIME 0x4036 +#define SCM_TXTIME SO_TXTIME + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h index 3510c0fd06f4..39d901476ee5 100644 --- a/arch/s390/include/uapi/asm/socket.h +++ b/arch/s390/include/uapi/asm/socket.h @@ -111,4 +111,7 @@ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME + #endif /* _ASM_SOCKET_H */ diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index d58520c2e6ff..7ea35e5601b6 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -101,6 +101,9 @@ #define SO_ZEROCOPY 0x003e +#define SO_TXTIME 0x003f +#define SCM_TXTIME SO_TXTIME + /* Security levels - as per NRL IPv6 - don't actually do anything */ #define SO_SECURITY_AUTHENTICATION 0x5001 #define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002 diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h index 75a07b8119a9..1de07a7f7680 100644 --- a/arch/xtensa/include/uapi/asm/socket.h +++ b/arch/xtensa/include/uapi/asm/socket.h @@ -116,4 +116,7 @@ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME + #endif /* _XTENSA_SOCKET_H */ diff --git a/include/net/sock.h b/include/net/sock.h index 2ed99bfa4595..68347b9821c6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -319,6 +319,9 @@ struct sock_common { * @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0 * @sk_reuseport_cb: reuseport group container * @sk_rcu: used during RCU grace period + * @sk_clockid: clockid used by time-based scheduling (SO_TXTIME) + * @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME + * @sk_txtime_unused: unused txtime flags */ struct sock { /* @@ -475,6 +478,11 @@ struct sock { u8 sk_shutdown; u32 sk_tskey; atomic_t sk_zckey; + + u8 sk_clockid; + u8 sk_txtime_deadline_mode : 1, + sk_txtime_unused : 7; + struct socket *sk_socket; void *sk_user_data; #ifdef CONFIG_SECURITY @@ -790,6 +798,7 @@ enum sock_flags { SOCK_FILTER_LOCKED, /* Filter cannot be changed anymore */ SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */ SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */ + SOCK_TXTIME, }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) @@ -1585,6 +1594,7 @@ void sock_kzfree_s(struct sock *sk, void *mem, int size); void sk_send_sigurg(struct sock *sk); struct sockcm_cookie { + u64 transmit_time; u32 mark; u16 tsflags; }; diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 0ae758c90e54..a12692e5f7a8 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -107,4 +107,7 @@ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME + #endif /* __ASM_GENERIC_SOCKET_H */ diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index 4fe104b2411f..c9a77c353b98 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -141,4 +141,19 @@ struct scm_ts_pktinfo { __u32 reserved[2]; }; +/* + * SO_TXTIME gets a struct sock_txtime with flags being an integer bit + * field comprised of these values. + */ +enum txtime_flags { + SOF_TXTIME_DEADLINE_MODE = (1 << 0), + + SOF_TXTIME_FLAGS_MASK = (SOF_TXTIME_DEADLINE_MODE) +}; + +struct sock_txtime { + clockid_t clockid; /* reference clockid */ + u32 flags; /* flags defined by enum txtime_flags */ +}; + #endif /* _NET_TIMESTAMPING_H */ diff --git a/net/core/sock.c b/net/core/sock.c index 6429982eb976..fe64b839f1b2 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -91,6 +91,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -697,6 +698,7 @@ EXPORT_SYMBOL(sk_mc_loop); int sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { + struct sock_txtime sk_txtime; struct sock *sk = sock->sk; int val; int valbool; @@ -1070,6 +1072,24 @@ set_rcvbuf: } break; + case SO_TXTIME: + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + ret = -EPERM; + } else if (optlen != sizeof(struct sock_txtime)) { + ret = -EINVAL; + } else if (copy_from_user(&sk_txtime, optval, + sizeof(struct sock_txtime))) { + ret = -EFAULT; + } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { + ret = -EINVAL; + } else { + sock_valbool_flag(sk, SOCK_TXTIME, true); + sk->sk_clockid = sk_txtime.clockid; + sk->sk_txtime_deadline_mode = + !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); + } + break; + default: ret = -ENOPROTOOPT; break; @@ -1115,6 +1135,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, u64 val64; struct linger ling; struct timeval tm; + struct sock_txtime txtime; } v; int lv = sizeof(int); @@ -1403,6 +1424,13 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sock_flag(sk, SOCK_ZEROCOPY); break; + case SO_TXTIME: + lv = sizeof(v.txtime); + v.txtime.clockid = sk->sk_clockid; + v.txtime.flags |= sk->sk_txtime_deadline_mode ? + SOF_TXTIME_DEADLINE_MODE : 0; + break; + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). @@ -2137,6 +2165,13 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; sockc->tsflags |= tsflags; break; + case SCM_TXTIME: + if (!sock_flag(sk, SOCK_TXTIME)) + return -EINVAL; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) + return -EINVAL; + sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); + break; /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ case SCM_RIGHTS: case SCM_CREDENTIALS: -- cgit v1.2.3 From bc969a977880511057053642a81371196303ca01 Mon Sep 17 00:00:00 2001 From: Jesus Sanchez-Palencia Date: Tue, 3 Jul 2018 15:42:49 -0700 Subject: net: ipv4: Hook into time based transmission Add a transmit_time field to struct inet_cork, then copy the timestamp from the CMSG cookie at ip_setup_cork() so we can safely copy it into the skb later during __ip_make_skb(). For the raw fast path, just perform the copy at raw_send_hdrinc(). Signed-off-by: Richard Cochran Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: David S. Miller --- include/net/inet_sock.h | 1 + net/ipv4/icmp.c | 2 ++ net/ipv4/ip_output.c | 3 +++ net/ipv4/ping.c | 1 + net/ipv4/raw.c | 2 ++ net/ipv4/udp.c | 1 + 6 files changed, 10 insertions(+) (limited to 'include/net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 83d5b3c2ac42..314be484c696 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -148,6 +148,7 @@ struct inet_cork { __s16 tos; char priority; __u16 gso_size; + u64 transmit_time; }; struct inet_cork_full { diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 1617604c9284..937239afd68d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -437,6 +437,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) ipc.tx_flags = 0; ipc.ttl = 0; ipc.tos = -1; + ipc.sockc.transmit_time = 0; if (icmp_param->replyopts.opt.opt.optlen) { ipc.opt = &icmp_param->replyopts.opt; @@ -715,6 +716,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) ipc.tx_flags = 0; ipc.ttl = 0; ipc.tos = -1; + ipc.sockc.transmit_time = 0; rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark, type, code, &icmp_param); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 188cc586e7ff..570e3ebc3974 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1154,6 +1154,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, cork->tos = ipc->tos; cork->priority = ipc->priority; cork->tx_flags = ipc->tx_flags; + cork->transmit_time = ipc->sockc.transmit_time; return 0; } @@ -1414,6 +1415,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = cork->transmit_time; /* * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec * on dst refcount @@ -1551,6 +1553,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, ipc.tx_flags = 0; ipc.ttl = 0; ipc.tos = -1; + ipc.sockc.transmit_time = 0; if (replyopts.opt.opt.optlen) { ipc.opt = &replyopts.opt; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 2ed64bca54e3..b47492205507 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -746,6 +746,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc.tx_flags = 0; ipc.ttl = 0; ipc.tos = -1; + ipc.sockc.transmit_time = 0; if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, false); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index abb3c9490c55..446af7be2b55 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -381,6 +381,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = sockc->transmit_time; skb_dst_set(skb, &rt->dst); *rtp = NULL; @@ -562,6 +563,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } ipc.sockc.tsflags = sk->sk_tsflags; + ipc.sockc.transmit_time = 0; ipc.addr = inet->inet_saddr; ipc.opt = NULL; ipc.tx_flags = 0; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 24e116ddae79..5c76ba0666ec 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -930,6 +930,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc.tx_flags = 0; ipc.ttl = 0; ipc.tos = -1; + ipc.sockc.transmit_time = 0; getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; -- cgit v1.2.3 From 860b642b9c33ea4a6ae2f416607b0b98a9d11bb0 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Tue, 3 Jul 2018 15:42:52 -0700 Subject: net/sched: Allow creating a Qdisc watchdog with other clocks This adds 'qdisc_watchdog_init_clockid()' that allows a clockid to be passed, this allows other time references to be used when scheduling the Qdisc to run. Signed-off-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 2 ++ net/sched/sch_api.c | 11 +++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 815b92a23936..2466ea143d01 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -72,6 +72,8 @@ struct qdisc_watchdog { struct Qdisc *qdisc; }; +void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, + clockid_t clockid); void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc); void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 54eca685420f..98541c6399db 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -596,12 +596,19 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) return HRTIMER_NORESTART; } -void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) +void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, + clockid_t clockid) { - hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED); wd->timer.function = qdisc_watchdog; wd->qdisc = qdisc; } +EXPORT_SYMBOL(qdisc_watchdog_init_clockid); + +void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) +{ + qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC); +} EXPORT_SYMBOL(qdisc_watchdog_init); void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) -- cgit v1.2.3 From 88cab77162e86e0f6a2b7e4f859c1435c4e24feb Mon Sep 17 00:00:00 2001 From: Jesus Sanchez-Palencia Date: Tue, 3 Jul 2018 15:42:54 -0700 Subject: net/sched: Add HW offloading capability to ETF Add infra so etf qdisc supports HW offload of time-based transmission. For hw offload, the time sorted list is still used, so packets are dequeued always in order of txtime. Example: $ tc qdisc replace dev enp2s0 parent root handle 100 mqprio num_tc 3 \ map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 queues 1@0 1@1 2@2 hw 0 $ tc qdisc add dev enp2s0 parent 100:1 etf offload delta 100000 \ clockid CLOCK_REALTIME In this example, the Qdisc will use HW offload for the control of the transmission time through the network adapter. The hrtimer used for packets scheduling inside the qdisc will use the clockid CLOCK_REALTIME as reference and packets leave the Qdisc "delta" (100000) nanoseconds before their transmission time. Because this will be using HW offload and since dynamic clocks are not supported by the hrtimer, the system clock and the PHC clock must be synchronized for this mode to behave as expected. Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 5 +++ include/uapi/linux/pkt_sched.h | 1 + net/sched/sch_etf.c | 71 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 76 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 2466ea143d01..7dc769e5452b 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -155,4 +155,9 @@ struct tc_cbs_qopt_offload { s32 sendslope; }; +struct tc_etf_qopt_offload { + u8 enable; + s32 queue; +}; + #endif diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index d5e933ce1447..949118461009 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -944,6 +944,7 @@ struct tc_etf_qopt { __s32 clockid; __u32 flags; #define TC_ETF_DEADLINE_MODE_ON BIT(0) +#define TC_ETF_OFFLOAD_ON BIT(1) }; enum { diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c index 4b7f4903ac17..932a136db568 100644 --- a/net/sched/sch_etf.c +++ b/net/sched/sch_etf.c @@ -20,8 +20,10 @@ #include #define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON) +#define OFFLOAD_IS_ON(x) ((x)->flags & TC_ETF_OFFLOAD_ON) struct etf_sched_data { + bool offload; bool deadline_mode; int clockid; int queue; @@ -45,6 +47,9 @@ static inline int validate_input_params(struct tc_etf_qopt *qopt, * * Dynamic clockids are not supported. * * * Delta must be a positive integer. + * + * Also note that for the HW offload case, we must + * expect that system clocks have been synchronized to PHC. */ if (qopt->clockid < 0) { NL_SET_ERR_MSG(extack, "Dynamic clockids are not supported"); @@ -225,6 +230,56 @@ out: return skb; } +static void etf_disable_offload(struct net_device *dev, + struct etf_sched_data *q) +{ + struct tc_etf_qopt_offload etf = { }; + const struct net_device_ops *ops; + int err; + + if (!q->offload) + return; + + ops = dev->netdev_ops; + if (!ops->ndo_setup_tc) + return; + + etf.queue = q->queue; + etf.enable = 0; + + err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf); + if (err < 0) + pr_warn("Couldn't disable ETF offload for queue %d\n", + etf.queue); +} + +static int etf_enable_offload(struct net_device *dev, struct etf_sched_data *q, + struct netlink_ext_ack *extack) +{ + const struct net_device_ops *ops = dev->netdev_ops; + struct tc_etf_qopt_offload etf = { }; + int err; + + if (q->offload) + return 0; + + if (!ops->ndo_setup_tc) { + NL_SET_ERR_MSG(extack, "Specified device does not support ETF offload"); + return -EOPNOTSUPP; + } + + etf.queue = q->queue; + etf.enable = 1; + + err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Specified device failed to setup ETF hardware offload"); + return err; + } + + return 0; +} + static int etf_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -251,8 +306,9 @@ static int etf_init(struct Qdisc *sch, struct nlattr *opt, qopt = nla_data(tb[TCA_ETF_PARMS]); - pr_debug("delta %d clockid %d deadline %s\n", + pr_debug("delta %d clockid %d offload %s deadline %s\n", qopt->delta, qopt->clockid, + OFFLOAD_IS_ON(qopt) ? "on" : "off", DEADLINE_MODE_IS_ON(qopt) ? "on" : "off"); err = validate_input_params(qopt, extack); @@ -261,9 +317,16 @@ static int etf_init(struct Qdisc *sch, struct nlattr *opt, q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0); + if (OFFLOAD_IS_ON(qopt)) { + err = etf_enable_offload(dev, q, extack); + if (err < 0) + return err; + } + /* Everything went OK, save the parameters used. */ q->delta = qopt->delta; q->clockid = qopt->clockid; + q->offload = OFFLOAD_IS_ON(qopt); q->deadline_mode = DEADLINE_MODE_IS_ON(qopt); switch (q->clockid) { @@ -326,10 +389,13 @@ static void etf_reset(struct Qdisc *sch) static void etf_destroy(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); /* Only cancel watchdog if it's been initialized. */ if (q->watchdog.qdisc == sch) qdisc_watchdog_cancel(&q->watchdog); + + etf_disable_offload(dev, q); } static int etf_dump(struct Qdisc *sch, struct sk_buff *skb) @@ -344,6 +410,9 @@ static int etf_dump(struct Qdisc *sch, struct sk_buff *skb) opt.delta = q->delta; opt.clockid = q->clockid; + if (q->offload) + opt.flags |= TC_ETF_OFFLOAD_ON; + if (q->deadline_mode) opt.flags |= TC_ETF_DEADLINE_MODE_ON; -- cgit v1.2.3 From 4b15c7075352668d4467ced7594b676707d11cae Mon Sep 17 00:00:00 2001 From: Jesus Sanchez-Palencia Date: Tue, 3 Jul 2018 15:43:00 -0700 Subject: net/sched: Make etf report drops on error_queue Use the socket error queue for reporting dropped packets if the socket has enabled that feature through the SO_TXTIME API. Packets are dropped either on enqueue() if they aren't accepted by the qdisc or on dequeue() if the system misses their deadline. Those are reported as different errors so applications can react accordingly. Userspace can retrieve the errors through the socket error queue and the corresponding cmsg interfaces. A struct sock_extended_err* is used for returning the error data, and the packet's timestamp can be retrieved by adding both ee_data and ee_info fields as e.g.: ((__u64) serr->ee_data << 32) + serr->ee_info This feature is disabled by default and must be explicitly enabled by applications. Enabling it can bring some overhead for the Tx cycles of the application. Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: David S. Miller --- include/net/sock.h | 3 ++- include/uapi/linux/errqueue.h | 4 ++++ include/uapi/linux/net_tstamp.h | 5 ++++- net/core/sock.c | 4 ++++ net/sched/sch_etf.c | 35 +++++++++++++++++++++++++++++++++-- 5 files changed, 47 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 68347b9821c6..e0eac9ef44b5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -481,7 +481,8 @@ struct sock { u8 sk_clockid; u8 sk_txtime_deadline_mode : 1, - sk_txtime_unused : 7; + sk_txtime_report_errors : 1, + sk_txtime_unused : 6; struct socket *sk_socket; void *sk_user_data; diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h index dc64cfaf13da..c0151200f7d1 100644 --- a/include/uapi/linux/errqueue.h +++ b/include/uapi/linux/errqueue.h @@ -20,12 +20,16 @@ struct sock_extended_err { #define SO_EE_ORIGIN_ICMP6 3 #define SO_EE_ORIGIN_TXSTATUS 4 #define SO_EE_ORIGIN_ZEROCOPY 5 +#define SO_EE_ORIGIN_TXTIME 6 #define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS #define SO_EE_OFFENDER(ee) ((struct sockaddr*)((ee)+1)) #define SO_EE_CODE_ZEROCOPY_COPIED 1 +#define SO_EE_CODE_TXTIME_INVALID_PARAM 1 +#define SO_EE_CODE_TXTIME_MISSED 2 + /** * struct scm_timestamping - timestamps exposed through cmsg * diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index c9a77c353b98..f8f4539f1135 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -147,8 +147,11 @@ struct scm_ts_pktinfo { */ enum txtime_flags { SOF_TXTIME_DEADLINE_MODE = (1 << 0), + SOF_TXTIME_REPORT_ERRORS = (1 << 1), - SOF_TXTIME_FLAGS_MASK = (SOF_TXTIME_DEADLINE_MODE) + SOF_TXTIME_FLAGS_LAST = SOF_TXTIME_REPORT_ERRORS, + SOF_TXTIME_FLAGS_MASK = (SOF_TXTIME_FLAGS_LAST - 1) | + SOF_TXTIME_FLAGS_LAST }; struct sock_txtime { diff --git a/net/core/sock.c b/net/core/sock.c index fe64b839f1b2..03fdea5b0f57 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1087,6 +1087,8 @@ set_rcvbuf: sk->sk_clockid = sk_txtime.clockid; sk->sk_txtime_deadline_mode = !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); + sk->sk_txtime_report_errors = + !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); } break; @@ -1429,6 +1431,8 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.txtime.clockid = sk->sk_clockid; v.txtime.flags |= sk->sk_txtime_deadline_mode ? SOF_TXTIME_DEADLINE_MODE : 0; + v.txtime.flags |= sk->sk_txtime_report_errors ? + SOF_TXTIME_REPORT_ERRORS : 0; break; default: diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c index 932a136db568..1538d6fa8165 100644 --- a/net/sched/sch_etf.c +++ b/net/sched/sch_etf.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -123,6 +124,32 @@ static void reset_watchdog(struct Qdisc *sch) qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next)); } +static void report_sock_error(struct sk_buff *skb, u32 err, u8 code) +{ + struct sock_exterr_skb *serr; + struct sk_buff *clone; + ktime_t txtime = skb->tstamp; + + if (!skb->sk || !(skb->sk->sk_txtime_report_errors)) + return; + + clone = skb_clone(skb, GFP_ATOMIC); + if (!clone) + return; + + serr = SKB_EXT_ERR(clone); + serr->ee.ee_errno = err; + serr->ee.ee_origin = SO_EE_ORIGIN_TXTIME; + serr->ee.ee_type = 0; + serr->ee.ee_code = code; + serr->ee.ee_pad = 0; + serr->ee.ee_data = (txtime >> 32); /* high part of tstamp */ + serr->ee.ee_info = txtime; /* low part of tstamp */ + + if (sock_queue_err_skb(skb->sk, clone)) + kfree_skb(clone); +} + static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -130,8 +157,11 @@ static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, struct rb_node **p = &q->head.rb_node, *parent = NULL; ktime_t txtime = nskb->tstamp; - if (!is_packet_valid(sch, nskb)) + if (!is_packet_valid(sch, nskb)) { + report_sock_error(nskb, EINVAL, + SO_EE_CODE_TXTIME_INVALID_PARAM); return qdisc_drop(nskb, sch, to_free); + } while (*p) { struct sk_buff *skb; @@ -174,6 +204,8 @@ static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb, if (drop) { struct sk_buff *to_free = NULL; + report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED); + qdisc_drop(skb, sch, &to_free); kfree_skb_list(to_free); qdisc_qstats_overlimit(sch); @@ -199,7 +231,6 @@ static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch) now = q->get_time(); /* Drop if packet has expired while in queue. */ - /* FIXME: Must return error on the socket's error queue */ if (ktime_before(skb->tstamp, now)) { timesortedlist_erase(sch, skb, true); skb = NULL; -- cgit v1.2.3 From eabaef1896bc06319461a644e3aa139885454def Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 4 Jul 2018 14:30:28 +0300 Subject: devlink: Add devlink_param register and unregister Define configuration parameters data structure. Add functions to register and unregister the driver supported configuration parameters table. For each parameter registered, the driver should fill all the parameter's fields. In case the only supported configuration mode is "driverinit" the parameter's get()/set() functions are not required and should be set to NULL, for any other configuration mode, these functions are required and should be set by the driver. Signed-off-by: Moshe Shemesh Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 85 +++++++++++++++++++++++++ include/uapi/linux/devlink.h | 10 +++ net/core/devlink.c | 148 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 243 insertions(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index e336ea9c73df..4a0687a1fb99 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -27,6 +27,7 @@ struct devlink { struct list_head sb_list; struct list_head dpipe_table_list; struct list_head resource_list; + struct list_head param_list; struct devlink_dpipe_headers *dpipe_headers; const struct devlink_ops *ops; struct device *dev; @@ -295,6 +296,68 @@ struct devlink_resource { #define DEVLINK_RESOURCE_ID_PARENT_TOP 0 +#define DEVLINK_PARAM_MAX_STRING_VALUE 32 +enum devlink_param_type { + DEVLINK_PARAM_TYPE_U8, + DEVLINK_PARAM_TYPE_U16, + DEVLINK_PARAM_TYPE_U32, + DEVLINK_PARAM_TYPE_STRING, + DEVLINK_PARAM_TYPE_BOOL, +}; + +union devlink_param_value { + u8 vu8; + u16 vu16; + u32 vu32; + const char *vstr; + bool vbool; +}; + +struct devlink_param_gset_ctx { + union devlink_param_value val; + enum devlink_param_cmode cmode; +}; + +/** + * struct devlink_param - devlink configuration parameter data + * @name: name of the parameter + * @generic: indicates if the parameter is generic or driver specific + * @type: parameter type + * @supported_cmodes: bitmap of supported configuration modes + * @get: get parameter value, used for runtime and permanent + * configuration modes + * @set: set parameter value, used for runtime and permanent + * configuration modes + * + * This struct should be used by the driver to fill the data for + * a parameter it registers. + */ +struct devlink_param { + u32 id; + const char *name; + bool generic; + enum devlink_param_type type; + unsigned long supported_cmodes; + int (*get)(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx); + int (*set)(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx); +}; + +struct devlink_param_item { + struct list_head list; + const struct devlink_param *param; + union devlink_param_value driverinit_value; + bool driverinit_value_valid; +}; + +enum devlink_param_generic_id { + + /* add new param generic ids above here*/ + __DEVLINK_PARAM_GENERIC_ID_MAX, + DEVLINK_PARAM_GENERIC_ID_MAX = __DEVLINK_PARAM_GENERIC_ID_MAX - 1, +}; + struct devlink_ops { int (*reload)(struct devlink *devlink, struct netlink_ext_ack *extack); int (*port_type_set)(struct devlink_port *devlink_port, @@ -430,6 +493,12 @@ void devlink_resource_occ_get_register(struct devlink *devlink, void *occ_get_priv); void devlink_resource_occ_get_unregister(struct devlink *devlink, u64 resource_id); +int devlink_params_register(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count); +void devlink_params_unregister(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count); #else @@ -622,6 +691,22 @@ devlink_resource_occ_get_unregister(struct devlink *devlink, { } +static inline int +devlink_params_register(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count) +{ + return 0; +} + +static inline void +devlink_params_unregister(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count) +{ + +} + #endif #endif /* _NET_DEVLINK_H_ */ diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 75cb5450c851..d814fa67c7b9 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -142,6 +142,16 @@ enum devlink_port_flavour { */ }; +enum devlink_param_cmode { + DEVLINK_PARAM_CMODE_RUNTIME, + DEVLINK_PARAM_CMODE_DRIVERINIT, + DEVLINK_PARAM_CMODE_PERMANENT, + + /* Add new configuration modes above */ + __DEVLINK_PARAM_CMODE_MAX, + DEVLINK_PARAM_CMODE_MAX = __DEVLINK_PARAM_CMODE_MAX - 1 +}; + enum devlink_attr { /* don't change the order or add anything between, this is ABI! */ DEVLINK_ATTR_UNSPEC, diff --git a/net/core/devlink.c b/net/core/devlink.c index 22099705cc41..41b1a5d1c992 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2604,6 +2604,82 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) return devlink->ops->reload(devlink, info->extack); } +static const struct devlink_param devlink_param_generic[] = {}; + +static int devlink_param_generic_verify(const struct devlink_param *param) +{ + /* verify it match generic parameter by id and name */ + if (param->id > DEVLINK_PARAM_GENERIC_ID_MAX) + return -EINVAL; + if (strcmp(param->name, devlink_param_generic[param->id].name)) + return -ENOENT; + + WARN_ON(param->type != devlink_param_generic[param->id].type); + + return 0; +} + +static int devlink_param_driver_verify(const struct devlink_param *param) +{ + int i; + + if (param->id <= DEVLINK_PARAM_GENERIC_ID_MAX) + return -EINVAL; + /* verify no such name in generic params */ + for (i = 0; i <= DEVLINK_PARAM_GENERIC_ID_MAX; i++) + if (!strcmp(param->name, devlink_param_generic[i].name)) + return -EEXIST; + + return 0; +} + +static struct devlink_param_item * +devlink_param_find_by_name(struct list_head *param_list, + const char *param_name) +{ + struct devlink_param_item *param_item; + + list_for_each_entry(param_item, param_list, list) + if (!strcmp(param_item->param->name, param_name)) + return param_item; + return NULL; +} + +static int devlink_param_register_one(struct devlink *devlink, + const struct devlink_param *param) +{ + struct devlink_param_item *param_item; + + if (devlink_param_find_by_name(&devlink->param_list, + param->name)) + return -EEXIST; + + if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT)) + WARN_ON(param->get || param->set); + else + WARN_ON(!param->get || !param->set); + + param_item = kzalloc(sizeof(*param_item), GFP_KERNEL); + if (!param_item) + return -ENOMEM; + param_item->param = param; + + list_add_tail(¶m_item->list, &devlink->param_list); + return 0; +} + +static void devlink_param_unregister_one(struct devlink *devlink, + const struct devlink_param *param) +{ + struct devlink_param_item *param_item; + + param_item = devlink_param_find_by_name(&devlink->param_list, + param->name); + WARN_ON(!param_item); + list_del(¶m_item->list); + kfree(param_item); +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -2845,6 +2921,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) INIT_LIST_HEAD(&devlink->sb_list); INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); INIT_LIST_HEAD(&devlink->resource_list); + INIT_LIST_HEAD(&devlink->param_list); mutex_init(&devlink->lock); return devlink; } @@ -3434,6 +3511,77 @@ out: } EXPORT_SYMBOL_GPL(devlink_resource_occ_get_unregister); +/** + * devlink_params_register - register configuration parameters + * + * @devlink: devlink + * @params: configuration parameters array + * @params_count: number of parameters provided + * + * Register the configuration parameters supported by the driver. + */ +int devlink_params_register(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count) +{ + const struct devlink_param *param = params; + int i; + int err; + + mutex_lock(&devlink->lock); + for (i = 0; i < params_count; i++, param++) { + if (!param || !param->name || !param->supported_cmodes) { + err = -EINVAL; + goto rollback; + } + if (param->generic) { + err = devlink_param_generic_verify(param); + if (err) + goto rollback; + } else { + err = devlink_param_driver_verify(param); + if (err) + goto rollback; + } + err = devlink_param_register_one(devlink, param); + if (err) + goto rollback; + } + + mutex_unlock(&devlink->lock); + return 0; + +rollback: + if (!i) + goto unlock; + for (param--; i > 0; i--, param--) + devlink_param_unregister_one(devlink, param); +unlock: + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_params_register); + +/** + * devlink_params_unregister - unregister configuration parameters + * @devlink: devlink + * @params: configuration parameters to unregister + * @params_count: number of parameters provided + */ +void devlink_params_unregister(struct devlink *devlink, + const struct devlink_param *params, + size_t params_count) +{ + const struct devlink_param *param = params; + int i; + + mutex_lock(&devlink->lock); + for (i = 0; i < params_count; i++, param++) + devlink_param_unregister_one(devlink, param); + mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devlink_params_unregister); + static int __init devlink_module_init(void) { return genl_register_family(&devlink_nl_family); -- cgit v1.2.3 From e3b7ca18ad7b2f47ebd3b6e6ce58a42c6ec24746 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 4 Jul 2018 14:30:30 +0300 Subject: devlink: Add param set command Add param set command to set value for a parameter. Value can be set to any of the supported configuration modes. Signed-off-by: Moshe Shemesh Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 4 ++ include/uapi/linux/devlink.h | 1 + net/core/devlink.c | 134 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 139 insertions(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 4a0687a1fb99..88062752dcd7 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -328,6 +328,7 @@ struct devlink_param_gset_ctx { * configuration modes * @set: set parameter value, used for runtime and permanent * configuration modes + * @validate: validate input value is applicable (within value range, etc.) * * This struct should be used by the driver to fill the data for * a parameter it registers. @@ -342,6 +343,9 @@ struct devlink_param { struct devlink_param_gset_ctx *ctx); int (*set)(struct devlink *devlink, u32 id, struct devlink_param_gset_ctx *ctx); + int (*validate)(struct devlink *devlink, u32 id, + union devlink_param_value val, + struct netlink_ext_ack *extack); }; struct devlink_param_item { diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 2ccfe84176bf..ea0623e568f0 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -79,6 +79,7 @@ enum devlink_command { DEVLINK_CMD_RELOAD, DEVLINK_CMD_PARAM_GET, /* can dump */ + DEVLINK_CMD_PARAM_SET, /* add new commands above here */ __DEVLINK_CMD_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index b22d41275f0b..0cd7a42dcec2 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2661,6 +2661,15 @@ static int devlink_param_get(struct devlink *devlink, return param->get(devlink, param->id, ctx); } +static int devlink_param_set(struct devlink *devlink, + const struct devlink_param *param, + struct devlink_param_gset_ctx *ctx) +{ + if (!param->set) + return -EOPNOTSUPP; + return param->set(devlink, param->id, ctx); +} + static int devlink_param_type_to_nla_type(enum devlink_param_type param_type) { @@ -2847,6 +2856,69 @@ out: return msg->len; } +static int +devlink_param_type_get_from_info(struct genl_info *info, + enum devlink_param_type *param_type) +{ + if (!info->attrs[DEVLINK_ATTR_PARAM_TYPE]) + return -EINVAL; + + switch (nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE])) { + case NLA_U8: + *param_type = DEVLINK_PARAM_TYPE_U8; + break; + case NLA_U16: + *param_type = DEVLINK_PARAM_TYPE_U16; + break; + case NLA_U32: + *param_type = DEVLINK_PARAM_TYPE_U32; + break; + case NLA_STRING: + *param_type = DEVLINK_PARAM_TYPE_STRING; + break; + case NLA_FLAG: + *param_type = DEVLINK_PARAM_TYPE_BOOL; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int +devlink_param_value_get_from_info(const struct devlink_param *param, + struct genl_info *info, + union devlink_param_value *value) +{ + if (param->type != DEVLINK_PARAM_TYPE_BOOL && + !info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) + return -EINVAL; + + switch (param->type) { + case DEVLINK_PARAM_TYPE_U8: + value->vu8 = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); + break; + case DEVLINK_PARAM_TYPE_U16: + value->vu16 = nla_get_u16(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); + break; + case DEVLINK_PARAM_TYPE_U32: + value->vu32 = nla_get_u32(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); + break; + case DEVLINK_PARAM_TYPE_STRING: + if (nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) > + DEVLINK_PARAM_MAX_STRING_VALUE) + return -EINVAL; + value->vstr = nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]); + break; + case DEVLINK_PARAM_TYPE_BOOL: + value->vbool = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA] ? + true : false; + break; + } + return 0; +} + static struct devlink_param_item * devlink_param_get_from_info(struct devlink *devlink, struct genl_info *info) @@ -2887,6 +2959,58 @@ static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb, return genlmsg_reply(msg, info); } +static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + enum devlink_param_type param_type; + struct devlink_param_gset_ctx ctx; + enum devlink_param_cmode cmode; + struct devlink_param_item *param_item; + const struct devlink_param *param; + union devlink_param_value value; + int err = 0; + + param_item = devlink_param_get_from_info(devlink, info); + if (!param_item) + return -EINVAL; + param = param_item->param; + err = devlink_param_type_get_from_info(info, ¶m_type); + if (err) + return err; + if (param_type != param->type) + return -EINVAL; + err = devlink_param_value_get_from_info(param, info, &value); + if (err) + return err; + if (param->validate) { + err = param->validate(devlink, param->id, value, info->extack); + if (err) + return err; + } + + if (!info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]) + return -EINVAL; + cmode = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]); + if (!devlink_param_cmode_is_supported(param, cmode)) + return -EOPNOTSUPP; + + if (cmode == DEVLINK_PARAM_CMODE_DRIVERINIT) { + param_item->driverinit_value = value; + param_item->driverinit_value_valid = true; + } else { + if (!param->set) + return -EOPNOTSUPP; + ctx.val = value; + ctx.cmode = cmode; + err = devlink_param_set(devlink, param, &ctx); + if (err) + return err; + } + + return 0; +} + static int devlink_param_register_one(struct devlink *devlink, const struct devlink_param *param) { @@ -2942,6 +3066,9 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 }, [DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64}, [DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64}, + [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_PARAM_TYPE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -3133,6 +3260,13 @@ static const struct genl_ops devlink_nl_ops[] = { .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, + { + .cmd = DEVLINK_CMD_PARAM_SET, + .doit = devlink_nl_cmd_param_set_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { -- cgit v1.2.3 From ec01aeb1803eaaf0d006e7b07b5ddb5e429c38a4 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 4 Jul 2018 14:30:31 +0300 Subject: devlink: Add support for get/set driverinit value "driverinit" configuration mode value is held by devlink to enable the driver query the value after reload. Two additional functions added to help the driver get/set the value from/to devlink: devlink_param_driverinit_value_set() and devlink_param_driverinit_value_get(). Signed-off-by: Moshe Shemesh Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 18 ++++++++++++ net/core/devlink.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 88062752dcd7..3302e43b09a4 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -503,6 +503,10 @@ int devlink_params_register(struct devlink *devlink, void devlink_params_unregister(struct devlink *devlink, const struct devlink_param *params, size_t params_count); +int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, + union devlink_param_value *init_val); +int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, + union devlink_param_value init_val); #else @@ -711,6 +715,20 @@ devlink_params_unregister(struct devlink *devlink, } +static inline int +devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, + union devlink_param_value *init_val) +{ + return -EOPNOTSUPP; +} + +static inline int +devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, + union devlink_param_value init_val) +{ + return -EOPNOTSUPP; +} + #endif #endif /* _NET_DEVLINK_H_ */ diff --git a/net/core/devlink.c b/net/core/devlink.c index 0cd7a42dcec2..3af08f4562b5 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2645,6 +2645,17 @@ devlink_param_find_by_name(struct list_head *param_list, return NULL; } +static struct devlink_param_item * +devlink_param_find_by_id(struct list_head *param_list, u32 param_id) +{ + struct devlink_param_item *param_item; + + list_for_each_entry(param_item, param_list, list) + if (param_item->param->id == param_id) + return param_item; + return NULL; +} + static bool devlink_param_cmode_is_supported(const struct devlink_param *param, enum devlink_param_cmode cmode) @@ -3966,6 +3977,72 @@ void devlink_params_unregister(struct devlink *devlink, } EXPORT_SYMBOL_GPL(devlink_params_unregister); +/** + * devlink_param_driverinit_value_get - get configuration parameter + * value for driver initializing + * + * @devlink: devlink + * @param_id: parameter ID + * @init_val: value of parameter in driverinit configuration mode + * + * This function should be used by the driver to get driverinit + * configuration for initialization after reload command. + */ +int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, + union devlink_param_value *init_val) +{ + struct devlink_param_item *param_item; + + if (!devlink->ops || !devlink->ops->reload) + return -EOPNOTSUPP; + + param_item = devlink_param_find_by_id(&devlink->param_list, param_id); + if (!param_item) + return -EINVAL; + + if (!param_item->driverinit_value_valid || + !devlink_param_cmode_is_supported(param_item->param, + DEVLINK_PARAM_CMODE_DRIVERINIT)) + return -EOPNOTSUPP; + + *init_val = param_item->driverinit_value; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_get); + +/** + * devlink_param_driverinit_value_set - set value of configuration + * parameter for driverinit + * configuration mode + * + * @devlink: devlink + * @param_id: parameter ID + * @init_val: value of parameter to set for driverinit configuration mode + * + * This function should be used by the driver to set driverinit + * configuration mode default value. + */ +int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, + union devlink_param_value init_val) +{ + struct devlink_param_item *param_item; + + param_item = devlink_param_find_by_id(&devlink->param_list, param_id); + if (!param_item) + return -EINVAL; + + if (!devlink_param_cmode_is_supported(param_item->param, + DEVLINK_PARAM_CMODE_DRIVERINIT)) + return -EOPNOTSUPP; + + param_item->driverinit_value = init_val; + param_item->driverinit_value_valid = true; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set); + static int __init devlink_module_init(void) { return genl_register_family(&devlink_nl_family); -- cgit v1.2.3 From ea601e17098856ee059f35c2a75659e57df81f25 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 4 Jul 2018 14:30:32 +0300 Subject: devlink: Add devlink notifications support for params Add devlink_param_notify() function to support devlink param notifications. Add notification call to devlink param set, register and unregister functions. Add devlink_param_value_changed() function to enable the driver notify devlink on value change. Driver should use this function after value was changed on any configuration mode part to driverinit. Signed-off-by: Moshe Shemesh Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 7 +++++++ include/uapi/linux/devlink.h | 2 ++ net/core/devlink.c | 50 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 3302e43b09a4..792edaa996ba 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -507,6 +507,7 @@ int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, union devlink_param_value *init_val); int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, union devlink_param_value init_val); +void devlink_param_value_changed(struct devlink *devlink, u32 param_id); #else @@ -729,6 +730,12 @@ devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, return -EOPNOTSUPP; } +static inline void +devlink_param_value_changed(struct devlink *devlink, u32 param_id) +{ + return -EOPNOTSUPP; +} + #endif #endif /* _NET_DEVLINK_H_ */ diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index ea0623e568f0..68641fb56654 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -80,6 +80,8 @@ enum devlink_command { DEVLINK_CMD_PARAM_GET, /* can dump */ DEVLINK_CMD_PARAM_SET, + DEVLINK_CMD_PARAM_NEW, + DEVLINK_CMD_PARAM_DEL, /* add new commands above here */ __DEVLINK_CMD_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 3af08f4562b5..89d948fd4727 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2828,6 +2828,28 @@ genlmsg_cancel: return -EMSGSIZE; } +static void devlink_param_notify(struct devlink *devlink, + struct devlink_param_item *param_item, + enum devlink_command cmd) +{ + struct sk_buff *msg; + int err; + + WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + err = devlink_nl_param_fill(msg, devlink, param_item, cmd, 0, 0, 0); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { @@ -3019,6 +3041,7 @@ static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb, return err; } + devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); return 0; } @@ -3042,6 +3065,7 @@ static int devlink_param_register_one(struct devlink *devlink, param_item->param = param; list_add_tail(¶m_item->list, &devlink->param_list); + devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); return 0; } @@ -3053,6 +3077,7 @@ static void devlink_param_unregister_one(struct devlink *devlink, param_item = devlink_param_find_by_name(&devlink->param_list, param->name); WARN_ON(!param_item); + devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_DEL); list_del(¶m_item->list); kfree(param_item); } @@ -4039,10 +4064,35 @@ int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, param_item->driverinit_value = init_val; param_item->driverinit_value_valid = true; + devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); return 0; } EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set); +/** + * devlink_param_value_changed - notify devlink on a parameter's value + * change. Should be called by the driver + * right after the change. + * + * @devlink: devlink + * @param_id: parameter ID + * + * This function should be used by the driver to notify devlink on value + * change, excluding driverinit configuration mode. + * For driverinit configuration mode driver should use the function + * devlink_param_driverinit_value_set() instead. + */ +void devlink_param_value_changed(struct devlink *devlink, u32 param_id) +{ + struct devlink_param_item *param_item; + + param_item = devlink_param_find_by_id(&devlink->param_list, param_id); + WARN_ON(!param_item); + + devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); +} +EXPORT_SYMBOL_GPL(devlink_param_value_changed); + static int __init devlink_module_init(void) { return genl_register_family(&devlink_nl_family); -- cgit v1.2.3 From 036467c3990c75ec8ce97e517a864b52e184a1aa Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 4 Jul 2018 14:30:33 +0300 Subject: devlink: Add generic parameters internal_err_reset and max_macs Add 2 first generic parameters to devlink configuration parameters set: internal_err_reset - When set enables reset device on internal errors. max_macs - max number of MACs per ETH port. Signed-off-by: Moshe Shemesh Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 31 +++++++++++++++++++++++++++++++ net/core/devlink.c | 14 +++++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 792edaa996ba..a1c230d18911 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -356,12 +356,43 @@ struct devlink_param_item { }; enum devlink_param_generic_id { + DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET, + DEVLINK_PARAM_GENERIC_ID_MAX_MACS, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, DEVLINK_PARAM_GENERIC_ID_MAX = __DEVLINK_PARAM_GENERIC_ID_MAX - 1, }; +#define DEVLINK_PARAM_GENERIC_INT_ERR_RESET_NAME "internal_error_reset" +#define DEVLINK_PARAM_GENERIC_INT_ERR_RESET_TYPE DEVLINK_PARAM_TYPE_BOOL + +#define DEVLINK_PARAM_GENERIC_MAX_MACS_NAME "max_macs" +#define DEVLINK_PARAM_GENERIC_MAX_MACS_TYPE DEVLINK_PARAM_TYPE_U32 + +#define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ +{ \ + .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ + .name = DEVLINK_PARAM_GENERIC_##_id##_NAME, \ + .type = DEVLINK_PARAM_GENERIC_##_id##_TYPE, \ + .generic = true, \ + .supported_cmodes = _cmodes, \ + .get = _get, \ + .set = _set, \ + .validate = _validate, \ +} + +#define DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes, _get, _set, _validate) \ +{ \ + .id = _id, \ + .name = _name, \ + .type = _type, \ + .supported_cmodes = _cmodes, \ + .get = _get, \ + .set = _set, \ + .validate = _validate, \ +} + struct devlink_ops { int (*reload)(struct devlink *devlink, struct netlink_ext_ack *extack); int (*port_type_set)(struct devlink_port *devlink_port, diff --git a/net/core/devlink.c b/net/core/devlink.c index 89d948fd4727..5bbd0aa7571a 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2604,7 +2604,19 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) return devlink->ops->reload(devlink, info->extack); } -static const struct devlink_param devlink_param_generic[] = {}; +static const struct devlink_param devlink_param_generic[] = { + { + .id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET, + .name = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_NAME, + .type = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_MAX_MACS, + .name = DEVLINK_PARAM_GENERIC_MAX_MACS_NAME, + .type = DEVLINK_PARAM_GENERIC_MAX_MACS_TYPE, + }, + +}; static int devlink_param_generic_verify(const struct devlink_param *param) { -- cgit v1.2.3 From f567bcdae2b052bab94be7903863cb9ab47c907c Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Wed, 4 Jul 2018 14:30:36 +0300 Subject: devlink: Add enable_sriov boolean generic parameter enable_sriov - Enables Single-Root Input/Output Virtualization(SR-IOV) characteristic of the device. Reviewed-by: Michael Chan Signed-off-by: Vasundhara Volam Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 4 ++++ net/core/devlink.c | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index a1c230d18911..8ed571385626 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -358,6 +358,7 @@ struct devlink_param_item { enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET, DEVLINK_PARAM_GENERIC_ID_MAX_MACS, + DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -370,6 +371,9 @@ enum devlink_param_generic_id { #define DEVLINK_PARAM_GENERIC_MAX_MACS_NAME "max_macs" #define DEVLINK_PARAM_GENERIC_MAX_MACS_TYPE DEVLINK_PARAM_TYPE_U32 +#define DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_NAME "enable_sriov" +#define DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_TYPE DEVLINK_PARAM_TYPE_BOOL + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ diff --git a/net/core/devlink.c b/net/core/devlink.c index 5bbd0aa7571a..470f3dbfecfe 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2615,7 +2615,11 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_MAX_MACS_NAME, .type = DEVLINK_PARAM_GENERIC_MAX_MACS_TYPE, }, - + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV, + .name = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) -- cgit v1.2.3 From d8269e2cbf908f9d26aa5d3217236227dffd1d89 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Thu, 5 Jul 2018 15:49:42 +0100 Subject: net: ipv6: listify ipv6_rcv() and ip6_rcv_finish() Essentially the same as the ipv4 equivalents. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- include/net/ipv6.h | 2 + net/ipv6/af_inet6.c | 1 + net/ipv6/ip6_input.c | 131 ++++++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 118 insertions(+), 16 deletions(-) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 16475c269749..b7843e0b16ee 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -922,6 +922,8 @@ static inline __be32 flowi6_get_flowlabel(const struct flowi6 *fl6) int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); +void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, + struct net_device *orig_dev); int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 9ed0eae91758..c9535354149f 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -764,6 +764,7 @@ EXPORT_SYMBOL_GPL(ipv6_opt_accepted); static struct packet_type ipv6_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IPV6), .func = ipv6_rcv, + .list_func = ipv6_list_rcv, }; static int __init ipv6_packet_init(void) diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index f08d34491ece..6242682be876 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -47,17 +47,11 @@ #include #include -int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +static void ip6_rcv_finish_core(struct net *net, struct sock *sk, + struct sk_buff *skb) { void (*edemux)(struct sk_buff *skb); - /* if ingress device is enslaved to an L3 master device pass the - * skb to its handler for processing - */ - skb = l3mdev_ip6_rcv(skb); - if (!skb) - return NET_RX_SUCCESS; - if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { const struct inet6_protocol *ipprot; @@ -67,20 +61,73 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) } if (!skb_valid_dst(skb)) ip6_route_input(skb); +} + +int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + /* if ingress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip6_rcv(skb); + if (!skb) + return NET_RX_SUCCESS; + ip6_rcv_finish_core(net, sk, skb); return dst_input(skb); } -int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) +static void ip6_sublist_rcv_finish(struct list_head *head) +{ + struct sk_buff *skb, *next; + + list_for_each_entry_safe(skb, next, head, list) + dst_input(skb); +} + +static void ip6_list_rcv_finish(struct net *net, struct sock *sk, + struct list_head *head) +{ + struct dst_entry *curr_dst = NULL; + struct sk_buff *skb, *next; + struct list_head sublist; + + INIT_LIST_HEAD(&sublist); + list_for_each_entry_safe(skb, next, head, list) { + struct dst_entry *dst; + + list_del(&skb->list); + /* if ingress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip6_rcv(skb); + if (!skb) + continue; + ip6_rcv_finish_core(net, sk, skb); + dst = skb_dst(skb); + if (curr_dst != dst) { + /* dispatch old sublist */ + if (!list_empty(&sublist)) + ip6_sublist_rcv_finish(&sublist); + /* start new sublist */ + INIT_LIST_HEAD(&sublist); + curr_dst = dst; + } + list_add_tail(&skb->list, &sublist); + } + /* dispatch final sublist */ + ip6_sublist_rcv_finish(&sublist); +} + +static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev, + struct net *net) { const struct ipv6hdr *hdr; u32 pkt_len; struct inet6_dev *idev; - struct net *net = dev_net(skb->dev); if (skb->pkt_type == PACKET_OTHERHOST) { kfree_skb(skb); - return NET_RX_DROP; + return NULL; } rcu_read_lock(); @@ -196,7 +243,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt if (ipv6_parse_hopopts(skb) < 0) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); rcu_read_unlock(); - return NET_RX_DROP; + return NULL; } } @@ -205,15 +252,67 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt /* Must drop socket now because of tproxy. */ skb_orphan(skb); - return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, - net, NULL, skb, dev, NULL, - ip6_rcv_finish); + return skb; err: __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); drop: rcu_read_unlock(); kfree_skb(skb); - return NET_RX_DROP; + return NULL; +} + +int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) +{ + struct net *net = dev_net(skb->dev); + + skb = ip6_rcv_core(skb, dev, net); + if (skb == NULL) + return NET_RX_DROP; + return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, + net, NULL, skb, dev, NULL, + ip6_rcv_finish); +} + +static void ip6_sublist_rcv(struct list_head *head, struct net_device *dev, + struct net *net) +{ + NF_HOOK_LIST(NFPROTO_IPV6, NF_INET_PRE_ROUTING, net, NULL, + head, dev, NULL, ip6_rcv_finish); + ip6_list_rcv_finish(net, NULL, head); +} + +/* Receive a list of IPv6 packets */ +void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, + struct net_device *orig_dev) +{ + struct net_device *curr_dev = NULL; + struct net *curr_net = NULL; + struct sk_buff *skb, *next; + struct list_head sublist; + + INIT_LIST_HEAD(&sublist); + list_for_each_entry_safe(skb, next, head, list) { + struct net_device *dev = skb->dev; + struct net *net = dev_net(dev); + + list_del(&skb->list); + skb = ip6_rcv_core(skb, dev, net); + if (skb == NULL) + continue; + + if (curr_dev != dev || curr_net != net) { + /* dispatch old sublist */ + if (!list_empty(&sublist)) + ip6_sublist_rcv(&sublist, curr_dev, curr_net); + /* start new sublist */ + INIT_LIST_HEAD(&sublist); + curr_dev = dev; + curr_net = net; + } + list_add_tail(&skb->list, &sublist); + } + /* dispatch final sublist */ + ip6_sublist_rcv(&sublist, curr_dev, curr_net); } /* -- cgit v1.2.3 From cfdb0c2d095ac5d7f09cac1317b7d0a9e8178134 Mon Sep 17 00:00:00 2001 From: Ankit Navik Date: Fri, 29 Jun 2018 12:12:50 +0530 Subject: Bluetooth: Store Resolv list size When the controller supports the Read LE Resolv List size feature, the maximum list size are read and now stored. Before patch: < HCI Command: LE Read White List... (0x08|0x000f) plen 0 #55 [hci0] 17.979791 > HCI Event: Command Complete (0x0e) plen 5 #56 [hci0] 17.980629 LE Read White List Size (0x08|0x000f) ncmd 1 Status: Success (0x00) Size: 25 < HCI Command: LE Clear White List (0x08|0x0010) plen 0 #57 [hci0] 17.980786 > HCI Event: Command Complete (0x0e) plen 4 #58 [hci0] 17.981627 LE Clear White List (0x08|0x0010) ncmd 1 Status: Success (0x00) < HCI Command: LE Read Maximum Dat.. (0x08|0x002f) plen 0 #59 [hci0] 17.981786 > HCI Event: Command Complete (0x0e) plen 12 #60 [hci0] 17.982636 LE Read Maximum Data Length (0x08|0x002f) ncmd 1 Status: Success (0x00) Max TX octets: 251 Max TX time: 17040 Max RX octets: 251 Max RX time: 17040 After patch: < HCI Command: LE Read White List... (0x08|0x000f) plen 0 #55 [hci0] 13.338168 > HCI Event: Command Complete (0x0e) plen 5 #56 [hci0] 13.338842 LE Read White List Size (0x08|0x000f) ncmd 1 Status: Success (0x00) Size: 25 < HCI Command: LE Clear White List (0x08|0x0010) plen 0 #57 [hci0] 13.339029 > HCI Event: Command Complete (0x0e) plen 4 #58 [hci0] 13.339939 LE Clear White List (0x08|0x0010) ncmd 1 Status: Success (0x00) < HCI Command: LE Read Resolving L.. (0x08|0x002a) plen 0 #59 [hci0] 13.340152 > HCI Event: Command Complete (0x0e) plen 5 #60 [hci0] 13.340952 LE Read Resolving List Size (0x08|0x002a) ncmd 1 Status: Success (0x00) Size: 25 < HCI Command: LE Read Maximum Dat.. (0x08|0x002f) plen 0 #61 [hci0] 13.341180 > HCI Event: Command Complete (0x0e) plen 12 #62 [hci0] 13.341898 LE Read Maximum Data Length (0x08|0x002f) ncmd 1 Status: Success (0x00) Max TX octets: 251 Max TX time: 17040 Max RX octets: 251 Max RX time: 17040 Signed-off-by: Ankit Navik Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 6 ++++++ include/net/bluetooth/hci_core.h | 2 ++ net/bluetooth/hci_core.c | 8 ++++++++ net/bluetooth/hci_debugfs.c | 19 +++++++++++++++++++ net/bluetooth/hci_event.c | 18 ++++++++++++++++++ 5 files changed, 53 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 1668211297a9..484f24c7a415 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1490,6 +1490,12 @@ struct hci_cp_le_write_def_data_len { __le16 tx_time; } __packed; +#define HCI_OP_LE_READ_RESOLV_LIST_SIZE 0x202a +struct hci_rp_le_read_resolv_list_size { + __u8 status; + __u8 size; +} __packed; + #define HCI_OP_LE_READ_MAX_DATA_LEN 0x202f struct hci_rp_le_read_max_data_len { __u8 status; diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 893bbbb5d2fa..409f49bd8338 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -221,6 +221,7 @@ struct hci_dev { __u8 features[HCI_MAX_PAGES][8]; __u8 le_features[8]; __u8 le_white_list_size; + __u8 le_resolv_list_size; __u8 le_states[8]; __u8 commands[64]; __u8 hci_ver; @@ -367,6 +368,7 @@ struct hci_dev { struct list_head identity_resolving_keys; struct list_head remote_oob_data; struct list_head le_white_list; + struct list_head le_resolv_list; struct list_head le_conn_params; struct list_head pend_le_conns; struct list_head pend_le_reports; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index ee8ef1228263..036e14267d0a 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -714,6 +714,12 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) hci_req_add(req, HCI_OP_LE_CLEAR_WHITE_LIST, 0, NULL); } + if (hdev->commands[34] & 0x40) { + /* Read LE Resolving List Size */ + hci_req_add(req, HCI_OP_LE_READ_RESOLV_LIST_SIZE, + 0, NULL); + } + if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) { /* Read LE Maximum Data Length */ hci_req_add(req, HCI_OP_LE_READ_MAX_DATA_LEN, 0, NULL); @@ -3017,6 +3023,7 @@ struct hci_dev *hci_alloc_dev(void) INIT_LIST_HEAD(&hdev->identity_resolving_keys); INIT_LIST_HEAD(&hdev->remote_oob_data); INIT_LIST_HEAD(&hdev->le_white_list); + INIT_LIST_HEAD(&hdev->le_resolv_list); INIT_LIST_HEAD(&hdev->le_conn_params); INIT_LIST_HEAD(&hdev->pend_le_conns); INIT_LIST_HEAD(&hdev->pend_le_reports); @@ -3218,6 +3225,7 @@ void hci_unregister_dev(struct hci_dev *hdev) hci_remote_oob_data_clear(hdev); hci_adv_instances_clear(hdev); hci_bdaddr_list_clear(&hdev->le_white_list); + hci_bdaddr_list_clear(&hdev->le_resolv_list); hci_conn_params_clear_all(hdev); hci_discovery_filter_clear(hdev); hci_dev_unlock(hdev); diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 0d8ab5b3c177..51f5b1efc3a5 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -694,6 +694,21 @@ static int white_list_show(struct seq_file *f, void *ptr) DEFINE_SHOW_ATTRIBUTE(white_list); +static int resolv_list_show(struct seq_file *f, void *ptr) +{ + struct hci_dev *hdev = f->private; + struct bdaddr_list *b; + + hci_dev_lock(hdev); + list_for_each_entry(b, &hdev->le_resolv_list, list) + seq_printf(f, "%pMR (type %u)\n", &b->bdaddr, b->bdaddr_type); + hci_dev_unlock(hdev); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(resolv_list); + static int identity_resolving_keys_show(struct seq_file *f, void *ptr) { struct hci_dev *hdev = f->private; @@ -955,6 +970,10 @@ void hci_debugfs_create_le(struct hci_dev *hdev) &hdev->le_white_list_size); debugfs_create_file("white_list", 0444, hdev->debugfs, hdev, &white_list_fops); + debugfs_create_u8("resolv_list_size", 0444, hdev->debugfs, + &hdev->le_resolv_list_size); + debugfs_create_file("resolv_list", 0444, hdev->debugfs, hdev, + &resolv_list_fops); debugfs_create_file("identity_resolving_keys", 0400, hdev->debugfs, hdev, &identity_resolving_keys_fops); debugfs_create_file("long_term_keys", 0400, hdev->debugfs, hdev, diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 235b5aaab23d..6ee69a79258f 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -221,6 +221,7 @@ static void hci_cc_reset(struct hci_dev *hdev, struct sk_buff *skb) hdev->ssp_debug_mode = 0; hci_bdaddr_list_clear(&hdev->le_white_list); + hci_bdaddr_list_clear(&hdev->le_resolv_list); } static void hci_cc_read_stored_link_key(struct hci_dev *hdev, @@ -1306,6 +1307,19 @@ static void hci_cc_le_write_def_data_len(struct hci_dev *hdev, hdev->le_def_tx_time = le16_to_cpu(sent->tx_time); } +static void hci_cc_le_read_resolv_list_size(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_rp_le_read_resolv_list_size *rp = (void *) skb->data; + + BT_DBG("%s status 0x%2.2x size %u", hdev->name, rp->status, rp->size); + + if (rp->status) + return; + + hdev->le_resolv_list_size = rp->size; +} + static void hci_cc_le_read_max_data_len(struct hci_dev *hdev, struct sk_buff *skb) { @@ -3015,6 +3029,10 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_le_write_def_data_len(hdev, skb); break; + case HCI_OP_LE_READ_RESOLV_LIST_SIZE: + hci_cc_le_read_resolv_list_size(hdev, skb); + break; + case HCI_OP_LE_READ_MAX_DATA_LEN: hci_cc_le_read_max_data_len(hdev, skb); break; -- cgit v1.2.3 From 545f2596b907f0747170c7cb71edc74cecf68c5c Mon Sep 17 00:00:00 2001 From: Ankit Navik Date: Fri, 29 Jun 2018 12:13:20 +0530 Subject: Bluetooth: Add HCI command for clear Resolv list Check for Resolv list supported by controller. So check the supported commmand first before issuing this command i.e.,HCI_OP_LE_CLEAR_RESOLV_LIST Before patch: < HCI Command: LE Read White List... (0x08|0x000f) plen 0 #55 [hci0] 13.338168 > HCI Event: Command Complete (0x0e) plen 5 #56 [hci0] 13.338842 LE Read White List Size (0x08|0x000f) ncmd 1 Status: Success (0x00) Size: 25 < HCI Command: LE Clear White List (0x08|0x0010) plen 0 #57 [hci0] 13.339029 > HCI Event: Command Complete (0x0e) plen 4 #58 [hci0] 13.339939 LE Clear White List (0x08|0x0010) ncmd 1 Status: Success (0x00) < HCI Command: LE Read Resolving L.. (0x08|0x002a) plen 0 #59 [hci0] 13.340152 > HCI Event: Command Complete (0x0e) plen 5 #60 [hci0] 13.340952 LE Read Resolving List Size (0x08|0x002a) ncmd 1 Status: Success (0x00) Size: 25 < HCI Command: LE Read Maximum Dat.. (0x08|0x002f) plen 0 #61 [hci0] 13.341180 > HCI Event: Command Complete (0x0e) plen 12 #62 [hci0] 13.341898 LE Read Maximum Data Length (0x08|0x002f) ncmd 1 Status: Success (0x00) Max TX octets: 251 Max TX time: 17040 Max RX octets: 251 Max RX time: 17040 After patch: < HCI Command: LE Read White List... (0x08|0x000f) plen 0 #55 [hci0] 28.919131 > HCI Event: Command Complete (0x0e) plen 5 #56 [hci0] 28.920016 LE Read White List Size (0x08|0x000f) ncmd 1 Status: Success (0x00) Size: 25 < HCI Command: LE Clear White List (0x08|0x0010) plen 0 #57 [hci0] 28.920164 > HCI Event: Command Complete (0x0e) plen 4 #58 [hci0] 28.920873 LE Clear White List (0x08|0x0010) ncmd 1 Status: Success (0x00) < HCI Command: LE Read Resolving L.. (0x08|0x002a) plen 0 #59 [hci0] 28.921109 > HCI Event: Command Complete (0x0e) plen 5 #60 [hci0] 28.922016 LE Read Resolving List Size (0x08|0x002a) ncmd 1 Status: Success (0x00) Size: 25 < HCI Command: LE Clear Resolving... (0x08|0x0029) plen 0 #61 [hci0] 28.922166 > HCI Event: Command Complete (0x0e) plen 4 #62 [hci0] 28.922872 LE Clear Resolving List (0x08|0x0029) ncmd 1 Status: Success (0x00) < HCI Command: LE Read Maximum Dat.. (0x08|0x002f) plen 0 #63 [hci0] 28.923117 > HCI Event: Command Complete (0x0e) plen 12 #64 [hci0] 28.924030 LE Read Maximum Data Length (0x08|0x002f) ncmd 1 Status: Success (0x00) Max TX octets: 251 Max TX time: 17040 Max RX octets: 251 Max RX time: 17040 Signed-off-by: Ankit Navik Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 2 ++ net/bluetooth/hci_core.c | 5 +++++ net/bluetooth/hci_event.c | 17 +++++++++++++++++ 3 files changed, 24 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 484f24c7a415..4af1a3a4d9b1 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1490,6 +1490,8 @@ struct hci_cp_le_write_def_data_len { __le16 tx_time; } __packed; +#define HCI_OP_LE_CLEAR_RESOLV_LIST 0x2029 + #define HCI_OP_LE_READ_RESOLV_LIST_SIZE 0x202a struct hci_rp_le_read_resolv_list_size { __u8 status; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 036e14267d0a..ce2447d89ce1 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -720,6 +720,11 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) 0, NULL); } + if (hdev->commands[34] & 0x20) { + /* Clear LE Resolving List */ + hci_req_add(req, HCI_OP_LE_CLEAR_RESOLV_LIST, 0, NULL); + } + if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) { /* Read LE Maximum Data Length */ hci_req_add(req, HCI_OP_LE_READ_MAX_DATA_LEN, 0, NULL); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 6ee69a79258f..562e7a854ed6 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1307,6 +1307,19 @@ static void hci_cc_le_write_def_data_len(struct hci_dev *hdev, hdev->le_def_tx_time = le16_to_cpu(sent->tx_time); } +static void hci_cc_le_clear_resolv_list(struct hci_dev *hdev, + struct sk_buff *skb) +{ + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + if (status) + return; + + hci_bdaddr_list_clear(&hdev->le_resolv_list); +} + static void hci_cc_le_read_resolv_list_size(struct hci_dev *hdev, struct sk_buff *skb) { @@ -3029,6 +3042,10 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_le_write_def_data_len(hdev, skb); break; + case HCI_OP_LE_CLEAR_RESOLV_LIST: + hci_cc_le_clear_resolv_list(hdev, skb); + break; + case HCI_OP_LE_READ_RESOLV_LIST_SIZE: hci_cc_le_read_resolv_list_size(hdev, skb); break; -- cgit v1.2.3 From a2344b9e3a8c5c2064306b0d99b0e9a6c4813c08 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Fri, 6 Jul 2018 17:05:28 +0530 Subject: Bluetooth: Use extended scanning if controller supports This implements Set extended scan param and set extended scan enable commands and use it for start LE scan based on controller support. The new features added in these commands are setting of new PHY for scanning and setting of scan duration. Both features are disabled for now, meaning only 1M PHY is set and scan duration is set to 0 which means that scanning will be done untill scan disable is called. < HCI Command: LE Set Extended Scan Parameters (0x08|0x0041) plen 8 Own address type: Random (0x01) Filter policy: Accept all advertisement (0x00) PHYs: 0x01 Entry 0: LE 1M Type: Active (0x01) Interval: 11.250 msec (0x0012) Window: 11.250 msec (0x0012) > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Scan Parameters (0x08|0x0041) ncmd 1 Status: Success (0x00) < HCI Command: LE Set Extended Scan Enable (0x08|0x0042) plen 6 Extended scan: Enabled (0x01) Filter duplicates: Enabled (0x01) Duration: 0 msec (0x0000) Period: 0.00 sec (0x0000) > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Scan Enable (0x08|0x0042) ncmd 2 Status: Success (0x00) Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 24 +++++++++ include/net/bluetooth/hci_core.h | 4 ++ net/bluetooth/hci_event.c | 51 ++++++++++++++++++ net/bluetooth/hci_request.c | 110 ++++++++++++++++++++++++++++++--------- 4 files changed, 164 insertions(+), 25 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 4af1a3a4d9b1..8c2868f439e7 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1514,6 +1514,30 @@ struct hci_cp_le_set_default_phy { __u8 rx_phys; } __packed; +#define HCI_OP_LE_SET_EXT_SCAN_PARAMS 0x2041 +struct hci_cp_le_set_ext_scan_params { + __u8 own_addr_type; + __u8 filter_policy; + __u8 scanning_phys; + __u8 data[0]; +} __packed; + +#define LE_SCAN_PHY_1M 0x01 + +struct hci_cp_le_scan_phy_params { + __u8 type; + __le16 interval; + __le16 window; +} __packed; + +#define HCI_OP_LE_SET_EXT_SCAN_ENABLE 0x2042 +struct hci_cp_le_set_ext_scan_enable { + __u8 enable; + __u8 filter_dup; + __le16 duration; + __le16 period; +} __packed; + /* ---- HCI Events ---- */ #define HCI_EV_INQUIRY_COMPLETE 0x01 diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 409f49bd8338..cc0bde74dd45 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1158,6 +1158,10 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define bredr_sc_enabled(dev) (lmp_sc_capable(dev) && \ hci_dev_test_flag(dev, HCI_SC_ENABLED)) +/* Use ext scanning if set ext scan param and ext scan enable is supported */ +#define use_ext_scan(dev) (((dev)->commands[37] & 0x20) && \ + ((dev)->commands[37] & 0x40)) + /* ----- HCI protocols ----- */ #define HCI_PROTO_DEFER 0x01 diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 9ec07cd4ab13..15afad005d72 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1098,6 +1098,31 @@ static void hci_cc_le_set_scan_param(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_unlock(hdev); } +static void hci_cc_le_set_ext_scan_param(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_cp_le_set_ext_scan_params *cp; + __u8 status = *((__u8 *) skb->data); + struct hci_cp_le_scan_phy_params *phy_param; + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + if (status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_EXT_SCAN_PARAMS); + if (!cp) + return; + + phy_param = (void *)cp->data; + + hci_dev_lock(hdev); + + hdev->le_scan_type = phy_param->type; + + hci_dev_unlock(hdev); +} + static bool has_pending_adv_report(struct hci_dev *hdev) { struct discovery_state *d = &hdev->discovery; @@ -1202,6 +1227,24 @@ static void hci_cc_le_set_scan_enable(struct hci_dev *hdev, le_set_scan_enable_complete(hdev, cp->enable); } +static void hci_cc_le_set_ext_scan_enable(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_cp_le_set_ext_scan_enable *cp; + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + if (status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_EXT_SCAN_ENABLE); + if (!cp) + return; + + le_set_scan_enable_complete(hdev, cp->enable); +} + static void hci_cc_le_read_white_list_size(struct hci_dev *hdev, struct sk_buff *skb) { @@ -3079,6 +3122,14 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_write_ssp_debug_mode(hdev, skb); break; + case HCI_OP_LE_SET_EXT_SCAN_PARAMS: + hci_cc_le_set_ext_scan_param(hdev, skb); + break; + + case HCI_OP_LE_SET_EXT_SCAN_ENABLE: + hci_cc_le_set_ext_scan_enable(hdev, skb); + break; + default: BT_DBG("%s opcode 0x%4.4x", hdev->name, *opcode); break; diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 76dcc3f14cea..faf7c711234c 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -647,11 +647,22 @@ void __hci_req_update_eir(struct hci_request *req) void hci_req_add_le_scan_disable(struct hci_request *req) { - struct hci_cp_le_set_scan_enable cp; + struct hci_dev *hdev = req->hdev; - memset(&cp, 0, sizeof(cp)); - cp.enable = LE_SCAN_DISABLE; - hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp); + if (use_ext_scan(hdev)) { + struct hci_cp_le_set_ext_scan_enable cp; + + memset(&cp, 0, sizeof(cp)); + cp.enable = LE_SCAN_DISABLE; + hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_ENABLE, sizeof(cp), + &cp); + } else { + struct hci_cp_le_set_scan_enable cp; + + memset(&cp, 0, sizeof(cp)); + cp.enable = LE_SCAN_DISABLE; + hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp); + } } static void add_to_white_list(struct hci_request *req, @@ -770,23 +781,60 @@ static bool scan_use_rpa(struct hci_dev *hdev) static void hci_req_start_scan(struct hci_request *req, u8 type, u16 interval, u16 window, u8 own_addr_type, u8 filter_policy) { - struct hci_cp_le_set_scan_param param_cp; - struct hci_cp_le_set_scan_enable enable_cp; - - memset(¶m_cp, 0, sizeof(param_cp)); - param_cp.type = type; - param_cp.interval = cpu_to_le16(interval); - param_cp.window = cpu_to_le16(window); - param_cp.own_address_type = own_addr_type; - param_cp.filter_policy = filter_policy; - hci_req_add(req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(param_cp), - ¶m_cp); + struct hci_dev *hdev = req->hdev; - memset(&enable_cp, 0, sizeof(enable_cp)); - enable_cp.enable = LE_SCAN_ENABLE; - enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; - hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(enable_cp), - &enable_cp); + /* Use ext scanning if set ext scan param and ext scan enable is + * supported + */ + if (use_ext_scan(hdev)) { + struct hci_cp_le_set_ext_scan_params *ext_param_cp; + struct hci_cp_le_set_ext_scan_enable ext_enable_cp; + struct hci_cp_le_scan_phy_params *phy_params; + /* Ony single PHY (1M) is supported as of now */ + u8 data[sizeof(*ext_param_cp) + sizeof(*phy_params) * 1]; + + ext_param_cp = (void *)data; + phy_params = (void *)ext_param_cp->data; + + memset(ext_param_cp, 0, sizeof(*ext_param_cp)); + ext_param_cp->own_addr_type = own_addr_type; + ext_param_cp->filter_policy = filter_policy; + ext_param_cp->scanning_phys = LE_SCAN_PHY_1M; + + memset(phy_params, 0, sizeof(*phy_params)); + phy_params->type = type; + phy_params->interval = cpu_to_le16(interval); + phy_params->window = cpu_to_le16(window); + + hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_PARAMS, + sizeof(*ext_param_cp) + sizeof(*phy_params), + ext_param_cp); + + memset(&ext_enable_cp, 0, sizeof(ext_enable_cp)); + ext_enable_cp.enable = LE_SCAN_ENABLE; + ext_enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; + + hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_ENABLE, + sizeof(ext_enable_cp), &ext_enable_cp); + } else { + struct hci_cp_le_set_scan_param param_cp; + struct hci_cp_le_set_scan_enable enable_cp; + + memset(¶m_cp, 0, sizeof(param_cp)); + param_cp.type = type; + param_cp.interval = cpu_to_le16(interval); + param_cp.window = cpu_to_le16(window); + param_cp.own_address_type = own_addr_type; + param_cp.filter_policy = filter_policy; + hci_req_add(req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(param_cp), + ¶m_cp); + + memset(&enable_cp, 0, sizeof(enable_cp)); + enable_cp.enable = LE_SCAN_ENABLE; + enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; + hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(enable_cp), + &enable_cp); + } } void hci_req_add_le_passive_scan(struct hci_request *req) @@ -1948,7 +1996,6 @@ discov_stopped: static int le_scan_restart(struct hci_request *req, unsigned long opt) { struct hci_dev *hdev = req->hdev; - struct hci_cp_le_set_scan_enable cp; /* If controller is not scanning we are done. */ if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) @@ -1956,10 +2003,23 @@ static int le_scan_restart(struct hci_request *req, unsigned long opt) hci_req_add_le_scan_disable(req); - memset(&cp, 0, sizeof(cp)); - cp.enable = LE_SCAN_ENABLE; - cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; - hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp); + if (use_ext_scan(hdev)) { + struct hci_cp_le_set_ext_scan_enable ext_enable_cp; + + memset(&ext_enable_cp, 0, sizeof(ext_enable_cp)); + ext_enable_cp.enable = LE_SCAN_ENABLE; + ext_enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; + + hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_ENABLE, + sizeof(ext_enable_cp), &ext_enable_cp); + } else { + struct hci_cp_le_set_scan_enable cp; + + memset(&cp, 0, sizeof(cp)); + cp.enable = LE_SCAN_ENABLE; + cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; + hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp); + } return 0; } -- cgit v1.2.3 From c215e9397b00b3045a668120ed7dbd89f2866e74 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Fri, 6 Jul 2018 17:05:29 +0530 Subject: Bluetooth: Process extended ADV report event This patch enables Extended ADV report event if extended scanning is supported in the controller and process the same. The new features are not handled and for now its as good as legacy ADV report. > HCI Event: LE Meta Event (0x3e) plen 53 LE Extended Advertising Report (0x0d) Num reports: 1 Entry 0 Event type: 0x0013 Props: 0x0013 Connectable Scannable Use legacy advertising PDUs Data status: Complete Legacy PDU Type: ADV_IND (0x0013) Address type: Random (0x01) Address: DB:7E:2E:1A:85:E8 (Static) Primary PHY: LE 1M Secondary PHY: LE 1M SID: 0x00 TX power: 0 dBm RSSI: -90 dBm (0xa6) Periodic advertising invteral: 0.00 msec (0x0000) Direct address type: Public (0x00) Direct address: 00:00:00:00:00:00 (OUI 00-00-00) Data length: 0x1b 0f 09 44 65 73 69 67 6e 65 72 20 4d 6f 75 73 65 ..Designer Mouse 03 19 c2 03 02 01 05 03 03 12 18 ........... Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 26 +++++++++++++++++++++++ net/bluetooth/hci_core.c | 9 ++++++++ net/bluetooth/hci_event.c | 52 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 8c2868f439e7..0ec51eb14810 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1925,6 +1925,15 @@ struct hci_ev_le_conn_complete { #define LE_ADV_SCAN_IND 0x02 #define LE_ADV_NONCONN_IND 0x03 #define LE_ADV_SCAN_RSP 0x04 +#define LE_ADV_INVALID 0x05 + +/* Legacy event types in extended adv report */ +#define LE_LEGACY_ADV_IND 0x0013 +#define LE_LEGACY_ADV_DIRECT_IND 0x0015 +#define LE_LEGACY_ADV_SCAN_IND 0x0012 +#define LE_LEGACY_NONCONN_IND 0x0010 +#define LE_LEGACY_SCAN_RSP_ADV 0x001b +#define LE_LEGACY_SCAN_RSP_ADV_SCAN 0x001a #define ADDR_LE_DEV_PUBLIC 0x00 #define ADDR_LE_DEV_RANDOM 0x01 @@ -1989,6 +1998,23 @@ struct hci_ev_le_direct_adv_info { __s8 rssi; } __packed; +#define HCI_EV_LE_EXT_ADV_REPORT 0x0d +struct hci_ev_le_ext_adv_report { + __le16 evt_type; + __u8 bdaddr_type; + bdaddr_t bdaddr; + __u8 primary_phy; + __u8 secondary_phy; + __u8 sid; + __u8 tx_power; + __s8 rssi; + __le16 interval; + __u8 direct_addr_type; + bdaddr_t direct_addr; + __u8 length; + __u8 data[0]; +} __packed; + /* Internal events generated by Bluetooth stack */ #define HCI_EV_STACK_INTERNAL 0xfd struct hci_ev_stack_internal { diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index ce2447d89ce1..e3ec2d782762 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -695,6 +695,15 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) if (hdev->commands[35] & (0x20 | 0x40)) events[1] |= 0x08; /* LE PHY Update Complete */ + /* If the controller supports LE Set Extended Scan Parameters + * and LE Set Extended Scan Enable commands, enable the + * corresponding event. + */ + if (use_ext_scan(hdev)) + events[1] |= 0x10; /* LE Extended Advertising + * Report + */ + hci_req_add(req, HCI_OP_LE_SET_EVENT_MASK, sizeof(events), events); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 15afad005d72..6c6fd4f55f23 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5048,6 +5048,54 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_unlock(hdev); } +static u8 convert_legacy_evt_type(u16 evt_type) +{ + switch (evt_type) { + case LE_LEGACY_ADV_IND: + return LE_ADV_IND; + case LE_LEGACY_ADV_DIRECT_IND: + return LE_ADV_DIRECT_IND; + case LE_LEGACY_ADV_SCAN_IND: + return LE_ADV_SCAN_IND; + case LE_LEGACY_NONCONN_IND: + return LE_ADV_NONCONN_IND; + case LE_LEGACY_SCAN_RSP_ADV: + case LE_LEGACY_SCAN_RSP_ADV_SCAN: + return LE_ADV_SCAN_RSP; + } + + BT_ERR_RATELIMITED("Unknown advertising packet type: 0x%02x", + evt_type); + + return LE_ADV_INVALID; +} + +static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + u8 num_reports = skb->data[0]; + void *ptr = &skb->data[1]; + + hci_dev_lock(hdev); + + while (num_reports--) { + struct hci_ev_le_ext_adv_report *ev = ptr; + u8 legacy_evt_type; + u16 evt_type; + + evt_type = __le16_to_cpu(ev->evt_type); + legacy_evt_type = convert_legacy_evt_type(evt_type); + if (legacy_evt_type != LE_ADV_INVALID) { + process_adv_report(hdev, legacy_evt_type, &ev->bdaddr, + ev->bdaddr_type, NULL, 0, ev->rssi, + ev->data, ev->length); + } + + ptr += sizeof(*ev) + ev->length + 1; + } + + hci_dev_unlock(hdev); +} + static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) { @@ -5280,6 +5328,10 @@ static void hci_le_meta_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_le_direct_adv_report_evt(hdev, skb); break; + case HCI_EV_LE_EXT_ADV_REPORT: + hci_le_ext_adv_report_evt(hdev, skb); + break; + default: break; } -- cgit v1.2.3 From 4d94f95d30c8fbfe86068e9abed110974d697cf5 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Fri, 6 Jul 2018 22:50:32 +0200 Subject: Bluetooth: Use extended LE Connection if supported This implements extended LE craete connection and enhanced LE conn complete event if the controller supports. For now it is as good as legacy LE connection and event as no new features in the extended connection is handled. < HCI Command: LE Extended Create Connection (0x08|0x0043) plen 26 Filter policy: White list is not used (0x00) Own address type: Public (0x00) Peer address type: Random (0x01) Peer address: DB:7E:2E:1D:85:E8 (Static) Initiating PHYs: 0x01 Entry 0: LE 1M Scan interval: 60.000 msec (0x0060) Scan window: 60.000 msec (0x0060) Min connection interval: 50.00 msec (0x0028) Max connection interval: 70.00 msec (0x0038) Connection latency: 0 (0x0000) Supervision timeout: 420 msec (0x002a) Min connection length: 0.000 msec (0x0000) Max connection length: 0.000 msec (0x0000) > HCI Event: Command Status (0x0f) plen 4 LE Extended Create Connection (0x08|0x0043) ncmd 2 Status: Success (0x00) > HCI Event: LE Meta Event (0x3e) plen 31 LE Enhanced Connection Complete (0x0a) Status: Success (0x00) Handle: 3585 Role: Master (0x00) Peer address type: Random (0x01) Peer address: DB:7E:2E:1D:85:E8 (Static) Local resolvable private address: 00:00:00:00:00:00 (Non-Resolvable) Peer resolvable private address: 00:00:00:00:00:00 (Non-Resolvable) Connection interval: 67.50 msec (0x0036) Connection latency: 0 (0x0000) Supervision timeout: 420 msec (0x002a) Master clock accuracy: 0x00 @ MGMT Event: Device Connected (0x000b) plen 40 LE Address: DB:7E:2E:1D:85:E8 (Static) Flags: 0x00000000 Data length: 27 Name (complete): Designer Mouse Appearance: Mouse (0x03c2) Flags: 0x05 LE Limited Discoverable Mode BR/EDR Not Supported 16-bit Service UUIDs (complete): 1 entry Human Interface Device (0x1812) Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 36 ++++++++++++++++++++ include/net/bluetooth/hci_core.h | 2 ++ net/bluetooth/hci_conn.c | 72 ++++++++++++++++++++++++++++++---------- net/bluetooth/hci_core.c | 8 +++++ net/bluetooth/hci_event.c | 47 ++++++++++++++++++++++++++ 5 files changed, 147 insertions(+), 18 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 0ec51eb14810..73e48be5bbb3 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1538,6 +1538,27 @@ struct hci_cp_le_set_ext_scan_enable { __le16 period; } __packed; +#define HCI_OP_LE_EXT_CREATE_CONN 0x2043 +struct hci_cp_le_ext_create_conn { + __u8 filter_policy; + __u8 own_addr_type; + __u8 peer_addr_type; + bdaddr_t peer_addr; + __u8 phys; + __u8 data[0]; +} __packed; + +struct hci_cp_le_ext_conn_param { + __le16 scan_interval; + __le16 scan_window; + __le16 conn_interval_min; + __le16 conn_interval_max; + __le16 conn_latency; + __le16 supervision_timeout; + __le16 min_ce_len; + __le16 max_ce_len; +} __packed; + /* ---- HCI Events ---- */ #define HCI_EV_INQUIRY_COMPLETE 0x01 @@ -2015,6 +2036,21 @@ struct hci_ev_le_ext_adv_report { __u8 data[0]; } __packed; +#define HCI_EV_LE_ENHANCED_CONN_COMPLETE 0x0a +struct hci_ev_le_enh_conn_complete { + __u8 status; + __le16 handle; + __u8 role; + __u8 bdaddr_type; + bdaddr_t bdaddr; + bdaddr_t local_rpa; + bdaddr_t peer_rpa; + __le16 interval; + __le16 latency; + __le16 supervision_timeout; + __u8 clk_accurancy; +} __packed; + /* Internal events generated by Bluetooth stack */ #define HCI_EV_STACK_INTERNAL 0xfd struct hci_ev_stack_internal { diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index cc0bde74dd45..a74453571264 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1161,6 +1161,8 @@ void hci_conn_del_sysfs(struct hci_conn *conn); /* Use ext scanning if set ext scan param and ext scan enable is supported */ #define use_ext_scan(dev) (((dev)->commands[37] & 0x20) && \ ((dev)->commands[37] & 0x40)) +/* Use ext create connection if command is supported */ +#define use_ext_conn(dev) ((dev)->commands[37] & 0x80) /* ----- HCI protocols ----- */ #define HCI_PROTO_DEFER 0x01 diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 45ff5dc124cc..cc967ca67962 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -752,7 +752,6 @@ static void hci_req_add_le_create_conn(struct hci_request *req, struct hci_conn *conn, bdaddr_t *direct_rpa) { - struct hci_cp_le_create_conn cp; struct hci_dev *hdev = conn->hdev; u8 own_addr_type; @@ -775,25 +774,62 @@ static void hci_req_add_le_create_conn(struct hci_request *req, return; } - memset(&cp, 0, sizeof(cp)); + if (use_ext_conn(hdev)) { + struct hci_cp_le_ext_create_conn *cp; + struct hci_cp_le_ext_conn_param *p; + /* As of now only LE 1M is supported */ + u8 data[sizeof(*cp) + sizeof(*p) * 1]; - /* Set window to be the same value as the interval to enable - * continuous scanning. - */ - cp.scan_interval = cpu_to_le16(hdev->le_scan_interval); - cp.scan_window = cp.scan_interval; + cp = (void *) data; + p = (void *) cp->data; - bacpy(&cp.peer_addr, &conn->dst); - cp.peer_addr_type = conn->dst_type; - cp.own_address_type = own_addr_type; - cp.conn_interval_min = cpu_to_le16(conn->le_conn_min_interval); - cp.conn_interval_max = cpu_to_le16(conn->le_conn_max_interval); - cp.conn_latency = cpu_to_le16(conn->le_conn_latency); - cp.supervision_timeout = cpu_to_le16(conn->le_supv_timeout); - cp.min_ce_len = cpu_to_le16(0x0000); - cp.max_ce_len = cpu_to_le16(0x0000); - - hci_req_add(req, HCI_OP_LE_CREATE_CONN, sizeof(cp), &cp); + memset(cp, 0, sizeof(*cp)); + + bacpy(&cp->peer_addr, &conn->dst); + cp->peer_addr_type = conn->dst_type; + cp->own_addr_type = own_addr_type; + cp->phys = LE_SCAN_PHY_1M; + + memset(p, 0, sizeof(*p)); + + /* Set window to be the same value as the interval to enable + * continuous scanning. + */ + + p->scan_interval = cpu_to_le16(hdev->le_scan_interval); + p->scan_window = p->scan_interval; + p->conn_interval_min = cpu_to_le16(conn->le_conn_min_interval); + p->conn_interval_max = cpu_to_le16(conn->le_conn_max_interval); + p->conn_latency = cpu_to_le16(conn->le_conn_latency); + p->supervision_timeout = cpu_to_le16(conn->le_supv_timeout); + p->min_ce_len = cpu_to_le16(0x0000); + p->max_ce_len = cpu_to_le16(0x0000); + + hci_req_add(req, HCI_OP_LE_EXT_CREATE_CONN, sizeof(data), data); + + } else { + struct hci_cp_le_create_conn cp; + + memset(&cp, 0, sizeof(cp)); + + /* Set window to be the same value as the interval to enable + * continuous scanning. + */ + cp.scan_interval = cpu_to_le16(hdev->le_scan_interval); + cp.scan_window = cp.scan_interval; + + bacpy(&cp.peer_addr, &conn->dst); + cp.peer_addr_type = conn->dst_type; + cp.own_address_type = own_addr_type; + cp.conn_interval_min = cpu_to_le16(conn->le_conn_min_interval); + cp.conn_interval_max = cpu_to_le16(conn->le_conn_max_interval); + cp.conn_latency = cpu_to_le16(conn->le_conn_latency); + cp.supervision_timeout = cpu_to_le16(conn->le_supv_timeout); + cp.min_ce_len = cpu_to_le16(0x0000); + cp.max_ce_len = cpu_to_le16(0x0000); + + hci_req_add(req, HCI_OP_LE_CREATE_CONN, sizeof(cp), &cp); + } conn->state = BT_CONNECT; clear_bit(HCI_CONN_SCANNING, &conn->flags); diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index e3ec2d782762..f5c21004186c 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -704,6 +704,14 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) * Report */ + /* If the controller supports the LE Extended Create Connection + * command, enable the corresponding event. + */ + if (use_ext_conn(hdev)) + events[1] |= 0x02; /* LE Enhanced Connection + * Complete + */ + hci_req_add(req, HCI_OP_LE_SET_EVENT_MASK, sizeof(events), events); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 14e42e157de9..68192152c23b 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2031,6 +2031,31 @@ static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status) hci_dev_unlock(hdev); } +static void hci_cs_le_ext_create_conn(struct hci_dev *hdev, u8 status) +{ + struct hci_cp_le_ext_create_conn *cp; + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + /* All connection failure handling is taken care of by the + * hci_le_conn_failed function which is triggered by the HCI + * request completion callbacks used for connecting. + */ + if (status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_EXT_CREATE_CONN); + if (!cp) + return; + + hci_dev_lock(hdev); + + cs_le_create_conn(hdev, &cp->peer_addr, cp->peer_addr_type, + cp->own_addr_type, cp->filter_policy); + + hci_dev_unlock(hdev); +} + static void hci_cs_le_read_remote_features(struct hci_dev *hdev, u8 status) { struct hci_cp_le_read_remote_features *cp; @@ -3233,6 +3258,10 @@ static void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cs_le_start_enc(hdev, ev->status); break; + case HCI_OP_LE_EXT_CREATE_CONN: + hci_cs_le_ext_create_conn(hdev, ev->status); + break; + default: BT_DBG("%s opcode 0x%4.4x", hdev->name, *opcode); break; @@ -4733,6 +4762,20 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) le16_to_cpu(ev->supervision_timeout)); } +static void hci_le_enh_conn_complete_evt(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_ev_le_enh_conn_complete *ev = (void *) skb->data; + + BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); + + le_conn_complete_evt(hdev, ev->status, &ev->bdaddr, ev->bdaddr_type, + ev->role, le16_to_cpu(ev->handle), + le16_to_cpu(ev->interval), + le16_to_cpu(ev->latency), + le16_to_cpu(ev->supervision_timeout)); +} + static void hci_le_conn_update_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) { @@ -5352,6 +5395,10 @@ static void hci_le_meta_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_le_ext_adv_report_evt(hdev, skb); break; + case HCI_EV_LE_ENHANCED_CONN_COMPLETE: + hci_le_enh_conn_complete_evt(hdev, skb); + break; + default: break; } -- cgit v1.2.3 From 351782067b6be81879b0af0daf7bd3acbb32d986 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 6 Jul 2018 10:12:54 -0400 Subject: ipv4: ipcm_cookie initializers Initialize the cookie in one location to reduce code duplication and avoid bugs from inconsistent initialization, such as that fixed in commit 9887cba19978 ("ip: limit use of gso_size to udp"). Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/ip.h | 15 +++++++++++++++ net/ipv4/icmp.c | 11 ++--------- net/ipv4/ip_output.c | 6 +----- net/ipv4/ping.c | 9 +-------- net/ipv4/raw.c | 9 +-------- net/ipv4/udp.c | 10 +--------- 6 files changed, 21 insertions(+), 39 deletions(-) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index 99d1b835d2aa..6db23bf1e5eb 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -79,6 +79,21 @@ struct ipcm_cookie { __u16 gso_size; }; +static inline void ipcm_init(struct ipcm_cookie *ipcm) +{ + *ipcm = (struct ipcm_cookie) { .tos = -1 }; +} + +static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, + const struct inet_sock *inet) +{ + ipcm_init(ipcm); + + ipcm->sockc.tsflags = inet->sk.sk_tsflags; + ipcm->oif = inet->sk.sk_bound_dev_if; + ipcm->addr = inet->inet_saddr; +} + #define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb)) #define PKTINFO_SKB_CB(skb) ((struct in_pktinfo *)((skb)->cb)) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 937239afd68d..695979b7ef6d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -429,15 +429,11 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) icmp_param->data.icmph.checksum = 0; + ipcm_init(&ipc); inet->tos = ip_hdr(skb)->tos; sk->sk_mark = mark; daddr = ipc.addr = ip_hdr(skb)->saddr; saddr = fib_compute_spec_dst(skb); - ipc.opt = NULL; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; - ipc.sockc.transmit_time = 0; if (icmp_param->replyopts.opt.opt.optlen) { ipc.opt = &icmp_param->replyopts.opt; @@ -711,12 +707,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) icmp_param.offset = skb_network_offset(skb_in); inet_sk(sk)->tos = tos; sk->sk_mark = mark; + ipcm_init(&ipc); ipc.addr = iph->saddr; ipc.opt = &icmp_param.replyopts.opt; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; - ipc.sockc.transmit_time = 0; rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark, type, code, &icmp_param); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 570e3ebc3974..81d0e4a77ec5 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1548,12 +1548,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt)) return; + ipcm_init(&ipc); ipc.addr = daddr; - ipc.opt = NULL; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; - ipc.sockc.transmit_time = 0; if (replyopts.opt.opt.optlen) { ipc.opt = &replyopts.opt; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index b47492205507..6f17fc8ebbdb 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -739,14 +739,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) /* no remote port */ } - ipc.sockc.tsflags = sk->sk_tsflags; - ipc.addr = inet->inet_saddr; - ipc.opt = NULL; - ipc.oif = sk->sk_bound_dev_if; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; - ipc.sockc.transmit_time = 0; + ipcm_init_sk(&ipc, inet); if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, false); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 446af7be2b55..cf142909389c 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -562,14 +562,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) daddr = inet->inet_daddr; } - ipc.sockc.tsflags = sk->sk_tsflags; - ipc.sockc.transmit_time = 0; - ipc.addr = inet->inet_saddr; - ipc.opt = NULL; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; - ipc.oif = sk->sk_bound_dev_if; + ipcm_init_sk(&ipc, inet); if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, false); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 5c76ba0666ec..87f3a0b77864 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -926,12 +926,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */ return -EOPNOTSUPP; - ipc.opt = NULL; - ipc.tx_flags = 0; - ipc.ttl = 0; - ipc.tos = -1; - ipc.sockc.transmit_time = 0; - getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; fl4 = &inet->cork.fl.u.ip4; @@ -978,9 +972,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) connected = 1; } - ipc.sockc.tsflags = sk->sk_tsflags; - ipc.addr = inet->inet_saddr; - ipc.oif = sk->sk_bound_dev_if; + ipcm_init_sk(&ipc, inet); ipc.gso_size = up->gso_size; if (msg->msg_controllen) { -- cgit v1.2.3 From b515430ac9c25d5192cf498af3c6be6c4f51caad Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 6 Jul 2018 10:12:55 -0400 Subject: ipv6: ipcm6_cookie initializer Initialize the cookie in one location to reduce code duplication and avoid bugs from inconsistent initialization, such as that fixed in commit 9887cba19978 ("ip: limit use of gso_size to udp"). Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/ipv6.h | 19 +++++++++++++++++++ net/ipv6/icmp.c | 7 ++----- net/ipv6/ping.c | 4 +--- net/ipv6/raw.c | 5 +---- net/ipv6/udp.c | 4 +--- net/l2tp/l2tp_ip6.c | 4 +--- 6 files changed, 25 insertions(+), 18 deletions(-) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index b7843e0b16ee..6cb247f54d4c 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -301,6 +301,25 @@ struct ipcm6_cookie { __u16 gso_size; }; +static inline void ipcm6_init(struct ipcm6_cookie *ipc6) +{ + *ipc6 = (struct ipcm6_cookie) { + .hlimit = -1, + .tclass = -1, + .dontfrag = -1, + }; +} + +static inline void ipcm6_init_sk(struct ipcm6_cookie *ipc6, + const struct ipv6_pinfo *np) +{ + *ipc6 = (struct ipcm6_cookie) { + .hlimit = -1, + .tclass = np->tclass, + .dontfrag = np->dontfrag, + }; +} + static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np) { struct ipv6_txoptions *opt; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index be491bf6ab6e..d99fed67cd10 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -545,7 +545,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; - ipc6.tclass = np->tclass; + ipcm6_init_sk(&ipc6, np); fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = icmpv6_route_lookup(net, skb, sk, &fl6); @@ -553,8 +553,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, goto out; ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - ipc6.dontfrag = np->dontfrag; - ipc6.opt = NULL; msg.skb = skb; msg.offset = skb_network_offset(skb); @@ -726,10 +724,9 @@ static void icmpv6_echo_reply(struct sk_buff *skb) msg.offset = 0; msg.type = ICMPV6_ECHO_REPLY; + ipcm6_init_sk(&ipc6, np); ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); ipc6.tclass = ipv6_get_dsfield(ipv6_hdr(skb)); - ipc6.dontfrag = np->dontfrag; - ipc6.opt = NULL; if (ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 96f56bf49a30..717e7c1fba29 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -119,7 +119,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.fl6_icmp_code = user_icmph.icmp6_code; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - ipc6.tclass = np->tclass; + ipcm6_init_sk(&ipc6, np); fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr, false); @@ -142,8 +142,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) pfh.family = AF_INET6; ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - ipc6.dontfrag = np->dontfrag; - ipc6.opt = NULL; lock_sock(sk); err = ip6_append_data(sk, ping_getfrag, &pfh, len, diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 5737c50f16eb..5f40670271ee 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -791,10 +791,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_mark = sk->sk_mark; fl6.flowi6_uid = sk->sk_uid; - ipc6.hlimit = -1; - ipc6.tclass = -1; - ipc6.dontfrag = -1; - ipc6.opt = NULL; + ipcm6_init(&ipc6); if (sin6) { if (addr_len < SIN6_LEN_RFC2133) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index ac6fc6728903..940115da9843 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1143,9 +1143,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); struct sockcm_cookie sockc; - ipc6.hlimit = -1; - ipc6.tclass = -1; - ipc6.dontfrag = -1; + ipcm6_init(&ipc6); ipc6.gso_size = up->gso_size; sockc.tsflags = sk->sk_tsflags; sockc.transmit_time = 0; diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 957369192ca1..38f80691f4ab 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -525,9 +525,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_mark = sk->sk_mark; fl6.flowi6_uid = sk->sk_uid; - ipc6.hlimit = -1; - ipc6.tclass = -1; - ipc6.dontfrag = -1; + ipcm6_init(&ipc6); if (lsa) { if (addr_len < SIN6_LEN_RFC2133) -- cgit v1.2.3 From 657a0667025e77cc17f8a38b93e60a2bc24d830c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 6 Jul 2018 10:12:56 -0400 Subject: sock: sockc cookie initializer Initialize the cookie in one location to reduce code duplication and avoid bugs from inconsistent initialization, such as that fixed in commit 9887cba19978 ("ip: limit use of gso_size to udp"). Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/sock.h | 6 ++++++ net/ipv4/tcp.c | 2 +- net/packet/af_packet.c | 9 +++------ 3 files changed, 10 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index e0eac9ef44b5..83b747538bd0 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1600,6 +1600,12 @@ struct sockcm_cookie { u16 tsflags; }; +static inline void sockcm_init(struct sockcm_cookie *sockc, + const struct sock *sk) +{ + *sockc = (struct sockcm_cookie) { .tsflags = sk->sk_tsflags }; +} + int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, struct sockcm_cookie *sockc); int sock_cmsg_send(struct sock *sk, struct msghdr *msg, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bf461fa77ed6..850dc8f15afc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1241,7 +1241,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) /* 'common' sending to sendq */ } - sockc.tsflags = sk->sk_tsflags; + sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) { diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 3428f7739ae9..47931ebfaef3 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1951,8 +1951,7 @@ retry: goto out_unlock; } - sockc.transmit_time = 0; - sockc.tsflags = sk->sk_tsflags; + sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) @@ -2636,8 +2635,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) if (unlikely(!(dev->flags & IFF_UP))) goto out_put; - sockc.transmit_time = 0; - sockc.tsflags = po->sk.sk_tsflags; + sockcm_init(&sockc, &po->sk); if (msg->msg_controllen) { err = sock_cmsg_send(&po->sk, msg, &sockc); if (unlikely(err)) @@ -2833,8 +2831,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (unlikely(!(dev->flags & IFF_UP))) goto out_unlock; - sockc.transmit_time = 0; - sockc.tsflags = sk->sk_tsflags; + sockcm_init(&sockc, sk); sockc.mark = sk->sk_mark; if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); -- cgit v1.2.3 From 5fdaa88dfefa87ee1ea92750e99950dca182ea41 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 6 Jul 2018 10:12:57 -0400 Subject: ipv6: fold sockcm_cookie into ipcm6_cookie ipcm_cookie includes sockcm_cookie. Do the same for ipcm6_cookie. This reduces the number of arguments that need to be passed around, applies ipcm6_init to all cookie fields at once and reduces code differentiation between ipv4 and ipv6. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/ipv6.h | 7 +++---- include/net/transp_v6.h | 3 +-- net/ipv6/datagram.c | 4 ++-- net/ipv6/icmp.c | 7 ++----- net/ipv6/ip6_flowlabel.c | 3 +-- net/ipv6/ip6_output.c | 24 ++++++++++-------------- net/ipv6/ipv6_sockglue.c | 3 +-- net/ipv6/ping.c | 3 +-- net/ipv6/raw.c | 10 ++++------ net/ipv6/udp.c | 10 ++++------ net/l2tp/l2tp_ip6.c | 6 ++---- 11 files changed, 31 insertions(+), 49 deletions(-) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 6cb247f54d4c..aa6fd11a887c 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -294,6 +294,7 @@ struct ipv6_fl_socklist { }; struct ipcm6_cookie { + struct sockcm_cookie sockc; __s16 hlimit; __s16 tclass; __s8 dontfrag; @@ -959,8 +960,7 @@ int ip6_append_data(struct sock *sk, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, - struct rt6_info *rt, unsigned int flags, - const struct sockcm_cookie *sockc); + struct rt6_info *rt, unsigned int flags); int ip6_push_pending_frames(struct sock *sk); @@ -977,8 +977,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, void *from, int length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, struct rt6_info *rt, unsigned int flags, - struct inet_cork_full *cork, - const struct sockcm_cookie *sockc); + struct inet_cork_full *cork); static inline struct sk_buff *ip6_finish_skb(struct sock *sk) { diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h index f6a3543e5247..a8f6020f1196 100644 --- a/include/net/transp_v6.h +++ b/include/net/transp_v6.h @@ -42,8 +42,7 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb); int ip6_datagram_send_ctl(struct net *net, struct sock *sk, struct msghdr *msg, - struct flowi6 *fl6, struct ipcm6_cookie *ipc6, - struct sockcm_cookie *sockc); + struct flowi6 *fl6, struct ipcm6_cookie *ipc6); void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, __u16 srcp, __u16 destp, int rqueue, int bucket); diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 2ee08b6a86a4..201306b9b5ea 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -736,7 +736,7 @@ EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl); int ip6_datagram_send_ctl(struct net *net, struct sock *sk, struct msghdr *msg, struct flowi6 *fl6, - struct ipcm6_cookie *ipc6, struct sockcm_cookie *sockc) + struct ipcm6_cookie *ipc6) { struct in6_pktinfo *src_info; struct cmsghdr *cmsg; @@ -755,7 +755,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, } if (cmsg->cmsg_level == SOL_SOCKET) { - err = __sock_cmsg_send(sk, msg, cmsg, sockc); + err = __sock_cmsg_send(sk, msg, cmsg, &ipc6->sockc); if (err) return err; continue; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index d99fed67cd10..24611c8b0562 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -430,7 +430,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, struct icmp6hdr tmp_hdr; struct flowi6 fl6; struct icmpv6_msg msg; - struct sockcm_cookie sockc_unused = {0}; struct ipcm6_cookie ipc6; int iif = 0; int addr_type = 0; @@ -573,7 +572,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), &ipc6, &fl6, (struct rt6_info *)dst, - MSG_DONTWAIT, &sockc_unused)) { + MSG_DONTWAIT)) { ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { @@ -677,7 +676,6 @@ static void icmpv6_echo_reply(struct sk_buff *skb) struct dst_entry *dst; struct ipcm6_cookie ipc6; u32 mark = IP6_REPLY_MARK(net, skb->mark); - struct sockcm_cookie sockc_unused = {0}; saddr = &ipv6_hdr(skb)->daddr; @@ -731,8 +729,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) if (ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), &ipc6, &fl6, - (struct rt6_info *)dst, MSG_DONTWAIT, - &sockc_unused)) { + (struct rt6_info *)dst, MSG_DONTWAIT)) { __ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 3eee7637bdfe..cb54a8a3c273 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -373,7 +373,6 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, if (olen > 0) { struct msghdr msg; struct flowi6 flowi6; - struct sockcm_cookie sockc_junk; struct ipcm6_cookie ipc6; err = -ENOMEM; @@ -392,7 +391,7 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, memset(&flowi6, 0, sizeof(flowi6)); ipc6.opt = fl->opt; - err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, &ipc6, &sockc_junk); + err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, &ipc6); if (err) goto done; err = -EINVAL; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index f48af7e62f12..1a3bf6437cb9 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1158,8 +1158,7 @@ static void ip6_append_data_mtu(unsigned int *mtu, static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, - struct rt6_info *rt, struct flowi6 *fl6, - const struct sockcm_cookie *sockc) + struct rt6_info *rt, struct flowi6 *fl6) { struct ipv6_pinfo *np = inet6_sk(sk); unsigned int mtu; @@ -1227,7 +1226,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, cork->base.flags |= IPCORK_ALLFRAG; cork->base.length = 0; - cork->base.transmit_time = sockc->transmit_time; + cork->base.transmit_time = ipc6->sockc.transmit_time; return 0; } @@ -1241,8 +1240,7 @@ static int __ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, - unsigned int flags, struct ipcm6_cookie *ipc6, - const struct sockcm_cookie *sockc) + unsigned int flags, struct ipcm6_cookie *ipc6) { struct sk_buff *skb, *skb_prev = NULL; unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; @@ -1321,7 +1319,7 @@ emsgsize: csummode = CHECKSUM_PARTIAL; if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) { - sock_tx_timestamp(sk, sockc->tsflags, &tx_flags); + sock_tx_timestamp(sk, ipc6->sockc.tsflags, &tx_flags); if (tx_flags & SKBTX_ANY_SW_TSTAMP && sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) tskey = sk->sk_tskey++; @@ -1563,8 +1561,7 @@ int ip6_append_data(struct sock *sk, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, - struct rt6_info *rt, unsigned int flags, - const struct sockcm_cookie *sockc) + struct rt6_info *rt, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); @@ -1578,7 +1575,7 @@ int ip6_append_data(struct sock *sk, * setup for corking */ err = ip6_setup_cork(sk, &inet->cork, &np->cork, - ipc6, rt, fl6, sockc); + ipc6, rt, fl6); if (err) return err; @@ -1592,7 +1589,7 @@ int ip6_append_data(struct sock *sk, return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base, &np->cork, sk_page_frag(sk), getfrag, - from, length, transhdrlen, flags, ipc6, sockc); + from, length, transhdrlen, flags, ipc6); } EXPORT_SYMBOL_GPL(ip6_append_data); @@ -1752,8 +1749,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, void *from, int length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, struct rt6_info *rt, unsigned int flags, - struct inet_cork_full *cork, - const struct sockcm_cookie *sockc) + struct inet_cork_full *cork) { struct inet6_cork v6_cork; struct sk_buff_head queue; @@ -1770,7 +1766,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, cork->base.opt = NULL; cork->base.dst = NULL; v6_cork.opt = NULL; - err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6, sockc); + err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6); if (err) { ip6_cork_release(cork, &v6_cork); return ERR_PTR(err); @@ -1781,7 +1777,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork, ¤t->task_frag, getfrag, from, length + exthdrlen, transhdrlen + exthdrlen, - flags, ipc6, sockc); + flags, ipc6); if (err) { __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); return ERR_PTR(err); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 4d780c7f0130..fabe3ba1bddc 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -489,7 +489,6 @@ sticky_done: struct ipv6_txoptions *opt = NULL; struct msghdr msg; struct flowi6 fl6; - struct sockcm_cookie sockc_junk; struct ipcm6_cookie ipc6; memset(&fl6, 0, sizeof(fl6)); @@ -522,7 +521,7 @@ sticky_done: msg.msg_control = (void *)(opt+1); ipc6.opt = opt; - retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, &ipc6, &sockc_junk); + retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, &ipc6); if (retv) goto done; update: diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 717e7c1fba29..4c04bccc7417 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -62,7 +62,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct dst_entry *dst; struct rt6_info *rt; struct pingfakehdr pfh; - struct sockcm_cookie junk = {0}; struct ipcm6_cookie ipc6; pr_debug("ping_v6_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); @@ -146,7 +145,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) lock_sock(sk); err = ip6_append_data(sk, ping_getfrag, &pfh, len, 0, &ipc6, &fl6, rt, - MSG_DONTWAIT, &junk); + MSG_DONTWAIT); if (err) { ICMP6_INC_STATS(sock_net(sk), rt->rt6i_idev, diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 5f40670271ee..413d98bf24f4 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -767,7 +767,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct dst_entry *dst = NULL; struct raw6_frag_vec rfv; struct flowi6 fl6; - struct sockcm_cookie sockc; struct ipcm6_cookie ipc6; int addr_len = msg->msg_namelen; u16 proto; @@ -792,6 +791,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_uid = sk->sk_uid; ipcm6_init(&ipc6); + ipc6.sockc.tsflags = sk->sk_tsflags; if (sin6) { if (addr_len < SIN6_LEN_RFC2133) @@ -845,15 +845,13 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (fl6.flowi6_oif == 0) fl6.flowi6_oif = sk->sk_bound_dev_if; - sockc.tsflags = sk->sk_tsflags; - sockc.transmit_time = 0; if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(struct ipv6_txoptions); ipc6.opt = opt; - err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6, &sockc); + err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6); if (err < 0) { fl6_sock_release(flowlabel); return err; @@ -921,13 +919,13 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) back_from_confirm: if (inet->hdrincl) err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst, - msg->msg_flags, &sockc); + msg->msg_flags, &ipc6.sockc); else { ipc6.opt = opt; lock_sock(sk); err = ip6_append_data(sk, raw6_getfrag, &rfv, len, 0, &ipc6, &fl6, (struct rt6_info *)dst, - msg->msg_flags, &sockc); + msg->msg_flags); if (err) ip6_flush_pending_frames(sk); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 940115da9843..f6b96956a8ed 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1141,12 +1141,10 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int err; int is_udplite = IS_UDPLITE(sk); int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); - struct sockcm_cookie sockc; ipcm6_init(&ipc6); ipc6.gso_size = up->gso_size; - sockc.tsflags = sk->sk_tsflags; - sockc.transmit_time = 0; + ipc6.sockc.tsflags = sk->sk_tsflags; /* destination address check */ if (sin6) { @@ -1281,7 +1279,7 @@ do_udp_sendmsg: err = udp_cmsg_send(sk, msg, &ipc6.gso_size); if (err > 0) err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, - &ipc6, &sockc); + &ipc6); if (err < 0) { fl6_sock_release(flowlabel); return err; @@ -1375,7 +1373,7 @@ back_from_confirm: skb = ip6_make_skb(sk, getfrag, msg, ulen, sizeof(struct udphdr), &ipc6, &fl6, (struct rt6_info *)dst, - msg->msg_flags, &cork, &sockc); + msg->msg_flags, &cork); err = PTR_ERR(skb); if (!IS_ERR_OR_NULL(skb)) err = udp_v6_send_skb(skb, &fl6, &cork.base); @@ -1401,7 +1399,7 @@ do_append_data: up->len += ulen; err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr), &ipc6, &fl6, (struct rt6_info *)dst, - corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags, &sockc); + corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); if (err) udp_v6_flush_pending_frames(sk); else if (!corkreq) diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 38f80691f4ab..672e5b753738 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -500,7 +500,6 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct ip6_flowlabel *flowlabel = NULL; struct dst_entry *dst = NULL; struct flowi6 fl6; - struct sockcm_cookie sockc_unused = {0}; struct ipcm6_cookie ipc6; int addr_len = msg->msg_namelen; int transhdrlen = 4; /* zero session-id */ @@ -573,8 +572,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) opt->tot_len = sizeof(struct ipv6_txoptions); ipc6.opt = opt; - err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6, - &sockc_unused); + err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6); if (err < 0) { fl6_sock_release(flowlabel); return err; @@ -639,7 +637,7 @@ back_from_confirm: err = ip6_append_data(sk, ip_generic_getfrag, msg, ulen, transhdrlen, &ipc6, &fl6, (struct rt6_info *)dst, - msg->msg_flags, &sockc_unused); + msg->msg_flags); if (err) ip6_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) -- cgit v1.2.3 From 678ca42d688534adfc780b150abefaaac7c86687 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 6 Jul 2018 10:12:58 -0400 Subject: ip: remove tx_flags from ipcm_cookie and use same logic for v4 and v6 skb_shinfo(skb)->tx_flags is derived from sk->sk_tsflags, possibly after modification by __sock_cmsg_send, by calling sock_tx_timestamp. The IPv4 and IPv6 paths do this conversion differently. In IPv4, the individual protocols that support tx timestamps call this function and store the result in ipc.tx_flags. In IPv6, sock_tx_timestamp is called in __ip6_append_data. There is no need to store both tx_flags and ts_flags in the cookie as one is derived from the other. Convert when setting up the cork and remove the redundant field. This is similar to IPv6, only have the conversion happen only once per datagram, in ip(6)_setup_cork. Also change __ip6_append_data to match __ip_append_data. Only update tskey if timestamping is enabled with OPT_ID. The SOCK_.. test is redundant: only valid protocols can have non-zero cork->tx_flags. After this change the IPv4 and IPv6 logic is the same. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/ip.h | 1 - net/ipv4/ip_output.c | 3 ++- net/ipv4/ping.c | 2 -- net/ipv4/raw.c | 2 -- net/ipv4/udp.c | 2 -- net/ipv6/ip6_output.c | 18 ++++++++---------- 6 files changed, 10 insertions(+), 18 deletions(-) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index 6db23bf1e5eb..e44b1a44f67a 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -72,7 +72,6 @@ struct ipcm_cookie { __be32 addr; int oif; struct ip_options_rcu *opt; - __u8 tx_flags; __u8 ttl; __s16 tos; char priority; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 81d0e4a77ec5..e14c774cc092 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1153,8 +1153,9 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, cork->ttl = ipc->ttl; cork->tos = ipc->tos; cork->priority = ipc->priority; - cork->tx_flags = ipc->tx_flags; cork->transmit_time = ipc->sockc.transmit_time; + cork->tx_flags = 0; + sock_tx_timestamp(sk, ipc->sockc.tsflags, &cork->tx_flags); return 0; } diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 6f17fc8ebbdb..b54c964ad925 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -763,8 +763,6 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_unlock(); } - sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags); - saddr = ipc.addr; ipc.addr = faddr = daddr; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index cf142909389c..33df4d76db2d 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -665,8 +665,6 @@ back_from_confirm: &rt, msg->msg_flags, &ipc.sockc); else { - sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags); - if (!ipc.addr) ipc.addr = fl4.daddr; lock_sock(sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 87f3a0b77864..060e841dde40 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1020,8 +1020,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) saddr = ipc.addr; ipc.addr = faddr = daddr; - sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags); - if (ipc.opt && ipc.opt->opt.srr) { if (!daddr) { err = -EINVAL; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 1a3bf6437cb9..ff4b28a600ab 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1221,6 +1221,8 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, cork->base.fragsize = mtu; cork->base.gso_size = sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP ? ipc6->gso_size : 0; + cork->base.tx_flags = 0; + sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); if (dst_allfrag(xfrm_dst_path(&rt->dst))) cork->base.flags |= IPCORK_ALLFRAG; @@ -1250,7 +1252,6 @@ static int __ip6_append_data(struct sock *sk, int copy; int err; int offset = 0; - __u8 tx_flags = 0; u32 tskey = 0; struct rt6_info *rt = (struct rt6_info *)cork->dst; struct ipv6_txoptions *opt = v6_cork->opt; @@ -1269,6 +1270,10 @@ static int __ip6_append_data(struct sock *sk, mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; orig_mtu = mtu; + if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && + sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + tskey = sk->sk_tskey++; + hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + @@ -1318,13 +1323,6 @@ emsgsize: rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) csummode = CHECKSUM_PARTIAL; - if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) { - sock_tx_timestamp(sk, ipc6->sockc.tsflags, &tx_flags); - if (tx_flags & SKBTX_ANY_SW_TSTAMP && - sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) - tskey = sk->sk_tskey++; - } - /* * Let's try using as much space as possible. * Use MTU if total length of the message fits into the MTU. @@ -1443,8 +1441,8 @@ alloc_new_skb: dst_exthdrlen); /* Only the initial fragment is time stamped */ - skb_shinfo(skb)->tx_flags = tx_flags; - tx_flags = 0; + skb_shinfo(skb)->tx_flags = cork->tx_flags; + cork->tx_flags = 0; skb_shinfo(skb)->tskey = tskey; tskey = 0; -- cgit v1.2.3 From 22dd149167359981ea6f4afde04026fb78747ddc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 6 Jul 2018 14:58:51 +0200 Subject: devlink: fix incorrect return statement A newly added dummy helper function tries to return a failure from a "void" function: In file included from include/net/dsa.h:24, from arch/arm/plat-orion/common.c:21: include/net/devlink.h: In function 'devlink_param_value_changed': include/net/devlink.h:771:9: error: 'return' with a value, in function returning void [-Werror] return -EOPNOTSUPP; This fixes it by removing the bogus statement. Fixes: ea601e170988 ("devlink: Add devlink notifications support for params") Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- include/net/devlink.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 8ed571385626..f67c29cede15 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -768,7 +768,6 @@ devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, static inline void devlink_param_value_changed(struct devlink *devlink, u32 param_id) { - return -EOPNOTSUPP; } #endif -- cgit v1.2.3 From 2064c3d4c02026572d4975177f28a58052f0a8b7 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Fri, 6 Jul 2018 05:38:12 +0000 Subject: net/flow_dissector: Save vlan ethertype from headers Change vlan dissector key to save vlan tpid to support both 802.1Q and 802.1AD ethertype. Signed-off-by: Jianbo Liu Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 2 +- net/core/flow_dissector.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index adc24df56b90..8f899688a965 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -47,7 +47,7 @@ struct flow_dissector_key_tags { struct flow_dissector_key_vlan { u16 vlan_id:12, vlan_priority:3; - u16 padding; + __be16 vlan_tpid; }; struct flow_dissector_key_mpls { diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 53f96e4f7bf5..18cb99b50cba 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -751,6 +751,7 @@ proto_again: const struct vlan_hdr *vlan; struct vlan_hdr _vlan; bool vlan_tag_present = skb && skb_vlan_tag_present(skb); + __be16 saved_vlan_tpid = proto; if (vlan_tag_present) proto = skb->protocol; @@ -789,6 +790,7 @@ proto_again: (ntohs(vlan->h_vlan_TCI) & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } + key_vlan->vlan_tpid = saved_vlan_tpid; } fdret = FLOW_DISSECT_RET_PROTO_AGAIN; -- cgit v1.2.3 From 24c590e3b0f9eebe603ebe3d516990306d385f46 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Fri, 6 Jul 2018 05:38:14 +0000 Subject: net/flow_dissector: Add support for QinQ dissection Dissect the QinQ packets to get both outer and inner vlan information, then store to the extended flow keys. Signed-off-by: Jianbo Liu Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 2 ++ net/core/flow_dissector.c | 32 +++++++++++++++++--------------- 2 files changed, 19 insertions(+), 15 deletions(-) (limited to 'include/net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 8f899688a965..c64406717eee 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -206,6 +206,7 @@ enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_MPLS, /* struct flow_dissector_key_mpls */ FLOW_DISSECTOR_KEY_TCP, /* struct flow_dissector_key_tcp */ FLOW_DISSECTOR_KEY_IP, /* struct flow_dissector_key_ip */ + FLOW_DISSECTOR_KEY_CVLAN, /* struct flow_dissector_key_flow_vlan */ FLOW_DISSECTOR_KEY_MAX, }; @@ -237,6 +238,7 @@ struct flow_keys { struct flow_dissector_key_basic basic; struct flow_dissector_key_tags tags; struct flow_dissector_key_vlan vlan; + struct flow_dissector_key_vlan cvlan; struct flow_dissector_key_keyid keyid; struct flow_dissector_key_ports ports; struct flow_dissector_key_addrs addrs; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 18cb99b50cba..b555fc229e96 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -589,7 +589,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_vlan *key_vlan; enum flow_dissect_ret fdret; - bool skip_vlan = false; + enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; int num_hdrs = 0; u8 ip_proto = 0; bool ret; @@ -748,15 +748,14 @@ proto_again: } case htons(ETH_P_8021AD): case htons(ETH_P_8021Q): { - const struct vlan_hdr *vlan; + const struct vlan_hdr *vlan = NULL; struct vlan_hdr _vlan; - bool vlan_tag_present = skb && skb_vlan_tag_present(skb); __be16 saved_vlan_tpid = proto; - if (vlan_tag_present) + if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX && + skb && skb_vlan_tag_present(skb)) { proto = skb->protocol; - - if (!vlan_tag_present || eth_type_vlan(skb->protocol)) { + } else { vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan); if (!vlan) { @@ -766,20 +765,23 @@ proto_again: proto = vlan->h_vlan_encapsulated_proto; nhoff += sizeof(*vlan); - if (skip_vlan) { - fdret = FLOW_DISSECT_RET_PROTO_AGAIN; - break; - } } - skip_vlan = true; - if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_VLAN)) { + if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX) { + dissector_vlan = FLOW_DISSECTOR_KEY_VLAN; + } else if (dissector_vlan == FLOW_DISSECTOR_KEY_VLAN) { + dissector_vlan = FLOW_DISSECTOR_KEY_CVLAN; + } else { + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + break; + } + + if (dissector_uses_key(flow_dissector, dissector_vlan)) { key_vlan = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_VLAN, + dissector_vlan, target_container); - if (vlan_tag_present) { + if (!vlan) { key_vlan->vlan_id = skb_vlan_tag_get_id(skb); key_vlan->vlan_priority = (skb_vlan_tag_get_prio(skb) >> VLAN_PRIO_SHIFT); -- cgit v1.2.3 From eec94fdb04806790c7b7e6ea347820064cc6d467 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:23 +0300 Subject: net: sched: use rcu for action cookie update Implement functions to atomically update and free action cookie using rcu mechanism. Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 2 +- include/net/pkt_cls.h | 1 + net/sched/act_api.c | 44 ++++++++++++++++++++++++++++++-------------- 3 files changed, 32 insertions(+), 15 deletions(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 5ff11adbe2a6..ffc3ef321776 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -37,7 +37,7 @@ struct tc_action { spinlock_t tcfa_lock; struct gnet_stats_basic_cpu __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; - struct tc_cookie *act_cookie; + struct tc_cookie __rcu *act_cookie; struct tcf_chain *goto_chain; }; #define tcf_index common.tcfa_index diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 6641584b27f1..2081e4219f81 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -781,6 +781,7 @@ struct tc_mqprio_qopt_offload { struct tc_cookie { u8 *data; u32 len; + struct rcu_head rcu; }; struct tc_qopt_offload_stats { diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 3f4cf930f809..02670c7489e3 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -55,6 +55,24 @@ static void tcf_action_goto_chain_exec(const struct tc_action *a, res->goto_tp = rcu_dereference_bh(chain->filter_chain); } +static void tcf_free_cookie_rcu(struct rcu_head *p) +{ + struct tc_cookie *cookie = container_of(p, struct tc_cookie, rcu); + + kfree(cookie->data); + kfree(cookie); +} + +static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie, + struct tc_cookie *new_cookie) +{ + struct tc_cookie *old; + + old = xchg(old_cookie, new_cookie); + if (old) + call_rcu(&old->rcu, tcf_free_cookie_rcu); +} + /* XXX: For standalone actions, we don't need a RCU grace period either, because * actions are always connected to filters and filters are already destroyed in * RCU callbacks, so after a RCU grace period actions are already disconnected @@ -65,10 +83,7 @@ static void free_tcf(struct tc_action *p) free_percpu(p->cpu_bstats); free_percpu(p->cpu_qstats); - if (p->act_cookie) { - kfree(p->act_cookie->data); - kfree(p->act_cookie); - } + tcf_set_action_cookie(&p->act_cookie, NULL); if (p->goto_chain) tcf_action_goto_chain_fini(p); @@ -567,16 +582,22 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) int err = -EINVAL; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; + struct tc_cookie *cookie; if (nla_put_string(skb, TCA_KIND, a->ops->kind)) goto nla_put_failure; if (tcf_action_copy_stats(skb, a, 0)) goto nla_put_failure; - if (a->act_cookie) { - if (nla_put(skb, TCA_ACT_COOKIE, a->act_cookie->len, - a->act_cookie->data)) + + rcu_read_lock(); + cookie = rcu_dereference(a->act_cookie); + if (cookie) { + if (nla_put(skb, TCA_ACT_COOKIE, cookie->len, cookie->data)) { + rcu_read_unlock(); goto nla_put_failure; + } } + rcu_read_unlock(); nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) @@ -719,13 +740,8 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, if (err < 0) goto err_mod; - if (name == NULL && tb[TCA_ACT_COOKIE]) { - if (a->act_cookie) { - kfree(a->act_cookie->data); - kfree(a->act_cookie); - } - a->act_cookie = cookie; - } + if (!name && tb[TCA_ACT_COOKIE]) + tcf_set_action_cookie(&a->act_cookie, cookie); /* module count goes up only when brand new policy is created * if it exists and is only bound to in a_o->init() then -- cgit v1.2.3 From 036bb44327f50273e85ee4a2c9b56eebce1c0838 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:24 +0300 Subject: net: sched: change type of reference and bind counters Change type of action reference counter to refcount_t. Change type of action bind counter to atomic_t. This type is used to allow decrementing bind counter without testing for 0 result. Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 5 +++-- net/sched/act_api.c | 32 ++++++++++++++++++++++---------- net/sched/act_bpf.c | 4 ++-- net/sched/act_connmark.c | 4 ++-- net/sched/act_csum.c | 4 ++-- net/sched/act_gact.c | 4 ++-- net/sched/act_ife.c | 4 ++-- net/sched/act_ipt.c | 4 ++-- net/sched/act_mirred.c | 4 ++-- net/sched/act_nat.c | 4 ++-- net/sched/act_pedit.c | 4 ++-- net/sched/act_police.c | 4 ++-- net/sched/act_sample.c | 4 ++-- net/sched/act_simple.c | 4 ++-- net/sched/act_skbedit.c | 4 ++-- net/sched/act_skbmod.c | 4 ++-- net/sched/act_tunnel_key.c | 4 ++-- net/sched/act_vlan.c | 4 ++-- 18 files changed, 57 insertions(+), 44 deletions(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index ffc3ef321776..2759226527a2 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -6,6 +6,7 @@ * Public action API for classifiers/qdiscs */ +#include #include #include #include @@ -26,8 +27,8 @@ struct tc_action { struct tcf_idrinfo *idrinfo; u32 tcfa_index; - int tcfa_refcnt; - int tcfa_bindcnt; + refcount_t tcfa_refcnt; + atomic_t tcfa_bindcnt; u32 tcfa_capab; int tcfa_action; struct tcf_t tcfa_tm; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 02670c7489e3..4f064ecab882 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -105,14 +105,26 @@ int __tcf_idr_release(struct tc_action *p, bool bind, bool strict) ASSERT_RTNL(); + /* Release with strict==1 and bind==0 is only called through act API + * interface (classifiers always bind). Only case when action with + * positive reference count and zero bind count can exist is when it was + * also created with act API (unbinding last classifier will destroy the + * action if it was created by classifier). So only case when bind count + * can be changed after initial check is when unbound action is + * destroyed by act API while classifier binds to action with same id + * concurrently. This result either creation of new action(same behavior + * as before), or reusing existing action if concurrent process + * increments reference count before action is deleted. Both scenarios + * are acceptable. + */ if (p) { if (bind) - p->tcfa_bindcnt--; - else if (strict && p->tcfa_bindcnt > 0) + atomic_dec(&p->tcfa_bindcnt); + else if (strict && atomic_read(&p->tcfa_bindcnt) > 0) return -EPERM; - p->tcfa_refcnt--; - if (p->tcfa_bindcnt <= 0 && p->tcfa_refcnt <= 0) { + if (atomic_read(&p->tcfa_bindcnt) <= 0 && + refcount_dec_and_test(&p->tcfa_refcnt)) { if (p->ops->cleanup) p->ops->cleanup(p); tcf_idr_remove(p->idrinfo, p); @@ -304,8 +316,8 @@ bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a, if (index && p) { if (bind) - p->tcfa_bindcnt++; - p->tcfa_refcnt++; + atomic_inc(&p->tcfa_bindcnt); + refcount_inc(&p->tcfa_refcnt); *a = p; return true; } @@ -324,9 +336,9 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, if (unlikely(!p)) return -ENOMEM; - p->tcfa_refcnt = 1; + refcount_set(&p->tcfa_refcnt, 1); if (bind) - p->tcfa_bindcnt = 1; + atomic_set(&p->tcfa_bindcnt, 1); if (cpustats) { p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); @@ -782,7 +794,7 @@ static void cleanup_a(struct list_head *actions, int ovr) return; list_for_each_entry(a, actions, list) - a->tcfa_refcnt--; + refcount_dec(&a->tcfa_refcnt); } int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, @@ -810,7 +822,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, act->order = i; sz += tcf_action_fill_size(act); if (ovr) - act->tcfa_refcnt++; + refcount_inc(&act->tcfa_refcnt); list_add_tail(&act->list, actions); } diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 18089c02e557..15a2a53cbde1 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -141,8 +141,8 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, struct tcf_bpf *prog = to_bpf(act); struct tc_act_bpf opt = { .index = prog->tcf_index, - .refcnt = prog->tcf_refcnt - ref, - .bindcnt = prog->tcf_bindcnt - bind, + .refcnt = refcount_read(&prog->tcf_refcnt) - ref, + .bindcnt = atomic_read(&prog->tcf_bindcnt) - bind, .action = prog->tcf_action, }; struct tcf_t tm; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index e4b880fa51fe..188865034f9a 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -154,8 +154,8 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, struct tc_connmark opt = { .index = ci->tcf_index, - .refcnt = ci->tcf_refcnt - ref, - .bindcnt = ci->tcf_bindcnt - bind, + .refcnt = refcount_read(&ci->tcf_refcnt) - ref, + .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind, .action = ci->tcf_action, .zone = ci->zone, }; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 526a8e491626..da865f7b390a 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -597,8 +597,8 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, struct tcf_csum_params *params; struct tc_csum opt = { .index = p->tcf_index, - .refcnt = p->tcf_refcnt - ref, - .bindcnt = p->tcf_bindcnt - bind, + .refcnt = refcount_read(&p->tcf_refcnt) - ref, + .bindcnt = atomic_read(&p->tcf_bindcnt) - bind, }; struct tcf_t t; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 4dc4f153cad8..ca83debd5a70 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -169,8 +169,8 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_gact *gact = to_gact(a); struct tc_gact opt = { .index = gact->tcf_index, - .refcnt = gact->tcf_refcnt - ref, - .bindcnt = gact->tcf_bindcnt - bind, + .refcnt = refcount_read(&gact->tcf_refcnt) - ref, + .bindcnt = atomic_read(&gact->tcf_bindcnt) - bind, .action = gact->tcf_action, }; struct tcf_t t; diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 20d7d36b2fc9..3536a23f46b5 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -596,8 +596,8 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, struct tcf_ife_params *p = rtnl_dereference(ife->params); struct tc_ife opt = { .index = ife->tcf_index, - .refcnt = ife->tcf_refcnt - ref, - .bindcnt = ife->tcf_bindcnt - bind, + .refcnt = refcount_read(&ife->tcf_refcnt) - ref, + .bindcnt = atomic_read(&ife->tcf_bindcnt) - bind, .action = ife->tcf_action, .flags = p->flags, }; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 14c312d7908f..7bce88dc11c9 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -280,8 +280,8 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, if (unlikely(!t)) goto nla_put_failure; - c.bindcnt = ipt->tcf_bindcnt - bind; - c.refcnt = ipt->tcf_refcnt - ref; + c.bindcnt = atomic_read(&ipt->tcf_bindcnt) - bind; + c.refcnt = refcount_read(&ipt->tcf_refcnt) - ref; strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name); if (nla_put(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t) || diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index fd34015331ab..82a8bdd67c47 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -250,8 +250,8 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, struct tc_mirred opt = { .index = m->tcf_index, .action = m->tcf_action, - .refcnt = m->tcf_refcnt - ref, - .bindcnt = m->tcf_bindcnt - bind, + .refcnt = refcount_read(&m->tcf_refcnt) - ref, + .bindcnt = atomic_read(&m->tcf_bindcnt) - bind, .eaction = m->tcfm_eaction, .ifindex = dev ? dev->ifindex : 0, }; diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 4b5848b6c252..457c2ae3de46 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -257,8 +257,8 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, .index = p->tcf_index, .action = p->tcf_action, - .refcnt = p->tcf_refcnt - ref, - .bindcnt = p->tcf_bindcnt - bind, + .refcnt = refcount_read(&p->tcf_refcnt) - ref, + .bindcnt = atomic_read(&p->tcf_bindcnt) - bind, }; struct tcf_t t; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index e43aef28fdac..889690e0ec39 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -409,8 +409,8 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, opt->nkeys = p->tcfp_nkeys; opt->flags = p->tcfp_flags; opt->action = p->tcf_action; - opt->refcnt = p->tcf_refcnt - ref; - opt->bindcnt = p->tcf_bindcnt - bind; + opt->refcnt = refcount_read(&p->tcf_refcnt) - ref; + opt->bindcnt = atomic_read(&p->tcf_bindcnt) - bind; if (p->tcfp_keys_ex) { tcf_pedit_key_ex_dump(skb, p->tcfp_keys_ex, p->tcfp_nkeys); diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 4e72bc2a0dfb..a789b8060968 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -274,8 +274,8 @@ static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, .action = police->tcf_action, .mtu = police->tcfp_mtu, .burst = PSCHED_NS2TICKS(police->tcfp_burst), - .refcnt = police->tcf_refcnt - ref, - .bindcnt = police->tcf_bindcnt - bind, + .refcnt = refcount_read(&police->tcf_refcnt) - ref, + .bindcnt = atomic_read(&police->tcf_bindcnt) - bind, }; struct tcf_t t; diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 5db358497c9e..4a46978db092 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -173,8 +173,8 @@ static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a, struct tc_sample opt = { .index = s->tcf_index, .action = s->tcf_action, - .refcnt = s->tcf_refcnt - ref, - .bindcnt = s->tcf_bindcnt - bind, + .refcnt = refcount_read(&s->tcf_refcnt) - ref, + .bindcnt = atomic_read(&s->tcf_bindcnt) - bind, }; struct tcf_t t; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 98c4afe7c15b..c3a761097b01 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -145,8 +145,8 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_defact *d = to_defact(a); struct tc_defact opt = { .index = d->tcf_index, - .refcnt = d->tcf_refcnt - ref, - .bindcnt = d->tcf_bindcnt - bind, + .refcnt = refcount_read(&d->tcf_refcnt) - ref, + .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, .action = d->tcf_action, }; struct tcf_t t; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index dfaf5d8028dd..cfd20d3d2ca9 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -208,8 +208,8 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_skbedit *d = to_skbedit(a); struct tc_skbedit opt = { .index = d->tcf_index, - .refcnt = d->tcf_refcnt - ref, - .bindcnt = d->tcf_bindcnt - bind, + .refcnt = refcount_read(&d->tcf_refcnt) - ref, + .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, .action = d->tcf_action, }; struct tcf_t t; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index ad050d7d4b46..ff90d720eda3 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -205,8 +205,8 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_skbmod_params *p = rtnl_dereference(d->skbmod_p); struct tc_skbmod opt = { .index = d->tcf_index, - .refcnt = d->tcf_refcnt - ref, - .bindcnt = d->tcf_bindcnt - bind, + .refcnt = refcount_read(&d->tcf_refcnt) - ref, + .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, .action = d->tcf_action, }; struct tcf_t t; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index ea203e386a92..2354f07eba15 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -474,8 +474,8 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_tunnel_key_params *params; struct tc_tunnel_key opt = { .index = t->tcf_index, - .refcnt = t->tcf_refcnt - ref, - .bindcnt = t->tcf_bindcnt - bind, + .refcnt = refcount_read(&t->tcf_refcnt) - ref, + .bindcnt = atomic_read(&t->tcf_bindcnt) - bind, }; struct tcf_t tm; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 1fb39e1f9d07..799e3deb44ac 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -239,8 +239,8 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_vlan_params *p = rtnl_dereference(v->vlan_p); struct tc_vlan opt = { .index = v->tcf_index, - .refcnt = v->tcf_refcnt - ref, - .bindcnt = v->tcf_bindcnt - bind, + .refcnt = refcount_read(&v->tcf_refcnt) - ref, + .bindcnt = atomic_read(&v->tcf_bindcnt) - bind, .action = v->tcf_action, .v_action = p->tcfv_action, }; -- cgit v1.2.3 From 789871bb2a0381425b106d2a995bde1460d35a34 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:25 +0300 Subject: net: sched: implement unlocked action init API Add additional 'rtnl_held' argument to act API init functions. It is required to implement actions that need to release rtnl lock before loading kernel module and reacquire if afterwards. Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 6 ++++-- net/sched/act_api.c | 18 +++++++++++------- net/sched/act_bpf.c | 3 ++- net/sched/act_connmark.c | 2 +- net/sched/act_csum.c | 3 ++- net/sched/act_gact.c | 3 ++- net/sched/act_ife.c | 3 ++- net/sched/act_ipt.c | 6 ++++-- net/sched/act_mirred.c | 5 +++-- net/sched/act_nat.c | 2 +- net/sched/act_pedit.c | 3 ++- net/sched/act_police.c | 2 +- net/sched/act_sample.c | 3 ++- net/sched/act_simple.c | 3 ++- net/sched/act_skbedit.c | 3 ++- net/sched/act_skbmod.c | 3 ++- net/sched/act_tunnel_key.c | 3 ++- net/sched/act_vlan.c | 3 ++- net/sched/cls_api.c | 5 +++-- 19 files changed, 50 insertions(+), 29 deletions(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 2759226527a2..27823f4e24c4 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -92,7 +92,8 @@ struct tc_action_ops { struct netlink_ext_ack *extack); int (*init)(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, int ovr, - int bind, struct netlink_ext_ack *extack); + int bind, bool rtnl_held, + struct netlink_ext_ack *extack); int (*walk)(struct net *, struct sk_buff *, struct netlink_callback *, int, const struct tc_action_ops *, @@ -168,10 +169,11 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, struct list_head *actions, size_t *attr_size, - struct netlink_ext_ack *extack); + bool rtnl_held, struct netlink_ext_ack *extack); struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, + bool rtnl_held, struct netlink_ext_ack *extack); int tcf_action_dump(struct sk_buff *skb, struct list_head *, int, int); int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 4f064ecab882..256b0c93916c 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -671,6 +671,7 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, + bool rtnl_held, struct netlink_ext_ack *extack) { struct tc_action *a; @@ -721,9 +722,11 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, a_o = tc_lookup_action_n(act_name); if (a_o == NULL) { #ifdef CONFIG_MODULES - rtnl_unlock(); + if (rtnl_held) + rtnl_unlock(); request_module("act_%s", act_name); - rtnl_lock(); + if (rtnl_held) + rtnl_lock(); a_o = tc_lookup_action_n(act_name); @@ -746,9 +749,10 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, /* backward compatibility for policer */ if (name == NULL) err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind, - extack); + rtnl_held, extack); else - err = a_o->init(net, nla, est, &a, ovr, bind, extack); + err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held, + extack); if (err < 0) goto err_mod; @@ -800,7 +804,7 @@ static void cleanup_a(struct list_head *actions, int ovr) int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, struct list_head *actions, size_t *attr_size, - struct netlink_ext_ack *extack) + bool rtnl_held, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; @@ -814,7 +818,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind, - extack); + rtnl_held, extack); if (IS_ERR(act)) { err = PTR_ERR(act); goto err; @@ -1173,7 +1177,7 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, LIST_HEAD(actions); ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions, - &attr_size, extack); + &attr_size, true, extack); if (ret) return ret; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 15a2a53cbde1..8ebf40a3506c 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -276,7 +276,8 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog, static int tcf_bpf_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, - int replace, int bind, struct netlink_ext_ack *extack) + int replace, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, bpf_net_id); struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 188865034f9a..e3787aa0025a 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -96,7 +96,7 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = { static int tcf_connmark_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, + int ovr, int bind, bool rtnl_held, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, connmark_net_id); diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index da865f7b390a..334261943f9f 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -46,7 +46,8 @@ static struct tc_action_ops act_csum_ops; static int tcf_csum_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind, struct netlink_ext_ack *extack) + int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, csum_net_id); struct tcf_csum_params *params_old, *params_new; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index ca83debd5a70..b4dfb2b4addc 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -56,7 +56,8 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = { static int tcf_gact_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, gact_net_id); struct nlattr *tb[TCA_GACT_MAX + 1]; diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 3536a23f46b5..576ffbba61c3 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -448,7 +448,8 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, static int tcf_ife_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ife_net_id); struct nlattr *tb[TCA_IFE_MAX + 1]; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 7bce88dc11c9..9c21663a86a6 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -196,7 +196,8 @@ err1: static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind, struct netlink_ext_ack *extack) + int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr, bind); @@ -204,7 +205,8 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, static int tcf_xt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind, struct netlink_ext_ack *extack) + int bind, bool unlocked, + struct netlink_ext_ack *extack) { return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr, bind); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 82a8bdd67c47..5434f08f2eb7 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -68,8 +68,9 @@ static unsigned int mirred_net_id; static struct tc_action_ops act_mirred_ops; static int tcf_mirred_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action **a, int ovr, - int bind, struct netlink_ext_ack *extack) + struct nlattr *est, struct tc_action **a, + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, mirred_net_id); struct nlattr *tb[TCA_MIRRED_MAX + 1]; diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 457c2ae3de46..e6487ad1e4a8 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -38,7 +38,7 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, - struct netlink_ext_ack *extack) + bool rtnl_held, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, nat_net_id); struct nlattr *tb[TCA_NAT_MAX + 1]; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 889690e0ec39..f7965f35585b 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -132,7 +132,8 @@ static int tcf_pedit_key_ex_dump(struct sk_buff *skb, static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, pedit_net_id); struct nlattr *tb[TCA_PEDIT_MAX + 1]; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index a789b8060968..0e1c2fb0ebea 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -75,7 +75,7 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { static int tcf_act_police_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, + int ovr, int bind, bool rtnl_held, struct netlink_ext_ack *extack) { int ret = 0, err; diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 4a46978db092..316fc645595d 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -37,7 +37,8 @@ static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = { static int tcf_sample_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind, struct netlink_ext_ack *extack) + int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, sample_net_id); struct nlattr *tb[TCA_SAMPLE_MAX + 1]; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index c3a761097b01..dc591cc87f4a 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -79,7 +79,8 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = { static int tcf_simp_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, simp_net_id); struct nlattr *tb[TCA_DEF_MAX + 1]; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index cfd20d3d2ca9..c4ae4bd830aa 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -94,7 +94,8 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index ff90d720eda3..026d6f58eda1 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -84,7 +84,8 @@ static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = { static int tcf_skbmod_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbmod_net_id); struct nlattr *tb[TCA_SKBMOD_MAX + 1]; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 2354f07eba15..15ea5ce0f9ed 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -201,7 +201,8 @@ static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = { static int tunnel_key_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1]; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 799e3deb44ac..c61775250722 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -109,7 +109,8 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { static int tcf_vlan_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, vlan_net_id); struct nlattr *tb[TCA_VLAN_MAX + 1]; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index bbf8dda96b0e..ebc2b9dd783f 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1632,7 +1632,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, if (exts->police && tb[exts->police]) { act = tcf_action_init_1(net, tp, tb[exts->police], rate_tlv, "police", ovr, - TCA_ACT_BIND, extack); + TCA_ACT_BIND, true, extack); if (IS_ERR(act)) return PTR_ERR(act); @@ -1645,7 +1645,8 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, err = tcf_action_init(net, tp, tb[exts->action], rate_tlv, NULL, ovr, TCA_ACT_BIND, - &actions, &attr_size, extack); + &actions, &attr_size, true, + extack); if (err) return err; list_for_each_entry(act, &actions, list) -- cgit v1.2.3 From 2a2ea349704fffade9526d5122299edbbfd122ca Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:27 +0300 Subject: net: sched: implement action API that deletes action by index Implement new action API function that atomically finds and deletes action from idr by index. Intended to be used by lockless actions that do not rely on rtnl lock. Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 1 + net/sched/act_api.c | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 27823f4e24c4..a8eaae67c264 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -153,6 +153,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, int bind, bool cpustats); void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a); +int tcf_idr_delete_index(struct tc_action_net *tn, u32 index); int __tcf_idr_release(struct tc_action *a, bool bind, bool strict); static inline int tcf_idr_release(struct tc_action *a, bool bind) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index aa304d36fee0..0f31f09946ab 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -319,6 +319,45 @@ bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a, } EXPORT_SYMBOL(tcf_idr_check); +int tcf_idr_delete_index(struct tc_action_net *tn, u32 index) +{ + struct tcf_idrinfo *idrinfo = tn->idrinfo; + struct tc_action *p; + int ret = 0; + + spin_lock(&idrinfo->lock); + p = idr_find(&idrinfo->action_idr, index); + if (!p) { + spin_unlock(&idrinfo->lock); + return -ENOENT; + } + + if (!atomic_read(&p->tcfa_bindcnt)) { + if (refcount_dec_and_test(&p->tcfa_refcnt)) { + struct module *owner = p->ops->owner; + + WARN_ON(p != idr_remove(&idrinfo->action_idr, + p->tcfa_index)); + spin_unlock(&idrinfo->lock); + + if (p->ops->cleanup) + p->ops->cleanup(p); + + gen_kill_estimator(&p->tcfa_rate_est); + free_tcf(p); + module_put(owner); + return 0; + } + ret = 0; + } else { + ret = -EPERM; + } + + spin_unlock(&idrinfo->lock); + return ret; +} +EXPORT_SYMBOL(tcf_idr_delete_index); + int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, bool cpustats) -- cgit v1.2.3 From b409074e6693bcdaa7abbee2a035f22a9eabda53 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:28 +0300 Subject: net: sched: add 'delete' function to action ops Extend action ops with 'delete' function. Each action type to implements its own delete function that doesn't depend on rtnl lock. Implement delete function that is required to delete actions without holding rtnl lock. Use action API function that atomically deletes action only if it is still in action idr. This implementation prevents concurrent threads from deleting same action twice. Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 1 + net/sched/act_bpf.c | 8 ++++++++ net/sched/act_connmark.c | 8 ++++++++ net/sched/act_csum.c | 8 ++++++++ net/sched/act_gact.c | 8 ++++++++ net/sched/act_ife.c | 8 ++++++++ net/sched/act_ipt.c | 16 ++++++++++++++++ net/sched/act_mirred.c | 8 ++++++++ net/sched/act_nat.c | 8 ++++++++ net/sched/act_pedit.c | 8 ++++++++ net/sched/act_police.c | 8 ++++++++ net/sched/act_sample.c | 8 ++++++++ net/sched/act_simple.c | 8 ++++++++ net/sched/act_skbedit.c | 8 ++++++++ net/sched/act_skbmod.c | 8 ++++++++ net/sched/act_tunnel_key.c | 8 ++++++++ net/sched/act_vlan.c | 8 ++++++++ 17 files changed, 137 insertions(+) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index a8eaae67c264..b9ed2b8256a5 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -101,6 +101,7 @@ struct tc_action_ops { void (*stats_update)(struct tc_action *, u64, u32, u64); size_t (*get_fill_size)(const struct tc_action *act); struct net_device *(*get_dev)(const struct tc_action *a); + int (*delete)(struct net *net, u32 index); }; struct tc_action_net { diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 8ebf40a3506c..7941dd66ff83 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -388,6 +388,13 @@ static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_bpf_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, bpf_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_bpf_ops __read_mostly = { .kind = "bpf", .type = TCA_ACT_BPF, @@ -398,6 +405,7 @@ static struct tc_action_ops act_bpf_ops __read_mostly = { .init = tcf_bpf_init, .walk = tcf_bpf_walker, .lookup = tcf_bpf_search, + .delete = tcf_bpf_delete, .size = sizeof(struct tcf_bpf), }; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index e3787aa0025a..143c2d3de723 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -193,6 +193,13 @@ static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_connmark_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, connmark_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_connmark_ops = { .kind = "connmark", .type = TCA_ACT_CONNMARK, @@ -202,6 +209,7 @@ static struct tc_action_ops act_connmark_ops = { .init = tcf_connmark_init, .walk = tcf_connmark_walker, .lookup = tcf_connmark_search, + .delete = tcf_connmark_delete, .size = sizeof(struct tcf_connmark_info), }; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 334261943f9f..3768539340e0 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -654,6 +654,13 @@ static size_t tcf_csum_get_fill_size(const struct tc_action *act) return nla_total_size(sizeof(struct tc_csum)); } +static int tcf_csum_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, csum_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_csum_ops = { .kind = "csum", .type = TCA_ACT_CSUM, @@ -665,6 +672,7 @@ static struct tc_action_ops act_csum_ops = { .walk = tcf_csum_walker, .lookup = tcf_csum_search, .get_fill_size = tcf_csum_get_fill_size, + .delete = tcf_csum_delete, .size = sizeof(struct tcf_csum), }; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index b4dfb2b4addc..a431a711f0dd 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -231,6 +231,13 @@ static size_t tcf_gact_get_fill_size(const struct tc_action *act) return sz; } +static int tcf_gact_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, gact_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_gact_ops = { .kind = "gact", .type = TCA_ACT_GACT, @@ -242,6 +249,7 @@ static struct tc_action_ops act_gact_ops = { .walk = tcf_gact_walker, .lookup = tcf_gact_search, .get_fill_size = tcf_gact_get_fill_size, + .delete = tcf_gact_delete, .size = sizeof(struct tcf_gact), }; diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 576ffbba61c3..89a761395c94 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -844,6 +844,13 @@ static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_ife_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, ife_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_ife_ops = { .kind = "ife", .type = TCA_ACT_IFE, @@ -854,6 +861,7 @@ static struct tc_action_ops act_ife_ops = { .init = tcf_ife_init, .walk = tcf_ife_walker, .lookup = tcf_ife_search, + .delete = tcf_ife_delete, .size = sizeof(struct tcf_ife_info), }; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 9c21663a86a6..6c234411c771 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -324,6 +324,13 @@ static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_ipt_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, ipt_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_ipt_ops = { .kind = "ipt", .type = TCA_ACT_IPT, @@ -334,6 +341,7 @@ static struct tc_action_ops act_ipt_ops = { .init = tcf_ipt_init, .walk = tcf_ipt_walker, .lookup = tcf_ipt_search, + .delete = tcf_ipt_delete, .size = sizeof(struct tcf_ipt), }; @@ -374,6 +382,13 @@ static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_xt_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, xt_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_xt_ops = { .kind = "xt", .type = TCA_ACT_XT, @@ -384,6 +399,7 @@ static struct tc_action_ops act_xt_ops = { .init = tcf_xt_init, .walk = tcf_xt_walker, .lookup = tcf_xt_search, + .delete = tcf_xt_delete, .size = sizeof(struct tcf_ipt), }; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 5434f08f2eb7..3d8300bce7e4 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -322,6 +322,13 @@ static struct net_device *tcf_mirred_get_dev(const struct tc_action *a) return rtnl_dereference(m->tcfm_dev); } +static int tcf_mirred_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, mirred_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_mirred_ops = { .kind = "mirred", .type = TCA_ACT_MIRRED, @@ -335,6 +342,7 @@ static struct tc_action_ops act_mirred_ops = { .lookup = tcf_mirred_search, .size = sizeof(struct tcf_mirred), .get_dev = tcf_mirred_get_dev, + .delete = tcf_mirred_delete, }; static __net_init int mirred_init_net(struct net *net) diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index e6487ad1e4a8..9eb27c89dc46 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -294,6 +294,13 @@ static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_nat_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, nat_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_nat_ops = { .kind = "nat", .type = TCA_ACT_NAT, @@ -303,6 +310,7 @@ static struct tc_action_ops act_nat_ops = { .init = tcf_nat_init, .walk = tcf_nat_walker, .lookup = tcf_nat_search, + .delete = tcf_nat_delete, .size = sizeof(struct tcf_nat), }; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index f7965f35585b..45871052840f 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -454,6 +454,13 @@ static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_pedit_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, pedit_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_pedit_ops = { .kind = "pedit", .type = TCA_ACT_PEDIT, @@ -464,6 +471,7 @@ static struct tc_action_ops act_pedit_ops = { .init = tcf_pedit_init, .walk = tcf_pedit_walker, .lookup = tcf_pedit_search, + .delete = tcf_pedit_delete, .size = sizeof(struct tcf_pedit), }; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 0e1c2fb0ebea..c955fb0d4f3f 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -314,6 +314,13 @@ static int tcf_police_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_police_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, police_net_id); + + return tcf_idr_delete_index(tn, index); +} + MODULE_AUTHOR("Alexey Kuznetsov"); MODULE_DESCRIPTION("Policing actions"); MODULE_LICENSE("GPL"); @@ -327,6 +334,7 @@ static struct tc_action_ops act_police_ops = { .init = tcf_act_police_init, .walk = tcf_act_police_walker, .lookup = tcf_police_search, + .delete = tcf_police_delete, .size = sizeof(struct tcf_police), }; diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 316fc645595d..6f79d2afcba2 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -220,6 +220,13 @@ static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_sample_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, sample_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_sample_ops = { .kind = "sample", .type = TCA_ACT_SAMPLE, @@ -230,6 +237,7 @@ static struct tc_action_ops act_sample_ops = { .cleanup = tcf_sample_cleanup, .walk = tcf_sample_walker, .lookup = tcf_sample_search, + .delete = tcf_sample_delete, .size = sizeof(struct tcf_sample), }; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index dc591cc87f4a..446c750f3d3c 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -184,6 +184,13 @@ static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_simp_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, simp_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_simp_ops = { .kind = "simple", .type = TCA_ACT_SIMP, @@ -194,6 +201,7 @@ static struct tc_action_ops act_simp_ops = { .init = tcf_simp_init, .walk = tcf_simp_walker, .lookup = tcf_simp_search, + .delete = tcf_simp_delete, .size = sizeof(struct tcf_defact), }; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index c4ae4bd830aa..b3eaa120c7f4 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -267,6 +267,13 @@ static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_skbedit_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, skbedit_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_skbedit_ops = { .kind = "skbedit", .type = TCA_ACT_SKBEDIT, @@ -276,6 +283,7 @@ static struct tc_action_ops act_skbedit_ops = { .init = tcf_skbedit_init, .walk = tcf_skbedit_walker, .lookup = tcf_skbedit_search, + .delete = tcf_skbedit_delete, .size = sizeof(struct tcf_skbedit), }; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 026d6f58eda1..30be3f767495 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -253,6 +253,13 @@ static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_skbmod_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, skbmod_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_skbmod_ops = { .kind = "skbmod", .type = TCA_ACT_SKBMOD, @@ -263,6 +270,7 @@ static struct tc_action_ops act_skbmod_ops = { .cleanup = tcf_skbmod_cleanup, .walk = tcf_skbmod_walker, .lookup = tcf_skbmod_search, + .delete = tcf_skbmod_delete, .size = sizeof(struct tcf_skbmod), }; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 15ea5ce0f9ed..655ed0b3fc67 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -534,6 +534,13 @@ static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tunnel_key_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_tunnel_key_ops = { .kind = "tunnel_key", .type = TCA_ACT_TUNNEL_KEY, @@ -544,6 +551,7 @@ static struct tc_action_ops act_tunnel_key_ops = { .cleanup = tunnel_key_release, .walk = tunnel_key_walker, .lookup = tunnel_key_search, + .delete = tunnel_key_delete, .size = sizeof(struct tcf_tunnel_key), }; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index c61775250722..e334d2751784 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -287,6 +287,13 @@ static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_vlan_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, vlan_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_vlan_ops = { .kind = "vlan", .type = TCA_ACT_VLAN, @@ -297,6 +304,7 @@ static struct tc_action_ops act_vlan_ops = { .cleanup = tcf_vlan_cleanup, .walk = tcf_vlan_walker, .lookup = tcf_vlan_search, + .delete = tcf_vlan_delete, .size = sizeof(struct tcf_vlan), }; -- cgit v1.2.3 From 0190c1d452a91c38a3462abdd81752be1b9006a8 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:32 +0300 Subject: net: sched: atomically check-allocate action Implement function that atomically checks if action exists and either takes reference to it, or allocates idr slot for action index to prevent concurrent allocations of actions with same index. Use EBUSY error pointer to indicate that idr slot is reserved. Implement cleanup helper function that removes temporary error pointer from idr. (in case of error between idr allocation and insertion of newly created action to specified index) Refactor all action init functions to insert new action to idr using this API. Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 3 ++ net/sched/act_api.c | 92 ++++++++++++++++++++++++++++++++++++---------- net/sched/act_bpf.c | 11 ++++-- net/sched/act_connmark.c | 10 +++-- net/sched/act_csum.c | 11 ++++-- net/sched/act_gact.c | 11 ++++-- net/sched/act_ife.c | 6 ++- net/sched/act_ipt.c | 13 ++++++- net/sched/act_mirred.c | 16 ++++++-- net/sched/act_nat.c | 11 ++++-- net/sched/act_pedit.c | 12 ++++-- net/sched/act_police.c | 9 ++++- net/sched/act_sample.c | 11 ++++-- net/sched/act_simple.c | 11 +++++- net/sched/act_skbedit.c | 11 +++++- net/sched/act_skbmod.c | 11 +++++- net/sched/act_tunnel_key.c | 9 ++++- net/sched/act_vlan.c | 17 ++++++++- 18 files changed, 216 insertions(+), 59 deletions(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index b9ed2b8256a5..8090de2edab7 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -154,6 +154,9 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, int bind, bool cpustats); void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a); +void tcf_idr_cleanup(struct tc_action_net *tn, u32 index); +int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, + struct tc_action **a, int bind); int tcf_idr_delete_index(struct tc_action_net *tn, u32 index); int __tcf_idr_release(struct tc_action *a, bool bind, bool strict); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index eefe8c2fe667..9511502e1cbb 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -303,7 +303,9 @@ static bool __tcf_idr_check(struct tc_action_net *tn, u32 index, spin_lock(&idrinfo->lock); p = idr_find(&idrinfo->action_idr, index); - if (p) { + if (IS_ERR(p)) { + p = NULL; + } else if (p) { refcount_inc(&p->tcfa_refcnt); if (bind) atomic_inc(&p->tcfa_bindcnt); @@ -371,7 +373,6 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, { struct tc_action *p = kzalloc(ops->size, GFP_KERNEL); struct tcf_idrinfo *idrinfo = tn->idrinfo; - struct idr *idr = &idrinfo->action_idr; int err = -ENOMEM; if (unlikely(!p)) @@ -389,20 +390,6 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, goto err2; } spin_lock_init(&p->tcfa_lock); - idr_preload(GFP_KERNEL); - spin_lock(&idrinfo->lock); - /* user doesn't specify an index */ - if (!index) { - index = 1; - err = idr_alloc_u32(idr, NULL, &index, UINT_MAX, GFP_ATOMIC); - } else { - err = idr_alloc_u32(idr, NULL, &index, index, GFP_ATOMIC); - } - spin_unlock(&idrinfo->lock); - idr_preload_end(); - if (err) - goto err3; - p->tcfa_index = index; p->tcfa_tm.install = jiffies; p->tcfa_tm.lastuse = jiffies; @@ -412,7 +399,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, &p->tcfa_rate_est, &p->tcfa_lock, NULL, est); if (err) - goto err4; + goto err3; } p->idrinfo = idrinfo; @@ -420,8 +407,6 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, INIT_LIST_HEAD(&p->list); *a = p; return 0; -err4: - idr_remove(idr, index); err3: free_percpu(p->cpu_qstats); err2: @@ -437,11 +422,78 @@ void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a) struct tcf_idrinfo *idrinfo = tn->idrinfo; spin_lock(&idrinfo->lock); - idr_replace(&idrinfo->action_idr, a, a->tcfa_index); + /* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ + WARN_ON(!IS_ERR(idr_replace(&idrinfo->action_idr, a, a->tcfa_index))); spin_unlock(&idrinfo->lock); } EXPORT_SYMBOL(tcf_idr_insert); +/* Cleanup idr index that was allocated but not initialized. */ + +void tcf_idr_cleanup(struct tc_action_net *tn, u32 index) +{ + struct tcf_idrinfo *idrinfo = tn->idrinfo; + + spin_lock(&idrinfo->lock); + /* Remove ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ + WARN_ON(!IS_ERR(idr_remove(&idrinfo->action_idr, index))); + spin_unlock(&idrinfo->lock); +} +EXPORT_SYMBOL(tcf_idr_cleanup); + +/* Check if action with specified index exists. If actions is found, increments + * its reference and bind counters, and return 1. Otherwise insert temporary + * error pointer (to prevent concurrent users from inserting actions with same + * index) and return 0. + */ + +int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, + struct tc_action **a, int bind) +{ + struct tcf_idrinfo *idrinfo = tn->idrinfo; + struct tc_action *p; + int ret; + +again: + spin_lock(&idrinfo->lock); + if (*index) { + p = idr_find(&idrinfo->action_idr, *index); + if (IS_ERR(p)) { + /* This means that another process allocated + * index but did not assign the pointer yet. + */ + spin_unlock(&idrinfo->lock); + goto again; + } + + if (p) { + refcount_inc(&p->tcfa_refcnt); + if (bind) + atomic_inc(&p->tcfa_bindcnt); + *a = p; + ret = 1; + } else { + *a = NULL; + ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index, + *index, GFP_ATOMIC); + if (!ret) + idr_replace(&idrinfo->action_idr, + ERR_PTR(-EBUSY), *index); + } + } else { + *index = 1; + *a = NULL; + ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index, + UINT_MAX, GFP_ATOMIC); + if (!ret) + idr_replace(&idrinfo->action_idr, ERR_PTR(-EBUSY), + *index); + } + spin_unlock(&idrinfo->lock); + return ret; +} +EXPORT_SYMBOL(tcf_idr_check_alloc); + void tcf_idrinfo_destroy(const struct tc_action_ops *ops, struct tcf_idrinfo *idrinfo) { diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index d3f4ac6f2c4b..06f743d8ed41 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -299,14 +299,17 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_ACT_BPF_PARMS]); - if (!tcf_idr_check(tn, parm->index, act, bind)) { + ret = tcf_idr_check_alloc(tn, &parm->index, act, bind); + if (!ret) { ret = tcf_idr_create(tn, parm->index, est, act, &act_bpf_ops, bind, true); - if (ret < 0) + if (ret < 0) { + tcf_idr_cleanup(tn, parm->index); return ret; + } res = ACT_P_CREATED; - } else { + } else if (ret > 0) { /* Don't override defaults. */ if (bind) return 0; @@ -315,6 +318,8 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, tcf_idr_release(*act, bind); return -EEXIST; } + } else { + return ret; } is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS]; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 701e90244eff..1e31f0e448e2 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -118,11 +118,14 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_CONNMARK_PARMS]); - if (!tcf_idr_check(tn, parm->index, a, bind)) { + ret = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (!ret) { ret = tcf_idr_create(tn, parm->index, est, a, &act_connmark_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ci = to_connmark(*a); ci->tcf_action = parm->action; @@ -131,7 +134,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, tcf_idr_insert(tn, *a); ret = ACT_P_CREATED; - } else { + } else if (ret > 0) { ci = to_connmark(*a); if (bind) return 0; @@ -142,6 +145,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, /* replacing action and zone */ ci->tcf_action = parm->action; ci->zone = parm->zone; + ret = 0; } return ret; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 5dbee136b0a1..bd232d3bd022 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -67,19 +67,24 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, return -EINVAL; parm = nla_data(tb[TCA_CSUM_PARMS]); - if (!tcf_idr_check(tn, parm->index, a, bind)) { + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (!err) { ret = tcf_idr_create(tn, parm->index, est, a, &act_csum_ops, bind, true); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; - } else { + } else if (err > 0) { if (bind)/* dont override defaults */ return 0; if (!ovr) { tcf_idr_release(*a, bind); return -EEXIST; } + } else { + return err; } p = to_tcf_csum(*a); diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 11c4de3f344e..661b72b9147d 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -91,19 +91,24 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, } #endif - if (!tcf_idr_check(tn, parm->index, a, bind)) { + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (!err) { ret = tcf_idr_create(tn, parm->index, est, a, &act_gact_ops, bind, true); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; - } else { + } else if (err > 0) { if (bind)/* dont override defaults */ return 0; if (!ovr) { tcf_idr_release(*a, bind); return -EEXIST; } + } else { + return err; } gact = to_gact(*a); diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index acea3feae762..a3eef00cd711 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -484,7 +484,10 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, if (!p) return -ENOMEM; - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) { kfree(p); return 0; @@ -494,6 +497,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, ret = tcf_idr_create(tn, parm->index, est, a, &act_ife_ops, bind, true); if (ret) { + tcf_idr_cleanup(tn, parm->index); kfree(p); return ret; } diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 85e85dfba401..0dc787a57798 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -119,13 +119,18 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, if (tb[TCA_IPT_INDEX] != NULL) index = nla_get_u32(tb[TCA_IPT_INDEX]); - exists = tcf_idr_check(tn, index, a, bind); + err = tcf_idr_check_alloc(tn, &index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (tb[TCA_IPT_HOOK] == NULL || tb[TCA_IPT_TARG] == NULL) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, index); return -EINVAL; } @@ -133,14 +138,18 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, index, est, a, ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, index); return ret; + } ret = ACT_P_CREATED; } else { if (bind)/* dont override defaults */ diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index e08aed06d7f8..6afd89a36c69 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -79,7 +79,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct tcf_mirred *m; struct net_device *dev; bool exists = false; - int ret; + int ret, err; if (!nla) { NL_SET_ERR_MSG_MOD(extack, "Mirred requires attributes to be passed"); @@ -94,7 +94,10 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, } parm = nla_data(tb[TCA_MIRRED_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; @@ -107,6 +110,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, default: if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); NL_SET_ERR_MSG_MOD(extack, "Unknown mirred option"); return -EINVAL; } @@ -115,6 +120,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, if (dev == NULL) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -ENODEV; } mac_header_xmit = dev_is_mac_header_xmit(dev); @@ -124,13 +131,16 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, if (!exists) { if (!dev) { + tcf_idr_cleanup(tn, parm->index); NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist"); return -EINVAL; } ret = tcf_idr_create(tn, parm->index, est, a, &act_mirred_ops, bind, true); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; } else if (!ovr) { tcf_idr_release(*a, bind); diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 1f91e8e66c0f..4dd9188a72fd 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -57,19 +57,24 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, return -EINVAL; parm = nla_data(tb[TCA_NAT_PARMS]); - if (!tcf_idr_check(tn, parm->index, a, bind)) { + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (!err) { ret = tcf_idr_create(tn, parm->index, est, a, &act_nat_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; - } else { + } else if (err > 0) { if (bind) return 0; if (!ovr) { tcf_idr_release(*a, bind); return -EEXIST; } + } else { + return err; } p = to_tcf_nat(*a); diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 3a0e2f762f4e..cc8ffcd1ddb5 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -173,16 +173,20 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, if (IS_ERR(keys_ex)) return PTR_ERR(keys_ex); - if (!tcf_idr_check(tn, parm->index, a, bind)) { + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (!err) { if (!parm->nkeys) { + tcf_idr_cleanup(tn, parm->index); NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed"); ret = -EINVAL; goto out_free; } ret = tcf_idr_create(tn, parm->index, est, a, &act_pedit_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); goto out_free; + } p = to_pedit(*a); keys = kmalloc(ksize, GFP_KERNEL); if (!keys) { @@ -191,7 +195,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, goto out_free; } ret = ACT_P_CREATED; - } else { + } else if (err > 0) { if (bind) goto out_free; if (!ovr) { @@ -207,6 +211,8 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, goto out_free; } } + } else { + return err; } spin_lock_bh(&p->tcf_lock); diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 99335cca739e..1f3192ea8df7 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -101,15 +101,20 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla, return -EINVAL; parm = nla_data(tb[TCA_POLICE_TBF]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (!exists) { ret = tcf_idr_create(tn, parm->index, NULL, a, &act_police_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; } else if (!ovr) { tcf_idr_release(*a, bind); diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index a8582e1347db..3079e7be5bde 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -46,7 +46,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, struct tc_sample *parm; struct tcf_sample *s; bool exists = false; - int ret; + int ret, err; if (!nla) return -EINVAL; @@ -59,15 +59,20 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_SAMPLE_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_sample_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; } else if (!ovr) { tcf_idr_release(*a, bind); diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 2da47c682a30..aa51152e0066 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -100,21 +100,28 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, return -EINVAL; parm = nla_data(tb[TCA_DEF_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (tb[TCA_DEF_DATA] == NULL) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_simp_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } d = to_defact(*a); ret = alloc_defdata(d, tb[TCA_DEF_DATA]); diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 4616a2c1821f..86521a74ecdd 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -152,21 +152,28 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_SKBEDIT_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (!flags) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_skbedit_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } d = to_skbedit(*a); ret = ACT_P_CREATED; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index e844381af066..cdc6bacfb190 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -128,21 +128,28 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, if (parm->flags & SKBMOD_F_SWAPMAC) lflags = SKBMOD_F_SWAPMAC; - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (!lflags) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_skbmod_ops, bind, true); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; } else if (!ovr) { diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index ab5bf5c13f87..3ec585d58762 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -237,7 +237,10 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, } parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; @@ -325,7 +328,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, &act_tunnel_key_ops, bind, true); if (ret) { NL_SET_ERR_MSG(extack, "Cannot create TC IDR"); - return ret; + goto err_out; } ret = ACT_P_CREATED; @@ -364,6 +367,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, err_out: if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return ret; } diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 9b600faaccbb..ad37f308175a 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -134,7 +134,10 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, if (!tb[TCA_VLAN_PARMS]) return -EINVAL; parm = nla_data(tb[TCA_VLAN_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; @@ -146,12 +149,16 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, if (!tb[TCA_VLAN_PUSH_VLAN_ID]) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EINVAL; } push_vid = nla_get_u16(tb[TCA_VLAN_PUSH_VLAN_ID]); if (push_vid >= VLAN_VID_MASK) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -ERANGE; } @@ -164,6 +171,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, default: if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EPROTONOSUPPORT; } } else { @@ -176,6 +185,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, default: if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EINVAL; } action = parm->v_action; @@ -183,8 +194,10 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_vlan_ops, bind, true); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; } else if (!ovr) { -- cgit v1.2.3 From 90b73b77d08ec395311411b545c756ca710aae59 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:33 +0300 Subject: net: sched: change action API to use array of pointers to actions Act API used linked list to pass set of actions to functions. It is intrusive data structure that stores list nodes inside action structure itself, which means it is not safe to modify such list concurrently. However, action API doesn't use any linked list specific operations on this set of actions, so it can be safely refactored into plain pointer array. Refactor action API to use array of pointers to tc_actions instead of linked list. Change argument 'actions' type of exported action init, destroy and dump functions. Acked-by: Jiri Pirko Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- include/net/act_api.h | 7 ++-- net/sched/act_api.c | 89 +++++++++++++++++++++++++++++---------------------- net/sched/cls_api.c | 21 ++++-------- 3 files changed, 60 insertions(+), 57 deletions(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 8090de2edab7..683ce41053d9 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -168,19 +168,20 @@ static inline int tcf_idr_release(struct tc_action *a, bool bind) int tcf_register_action(struct tc_action_ops *a, struct pernet_operations *ops); int tcf_unregister_action(struct tc_action_ops *a, struct pernet_operations *ops); -int tcf_action_destroy(struct list_head *actions, int bind); +int tcf_action_destroy(struct tc_action *actions[], int bind); int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, int nr_actions, struct tcf_result *res); int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, - struct list_head *actions, size_t *attr_size, + struct tc_action *actions[], size_t *attr_size, bool rtnl_held, struct netlink_ext_ack *extack); struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, bool rtnl_held, struct netlink_ext_ack *extack); -int tcf_action_dump(struct sk_buff *skb, struct list_head *, int, int); +int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, + int ref); int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int); int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 9511502e1cbb..bf1c35f3deb6 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -657,13 +657,15 @@ repeat: } EXPORT_SYMBOL(tcf_action_exec); -int tcf_action_destroy(struct list_head *actions, int bind) +int tcf_action_destroy(struct tc_action *actions[], int bind) { const struct tc_action_ops *ops; - struct tc_action *a, *tmp; - int ret = 0; + struct tc_action *a; + int ret = 0, i; - list_for_each_entry_safe(a, tmp, actions, list) { + for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) { + a = actions[i]; + actions[i] = NULL; ops = a->ops; ret = __tcf_idr_release(a, bind, true); if (ret == ACT_P_DELETED) @@ -679,11 +681,12 @@ static int tcf_action_put(struct tc_action *p) return __tcf_action_put(p, false); } -static void tcf_action_put_lst(struct list_head *actions) +static void tcf_action_put_many(struct tc_action *actions[]) { - struct tc_action *a, *tmp; + int i; - list_for_each_entry_safe(a, tmp, actions, list) { + for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) { + struct tc_action *a = actions[i]; const struct tc_action_ops *ops = a->ops; if (tcf_action_put(a)) @@ -735,14 +738,15 @@ nla_put_failure: } EXPORT_SYMBOL(tcf_action_dump_1); -int tcf_action_dump(struct sk_buff *skb, struct list_head *actions, +int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, int ref) { struct tc_action *a; - int err = -EINVAL; + int err = -EINVAL, i; struct nlattr *nest; - list_for_each_entry(a, actions, list) { + for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) { + a = actions[i]; nest = nla_nest_start(skb, a->order); if (nest == NULL) goto nla_put_failure; @@ -878,10 +882,9 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN)) { err = tcf_action_goto_chain_init(a, tp); if (err) { - LIST_HEAD(actions); + struct tc_action *actions[] = { a, NULL }; - list_add_tail(&a->list, &actions); - tcf_action_destroy(&actions, bind); + tcf_action_destroy(actions, bind); NL_SET_ERR_MSG(extack, "Failed to init TC action chain"); return ERR_PTR(err); } @@ -899,9 +902,11 @@ err_out: return ERR_PTR(err); } +/* Returns numbers of initialized actions or negative error. */ + int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, - struct list_head *actions, size_t *attr_size, + struct tc_action *actions[], size_t *attr_size, bool rtnl_held, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; @@ -923,11 +928,12 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, } act->order = i; sz += tcf_action_fill_size(act); - list_add_tail(&act->list, actions); + /* Start from index 0 */ + actions[i - 1] = act; } *attr_size = tcf_action_full_attrs_size(sz); - return 0; + return i - 1; err: tcf_action_destroy(actions, bind); @@ -978,7 +984,7 @@ errout: return -1; } -static int tca_get_fill(struct sk_buff *skb, struct list_head *actions, +static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[], u32 portid, u32 seq, u16 flags, int event, int bind, int ref) { @@ -1014,7 +1020,7 @@ out_nlmsg_trim: static int tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, - struct list_head *actions, int event, + struct tc_action *actions[], int event, struct netlink_ext_ack *extack) { struct sk_buff *skb; @@ -1150,14 +1156,14 @@ err_out: return err; } -static int tcf_action_delete(struct net *net, struct list_head *actions, - struct netlink_ext_ack *extack) +static int tcf_action_delete(struct net *net, struct tc_action *actions[], + int *acts_deleted, struct netlink_ext_ack *extack) { - struct tc_action *a, *tmp; u32 act_index; - int ret; + int ret, i; - list_for_each_entry_safe(a, tmp, actions, list) { + for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) { + struct tc_action *a = actions[i]; const struct tc_action_ops *ops = a->ops; /* Actions can be deleted concurrently so we must save their @@ -1165,23 +1171,26 @@ static int tcf_action_delete(struct net *net, struct list_head *actions, */ act_index = a->tcfa_index; - list_del(&a->list); if (tcf_action_put(a)) { /* last reference, action was deleted concurrently */ module_put(ops->owner); } else { /* now do the delete */ ret = ops->delete(net, act_index); - if (ret < 0) + if (ret < 0) { + *acts_deleted = i + 1; return ret; + } } } + *acts_deleted = i; return 0; } static int -tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, - u32 portid, size_t attr_size, struct netlink_ext_ack *extack) +tcf_del_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], + int *acts_deleted, u32 portid, size_t attr_size, + struct netlink_ext_ack *extack) { int ret; struct sk_buff *skb; @@ -1199,7 +1208,7 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, } /* now do the delete */ - ret = tcf_action_delete(net, actions, extack); + ret = tcf_action_delete(net, actions, acts_deleted, extack); if (ret < 0) { NL_SET_ERR_MSG(extack, "Failed to delete TC action"); kfree_skb(skb); @@ -1221,7 +1230,8 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; size_t attr_size = 0; - LIST_HEAD(actions); + struct tc_action *actions[TCA_ACT_MAX_PRIO + 1] = {}; + int acts_deleted = 0; ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack); if (ret < 0) @@ -1243,26 +1253,27 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, } act->order = i; attr_size += tcf_action_fill_size(act); - list_add_tail(&act->list, &actions); + actions[i - 1] = act; } attr_size = tcf_action_full_attrs_size(attr_size); if (event == RTM_GETACTION) - ret = tcf_get_notify(net, portid, n, &actions, event, extack); + ret = tcf_get_notify(net, portid, n, actions, event, extack); else { /* delete */ - ret = tcf_del_notify(net, n, &actions, portid, attr_size, extack); + ret = tcf_del_notify(net, n, actions, &acts_deleted, portid, + attr_size, extack); if (ret) goto err; return ret; } err: - tcf_action_put_lst(&actions); + tcf_action_put_many(&actions[acts_deleted]); return ret; } static int -tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, +tcf_add_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], u32 portid, size_t attr_size, struct netlink_ext_ack *extack) { struct sk_buff *skb; @@ -1293,15 +1304,15 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, { size_t attr_size = 0; int ret = 0; - LIST_HEAD(actions); + struct tc_action *actions[TCA_ACT_MAX_PRIO] = {}; - ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions, + ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, actions, &attr_size, true, extack); - if (ret) + if (ret < 0) return ret; - ret = tcf_add_notify(net, n, &actions, portid, attr_size, extack); + ret = tcf_add_notify(net, n, actions, portid, attr_size, extack); if (ovr) - tcf_action_put_lst(&actions); + tcf_action_put_many(actions); return ret; } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 9041f0e43e9a..73d9967c3739 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1609,10 +1609,7 @@ out: void tcf_exts_destroy(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT - LIST_HEAD(actions); - - tcf_exts_to_list(exts, &actions); - tcf_action_destroy(&actions, TCA_ACT_UNBIND); + tcf_action_destroy(exts->actions, TCA_ACT_UNBIND); kfree(exts->actions); exts->nr_actions = 0; #endif @@ -1639,18 +1636,15 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, exts->actions[0] = act; exts->nr_actions = 1; } else if (exts->action && tb[exts->action]) { - LIST_HEAD(actions); - int err, i = 0; + int err; err = tcf_action_init(net, tp, tb[exts->action], rate_tlv, NULL, ovr, TCA_ACT_BIND, - &actions, &attr_size, true, + exts->actions, &attr_size, true, extack); - if (err) + if (err < 0) return err; - list_for_each_entry(act, &actions, list) - exts->actions[i++] = act; - exts->nr_actions = i; + exts->nr_actions = err; } exts->net = net; } @@ -1699,14 +1693,11 @@ int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts) * tc data even if iproute2 was newer - jhs */ if (exts->type != TCA_OLD_COMPAT) { - LIST_HEAD(actions); - nest = nla_nest_start(skb, exts->action); if (nest == NULL) goto nla_put_failure; - tcf_exts_to_list(exts, &actions); - if (tcf_action_dump(skb, &actions, 0, 0) < 0) + if (tcf_action_dump(skb, exts->actions, 0, 0) < 0) goto nla_put_failure; nla_nest_end(skb, nest); } else if (exts->police) { -- cgit v1.2.3 From 03dc7a35fcc83a199121a5156c4a7a976b836682 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 11 Jul 2018 12:19:14 +0200 Subject: ipv6: xfrm: use 64-bit timestamps get_seconds() is deprecated because it can overflow on 32-bit architectures. For the xfrm_state->lastused member, we treat the data as a 64-bit number already, so we just need to use the right accessor that works on both 32-bit and 64-bit machines. Signed-off-by: Arnd Bergmann Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 2 +- net/ipv6/xfrm6_mode_ro.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index a5378613a49c..1350e2cf0749 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -227,7 +227,7 @@ struct xfrm_state { long saved_tmo; /* Last used time */ - unsigned long lastused; + time64_t lastused; struct page_frag xfrag; diff --git a/net/ipv6/xfrm6_mode_ro.c b/net/ipv6/xfrm6_mode_ro.c index 07d36573f50b..da28e4407b8f 100644 --- a/net/ipv6/xfrm6_mode_ro.c +++ b/net/ipv6/xfrm6_mode_ro.c @@ -55,7 +55,7 @@ static int xfrm6_ro_output(struct xfrm_state *x, struct sk_buff *skb) __skb_pull(skb, hdr_len); memmove(ipv6_hdr(skb), iph, hdr_len); - x->lastused = get_seconds(); + x->lastused = ktime_get_real_seconds(); return 0; } -- cgit v1.2.3 From 4929c9428a171145f82f81aae0c3c25ef7d82837 Mon Sep 17 00:00:00 2001 From: Deepti Raghavan Date: Mon, 9 Jul 2018 17:53:39 +0000 Subject: tcp: expose both send and receive intervals for rate sample Congestion control algorithms, which access the rate sample through the tcp_cong_control function, only have access to the maximum of the send and receive interval, for cases where the acknowledgment rate may be inaccurate due to ACK compression or decimation. Algorithms may want to use send rates and receive rates as separate signals. Signed-off-by: Deepti Raghavan Acked-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 2 ++ net/ipv4/tcp_rate.c | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index cce37694776e..f6cb20e6e524 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -954,6 +954,8 @@ struct rate_sample { u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ s32 delivered; /* number of packets delivered over interval */ long interval_us; /* time for tp->delivered to incr "delivered" */ + u32 snd_interval_us; /* snd interval for delivered packets */ + u32 rcv_interval_us; /* rcv interval for delivered packets */ long rtt_us; /* RTT of last (S)ACKed packet (or -1) */ int losses; /* number of packets marked lost upon ACK */ u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */ diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index c61240e43923..4dff40dad4dc 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -146,6 +146,10 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, rs->prior_mstamp); /* ack phase */ rs->interval_us = max(snd_us, ack_us); + /* Record both segment send and ack receive intervals */ + rs->snd_interval_us = snd_us; + rs->rcv_interval_us = ack_us; + /* Normally we expect interval_us >= min-rtt. * Note that rate may still be over-estimated when a spuriously * retransmistted skb was first (s)acked because "interval_us" -- cgit v1.2.3 From eeed992b776c54af6108187c87ac60d028e69d37 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 10 Jul 2018 10:02:58 +0300 Subject: net: Add lag.h, net_lag_port_dev_txable() LAG devices (team or bond) recognize for each one of their slave devices whether LAG traffic is going to be sent through that device. Bond calls such devices "active", team calls them "txable". When this state changes, a NETDEV_CHANGELOWERSTATE notification is distributed, together with a netdev_notifier_changelowerstate_info structure that for LAG devices includes a tx_enabled flag that refers to the new state. The notification thus makes it possible to react to the changes in txability in drivers. However there's no way to query txability from the outside on demand. That is problematic namely for mlxsw, which when resolving ERSPAN packet path, may encounter a LAG device, and needs to determine which of the slaves it should choose. To that end, introduce a new function, net_lag_port_dev_txable(), which determines whether a given slave device is "active" or "txable" (depending on the flavor of the LAG device). That function then dispatches to per-LAG-flavor helpers, bond_is_active_slave_dev() resp. team_port_dev_txable(). Because there currently is no good place where net_lag_port_dev_txable() should be added, introduce a new header file, lag.h, which should from now on hold any logic common to both team and bond. (But keep netif_is_lag_master() together with the rest of netif_is_*_master() functions). Signed-off-by: Petr Machata Reviewed-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/if_team.h | 13 +++++++++++++ include/net/bonding.h | 13 +++++++++++++ include/net/lag.h | 17 +++++++++++++++++ 3 files changed, 43 insertions(+) create mode 100644 include/net/lag.h (limited to 'include/net') diff --git a/include/linux/if_team.h b/include/linux/if_team.h index 0d07c6655cce..ac42da56f7a2 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -89,6 +89,19 @@ static inline bool team_port_txable(struct team_port *port) return port->linkup && team_port_enabled(port); } +static inline bool team_port_dev_txable(const struct net_device *port_dev) +{ + struct team_port *port; + bool txable; + + rcu_read_lock(); + port = team_port_get_rcu(port_dev); + txable = port ? team_port_txable(port) : false; + rcu_read_unlock(); + + return txable; +} + #ifdef CONFIG_NET_POLL_CONTROLLER static inline void team_netpoll_send_skb(struct team_port *port, struct sk_buff *skb) diff --git a/include/net/bonding.h b/include/net/bonding.h index 808f1d167349..a2d058170ea3 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -411,6 +411,19 @@ static inline bool bond_slave_can_tx(struct slave *slave) bond_is_active_slave(slave); } +static inline bool bond_is_active_slave_dev(const struct net_device *slave_dev) +{ + struct slave *slave; + bool active; + + rcu_read_lock(); + slave = bond_slave_get_rcu(slave_dev); + active = bond_is_active_slave(slave); + rcu_read_unlock(); + + return active; +} + static inline void bond_hw_addr_copy(u8 *dst, const u8 *src, unsigned int len) { if (len == ETH_ALEN) { diff --git a/include/net/lag.h b/include/net/lag.h new file mode 100644 index 000000000000..95b880e6fdde --- /dev/null +++ b/include/net/lag.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_IF_LAG_H +#define _LINUX_IF_LAG_H + +#include +#include +#include + +static inline bool net_lag_port_dev_txable(const struct net_device *port_dev) +{ + if (netif_is_team_port(port_dev)) + return team_port_dev_txable(port_dev); + else + return bond_is_active_slave_dev(port_dev); +} + +#endif /* _LINUX_IF_LAG_H */ -- cgit v1.2.3 From cca9bab1b72cd2296097c75f59ef11ef80461279 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 11 Jul 2018 12:16:12 +0200 Subject: tcp: use monotonic timestamps for PAWS Using get_seconds() for timestamps is deprecated since it can lead to overflows on 32-bit systems. While the interface generally doesn't overflow until year 2106, the specific implementation of the TCP PAWS algorithm breaks in 2038 when the intermediate signed 32-bit timestamps overflow. A related problem is that the local timestamps in CLOCK_REALTIME form lead to unexpected behavior when settimeofday is called to set the system clock backwards or forwards by more than 24 days. While the first problem could be solved by using an overflow-safe method of comparing the timestamps, a nicer solution is to use a monotonic clocksource with ktime_get_seconds() that simply doesn't overflow (at least not until 136 years after boot) and that doesn't change during settimeofday(). To make 32-bit and 64-bit architectures behave the same way here, and also save a few bytes in the tcp_options_received structure, I'm changing the type to a 32-bit integer, which is now safe on all architectures. Finally, the ts_recent_stamp field also (confusingly) gets used to store a jiffies value in tcp_synq_overflow()/tcp_synq_no_recent_overflow(). This is currently safe, but changing the type to 32-bit requires some small changes there to keep it working. Signed-off-by: Arnd Bergmann Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/crypto/chelsio/chtls/chtls_cm.c | 2 +- include/linux/tcp.h | 4 ++-- include/net/tcp.h | 17 ++++++++++------- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 3 ++- net/ipv4/tcp_minisocks.c | 8 ++++---- 6 files changed, 20 insertions(+), 16 deletions(-) (limited to 'include/net') diff --git a/drivers/crypto/chelsio/chtls/chtls_cm.c b/drivers/crypto/chelsio/chtls/chtls_cm.c index 2bb6f0380758..0997e166ea57 100644 --- a/drivers/crypto/chelsio/chtls/chtls_cm.c +++ b/drivers/crypto/chelsio/chtls/chtls_cm.c @@ -1673,7 +1673,7 @@ static void chtls_timewait(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); tp->rcv_nxt++; - tp->rx_opt.ts_recent_stamp = get_seconds(); + tp->rx_opt.ts_recent_stamp = ktime_get_seconds(); tp->srtt_us = 0; tcp_time_wait(sk, TCP_TIME_WAIT, 0); } diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 3dbea6610304..58a8d7d71354 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -89,7 +89,7 @@ struct tcp_sack_block { struct tcp_options_received { /* PAWS/RTTM data */ - long ts_recent_stamp;/* Time we stored ts_recent (for aging) */ + int ts_recent_stamp;/* Time we stored ts_recent (for aging) */ u32 ts_recent; /* Time stamp to echo next */ u32 rcv_tsval; /* Time stamp value */ u32 rcv_tsecr; /* Time stamp echo reply */ @@ -426,7 +426,7 @@ struct tcp_timewait_sock { /* The time we sent the last out-of-window ACK: */ u32 tw_last_oow_ack_time; - long tw_ts_recent_stamp; + int tw_ts_recent_stamp; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *tw_md5_key; #endif diff --git a/include/net/tcp.h b/include/net/tcp.h index f6cb20e6e524..582304955087 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -472,19 +472,20 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb); */ static inline void tcp_synq_overflow(const struct sock *sk) { - unsigned long last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; - unsigned long now = jiffies; + unsigned int last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; + unsigned int now = jiffies; - if (time_after(now, last_overflow + HZ)) + if (time_after32(now, last_overflow + HZ)) tcp_sk(sk)->rx_opt.ts_recent_stamp = now; } /* syncookies: no recent synqueue overflow on this listening socket? */ static inline bool tcp_synq_no_recent_overflow(const struct sock *sk) { - unsigned long last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; + unsigned int last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; + unsigned int now = jiffies; - return time_after(jiffies, last_overflow + TCP_SYNCOOKIE_VALID); + return time_after32(now, last_overflow + TCP_SYNCOOKIE_VALID); } static inline u32 tcp_cookie_time(void) @@ -1375,7 +1376,8 @@ static inline bool tcp_paws_check(const struct tcp_options_received *rx_opt, { if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win) return true; - if (unlikely(get_seconds() >= rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS)) + if (unlikely(!time_before32(ktime_get_seconds(), + rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS))) return true; /* * Some OSes send SYN and SYNACK messages with tsval=0 tsecr=0, @@ -1405,7 +1407,8 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt, However, we can relax time bounds for RST segments to MSL. */ - if (rst && get_seconds() >= rx_opt->ts_recent_stamp + TCP_PAWS_MSL) + if (rst && !time_before32(ktime_get_seconds(), + rx_opt->ts_recent_stamp + TCP_PAWS_MSL)) return false; return true; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 814ea43dd12f..d3b6390ecf23 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3462,7 +3462,7 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) static void tcp_store_ts_recent(struct tcp_sock *tp) { tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; - tp->rx_opt.ts_recent_stamp = get_seconds(); + tp->rx_opt.ts_recent_stamp = ktime_get_seconds(); } static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index bea17f1e8302..dc415c66a33a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -155,7 +155,8 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) and use initial timestamp retrieved from peer table. */ if (tcptw->tw_ts_recent_stamp && - (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { + (!twp || (reuse && time_after32(ktime_get_seconds(), + tcptw->tw_ts_recent_stamp)))) { tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; if (tp->write_seq == 0) tp->write_seq = 1; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index dac5893a52b4..75ef332a7caf 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -144,7 +144,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, tw->tw_substate = TCP_TIME_WAIT; tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (tmp_opt.saw_tstamp) { - tcptw->tw_ts_recent_stamp = get_seconds(); + tcptw->tw_ts_recent_stamp = ktime_get_seconds(); tcptw->tw_ts_recent = tmp_opt.rcv_tsval; } @@ -189,7 +189,7 @@ kill: if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; - tcptw->tw_ts_recent_stamp = get_seconds(); + tcptw->tw_ts_recent_stamp = ktime_get_seconds(); } inet_twsk_put(tw); @@ -537,7 +537,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, if (newtp->rx_opt.tstamp_ok) { newtp->rx_opt.ts_recent = req->ts_recent; - newtp->rx_opt.ts_recent_stamp = get_seconds(); + newtp->rx_opt.ts_recent_stamp = ktime_get_seconds(); newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; } else { newtp->rx_opt.ts_recent_stamp = 0; @@ -603,7 +603,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * it can be estimated (approximately) * from another data. */ - tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<num_timeout); + tmp_opt.ts_recent_stamp = ktime_get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<num_timeout); paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } -- cgit v1.2.3 From c749cdda9089eb1fdb6a9ab98f945124d12f2595 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 11 Jul 2018 16:04:50 +0200 Subject: net/sched: act_skbedit: don't use spinlock in the data path use RCU instead of spin_{,un}lock_bh, to protect concurrent read/write on act_skbedit configuration. This reduces the effects of contention in the data path, in case multiple readers are present. Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- include/net/tc_act/tc_skbedit.h | 37 ++++++++++---- net/sched/act_skbedit.c | 107 +++++++++++++++++++++++++--------------- 2 files changed, 95 insertions(+), 49 deletions(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h index 19cd3d345804..911bbac838a2 100644 --- a/include/net/tc_act/tc_skbedit.h +++ b/include/net/tc_act/tc_skbedit.h @@ -22,14 +22,19 @@ #include #include +struct tcf_skbedit_params { + u32 flags; + u32 priority; + u32 mark; + u32 mask; + u16 queue_mapping; + u16 ptype; + struct rcu_head rcu; +}; + struct tcf_skbedit { - struct tc_action common; - u32 flags; - u32 priority; - u32 mark; - u32 mask; - u16 queue_mapping; - u16 ptype; + struct tc_action common; + struct tcf_skbedit_params __rcu *params; }; #define to_skbedit(a) ((struct tcf_skbedit *)a) @@ -37,15 +42,27 @@ struct tcf_skbedit { static inline bool is_tcf_skbedit_mark(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT - if (a->ops && a->ops->type == TCA_ACT_SKBEDIT) - return to_skbedit(a)->flags == SKBEDIT_F_MARK; + u32 flags; + + if (a->ops && a->ops->type == TCA_ACT_SKBEDIT) { + rcu_read_lock(); + flags = rcu_dereference(to_skbedit(a)->params)->flags; + rcu_read_unlock(); + return flags == SKBEDIT_F_MARK; + } #endif return false; } static inline u32 tcf_skbedit_mark(const struct tc_action *a) { - return to_skbedit(a)->mark; + u32 mark; + + rcu_read_lock(); + mark = rcu_dereference(to_skbedit(a)->params)->mark; + rcu_read_unlock(); + + return mark; } #endif /* __NET_TC_SKBEDIT_H */ diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 8651b5bd6b59..da56e6938c9e 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -37,14 +37,19 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_skbedit *d = to_skbedit(a); + struct tcf_skbedit_params *params; + int action; tcf_lastuse_update(&d->tcf_tm); bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); - spin_lock(&d->tcf_lock); - if (d->flags & SKBEDIT_F_PRIORITY) - skb->priority = d->priority; - if (d->flags & SKBEDIT_F_INHERITDSFIELD) { + rcu_read_lock(); + params = rcu_dereference(d->params); + action = READ_ONCE(d->tcf_action); + + if (params->flags & SKBEDIT_F_PRIORITY) + skb->priority = params->priority; + if (params->flags & SKBEDIT_F_INHERITDSFIELD) { int wlen = skb_network_offset(skb); switch (tc_skb_protocol(skb)) { @@ -63,23 +68,23 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, break; } } - if (d->flags & SKBEDIT_F_QUEUE_MAPPING && - skb->dev->real_num_tx_queues > d->queue_mapping) - skb_set_queue_mapping(skb, d->queue_mapping); - if (d->flags & SKBEDIT_F_MARK) { - skb->mark &= ~d->mask; - skb->mark |= d->mark & d->mask; + if (params->flags & SKBEDIT_F_QUEUE_MAPPING && + skb->dev->real_num_tx_queues > params->queue_mapping) + skb_set_queue_mapping(skb, params->queue_mapping); + if (params->flags & SKBEDIT_F_MARK) { + skb->mark &= ~params->mask; + skb->mark |= params->mark & params->mask; } - if (d->flags & SKBEDIT_F_PTYPE) - skb->pkt_type = d->ptype; - - spin_unlock(&d->tcf_lock); - return d->tcf_action; + if (params->flags & SKBEDIT_F_PTYPE) + skb->pkt_type = params->ptype; +unlock: + rcu_read_unlock(); + return action; err: - spin_unlock(&d->tcf_lock); qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats)); - return TC_ACT_SHOT; + action = TC_ACT_SHOT; + goto unlock; } static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { @@ -98,6 +103,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); + struct tcf_skbedit_params *params_old, *params_new; struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; struct tc_skbedit *parm; struct tcf_skbedit *d; @@ -185,25 +191,34 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, } } - spin_lock_bh(&d->tcf_lock); + ASSERT_RTNL(); - d->flags = flags; + params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); + if (unlikely(!params_new)) { + if (ret == ACT_P_CREATED) + tcf_idr_release(*a, bind); + return -ENOMEM; + } + + params_new->flags = flags; if (flags & SKBEDIT_F_PRIORITY) - d->priority = *priority; + params_new->priority = *priority; if (flags & SKBEDIT_F_QUEUE_MAPPING) - d->queue_mapping = *queue_mapping; + params_new->queue_mapping = *queue_mapping; if (flags & SKBEDIT_F_MARK) - d->mark = *mark; + params_new->mark = *mark; if (flags & SKBEDIT_F_PTYPE) - d->ptype = *ptype; + params_new->ptype = *ptype; /* default behaviour is to use all the bits */ - d->mask = 0xffffffff; + params_new->mask = 0xffffffff; if (flags & SKBEDIT_F_MASK) - d->mask = *mask; + params_new->mask = *mask; d->tcf_action = parm->action; - - spin_unlock_bh(&d->tcf_lock); + params_old = rtnl_dereference(d->params); + rcu_assign_pointer(d->params, params_new); + if (params_old) + kfree_rcu(params_old, rcu); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -215,33 +230,36 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, { unsigned char *b = skb_tail_pointer(skb); struct tcf_skbedit *d = to_skbedit(a); + struct tcf_skbedit_params *params; struct tc_skbedit opt = { .index = d->tcf_index, .refcnt = refcount_read(&d->tcf_refcnt) - ref, .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, .action = d->tcf_action, }; - struct tcf_t t; u64 pure_flags = 0; + struct tcf_t t; + + params = rtnl_dereference(d->params); if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - if ((d->flags & SKBEDIT_F_PRIORITY) && - nla_put_u32(skb, TCA_SKBEDIT_PRIORITY, d->priority)) + if ((params->flags & SKBEDIT_F_PRIORITY) && + nla_put_u32(skb, TCA_SKBEDIT_PRIORITY, params->priority)) goto nla_put_failure; - if ((d->flags & SKBEDIT_F_QUEUE_MAPPING) && - nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING, d->queue_mapping)) + if ((params->flags & SKBEDIT_F_QUEUE_MAPPING) && + nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING, params->queue_mapping)) goto nla_put_failure; - if ((d->flags & SKBEDIT_F_MARK) && - nla_put_u32(skb, TCA_SKBEDIT_MARK, d->mark)) + if ((params->flags & SKBEDIT_F_MARK) && + nla_put_u32(skb, TCA_SKBEDIT_MARK, params->mark)) goto nla_put_failure; - if ((d->flags & SKBEDIT_F_PTYPE) && - nla_put_u16(skb, TCA_SKBEDIT_PTYPE, d->ptype)) + if ((params->flags & SKBEDIT_F_PTYPE) && + nla_put_u16(skb, TCA_SKBEDIT_PTYPE, params->ptype)) goto nla_put_failure; - if ((d->flags & SKBEDIT_F_MASK) && - nla_put_u32(skb, TCA_SKBEDIT_MASK, d->mask)) + if ((params->flags & SKBEDIT_F_MASK) && + nla_put_u32(skb, TCA_SKBEDIT_MASK, params->mask)) goto nla_put_failure; - if (d->flags & SKBEDIT_F_INHERITDSFIELD) + if (params->flags & SKBEDIT_F_INHERITDSFIELD) pure_flags |= SKBEDIT_F_INHERITDSFIELD; if (pure_flags != 0 && nla_put(skb, TCA_SKBEDIT_FLAGS, sizeof(pure_flags), &pure_flags)) @@ -257,6 +275,16 @@ nla_put_failure: return -1; } +static void tcf_skbedit_cleanup(struct tc_action *a) +{ + struct tcf_skbedit *d = to_skbedit(a); + struct tcf_skbedit_params *params; + + params = rcu_dereference_protected(d->params, 1); + if (params) + kfree_rcu(params, rcu); +} + static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, const struct tc_action_ops *ops, @@ -289,6 +317,7 @@ static struct tc_action_ops act_skbedit_ops = { .act = tcf_skbedit, .dump = tcf_skbedit_dump, .init = tcf_skbedit_init, + .cleanup = tcf_skbedit_cleanup, .walk = tcf_skbedit_walker, .lookup = tcf_skbedit_search, .delete = tcf_skbedit_delete, -- cgit v1.2.3 From b16ebe925a4400a2ec3dc663c81dce2fd9bf0998 Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Thu, 12 Jul 2018 15:13:08 +0300 Subject: devlink: Add support for creating and destroying regions This allows a device to register its supported address regions. Each address region can be accessed directly for example reading the snapshots taken of this address space. Drivers are not limited in the name selection for different regions. An example of a region-name can be: pci cr-space, register-space. Signed-off-by: Alex Vesker Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 22 ++++++++++++++ net/core/devlink.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index f67c29cede15..e5397652f2fb 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -28,6 +28,7 @@ struct devlink { struct list_head dpipe_table_list; struct list_head resource_list; struct list_head param_list; + struct list_head region_list; struct devlink_dpipe_headers *dpipe_headers; const struct devlink_ops *ops; struct device *dev; @@ -397,6 +398,8 @@ enum devlink_param_generic_id { .validate = _validate, \ } +struct devlink_region; + struct devlink_ops { int (*reload)(struct devlink *devlink, struct netlink_ext_ack *extack); int (*port_type_set)(struct devlink_port *devlink_port, @@ -543,6 +546,11 @@ int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, union devlink_param_value init_val); void devlink_param_value_changed(struct devlink *devlink, u32 param_id); +struct devlink_region *devlink_region_create(struct devlink *devlink, + const char *region_name, + u32 region_max_snapshots, + u64 region_size); +void devlink_region_destroy(struct devlink_region *region); #else @@ -770,6 +778,20 @@ devlink_param_value_changed(struct devlink *devlink, u32 param_id) { } +static inline struct devlink_region * +devlink_region_create(struct devlink *devlink, + const char *region_name, + u32 region_max_snapshots, + u64 region_size) +{ + return NULL; +} + +static inline void +devlink_region_destroy(struct devlink_region *region) +{ +} + #endif #endif /* _NET_DEVLINK_H_ */ diff --git a/net/core/devlink.c b/net/core/devlink.c index 470f3dbfecfe..cac856136ac6 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -326,6 +326,28 @@ devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb, pool_type, p_tc_index); } +struct devlink_region { + struct devlink *devlink; + struct list_head list; + const char *name; + struct list_head snapshot_list; + u32 max_snapshots; + u32 cur_snapshots; + u64 size; +}; + +static struct devlink_region * +devlink_region_get_by_name(struct devlink *devlink, const char *region_name) +{ + struct devlink_region *region; + + list_for_each_entry(region, &devlink->region_list, list) + if (!strcmp(region->name, region_name)) + return region; + + return NULL; +} + #define DEVLINK_NL_FLAG_NEED_DEVLINK BIT(0) #define DEVLINK_NL_FLAG_NEED_PORT BIT(1) #define DEVLINK_NL_FLAG_NEED_SB BIT(2) @@ -3358,6 +3380,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); INIT_LIST_HEAD(&devlink->resource_list); INIT_LIST_HEAD(&devlink->param_list); + INIT_LIST_HEAD(&devlink->region_list); mutex_init(&devlink->lock); return devlink; } @@ -4109,6 +4132,67 @@ void devlink_param_value_changed(struct devlink *devlink, u32 param_id) } EXPORT_SYMBOL_GPL(devlink_param_value_changed); +/** + * devlink_region_create - create a new address region + * + * @devlink: devlink + * @region_name: region name + * @region_max_snapshots: Maximum supported number of snapshots for region + * @region_size: size of region + */ +struct devlink_region *devlink_region_create(struct devlink *devlink, + const char *region_name, + u32 region_max_snapshots, + u64 region_size) +{ + struct devlink_region *region; + int err = 0; + + mutex_lock(&devlink->lock); + + if (devlink_region_get_by_name(devlink, region_name)) { + err = -EEXIST; + goto unlock; + } + + region = kzalloc(sizeof(*region), GFP_KERNEL); + if (!region) { + err = -ENOMEM; + goto unlock; + } + + region->devlink = devlink; + region->max_snapshots = region_max_snapshots; + region->name = region_name; + region->size = region_size; + INIT_LIST_HEAD(®ion->snapshot_list); + list_add_tail(®ion->list, &devlink->region_list); + + mutex_unlock(&devlink->lock); + return region; + +unlock: + mutex_unlock(&devlink->lock); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(devlink_region_create); + +/** + * devlink_region_destroy - destroy address region + * + * @region: devlink region to destroy + */ +void devlink_region_destroy(struct devlink_region *region) +{ + struct devlink *devlink = region->devlink; + + mutex_lock(&devlink->lock); + list_del(®ion->list); + mutex_unlock(&devlink->lock); + kfree(region); +} +EXPORT_SYMBOL_GPL(devlink_region_destroy); + static int __init devlink_module_init(void) { return genl_register_family(&devlink_nl_family); -- cgit v1.2.3 From ccadfa444b34c6ec7bb458eee17fdd8c9a456c63 Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Thu, 12 Jul 2018 15:13:09 +0300 Subject: devlink: Add callback to query for snapshot id before snapshot create To restrict the driver with the snapshot ID selection a new callback is introduced for the driver to get the snapshot ID before creating a new snapshot. This will also allow giving the same ID for multiple snapshots taken of different regions on the same time. Signed-off-by: Alex Vesker Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 8 ++++++++ net/core/devlink.c | 21 +++++++++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index e5397652f2fb..f27d8593687a 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -29,6 +29,7 @@ struct devlink { struct list_head resource_list; struct list_head param_list; struct list_head region_list; + u32 snapshot_id; struct devlink_dpipe_headers *dpipe_headers; const struct devlink_ops *ops; struct device *dev; @@ -551,6 +552,7 @@ struct devlink_region *devlink_region_create(struct devlink *devlink, u32 region_max_snapshots, u64 region_size); void devlink_region_destroy(struct devlink_region *region); +u32 devlink_region_shapshot_id_get(struct devlink *devlink); #else @@ -792,6 +794,12 @@ devlink_region_destroy(struct devlink_region *region) { } +static inline u32 +devlink_region_shapshot_id_get(struct devlink *devlink) +{ + return 0; +} + #endif #endif /* _NET_DEVLINK_H_ */ diff --git a/net/core/devlink.c b/net/core/devlink.c index cac856136ac6..6c92ddd2465d 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4193,6 +4193,27 @@ void devlink_region_destroy(struct devlink_region *region) } EXPORT_SYMBOL_GPL(devlink_region_destroy); +/** + * devlink_region_shapshot_id_get - get snapshot ID + * + * This callback should be called when adding a new snapshot, + * Driver should use the same id for multiple snapshots taken + * on multiple regions at the same time/by the same trigger. + * + * @devlink: devlink + */ +u32 devlink_region_shapshot_id_get(struct devlink *devlink) +{ + u32 id; + + mutex_lock(&devlink->lock); + id = ++devlink->snapshot_id; + mutex_unlock(&devlink->lock); + + return id; +} +EXPORT_SYMBOL_GPL(devlink_region_shapshot_id_get); + static int __init devlink_module_init(void) { return genl_register_family(&devlink_nl_family); -- cgit v1.2.3 From d7e5272282d93bedbbeb6174b8af8425d7dcfd6f Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Thu, 12 Jul 2018 15:13:10 +0300 Subject: devlink: Add support for creating region snapshots Each device address region can store multiple snapshots, each snapshot is identified using a different numerical ID. This ID is used when deleting a snapshot or showing an address region specific snapshot. This patch exposes a callback to add a new snapshot to an address region. The snapshot will be deleted using the destructor function when destroying a region or when a snapshot delete command from devlink user tool. Signed-off-by: Alex Vesker Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 13 +++++++ net/core/devlink.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index f27d8593687a..905f0bb7b4ba 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -401,6 +401,8 @@ enum devlink_param_generic_id { struct devlink_region; +typedef void devlink_snapshot_data_dest_t(const void *data); + struct devlink_ops { int (*reload)(struct devlink *devlink, struct netlink_ext_ack *extack); int (*port_type_set)(struct devlink_port *devlink_port, @@ -553,6 +555,9 @@ struct devlink_region *devlink_region_create(struct devlink *devlink, u64 region_size); void devlink_region_destroy(struct devlink_region *region); u32 devlink_region_shapshot_id_get(struct devlink *devlink); +int devlink_region_snapshot_create(struct devlink_region *region, u64 data_len, + u8 *data, u32 snapshot_id, + devlink_snapshot_data_dest_t *data_destructor); #else @@ -800,6 +805,14 @@ devlink_region_shapshot_id_get(struct devlink *devlink) return 0; } +static inline int +devlink_region_snapshot_create(struct devlink_region *region, u64 data_len, + u8 *data, u32 snapshot_id, + devlink_snapshot_data_dest_t *data_destructor) +{ + return 0; +} + #endif #endif /* _NET_DEVLINK_H_ */ diff --git a/net/core/devlink.c b/net/core/devlink.c index 6c92ddd2465d..7d09fe60fa4b 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -336,6 +336,15 @@ struct devlink_region { u64 size; }; +struct devlink_snapshot { + struct list_head list; + struct devlink_region *region; + devlink_snapshot_data_dest_t *data_destructor; + u64 data_len; + u8 *data; + u32 id; +}; + static struct devlink_region * devlink_region_get_by_name(struct devlink *devlink, const char *region_name) { @@ -348,6 +357,26 @@ devlink_region_get_by_name(struct devlink *devlink, const char *region_name) return NULL; } +static struct devlink_snapshot * +devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id) +{ + struct devlink_snapshot *snapshot; + + list_for_each_entry(snapshot, ®ion->snapshot_list, list) + if (snapshot->id == id) + return snapshot; + + return NULL; +} + +static void devlink_region_snapshot_del(struct devlink_snapshot *snapshot) +{ + snapshot->region->cur_snapshots--; + list_del(&snapshot->list); + (*snapshot->data_destructor)(snapshot->data); + kfree(snapshot); +} + #define DEVLINK_NL_FLAG_NEED_DEVLINK BIT(0) #define DEVLINK_NL_FLAG_NEED_PORT BIT(1) #define DEVLINK_NL_FLAG_NEED_SB BIT(2) @@ -4185,8 +4214,14 @@ EXPORT_SYMBOL_GPL(devlink_region_create); void devlink_region_destroy(struct devlink_region *region) { struct devlink *devlink = region->devlink; + struct devlink_snapshot *snapshot, *ts; mutex_lock(&devlink->lock); + + /* Free all snapshots of region */ + list_for_each_entry_safe(snapshot, ts, ®ion->snapshot_list, list) + devlink_region_snapshot_del(snapshot); + list_del(®ion->list); mutex_unlock(&devlink->lock); kfree(region); @@ -4214,6 +4249,66 @@ u32 devlink_region_shapshot_id_get(struct devlink *devlink) } EXPORT_SYMBOL_GPL(devlink_region_shapshot_id_get); +/** + * devlink_region_snapshot_create - create a new snapshot + * This will add a new snapshot of a region. The snapshot + * will be stored on the region struct and can be accessed + * from devlink. This is useful for future analyses of snapshots. + * Multiple snapshots can be created on a region. + * The @snapshot_id should be obtained using the getter function. + * + * @devlink_region: devlink region of the snapshot + * @data_len: size of snapshot data + * @data: snapshot data + * @snapshot_id: snapshot id to be created + * @data_destructor: pointer to destructor function to free data + */ +int devlink_region_snapshot_create(struct devlink_region *region, u64 data_len, + u8 *data, u32 snapshot_id, + devlink_snapshot_data_dest_t *data_destructor) +{ + struct devlink *devlink = region->devlink; + struct devlink_snapshot *snapshot; + int err; + + mutex_lock(&devlink->lock); + + /* check if region can hold one more snapshot */ + if (region->cur_snapshots == region->max_snapshots) { + err = -ENOMEM; + goto unlock; + } + + if (devlink_region_snapshot_get_by_id(region, snapshot_id)) { + err = -EEXIST; + goto unlock; + } + + snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL); + if (!snapshot) { + err = -ENOMEM; + goto unlock; + } + + snapshot->id = snapshot_id; + snapshot->region = region; + snapshot->data = data; + snapshot->data_len = data_len; + snapshot->data_destructor = data_destructor; + + list_add_tail(&snapshot->list, ®ion->snapshot_list); + + region->cur_snapshots++; + + mutex_unlock(&devlink->lock); + return 0; + +unlock: + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_region_snapshot_create); + static int __init devlink_module_init(void) { return genl_register_family(&devlink_nl_family); -- cgit v1.2.3 From f6a69885f2e38be0229ab9f6a2d9d4a1b4ba2be5 Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Thu, 12 Jul 2018 15:13:17 +0300 Subject: devlink: Add generic parameters region_snapshot region_snapshot - When set enables capturing region snapshots Signed-off-by: Alex Vesker Signed-off-by: Jiri Pirko Reviewed-by: Moshe Shemesh Signed-off-by: David S. Miller --- include/net/devlink.h | 4 ++++ net/core/devlink.c | 5 +++++ 2 files changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 905f0bb7b4ba..b9b89d6604d4 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -361,6 +361,7 @@ enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET, DEVLINK_PARAM_GENERIC_ID_MAX_MACS, DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV, + DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -376,6 +377,9 @@ enum devlink_param_generic_id { #define DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_NAME "enable_sriov" #define DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_TYPE DEVLINK_PARAM_TYPE_BOOL +#define DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME "region_snapshot_enable" +#define DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE DEVLINK_PARAM_TYPE_BOOL + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ diff --git a/net/core/devlink.c b/net/core/devlink.c index e5118dba6bb4..65fc366a78a4 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2671,6 +2671,11 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT, + .name = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME, + .type = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) -- cgit v1.2.3 From 811e299f4645588cc7a1b78d97b6847c155324b9 Mon Sep 17 00:00:00 2001 From: Romuald CARI Date: Thu, 7 Jun 2018 16:08:02 +0200 Subject: ieee802154: add rx LQI from userspace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Link Quality Indication data exposed by drivers could not be accessed from userspace. Since this data is per-datagram received, it makes sense to make it available to userspace application through the ancillary data mechanism in recvmsg rather than through ioctls. This can be activated using the socket option WPAN_WANTLQI under SOL_IEEE802154 protocol. This LQI data is available in the ancillary data buffer under the SOL_IEEE802154 level as the type WPAN_LQI. The value is an unsigned byte indicating the link quality with values ranging 0-255. Signed-off-by: Romuald Cari Signed-off-by: Clément Peron Signed-off-by: Stefan Schmidt --- include/net/af_ieee802154.h | 1 + net/ieee802154/socket.c | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'include/net') diff --git a/include/net/af_ieee802154.h b/include/net/af_ieee802154.h index a5563d27a3eb..8003a9f6eb43 100644 --- a/include/net/af_ieee802154.h +++ b/include/net/af_ieee802154.h @@ -56,6 +56,7 @@ struct sockaddr_ieee802154 { #define WPAN_WANTACK 0 #define WPAN_SECURITY 1 #define WPAN_SECURITY_LEVEL 2 +#define WPAN_WANTLQI 3 #define WPAN_SECURITY_DEFAULT 0 #define WPAN_SECURITY_OFF 1 diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index a60658c85a9a..bc6b912603f1 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -25,6 +25,7 @@ #include /* For TIOCOUTQ/INQ */ #include #include +#include #include #include #include @@ -452,6 +453,7 @@ struct dgram_sock { unsigned int bound:1; unsigned int connected:1; unsigned int want_ack:1; + unsigned int want_lqi:1; unsigned int secen:1; unsigned int secen_override:1; unsigned int seclevel:3; @@ -486,6 +488,7 @@ static int dgram_init(struct sock *sk) struct dgram_sock *ro = dgram_sk(sk); ro->want_ack = 1; + ro->want_lqi = 0; return 0; } @@ -713,6 +716,7 @@ static int dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, size_t copied = 0; int err = -EOPNOTSUPP; struct sk_buff *skb; + struct dgram_sock *ro = dgram_sk(sk); DECLARE_SOCKADDR(struct sockaddr_ieee802154 *, saddr, msg->msg_name); skb = skb_recv_datagram(sk, flags, noblock, &err); @@ -744,6 +748,13 @@ static int dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, *addr_len = sizeof(*saddr); } + if (ro->want_lqi) { + err = put_cmsg(msg, SOL_IEEE802154, WPAN_WANTLQI, + sizeof(uint8_t), &(mac_cb(skb)->lqi)); + if (err) + goto done; + } + if (flags & MSG_TRUNC) copied = skb->len; done: @@ -847,6 +858,9 @@ static int dgram_getsockopt(struct sock *sk, int level, int optname, case WPAN_WANTACK: val = ro->want_ack; break; + case WPAN_WANTLQI: + val = ro->want_lqi; + break; case WPAN_SECURITY: if (!ro->secen_override) val = WPAN_SECURITY_DEFAULT; @@ -892,6 +906,9 @@ static int dgram_setsockopt(struct sock *sk, int level, int optname, case WPAN_WANTACK: ro->want_ack = !!val; break; + case WPAN_WANTLQI: + ro->want_lqi = !!val; + break; case WPAN_SECURITY: if (!ns_capable(net->user_ns, CAP_NET_ADMIN) && !ns_capable(net->user_ns, CAP_NET_RAW)) { -- cgit v1.2.3 From 05296620f6d14dce0030b87e1e57891a770fb65c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 11 Jul 2018 20:36:40 -0700 Subject: xdp: factor out common program/flags handling from drivers Basic operations drivers perform during xdp setup and query can be moved to helpers in the core. Encapsulate program and flags into a structure and add helpers. Note that the structure is intended as the "main" program information source in the driver. Most drivers will additionally place the program pointer in their fast path or ring structures. The helpers don't have a huge impact now, but they will decrease the code duplication when programs can be installed in HW and driver at the same time. Encapsulating the basic operations in helpers will hopefully also reduce the number of changes to drivers which adopt them. Helpers could really be static inline, but they depend on definition of struct netdev_bpf which means they'd have to be placed in netdevice.h, an already 4500 line header. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/nfp_net.h | 6 ++-- .../net/ethernet/netronome/nfp/nfp_net_common.c | 28 +++++++----------- drivers/net/netdevsim/bpf.c | 16 +++------- drivers/net/netdevsim/netdevsim.h | 4 +-- include/net/xdp.h | 13 +++++++++ net/core/xdp.c | 34 ++++++++++++++++++++++ tools/testing/selftests/bpf/test_offload.py | 4 +-- 7 files changed, 67 insertions(+), 38 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h b/drivers/net/ethernet/netronome/nfp/nfp_net.h index 2a71a9ffd095..2021dda595b7 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h @@ -553,8 +553,7 @@ struct nfp_net_dp { * @rss_cfg: RSS configuration * @rss_key: RSS secret key * @rss_itbl: RSS indirection table - * @xdp_flags: Flags with which XDP prog was loaded - * @xdp_prog: XDP prog (for ctrl path, both DRV and HW modes) + * @xdp: Information about the attached XDP program * @max_r_vecs: Number of allocated interrupt vectors for RX/TX * @max_tx_rings: Maximum number of TX rings supported by the Firmware * @max_rx_rings: Maximum number of RX rings supported by the Firmware @@ -610,8 +609,7 @@ struct nfp_net { u8 rss_key[NFP_NET_CFG_RSS_KEY_SZ]; u8 rss_itbl[NFP_NET_CFG_RSS_ITBL_SZ]; - u32 xdp_flags; - struct bpf_prog *xdp_prog; + struct xdp_attachment_info xdp; unsigned int max_tx_rings; unsigned int max_rx_rings; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index d20714598613..4bb589dbffbc 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -3417,34 +3417,29 @@ nfp_net_xdp_setup_drv(struct nfp_net *nn, struct bpf_prog *prog, return nfp_net_ring_reconfig(nn, dp, extack); } -static int -nfp_net_xdp_setup(struct nfp_net *nn, struct bpf_prog *prog, u32 flags, - struct netlink_ext_ack *extack) +static int nfp_net_xdp_setup(struct nfp_net *nn, struct netdev_bpf *bpf) { struct bpf_prog *drv_prog, *offload_prog; int err; - if (nn->xdp_prog && (flags ^ nn->xdp_flags) & XDP_FLAGS_MODES) + if (!xdp_attachment_flags_ok(&nn->xdp, bpf)) return -EBUSY; /* Load both when no flags set to allow easy activation of driver path * when program is replaced by one which can't be offloaded. */ - drv_prog = flags & XDP_FLAGS_HW_MODE ? NULL : prog; - offload_prog = flags & XDP_FLAGS_DRV_MODE ? NULL : prog; + drv_prog = bpf->flags & XDP_FLAGS_HW_MODE ? NULL : bpf->prog; + offload_prog = bpf->flags & XDP_FLAGS_DRV_MODE ? NULL : bpf->prog; - err = nfp_net_xdp_setup_drv(nn, drv_prog, extack); + err = nfp_net_xdp_setup_drv(nn, drv_prog, bpf->extack); if (err) return err; - err = nfp_app_xdp_offload(nn->app, nn, offload_prog, extack); - if (err && flags & XDP_FLAGS_HW_MODE) + err = nfp_app_xdp_offload(nn->app, nn, offload_prog, bpf->extack); + if (err && bpf->flags & XDP_FLAGS_HW_MODE) return err; - if (nn->xdp_prog) - bpf_prog_put(nn->xdp_prog); - nn->xdp_prog = prog; - nn->xdp_flags = flags; + xdp_attachment_setup(&nn->xdp, bpf); return 0; } @@ -3456,12 +3451,9 @@ static int nfp_net_xdp(struct net_device *netdev, struct netdev_bpf *xdp) switch (xdp->command) { case XDP_SETUP_PROG: case XDP_SETUP_PROG_HW: - return nfp_net_xdp_setup(nn, xdp->prog, xdp->flags, - xdp->extack); + return nfp_net_xdp_setup(nn, xdp); case XDP_QUERY_PROG: - xdp->prog_id = nn->xdp_prog ? nn->xdp_prog->aux->id : 0; - xdp->prog_flags = nn->xdp_prog ? nn->xdp_flags : 0; - return 0; + return xdp_attachment_query(&nn->xdp, xdp); default: return nfp_app_bpf(nn->app, nn, xdp); } diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c index 712e6f918065..c485d97b5df4 100644 --- a/drivers/net/netdevsim/bpf.c +++ b/drivers/net/netdevsim/bpf.c @@ -199,10 +199,8 @@ static int nsim_xdp_set_prog(struct netdevsim *ns, struct netdev_bpf *bpf) { int err; - if (ns->xdp_prog && (bpf->flags ^ ns->xdp_flags) & XDP_FLAGS_MODES) { - NSIM_EA(bpf->extack, "program loaded with different flags"); + if (!xdp_attachment_flags_ok(&ns->xdp, bpf)) return -EBUSY; - } if (bpf->command == XDP_SETUP_PROG && !ns->bpf_xdpdrv_accept) { NSIM_EA(bpf->extack, "driver XDP disabled in DebugFS"); @@ -219,11 +217,7 @@ static int nsim_xdp_set_prog(struct netdevsim *ns, struct netdev_bpf *bpf) return err; } - if (ns->xdp_prog) - bpf_prog_put(ns->xdp_prog); - - ns->xdp_prog = bpf->prog; - ns->xdp_flags = bpf->flags; + xdp_attachment_setup(&ns->xdp, bpf); if (!bpf->prog) ns->xdp_prog_mode = XDP_ATTACHED_NONE; @@ -567,9 +561,7 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf) nsim_bpf_destroy_prog(bpf->offload.prog); return 0; case XDP_QUERY_PROG: - bpf->prog_id = ns->xdp_prog ? ns->xdp_prog->aux->id : 0; - bpf->prog_flags = ns->xdp_prog ? ns->xdp_flags : 0; - return 0; + return xdp_attachment_query(&ns->xdp, bpf); case XDP_SETUP_PROG: err = nsim_setup_prog_checks(ns, bpf); if (err) @@ -636,6 +628,6 @@ void nsim_bpf_uninit(struct netdevsim *ns) { WARN_ON(!list_empty(&ns->bpf_bound_progs)); WARN_ON(!list_empty(&ns->bpf_bound_maps)); - WARN_ON(ns->xdp_prog); + WARN_ON(ns->xdp.prog); WARN_ON(ns->bpf_offloaded); } diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h index d8a7cc995e88..69ffb4a2d14b 100644 --- a/drivers/net/netdevsim/netdevsim.h +++ b/drivers/net/netdevsim/netdevsim.h @@ -18,6 +18,7 @@ #include #include #include +#include #define DRV_NAME "netdevsim" @@ -67,9 +68,8 @@ struct netdevsim { struct bpf_prog *bpf_offloaded; u32 bpf_offloaded_id; - u32 xdp_flags; + struct xdp_attachment_info xdp; int xdp_prog_mode; - struct bpf_prog *xdp_prog; u32 prog_id_gen; diff --git a/include/net/xdp.h b/include/net/xdp.h index 2deea7166a34..fcb033f51d8c 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -144,4 +144,17 @@ xdp_data_meta_unsupported(const struct xdp_buff *xdp) return unlikely(xdp->data_meta > xdp->data); } +struct xdp_attachment_info { + struct bpf_prog *prog; + u32 flags; +}; + +struct netdev_bpf; +int xdp_attachment_query(struct xdp_attachment_info *info, + struct netdev_bpf *bpf); +bool xdp_attachment_flags_ok(struct xdp_attachment_info *info, + struct netdev_bpf *bpf); +void xdp_attachment_setup(struct xdp_attachment_info *info, + struct netdev_bpf *bpf); + #endif /* __LINUX_NET_XDP_H__ */ diff --git a/net/core/xdp.c b/net/core/xdp.c index 31c58719b5a9..57285383ed00 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -3,8 +3,11 @@ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. * Released under terms in GPL version 2. See COPYING. */ +#include +#include #include #include +#include #include #include #include @@ -370,3 +373,34 @@ void xdp_return_buff(struct xdp_buff *xdp) __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle); } EXPORT_SYMBOL_GPL(xdp_return_buff); + +int xdp_attachment_query(struct xdp_attachment_info *info, + struct netdev_bpf *bpf) +{ + bpf->prog_id = info->prog ? info->prog->aux->id : 0; + bpf->prog_flags = info->prog ? info->flags : 0; + return 0; +} +EXPORT_SYMBOL_GPL(xdp_attachment_query); + +bool xdp_attachment_flags_ok(struct xdp_attachment_info *info, + struct netdev_bpf *bpf) +{ + if (info->prog && (bpf->flags ^ info->flags) & XDP_FLAGS_MODES) { + NL_SET_ERR_MSG(bpf->extack, + "program loaded with different flags"); + return false; + } + return true; +} +EXPORT_SYMBOL_GPL(xdp_attachment_flags_ok); + +void xdp_attachment_setup(struct xdp_attachment_info *info, + struct netdev_bpf *bpf) +{ + if (info->prog) + bpf_prog_put(info->prog); + info->prog = bpf->prog; + info->flags = bpf->flags; +} +EXPORT_SYMBOL_GPL(xdp_attachment_setup); diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py index f8d9bd81d9a4..40401e9e9351 100755 --- a/tools/testing/selftests/bpf/test_offload.py +++ b/tools/testing/selftests/bpf/test_offload.py @@ -821,7 +821,7 @@ try: ret, _, err = sim.set_xdp(obj, "", force=True, fail=False, include_stderr=True) fail(ret == 0, "Replaced XDP program with a program in different mode") - check_extack_nsim(err, "program loaded with different flags.", args) + check_extack(err, "program loaded with different flags.", args) start_test("Test XDP prog remove with bad flags...") ret, _, err = sim.unset_xdp("offload", force=True, @@ -831,7 +831,7 @@ try: ret, _, err = sim.unset_xdp("", force=True, fail=False, include_stderr=True) fail(ret == 0, "Removed program with a bad mode") - check_extack_nsim(err, "program loaded with different flags.", args) + check_extack(err, "program loaded with different flags.", args) start_test("Test MTU restrictions...") ret, _ = sim.set_mtu(9000, fail=False) -- cgit v1.2.3 From 01683a1469995cc7aaf833d6f8b3f1c1d2fc3b92 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Mon, 9 Jul 2018 13:29:11 +0300 Subject: net: sched: refactor flower walk to iterate over idr Extend struct tcf_walker with additional 'cookie' field. It is intended to be used by classifier walk implementations to continue iteration directly from particular filter, instead of iterating 'skip' number of times. Change flower walk implementation to save filter handle in 'cookie'. Each time flower walk is called, it looks up filter with saved handle directly with idr, instead of iterating over filter linked list 'skip' number of times. This change improves complexity of dumping flower classifier from quadratic to linearithmic. (assuming idr lookup has logarithmic complexity) Reviewed-by: Jiri Pirko Signed-off-by: Vlad Buslov Reported-by: Simon Horman Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 1 + net/sched/cls_api.c | 2 ++ net/sched/cls_flower.c | 20 +++++++++----------- 3 files changed, 12 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 2081e4219f81..e4252a176eec 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -13,6 +13,7 @@ struct tcf_walker { int stop; int skip; int count; + unsigned long cookie; int (*fn)(struct tcf_proto *, void *node, struct tcf_walker *); }; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 73d9967c3739..c51b1b12450d 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1508,7 +1508,9 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, arg.w.stop = 0; arg.w.skip = cb->args[1] - 1; arg.w.count = 0; + arg.w.cookie = cb->args[2]; tp->ops->walk(tp, &arg.w); + cb->args[2] = arg.w.cookie; cb->args[1] = arg.w.count + 1; if (arg.w.stop) return false; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 8b2474293db1..c53fdd411f90 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1099,19 +1099,17 @@ static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg) { struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *f; - struct fl_flow_mask *mask; - list_for_each_entry_rcu(mask, &head->masks, list) { - list_for_each_entry_rcu(f, &mask->filters, list) { - if (arg->count < arg->skip) - goto skip; - if (arg->fn(tp, f, arg) < 0) { - arg->stop = 1; - break; - } -skip: - arg->count++; + arg->count = arg->skip; + + while ((f = idr_get_next_ul(&head->handle_idr, + &arg->cookie)) != NULL) { + if (arg->fn(tp, f, arg) < 0) { + arg->stop = 1; + break; } + arg->cookie = f->handle + 1; + arg->count++; } } -- cgit v1.2.3 From d80a1b9d186057ddb0d384ba601cf2b7d214539c Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Fri, 13 Jul 2018 14:33:39 +0300 Subject: tls: Refactor tls_offload variable names For symmetry, we rename tls_offload_context to tls_offload_context_tx before we add tls_offload_context_rx. Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlx5/core/en_accel/tls.h | 6 +++--- include/net/tls.h | 16 +++++++------- net/tls/tls_device.c | 25 +++++++++++----------- net/tls/tls_device_fallback.c | 8 +++---- 4 files changed, 27 insertions(+), 28 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h index b6162178f621..b82f4deaa398 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.h @@ -50,7 +50,7 @@ struct mlx5e_tls { }; struct mlx5e_tls_offload_context { - struct tls_offload_context base; + struct tls_offload_context_tx base; u32 expected_seq; __be32 swid; }; @@ -59,8 +59,8 @@ static inline struct mlx5e_tls_offload_context * mlx5e_get_tls_tx_context(struct tls_context *tls_ctx) { BUILD_BUG_ON(sizeof(struct mlx5e_tls_offload_context) > - TLS_OFFLOAD_CONTEXT_SIZE); - return container_of(tls_offload_ctx(tls_ctx), + TLS_OFFLOAD_CONTEXT_SIZE_TX); + return container_of(tls_offload_ctx_tx(tls_ctx), struct mlx5e_tls_offload_context, base); } diff --git a/include/net/tls.h b/include/net/tls.h index 70c273777fe9..5dcd808236a7 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -128,7 +128,7 @@ struct tls_record_info { skb_frag_t frags[MAX_SKB_FRAGS]; }; -struct tls_offload_context { +struct tls_offload_context_tx { struct crypto_aead *aead_send; spinlock_t lock; /* protects records list */ struct list_head records_list; @@ -147,8 +147,8 @@ struct tls_offload_context { #define TLS_DRIVER_STATE_SIZE (max_t(size_t, 8, sizeof(void *))) }; -#define TLS_OFFLOAD_CONTEXT_SIZE \ - (ALIGN(sizeof(struct tls_offload_context), sizeof(void *)) + \ +#define TLS_OFFLOAD_CONTEXT_SIZE_TX \ + (ALIGN(sizeof(struct tls_offload_context_tx), sizeof(void *)) + \ TLS_DRIVER_STATE_SIZE) enum { @@ -239,7 +239,7 @@ void tls_device_sk_destruct(struct sock *sk); void tls_device_init(void); void tls_device_cleanup(void); -struct tls_record_info *tls_get_record(struct tls_offload_context *context, +struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, u32 seq, u64 *p_record_sn); static inline bool tls_record_is_start_marker(struct tls_record_info *rec) @@ -380,10 +380,10 @@ static inline struct tls_sw_context_tx *tls_sw_ctx_tx( return (struct tls_sw_context_tx *)tls_ctx->priv_ctx_tx; } -static inline struct tls_offload_context *tls_offload_ctx( - const struct tls_context *tls_ctx) +static inline struct tls_offload_context_tx * +tls_offload_ctx_tx(const struct tls_context *tls_ctx) { - return (struct tls_offload_context *)tls_ctx->priv_ctx_tx; + return (struct tls_offload_context_tx *)tls_ctx->priv_ctx_tx; } int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, @@ -396,7 +396,7 @@ struct sk_buff *tls_validate_xmit_skb(struct sock *sk, struct sk_buff *skb); int tls_sw_fallback_init(struct sock *sk, - struct tls_offload_context *offload_ctx, + struct tls_offload_context_tx *offload_ctx, struct tls_crypto_info *crypto_info); #endif /* _TLS_OFFLOAD_H */ diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index a7a8f8e20ff3..332a5d1459b6 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -52,9 +52,8 @@ static DEFINE_SPINLOCK(tls_device_lock); static void tls_device_free_ctx(struct tls_context *ctx) { - struct tls_offload_context *offload_ctx = tls_offload_ctx(ctx); + kfree(tls_offload_ctx_tx(ctx)); - kfree(offload_ctx); kfree(ctx); } @@ -125,7 +124,7 @@ static void destroy_record(struct tls_record_info *record) kfree(record); } -static void delete_all_records(struct tls_offload_context *offload_ctx) +static void delete_all_records(struct tls_offload_context_tx *offload_ctx) { struct tls_record_info *info, *temp; @@ -141,14 +140,14 @@ static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_record_info *info, *temp; - struct tls_offload_context *ctx; + struct tls_offload_context_tx *ctx; u64 deleted_records = 0; unsigned long flags; if (!tls_ctx) return; - ctx = tls_offload_ctx(tls_ctx); + ctx = tls_offload_ctx_tx(tls_ctx); spin_lock_irqsave(&ctx->lock, flags); info = ctx->retransmit_hint; @@ -179,7 +178,7 @@ static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq) void tls_device_sk_destruct(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); + struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); if (ctx->open_record) destroy_record(ctx->open_record); @@ -219,7 +218,7 @@ static void tls_append_frag(struct tls_record_info *record, static int tls_push_record(struct sock *sk, struct tls_context *ctx, - struct tls_offload_context *offload_ctx, + struct tls_offload_context_tx *offload_ctx, struct tls_record_info *record, struct page_frag *pfrag, int flags, @@ -264,7 +263,7 @@ static int tls_push_record(struct sock *sk, return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags); } -static int tls_create_new_record(struct tls_offload_context *offload_ctx, +static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx, struct page_frag *pfrag, size_t prepend_size) { @@ -290,7 +289,7 @@ static int tls_create_new_record(struct tls_offload_context *offload_ctx, } static int tls_do_allocation(struct sock *sk, - struct tls_offload_context *offload_ctx, + struct tls_offload_context_tx *offload_ctx, struct page_frag *pfrag, size_t prepend_size) { @@ -324,7 +323,7 @@ static int tls_push_data(struct sock *sk, unsigned char record_type) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); + struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); int tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST; int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE); struct tls_record_info *record = ctx->open_record; @@ -477,7 +476,7 @@ out: return rc; } -struct tls_record_info *tls_get_record(struct tls_offload_context *context, +struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, u32 seq, u64 *p_record_sn) { u64 record_sn = context->hint_record_sn; @@ -524,7 +523,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) { u16 nonce_size, tag_size, iv_size, rec_seq_size; struct tls_record_info *start_marker_record; - struct tls_offload_context *offload_ctx; + struct tls_offload_context_tx *offload_ctx; struct tls_crypto_info *crypto_info; struct net_device *netdev; char *iv, *rec_seq; @@ -546,7 +545,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) goto out; } - offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE, GFP_KERNEL); + offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_TX, GFP_KERNEL); if (!offload_ctx) { rc = -ENOMEM; goto free_marker_record; diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index 748914abdb60..d1d7dce38e0b 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -214,7 +214,7 @@ static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln) static int fill_sg_in(struct scatterlist *sg_in, struct sk_buff *skb, - struct tls_offload_context *ctx, + struct tls_offload_context_tx *ctx, u64 *rcd_sn, s32 *sync_size, int *resync_sgs) @@ -299,7 +299,7 @@ static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx, s32 sync_size, u64 rcd_sn) { int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb); - struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); + struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); int payload_len = skb->len - tcp_payload_offset; void *buf, *iv, *aad, *dummy_buf; struct aead_request *aead_req; @@ -361,7 +361,7 @@ static struct sk_buff *tls_sw_fallback(struct sock *sk, struct sk_buff *skb) { int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb); struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); + struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); int payload_len = skb->len - tcp_payload_offset; struct scatterlist *sg_in, sg_out[3]; struct sk_buff *nskb = NULL; @@ -415,7 +415,7 @@ struct sk_buff *tls_validate_xmit_skb(struct sock *sk, } int tls_sw_fallback_init(struct sock *sk, - struct tls_offload_context *offload_ctx, + struct tls_offload_context_tx *offload_ctx, struct tls_crypto_info *crypto_info) { const u8 *key; -- cgit v1.2.3 From dafb67f3bb4a58a45fe92c1e362ea6429831688a Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Fri, 13 Jul 2018 14:33:40 +0300 Subject: tls: Split decrypt_skb to two functions Previously, decrypt_skb also updated the TLS context. Now, decrypt_skb only decrypts the payload using the current context, while decrypt_skb_update also updates the state. Later, in the tls_device Rx flow, we will use decrypt_skb directly. Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- include/net/tls.h | 2 ++ net/tls/tls_sw.c | 44 ++++++++++++++++++++++++++------------------ 2 files changed, 28 insertions(+), 18 deletions(-) (limited to 'include/net') diff --git a/include/net/tls.h b/include/net/tls.h index 5dcd808236a7..49b89221db43 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -390,6 +390,8 @@ int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, unsigned char *record_type); void tls_register_device(struct tls_device *device); void tls_unregister_device(struct tls_device *device); +int decrypt_skb(struct sock *sk, struct sk_buff *skb, + struct scatterlist *sgout); struct sk_buff *tls_validate_xmit_skb(struct sock *sk, struct net_device *dev, diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 7453f5ae0819..1d2271736717 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -53,7 +53,6 @@ static int tls_do_decryption(struct sock *sk, { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - struct strp_msg *rxm = strp_msg(skb); struct aead_request *aead_req; int ret; @@ -71,18 +70,6 @@ static int tls_do_decryption(struct sock *sk, ret = crypto_wait_req(crypto_aead_decrypt(aead_req), &ctx->async_wait); - if (ret < 0) - goto out; - - rxm->offset += tls_ctx->rx.prepend_size; - rxm->full_len -= tls_ctx->rx.overhead_size; - tls_advance_record_sn(sk, &tls_ctx->rx); - - ctx->decrypted = true; - - ctx->saved_data_ready(sk); - -out: aead_request_free(aead_req); return ret; } @@ -666,8 +653,29 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags, return skb; } -static int decrypt_skb(struct sock *sk, struct sk_buff *skb, - struct scatterlist *sgout) +static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, + struct scatterlist *sgout) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + struct strp_msg *rxm = strp_msg(skb); + int err = 0; + + err = decrypt_skb(sk, skb, sgout); + if (err < 0) + return err; + + rxm->offset += tls_ctx->rx.prepend_size; + rxm->full_len -= tls_ctx->rx.overhead_size; + tls_advance_record_sn(sk, &tls_ctx->rx); + ctx->decrypted = true; + ctx->saved_data_ready(sk); + + return err; +} + +int decrypt_skb(struct sock *sk, struct sk_buff *skb, + struct scatterlist *sgout) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); @@ -812,7 +820,7 @@ int tls_sw_recvmsg(struct sock *sk, if (err < 0) goto fallback_to_reg_recv; - err = decrypt_skb(sk, skb, sgin); + err = decrypt_skb_update(sk, skb, sgin); for (; pages > 0; pages--) put_page(sg_page(&sgin[pages])); if (err < 0) { @@ -821,7 +829,7 @@ int tls_sw_recvmsg(struct sock *sk, } } else { fallback_to_reg_recv: - err = decrypt_skb(sk, skb, NULL); + err = decrypt_skb_update(sk, skb, NULL); if (err < 0) { tls_err_abort(sk, EBADMSG); goto recv_end; @@ -892,7 +900,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, } if (!ctx->decrypted) { - err = decrypt_skb(sk, skb, NULL); + err = decrypt_skb_update(sk, skb, NULL); if (err < 0) { tls_err_abort(sk, EBADMSG); -- cgit v1.2.3 From 39f56e1a78d647316db330c3b6f4c5637a895e3b Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Fri, 13 Jul 2018 14:33:41 +0300 Subject: tls: Split tls_sw_release_resources_rx This patch splits tls_sw_release_resources_rx into two functions one which releases all inner software tls structures and another that also frees the containing structure. In TLS_DEVICE we will need to release the software structures without freeeing the containing structure, which contains other information. Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- include/net/tls.h | 1 + net/tls/tls_sw.c | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tls.h b/include/net/tls.h index 49b89221db43..7a485de25646 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -223,6 +223,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, void tls_sw_close(struct sock *sk, long timeout); void tls_sw_free_resources_tx(struct sock *sk); void tls_sw_free_resources_rx(struct sock *sk); +void tls_sw_release_resources_rx(struct sock *sk); int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); unsigned int tls_sw_poll(struct file *file, struct socket *sock, diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 1d2271736717..694d26589dcc 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1030,7 +1030,7 @@ void tls_sw_free_resources_tx(struct sock *sk) kfree(ctx); } -void tls_sw_free_resources_rx(struct sock *sk) +void tls_sw_release_resources_rx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); @@ -1049,6 +1049,14 @@ void tls_sw_free_resources_rx(struct sock *sk) strp_done(&ctx->strp); lock_sock(sk); } +} + +void tls_sw_free_resources_rx(struct sock *sk) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + + tls_sw_release_resources_rx(sk); kfree(ctx); } -- cgit v1.2.3 From 4799ac81e52a72a6404827bf2738337bb581a174 Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Fri, 13 Jul 2018 14:33:43 +0300 Subject: tls: Add rx inline crypto offload This patch completes the generic infrastructure to offload TLS crypto to a network device. It enables the kernel to skip decryption and authentication of some skbs marked as decrypted by the NIC. In the fast path, all packets received are decrypted by the NIC and the performance is comparable to plain TCP. This infrastructure doesn't require a TCP offload engine. Instead, the NIC only decrypts packets that contain the expected TCP sequence number. Out-Of-Order TCP packets are provided unmodified. As a result, at the worst case a received TLS record consists of both plaintext and ciphertext packets. These partially decrypted records must be reencrypted, only to be decrypted. The notable differences between SW KTLS Rx and this offload are as follows: 1. Partial decryption - Software must handle the case of a TLS record that was only partially decrypted by HW. This can happen due to packet reordering. 2. Resynchronization - tls_read_size calls the device driver to resynchronize HW after HW lost track of TLS record framing in the TCP stream. Signed-off-by: Boris Pismenny Signed-off-by: David S. Miller --- include/net/tls.h | 63 +++++++++- net/tls/tls_device.c | 278 ++++++++++++++++++++++++++++++++++++++---- net/tls/tls_device_fallback.c | 1 + net/tls/tls_main.c | 32 +++-- net/tls/tls_sw.c | 24 +++- 5 files changed, 355 insertions(+), 43 deletions(-) (limited to 'include/net') diff --git a/include/net/tls.h b/include/net/tls.h index 7a485de25646..d8b3b6578c01 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -83,6 +83,16 @@ struct tls_device { void (*unhash)(struct tls_device *device, struct sock *sk); }; +enum { + TLS_BASE, + TLS_SW, +#ifdef CONFIG_TLS_DEVICE + TLS_HW, +#endif + TLS_HW_RECORD, + TLS_NUM_CONFIG, +}; + struct tls_sw_context_tx { struct crypto_aead *aead_send; struct crypto_wait async_wait; @@ -197,6 +207,7 @@ struct tls_context { int (*push_pending_record)(struct sock *sk, int flags); void (*sk_write_space)(struct sock *sk); + void (*sk_destruct)(struct sock *sk); void (*sk_proto_close)(struct sock *sk, long timeout); int (*setsockopt)(struct sock *sk, int level, @@ -209,13 +220,27 @@ struct tls_context { void (*unhash)(struct sock *sk); }; +struct tls_offload_context_rx { + /* sw must be the first member of tls_offload_context_rx */ + struct tls_sw_context_rx sw; + atomic64_t resync_req; + u8 driver_state[]; + /* The TLS layer reserves room for driver specific state + * Currently the belief is that there is not enough + * driver specific state to justify another layer of indirection + */ +}; + +#define TLS_OFFLOAD_CONTEXT_SIZE_RX \ + (ALIGN(sizeof(struct tls_offload_context_rx), sizeof(void *)) + \ + TLS_DRIVER_STATE_SIZE) + int wait_on_pending_writer(struct sock *sk, long *timeo); int tls_sk_query(struct sock *sk, int optname, char __user *optval, int __user *optlen); int tls_sk_attach(struct sock *sk, int optname, char __user *optval, unsigned int optlen); - int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx); int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tls_sw_sendpage(struct sock *sk, struct page *page, @@ -290,11 +315,19 @@ static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx) return tls_ctx->pending_open_record_frags; } +struct sk_buff * +tls_validate_xmit_skb(struct sock *sk, struct net_device *dev, + struct sk_buff *skb); + static inline bool tls_is_sk_tx_device_offloaded(struct sock *sk) { - return sk_fullsock(sk) && - /* matches smp_store_release in tls_set_device_offload */ - smp_load_acquire(&sk->sk_destruct) == &tls_device_sk_destruct; +#ifdef CONFIG_SOCK_VALIDATE_XMIT + return sk_fullsock(sk) & + (smp_load_acquire(&sk->sk_validate_xmit_skb) == + &tls_validate_xmit_skb); +#else + return false; +#endif } static inline void tls_err_abort(struct sock *sk, int err) @@ -387,10 +420,27 @@ tls_offload_ctx_tx(const struct tls_context *tls_ctx) return (struct tls_offload_context_tx *)tls_ctx->priv_ctx_tx; } +static inline struct tls_offload_context_rx * +tls_offload_ctx_rx(const struct tls_context *tls_ctx) +{ + return (struct tls_offload_context_rx *)tls_ctx->priv_ctx_rx; +} + +/* The TLS context is valid until sk_destruct is called */ +static inline void tls_offload_rx_resync_request(struct sock *sk, __be32 seq) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); + + atomic64_set(&rx_ctx->resync_req, ((((uint64_t)seq) << 32) | 1)); +} + + int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, unsigned char *record_type); void tls_register_device(struct tls_device *device); void tls_unregister_device(struct tls_device *device); +int tls_device_decrypted(struct sock *sk, struct sk_buff *skb); int decrypt_skb(struct sock *sk, struct sk_buff *skb, struct scatterlist *sgout); @@ -402,4 +452,9 @@ int tls_sw_fallback_init(struct sock *sk, struct tls_offload_context_tx *offload_ctx, struct tls_crypto_info *crypto_info); +int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx); + +void tls_device_offload_cleanup_rx(struct sock *sk); +void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn); + #endif /* _TLS_OFFLOAD_H */ diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 332a5d1459b6..4995d84d228d 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -52,7 +52,11 @@ static DEFINE_SPINLOCK(tls_device_lock); static void tls_device_free_ctx(struct tls_context *ctx) { - kfree(tls_offload_ctx_tx(ctx)); + if (ctx->tx_conf == TLS_HW) + kfree(tls_offload_ctx_tx(ctx)); + + if (ctx->rx_conf == TLS_HW) + kfree(tls_offload_ctx_rx(ctx)); kfree(ctx); } @@ -70,10 +74,11 @@ static void tls_device_gc_task(struct work_struct *work) list_for_each_entry_safe(ctx, tmp, &gc_list, list) { struct net_device *netdev = ctx->netdev; - if (netdev) { + if (netdev && ctx->tx_conf == TLS_HW) { netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_TX); dev_put(netdev); + ctx->netdev = NULL; } list_del(&ctx->list); @@ -81,6 +86,22 @@ static void tls_device_gc_task(struct work_struct *work) } } +static void tls_device_attach(struct tls_context *ctx, struct sock *sk, + struct net_device *netdev) +{ + if (sk->sk_destruct != tls_device_sk_destruct) { + refcount_set(&ctx->refcount, 1); + dev_hold(netdev); + ctx->netdev = netdev; + spin_lock_irq(&tls_device_lock); + list_add_tail(&ctx->list, &tls_device_list); + spin_unlock_irq(&tls_device_lock); + + ctx->sk_destruct = sk->sk_destruct; + sk->sk_destruct = tls_device_sk_destruct; + } +} + static void tls_device_queue_ctx_destruction(struct tls_context *ctx) { unsigned long flags; @@ -180,13 +201,15 @@ void tls_device_sk_destruct(struct sock *sk) struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); - if (ctx->open_record) - destroy_record(ctx->open_record); + tls_ctx->sk_destruct(sk); - delete_all_records(ctx); - crypto_free_aead(ctx->aead_send); - ctx->sk_destruct(sk); - clean_acked_data_disable(inet_csk(sk)); + if (tls_ctx->tx_conf == TLS_HW) { + if (ctx->open_record) + destroy_record(ctx->open_record); + delete_all_records(ctx); + crypto_free_aead(ctx->aead_send); + clean_acked_data_disable(inet_csk(sk)); + } if (refcount_dec_and_test(&tls_ctx->refcount)) tls_device_queue_ctx_destruction(tls_ctx); @@ -519,6 +542,118 @@ static int tls_device_push_pending_record(struct sock *sk, int flags) return tls_push_data(sk, &msg_iter, 0, flags, TLS_RECORD_TYPE_DATA); } +void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct net_device *netdev = tls_ctx->netdev; + struct tls_offload_context_rx *rx_ctx; + u32 is_req_pending; + s64 resync_req; + u32 req_seq; + + if (tls_ctx->rx_conf != TLS_HW) + return; + + rx_ctx = tls_offload_ctx_rx(tls_ctx); + resync_req = atomic64_read(&rx_ctx->resync_req); + req_seq = ntohl(resync_req >> 32) - ((u32)TLS_HEADER_SIZE - 1); + is_req_pending = resync_req; + + if (unlikely(is_req_pending) && req_seq == seq && + atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) + netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk, + seq + TLS_HEADER_SIZE - 1, + rcd_sn); +} + +static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) +{ + struct strp_msg *rxm = strp_msg(skb); + int err = 0, offset = rxm->offset, copy, nsg; + struct sk_buff *skb_iter, *unused; + struct scatterlist sg[1]; + char *orig_buf, *buf; + + orig_buf = kmalloc(rxm->full_len + TLS_HEADER_SIZE + + TLS_CIPHER_AES_GCM_128_IV_SIZE, sk->sk_allocation); + if (!orig_buf) + return -ENOMEM; + buf = orig_buf; + + nsg = skb_cow_data(skb, 0, &unused); + if (unlikely(nsg < 0)) { + err = nsg; + goto free_buf; + } + + sg_init_table(sg, 1); + sg_set_buf(&sg[0], buf, + rxm->full_len + TLS_HEADER_SIZE + + TLS_CIPHER_AES_GCM_128_IV_SIZE); + skb_copy_bits(skb, offset, buf, + TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE); + + /* We are interested only in the decrypted data not the auth */ + err = decrypt_skb(sk, skb, sg); + if (err != -EBADMSG) + goto free_buf; + else + err = 0; + + copy = min_t(int, skb_pagelen(skb) - offset, + rxm->full_len - TLS_CIPHER_AES_GCM_128_TAG_SIZE); + + if (skb->decrypted) + skb_store_bits(skb, offset, buf, copy); + + offset += copy; + buf += copy; + + skb_walk_frags(skb, skb_iter) { + copy = min_t(int, skb_iter->len, + rxm->full_len - offset + rxm->offset - + TLS_CIPHER_AES_GCM_128_TAG_SIZE); + + if (skb_iter->decrypted) + skb_store_bits(skb, offset, buf, copy); + + offset += copy; + buf += copy; + } + +free_buf: + kfree(orig_buf); + return err; +} + +int tls_device_decrypted(struct sock *sk, struct sk_buff *skb) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_offload_context_rx *ctx = tls_offload_ctx_rx(tls_ctx); + int is_decrypted = skb->decrypted; + int is_encrypted = !is_decrypted; + struct sk_buff *skb_iter; + + /* Skip if it is already decrypted */ + if (ctx->sw.decrypted) + return 0; + + /* Check if all the data is decrypted already */ + skb_walk_frags(skb, skb_iter) { + is_decrypted &= skb_iter->decrypted; + is_encrypted &= !skb_iter->decrypted; + } + + ctx->sw.decrypted |= is_decrypted; + + /* Return immedeatly if the record is either entirely plaintext or + * entirely ciphertext. Otherwise handle reencrypt partially decrypted + * record. + */ + return (is_encrypted || is_decrypted) ? 0 : + tls_device_reencrypt(sk, skb); +} + int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) { u16 nonce_size, tag_size, iv_size, rec_seq_size; @@ -608,7 +743,6 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) clean_acked_data_enable(inet_csk(sk), &tls_icsk_clean_acked); ctx->push_pending_record = tls_device_push_pending_record; - offload_ctx->sk_destruct = sk->sk_destruct; /* TLS offload is greatly simplified if we don't send * SKBs where only part of the payload needs to be encrypted. @@ -618,8 +752,6 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) if (skb) TCP_SKB_CB(skb)->eor = 1; - refcount_set(&ctx->refcount, 1); - /* We support starting offload on multiple sockets * concurrently, so we only need a read lock here. * This lock must precede get_netdev_for_sock to prevent races between @@ -654,19 +786,14 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) if (rc) goto release_netdev; - ctx->netdev = netdev; + tls_device_attach(ctx, sk, netdev); - spin_lock_irq(&tls_device_lock); - list_add_tail(&ctx->list, &tls_device_list); - spin_unlock_irq(&tls_device_lock); - - sk->sk_validate_xmit_skb = tls_validate_xmit_skb; /* following this assignment tls_is_sk_tx_device_offloaded * will return true and the context might be accessed * by the netdev's xmit function. */ - smp_store_release(&sk->sk_destruct, - &tls_device_sk_destruct); + smp_store_release(&sk->sk_validate_xmit_skb, tls_validate_xmit_skb); + dev_put(netdev); up_read(&device_offload_lock); goto out; @@ -689,6 +816,105 @@ out: return rc; } +int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) +{ + struct tls_offload_context_rx *context; + struct net_device *netdev; + int rc = 0; + + /* We support starting offload on multiple sockets + * concurrently, so we only need a read lock here. + * This lock must precede get_netdev_for_sock to prevent races between + * NETDEV_DOWN and setsockopt. + */ + down_read(&device_offload_lock); + netdev = get_netdev_for_sock(sk); + if (!netdev) { + pr_err_ratelimited("%s: netdev not found\n", __func__); + rc = -EINVAL; + goto release_lock; + } + + if (!(netdev->features & NETIF_F_HW_TLS_RX)) { + pr_err_ratelimited("%s: netdev %s with no TLS offload\n", + __func__, netdev->name); + rc = -ENOTSUPP; + goto release_netdev; + } + + /* Avoid offloading if the device is down + * We don't want to offload new flows after + * the NETDEV_DOWN event + */ + if (!(netdev->flags & IFF_UP)) { + rc = -EINVAL; + goto release_netdev; + } + + context = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_RX, GFP_KERNEL); + if (!context) { + rc = -ENOMEM; + goto release_netdev; + } + + ctx->priv_ctx_rx = context; + rc = tls_set_sw_offload(sk, ctx, 0); + if (rc) + goto release_ctx; + + rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_RX, + &ctx->crypto_recv, + tcp_sk(sk)->copied_seq); + if (rc) { + pr_err_ratelimited("%s: The netdev has refused to offload this socket\n", + __func__); + goto free_sw_resources; + } + + tls_device_attach(ctx, sk, netdev); + goto release_netdev; + +free_sw_resources: + tls_sw_free_resources_rx(sk); +release_ctx: + ctx->priv_ctx_rx = NULL; +release_netdev: + dev_put(netdev); +release_lock: + up_read(&device_offload_lock); + return rc; +} + +void tls_device_offload_cleanup_rx(struct sock *sk) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct net_device *netdev; + + down_read(&device_offload_lock); + netdev = tls_ctx->netdev; + if (!netdev) + goto out; + + if (!(netdev->features & NETIF_F_HW_TLS_RX)) { + pr_err_ratelimited("%s: device is missing NETIF_F_HW_TLS_RX cap\n", + __func__); + goto out; + } + + netdev->tlsdev_ops->tls_dev_del(netdev, tls_ctx, + TLS_OFFLOAD_CTX_DIR_RX); + + if (tls_ctx->tx_conf != TLS_HW) { + dev_put(netdev); + tls_ctx->netdev = NULL; + } +out: + up_read(&device_offload_lock); + kfree(tls_ctx->rx.rec_seq); + kfree(tls_ctx->rx.iv); + tls_sw_release_resources_rx(sk); +} + static int tls_device_down(struct net_device *netdev) { struct tls_context *ctx, *tmp; @@ -709,8 +935,12 @@ static int tls_device_down(struct net_device *netdev) spin_unlock_irqrestore(&tls_device_lock, flags); list_for_each_entry_safe(ctx, tmp, &list, list) { - netdev->tlsdev_ops->tls_dev_del(netdev, ctx, - TLS_OFFLOAD_CTX_DIR_TX); + if (ctx->tx_conf == TLS_HW) + netdev->tlsdev_ops->tls_dev_del(netdev, ctx, + TLS_OFFLOAD_CTX_DIR_TX); + if (ctx->rx_conf == TLS_HW) + netdev->tlsdev_ops->tls_dev_del(netdev, ctx, + TLS_OFFLOAD_CTX_DIR_RX); ctx->netdev = NULL; dev_put(netdev); list_del_init(&ctx->list); @@ -731,12 +961,16 @@ static int tls_dev_event(struct notifier_block *this, unsigned long event, { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - if (!(dev->features & NETIF_F_HW_TLS_TX)) + if (!(dev->features & (NETIF_F_HW_TLS_RX | NETIF_F_HW_TLS_TX))) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: case NETDEV_FEAT_CHANGE: + if ((dev->features & NETIF_F_HW_TLS_RX) && + !dev->tlsdev_ops->tls_dev_resync_rx) + return NOTIFY_BAD; + if (dev->tlsdev_ops && dev->tlsdev_ops->tls_dev_add && dev->tlsdev_ops->tls_dev_del) diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index d1d7dce38e0b..e3313c45663f 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -413,6 +413,7 @@ struct sk_buff *tls_validate_xmit_skb(struct sock *sk, return tls_sw_fallback(sk, skb); } +EXPORT_SYMBOL_GPL(tls_validate_xmit_skb); int tls_sw_fallback_init(struct sock *sk, struct tls_offload_context_tx *offload_ctx, diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 301f22430469..b09867c8b817 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -51,15 +51,6 @@ enum { TLSV6, TLS_NUM_PROTS, }; -enum { - TLS_BASE, - TLS_SW, -#ifdef CONFIG_TLS_DEVICE - TLS_HW, -#endif - TLS_HW_RECORD, - TLS_NUM_CONFIG, -}; static struct proto *saved_tcpv6_prot; static DEFINE_MUTEX(tcpv6_prot_mutex); @@ -290,7 +281,10 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) } #ifdef CONFIG_TLS_DEVICE - if (ctx->tx_conf != TLS_HW) { + if (ctx->rx_conf == TLS_HW) + tls_device_offload_cleanup_rx(sk); + + if (ctx->tx_conf != TLS_HW && ctx->rx_conf != TLS_HW) { #else { #endif @@ -470,8 +464,16 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, conf = TLS_SW; } } else { - rc = tls_set_sw_offload(sk, ctx, 0); - conf = TLS_SW; +#ifdef CONFIG_TLS_DEVICE + rc = tls_set_device_offload_rx(sk, ctx); + conf = TLS_HW; + if (rc) { +#else + { +#endif + rc = tls_set_sw_offload(sk, ctx, 0); + conf = TLS_SW; + } } if (rc) @@ -629,6 +631,12 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], prot[TLS_HW][TLS_SW] = prot[TLS_BASE][TLS_SW]; prot[TLS_HW][TLS_SW].sendmsg = tls_device_sendmsg; prot[TLS_HW][TLS_SW].sendpage = tls_device_sendpage; + + prot[TLS_BASE][TLS_HW] = prot[TLS_BASE][TLS_SW]; + + prot[TLS_SW][TLS_HW] = prot[TLS_SW][TLS_SW]; + + prot[TLS_HW][TLS_HW] = prot[TLS_HW][TLS_SW]; #endif prot[TLS_HW_RECORD][TLS_HW_RECORD] = *base; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 5f7d70b24be6..fe5735c57774 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -654,16 +654,25 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags, } static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, - struct scatterlist *sgout) + struct scatterlist *sgout, bool *zc) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct strp_msg *rxm = strp_msg(skb); int err = 0; - err = decrypt_skb(sk, skb, sgout); +#ifdef CONFIG_TLS_DEVICE + err = tls_device_decrypted(sk, skb); if (err < 0) return err; +#endif + if (!ctx->decrypted) { + err = decrypt_skb(sk, skb, sgout); + if (err < 0) + return err; + } else { + *zc = false; + } rxm->offset += tls_ctx->rx.prepend_size; rxm->full_len -= tls_ctx->rx.overhead_size; @@ -820,7 +829,7 @@ int tls_sw_recvmsg(struct sock *sk, if (err < 0) goto fallback_to_reg_recv; - err = decrypt_skb_update(sk, skb, sgin); + err = decrypt_skb_update(sk, skb, sgin, &zc); for (; pages > 0; pages--) put_page(sg_page(&sgin[pages])); if (err < 0) { @@ -829,7 +838,7 @@ int tls_sw_recvmsg(struct sock *sk, } } else { fallback_to_reg_recv: - err = decrypt_skb_update(sk, skb, NULL); + err = decrypt_skb_update(sk, skb, NULL, &zc); if (err < 0) { tls_err_abort(sk, EBADMSG); goto recv_end; @@ -884,6 +893,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, int err = 0; long timeo; int chunk; + bool zc; lock_sock(sk); @@ -900,7 +910,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, } if (!ctx->decrypted) { - err = decrypt_skb_update(sk, skb, NULL); + err = decrypt_skb_update(sk, skb, NULL, &zc); if (err < 0) { tls_err_abort(sk, EBADMSG); @@ -989,6 +999,10 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb) goto read_failure; } +#ifdef CONFIG_TLS_DEVICE + handle_device_resync(strp->sk, TCP_SKB_CB(skb)->seq + rxm->offset, + *(u64*)tls_ctx->rx.rec_seq); +#endif return data_len + TLS_HEADER_SIZE; read_failure: -- cgit v1.2.3 From f286586df68e7733a8e651098401f139dc2e17f4 Mon Sep 17 00:00:00 2001 From: Máté Eckl Date: Mon, 18 Jun 2018 15:12:52 +0200 Subject: netfilter: nft_tproxy: Move nf_tproxy_assign_sock() to nf_tproxy.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This function is also necessary to implement nft tproxy support Fixes: 45ca4e0cf273 ("netfilter: Libify xt_TPROXY") Signed-off-by: Máté Eckl Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tproxy.h | 8 ++++++++ net/netfilter/xt_TPROXY.c | 9 --------- 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tproxy.h b/include/net/netfilter/nf_tproxy.h index 9754a50ecde9..d5a80888cbe4 100644 --- a/include/net/netfilter/nf_tproxy.h +++ b/include/net/netfilter/nf_tproxy.h @@ -17,6 +17,14 @@ static inline bool nf_tproxy_sk_is_transparent(struct sock *sk) return false; } +/* assign a socket to the skb -- consumes sk */ +static inline void nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) +{ + skb_orphan(skb); + skb->sk = sk; + skb->destructor = sock_edemux; +} + __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr); /** diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 58fce4e749a9..35df0827e2ca 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -36,15 +36,6 @@ #include #include -/* assign a socket to the skb -- consumes sk */ -static void -nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) -{ - skb_orphan(skb); - skb->sk = sk; - skb->destructor = sock_edemux; -} - static unsigned int tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport, u_int32_t mark_mask, u_int32_t mark_value) -- cgit v1.2.3 From 60e3be94e6a1c5162a0763c9aafb5190b2b1fdce Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 25 Jun 2018 17:55:32 +0200 Subject: openvswitch: use nf_ct_get_tuplepr, invert_tuplepr These versions deal with the l3proto/l4proto details internally. It removes only caller of nf_ct_get_tuple, so make it static. After this, l3proto->get_l4proto() can be removed in a followup patch. Signed-off-by: Florian Westphal Acked-by: Pravin B Shelar Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_core.h | 7 ------- net/netfilter/nf_conntrack_core.c | 3 +-- net/openvswitch/conntrack.c | 17 +++-------------- 3 files changed, 4 insertions(+), 23 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 9b5e7634713e..90df45022c51 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -40,13 +40,6 @@ void nf_conntrack_cleanup_start(void); void nf_conntrack_init_end(void); void nf_conntrack_cleanup_end(void); -bool nf_ct_get_tuple(const struct sk_buff *skb, unsigned int nhoff, - unsigned int dataoff, u_int16_t l3num, u_int8_t protonum, - struct net *net, - struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_l3proto *l3proto, - const struct nf_conntrack_l4proto *l4proto); - bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, const struct nf_conntrack_tuple *orig, const struct nf_conntrack_l3proto *l3proto, diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 85ab2fd6a665..be0ab81e6b2c 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -222,7 +222,7 @@ static u32 hash_conntrack(const struct net *net, return scale_hash(hash_conntrack_raw(tuple, net)); } -bool +static bool nf_ct_get_tuple(const struct sk_buff *skb, unsigned int nhoff, unsigned int dataoff, @@ -244,7 +244,6 @@ nf_ct_get_tuple(const struct sk_buff *skb, return l4proto->pkt_to_tuple(skb, dataoff, net, tuple); } -EXPORT_SYMBOL_GPL(nf_ct_get_tuple); bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, u_int16_t l3num, diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 284aca2a252d..e05bd3e53f0f 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -607,23 +607,12 @@ static struct nf_conn * ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, u8 l3num, struct sk_buff *skb, bool natted) { - const struct nf_conntrack_l3proto *l3proto; - const struct nf_conntrack_l4proto *l4proto; struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; - unsigned int dataoff; - u8 protonum; - l3proto = __nf_ct_l3proto_find(l3num); - if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff, - &protonum) <= 0) { - pr_debug("ovs_ct_find_existing: Can't get protonum\n"); - return NULL; - } - l4proto = __nf_ct_l4proto_find(l3num, protonum); - if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, - protonum, net, &tuple, l3proto, l4proto)) { + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), l3num, + net, &tuple)) { pr_debug("ovs_ct_find_existing: Can't get tuple\n"); return NULL; } @@ -632,7 +621,7 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, if (natted) { struct nf_conntrack_tuple inverse; - if (!nf_ct_invert_tuple(&inverse, &tuple, l3proto, l4proto)) { + if (!nf_ct_invert_tuplepr(&inverse, &tuple)) { pr_debug("ovs_ct_find_existing: Inversion failed!\n"); return NULL; } -- cgit v1.2.3 From f957be9d349a3800940f823b16e12b0405cc305b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Jun 2018 07:46:44 +0200 Subject: netfilter: conntrack: remove ctnetlink callbacks from l3 protocol trackers handle everything from ctnetlink directly. After all these years we still only support ipv4 and ipv6, so it seems reasonable to remove l3 protocol tracker support and instead handle ipv4/ipv6 from a common, always builtin inet tracker. Step 1: Get rid of all the l3proto->func() calls. Start with ctnetlink, then move on to packet-path ones. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_core.h | 6 +- include/net/netfilter/nf_conntrack_l3proto.h | 8 --- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 47 ------------- net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 48 ------------- net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 1 - net/netfilter/nf_conntrack_expect.c | 1 - net/netfilter/nf_conntrack_helper.c | 1 - net/netfilter/nf_conntrack_netlink.c | 96 +++++++++++++++++++------- net/netfilter/nf_conntrack_proto.c | 5 +- net/netfilter/nf_conntrack_standalone.c | 14 ++-- net/netfilter/nfnetlink_cttimeout.c | 1 - 11 files changed, 79 insertions(+), 149 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 90df45022c51..d454a53ba646 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -68,10 +68,8 @@ static inline int nf_conntrack_confirm(struct sk_buff *skb) return ret; } -void -print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_l3proto *l3proto, - const struct nf_conntrack_l4proto *proto); +void print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_l4proto *proto); #define CONNTRACK_LOCKS 1024 diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h index d5808f3e2715..d07b5216a925 100644 --- a/include/net/netfilter/nf_conntrack_l3proto.h +++ b/include/net/netfilter/nf_conntrack_l3proto.h @@ -46,14 +46,6 @@ struct nf_conntrack_l3proto { int (*get_l4proto)(const struct sk_buff *skb, unsigned int nhoff, unsigned int *dataoff, u_int8_t *protonum); -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - int (*tuple_to_nlattr)(struct sk_buff *skb, - const struct nf_conntrack_tuple *t); - int (*nlattr_to_tuple)(struct nlattr *tb[], - struct nf_conntrack_tuple *t); - const struct nla_policy *nla_policy; -#endif - /* Called when netns wants to use connection tracking */ int (*net_ns_get)(struct net *); void (*net_ns_put)(struct net *); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 9db988f9a4d7..98ed12858c52 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -274,41 +274,6 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) return -ENOENT; } -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - -#include -#include - -static int ipv4_tuple_to_nlattr(struct sk_buff *skb, - const struct nf_conntrack_tuple *tuple) -{ - if (nla_put_in_addr(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) || - nla_put_in_addr(skb, CTA_IP_V4_DST, tuple->dst.u3.ip)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -1; -} - -static const struct nla_policy ipv4_nla_policy[CTA_IP_MAX+1] = { - [CTA_IP_V4_SRC] = { .type = NLA_U32 }, - [CTA_IP_V4_DST] = { .type = NLA_U32 }, -}; - -static int ipv4_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *t) -{ - if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST]) - return -EINVAL; - - t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]); - t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]); - - return 0; -} -#endif - static struct nf_sockopt_ops so_getorigdst = { .pf = PF_INET, .get_optmin = SO_ORIGINAL_DST, @@ -360,13 +325,6 @@ const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = { .pkt_to_tuple = ipv4_pkt_to_tuple, .invert_tuple = ipv4_invert_tuple, .get_l4proto = ipv4_get_l4proto, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .tuple_to_nlattr = ipv4_tuple_to_nlattr, - .nlattr_to_tuple = ipv4_nlattr_to_tuple, - .nla_policy = ipv4_nla_policy, - .nla_size = NLA_ALIGN(NLA_HDRLEN + sizeof(u32)) + /* CTA_IP_V4_SRC */ - NLA_ALIGN(NLA_HDRLEN + sizeof(u32)), /* CTA_IP_V4_DST */ -#endif .net_ns_get = ipv4_hooks_register, .net_ns_put = ipv4_hooks_unregister, .me = THIS_MODULE, @@ -419,11 +377,6 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) need_conntrack(); -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - if (WARN_ON(nla_policy_len(ipv4_nla_policy, CTA_IP_MAX + 1) != - nf_conntrack_l3proto_ipv4.nla_size)) - return -EINVAL; -#endif ret = nf_register_sockopt(&so_getorigdst); if (ret < 0) { pr_err("Unable to register netfilter socket option\n"); diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 663827ee3cf8..13a660ae5799 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -269,41 +269,6 @@ ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len) return copy_to_user(user, &sin6, sizeof(sin6)) ? -EFAULT : 0; } -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - -#include -#include - -static int ipv6_tuple_to_nlattr(struct sk_buff *skb, - const struct nf_conntrack_tuple *tuple) -{ - if (nla_put_in6_addr(skb, CTA_IP_V6_SRC, &tuple->src.u3.in6) || - nla_put_in6_addr(skb, CTA_IP_V6_DST, &tuple->dst.u3.in6)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -1; -} - -static const struct nla_policy ipv6_nla_policy[CTA_IP_MAX+1] = { - [CTA_IP_V6_SRC] = { .len = sizeof(u_int32_t)*4 }, - [CTA_IP_V6_DST] = { .len = sizeof(u_int32_t)*4 }, -}; - -static int ipv6_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *t) -{ - if (!tb[CTA_IP_V6_SRC] || !tb[CTA_IP_V6_DST]) - return -EINVAL; - - t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]); - t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]); - - return 0; -} -#endif - static int ipv6_hooks_register(struct net *net) { struct conntrack6_net *cnet = net_generic(net, conntrack6_net_id); @@ -345,13 +310,6 @@ const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = { .pkt_to_tuple = ipv6_pkt_to_tuple, .invert_tuple = ipv6_invert_tuple, .get_l4proto = ipv6_get_l4proto, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .tuple_to_nlattr = ipv6_tuple_to_nlattr, - .nlattr_to_tuple = ipv6_nlattr_to_tuple, - .nla_policy = ipv6_nla_policy, - .nla_size = NLA_ALIGN(NLA_HDRLEN + sizeof(u32[4])) + - NLA_ALIGN(NLA_HDRLEN + sizeof(u32[4])), -#endif .net_ns_get = ipv6_hooks_register, .net_ns_put = ipv6_hooks_unregister, .me = THIS_MODULE, @@ -409,12 +367,6 @@ static int __init nf_conntrack_l3proto_ipv6_init(void) need_conntrack(); -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - if (WARN_ON(nla_policy_len(ipv6_nla_policy, CTA_IP_MAX + 1) != - nf_conntrack_l3proto_ipv6.nla_size)) - return -EINVAL; -#endif - ret = nf_register_sockopt(&so_getorigdst6); if (ret < 0) { pr_err("Unable to register netfilter socket option\n"); diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index c87b48359e8f..e631be25337e 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #endif diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 853b23206bb7..3f586ba23d92 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -610,7 +610,6 @@ static int exp_seq_show(struct seq_file *s, void *v) expect->tuple.src.l3num, expect->tuple.dst.protonum); print_tuple(s, &expect->tuple, - __nf_ct_l3proto_find(expect->tuple.src.l3num), __nf_ct_l4proto_find(expect->tuple.src.l3num, expect->tuple.dst.protonum)); diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index a75b11c39312..a55a58c706a9 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -24,7 +24,6 @@ #include #include -#include #include #include #include diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 20a2e37c76d1..40152b9ad772 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include @@ -81,9 +80,26 @@ nla_put_failure: return -1; } +static int ipv4_tuple_to_nlattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple) +{ + if (nla_put_in_addr(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) || + nla_put_in_addr(skb, CTA_IP_V4_DST, tuple->dst.u3.ip)) + return -EMSGSIZE; + return 0; +} + +static int ipv6_tuple_to_nlattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple) +{ + if (nla_put_in6_addr(skb, CTA_IP_V6_SRC, &tuple->src.u3.in6) || + nla_put_in6_addr(skb, CTA_IP_V6_DST, &tuple->dst.u3.in6)) + return -EMSGSIZE; + return 0; +} + static int ctnetlink_dump_tuples_ip(struct sk_buff *skb, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_l3proto *l3proto) + const struct nf_conntrack_tuple *tuple) { int ret = 0; struct nlattr *nest_parms; @@ -92,8 +108,14 @@ static int ctnetlink_dump_tuples_ip(struct sk_buff *skb, if (!nest_parms) goto nla_put_failure; - if (likely(l3proto->tuple_to_nlattr)) - ret = l3proto->tuple_to_nlattr(skb, tuple); + switch (tuple->src.l3num) { + case NFPROTO_IPV4: + ret = ipv4_tuple_to_nlattr(skb, tuple); + break; + case NFPROTO_IPV6: + ret = ipv6_tuple_to_nlattr(skb, tuple); + break; + } nla_nest_end(skb, nest_parms); @@ -106,13 +128,11 @@ nla_put_failure: static int ctnetlink_dump_tuples(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { - const struct nf_conntrack_l3proto *l3proto; const struct nf_conntrack_l4proto *l4proto; int ret; rcu_read_lock(); - l3proto = __nf_ct_l3proto_find(tuple->src.l3num); - ret = ctnetlink_dump_tuples_ip(skb, tuple, l3proto); + ret = ctnetlink_dump_tuples_ip(skb, tuple); if (ret >= 0) { l4proto = __nf_ct_l4proto_find(tuple->src.l3num, @@ -556,15 +576,20 @@ nla_put_failure: return -1; } +static const struct nla_policy cta_ip_nla_policy[CTA_IP_MAX + 1] = { + [CTA_IP_V4_SRC] = { .type = NLA_U32 }, + [CTA_IP_V4_DST] = { .type = NLA_U32 }, + [CTA_IP_V6_SRC] = { .len = sizeof(__be32) * 4 }, + [CTA_IP_V6_DST] = { .len = sizeof(__be32) * 4 }, +}; + #if defined(CONFIG_NETFILTER_NETLINK_GLUE_CT) || defined(CONFIG_NF_CONNTRACK_EVENTS) static size_t ctnetlink_proto_size(const struct nf_conn *ct) { - const struct nf_conntrack_l3proto *l3proto; const struct nf_conntrack_l4proto *l4proto; size_t len, len4 = 0; - l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); - len = l3proto->nla_size; + len = nla_policy_len(cta_ip_nla_policy, CTA_IP_MAX + 1); len *= 3u; /* ORIG, REPLY, MASTER */ l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); @@ -936,29 +961,54 @@ out: return skb->len; } +static int ipv4_nlattr_to_tuple(struct nlattr *tb[], + struct nf_conntrack_tuple *t) +{ + if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST]) + return -EINVAL; + + t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]); + t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]); + + return 0; +} + +static int ipv6_nlattr_to_tuple(struct nlattr *tb[], + struct nf_conntrack_tuple *t) +{ + if (!tb[CTA_IP_V6_SRC] || !tb[CTA_IP_V6_DST]) + return -EINVAL; + + t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]); + t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]); + + return 0; +} + static int ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nf_conntrack_tuple *tuple) { struct nlattr *tb[CTA_IP_MAX+1]; - struct nf_conntrack_l3proto *l3proto; int ret = 0; ret = nla_parse_nested(tb, CTA_IP_MAX, attr, NULL, NULL); if (ret < 0) return ret; - rcu_read_lock(); - l3proto = __nf_ct_l3proto_find(tuple->src.l3num); + ret = nla_validate_nested(attr, CTA_IP_MAX, + cta_ip_nla_policy, NULL); + if (ret) + return ret; - if (likely(l3proto->nlattr_to_tuple)) { - ret = nla_validate_nested(attr, CTA_IP_MAX, - l3proto->nla_policy, NULL); - if (ret == 0) - ret = l3proto->nlattr_to_tuple(tb, tuple); + switch (tuple->src.l3num) { + case NFPROTO_IPV4: + ret = ipv4_nlattr_to_tuple(tb, tuple); + break; + case NFPROTO_IPV6: + ret = ipv6_nlattr_to_tuple(tb, tuple); + break; } - rcu_read_unlock(); - return ret; } @@ -2581,7 +2631,6 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple_mask *mask) { - const struct nf_conntrack_l3proto *l3proto; const struct nf_conntrack_l4proto *l4proto; struct nf_conntrack_tuple m; struct nlattr *nest_parms; @@ -2597,8 +2646,7 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb, goto nla_put_failure; rcu_read_lock(); - l3proto = __nf_ct_l3proto_find(tuple->src.l3num); - ret = ctnetlink_dump_tuples_ip(skb, &m, l3proto); + ret = ctnetlink_dump_tuples_ip(skb, &m); if (ret >= 0) { l4proto = __nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index d88841fbc560..859cb303bb91 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -294,10 +294,7 @@ int nf_ct_l3proto_register(const struct nf_conntrack_l3proto *proto) if (proto->l3proto >= NFPROTO_NUMPROTO) return -EBUSY; -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - if (proto->tuple_to_nlattr && proto->nla_size == 0) - return -EINVAL; -#endif + mutex_lock(&nf_ct_proto_mutex); old = rcu_dereference_protected(nf_ct_l3protos[proto->l3proto], lockdep_is_held(&nf_ct_proto_mutex)); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index b642c0b2495c..47b80fd0d2c3 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -24,7 +24,6 @@ #include #include -#include #include #include #include @@ -38,10 +37,9 @@ MODULE_LICENSE("GPL"); #ifdef CONFIG_NF_CONNTRACK_PROCFS void print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto) { - switch (l3proto->l3proto) { + switch (tuple->src.l3num) { case NFPROTO_IPV4: seq_printf(s, "src=%pI4 dst=%pI4 ", &tuple->src.u3.ip, &tuple->dst.u3.ip); @@ -282,7 +280,6 @@ static int ct_seq_show(struct seq_file *s, void *v) { struct nf_conntrack_tuple_hash *hash = v; struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash); - const struct nf_conntrack_l3proto *l3proto; const struct nf_conntrack_l4proto *l4proto; struct net *net = seq_file_net(s); int ret = 0; @@ -303,14 +300,12 @@ static int ct_seq_show(struct seq_file *s, void *v) if (!net_eq(nf_ct_net(ct), net)) goto release; - l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); - WARN_ON(!l3proto); l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); WARN_ON(!l4proto); ret = -ENOSPC; seq_printf(s, "%-8s %u %-8s %u ", - l3proto_name(l3proto->l3proto), nf_ct_l3num(ct), + l3proto_name(nf_ct_l3num(ct)), nf_ct_l3num(ct), l4proto_name(l4proto->l4proto), nf_ct_protonum(ct)); if (!test_bit(IPS_OFFLOAD_BIT, &ct->status)) @@ -320,7 +315,7 @@ static int ct_seq_show(struct seq_file *s, void *v) l4proto->print_conntrack(s, ct); print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - l3proto, l4proto); + l4proto); ct_show_zone(s, ct, NF_CT_ZONE_DIR_ORIG); @@ -333,8 +328,7 @@ static int ct_seq_show(struct seq_file *s, void *v) if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status))) seq_puts(s, "[UNREPLIED] "); - print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, - l3proto, l4proto); + print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, l4proto); ct_show_zone(s, ct, NF_CT_ZONE_DIR_REPL); diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 9ee5fa551fa6..9da4b8462004 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From 47a91b14de62e35d1466820cbb4c024b6c02dff1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Jun 2018 07:46:45 +0200 Subject: netfilter: conntrack: remove pkt_to_tuple indirection from l3 protocol trackers Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l3proto.h | 7 ----- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 17 ----------- net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 18 ------------ net/netfilter/nf_conntrack_core.c | 39 ++++++++++++++++++++++---- net/netfilter/nf_conntrack_l3proto_generic.c | 10 ------- 5 files changed, 33 insertions(+), 58 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h index d07b5216a925..ece231450f30 100644 --- a/include/net/netfilter/nf_conntrack_l3proto.h +++ b/include/net/netfilter/nf_conntrack_l3proto.h @@ -24,13 +24,6 @@ struct nf_conntrack_l3proto { /* size of tuple nlattr, fills a hole */ u16 nla_size; - /* - * Try to fill in the third arg: nhoff is offset of l3 proto - * hdr. Return true if possible. - */ - bool (*pkt_to_tuple)(const struct sk_buff *skb, unsigned int nhoff, - struct nf_conntrack_tuple *tuple); - /* * Invert the per-proto part of the tuple: ie. turn xmit into reply. * Some packets can't be inverted: return 0 in that case. diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 98ed12858c52..7ed56f61798b 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -38,22 +38,6 @@ struct conntrack4_net { unsigned int users; }; -static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, - struct nf_conntrack_tuple *tuple) -{ - const __be32 *ap; - __be32 _addrs[2]; - ap = skb_header_pointer(skb, nhoff + offsetof(struct iphdr, saddr), - sizeof(u_int32_t) * 2, _addrs); - if (ap == NULL) - return false; - - tuple->src.u3.ip = ap[0]; - tuple->dst.u3.ip = ap[1]; - - return true; -} - static bool ipv4_invert_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig) { @@ -322,7 +306,6 @@ static void ipv4_hooks_unregister(struct net *net) const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = { .l3proto = PF_INET, - .pkt_to_tuple = ipv4_pkt_to_tuple, .invert_tuple = ipv4_invert_tuple, .get_l4proto = ipv4_get_l4proto, .net_ns_get = ipv4_hooks_register, diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 13a660ae5799..bdb1709bb951 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -41,23 +41,6 @@ struct conntrack6_net { unsigned int users; }; -static bool ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, - struct nf_conntrack_tuple *tuple) -{ - const u_int32_t *ap; - u_int32_t _addrs[8]; - - ap = skb_header_pointer(skb, nhoff + offsetof(struct ipv6hdr, saddr), - sizeof(_addrs), _addrs); - if (ap == NULL) - return false; - - memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); - memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); - - return true; -} - static bool ipv6_invert_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig) { @@ -307,7 +290,6 @@ static void ipv6_hooks_unregister(struct net *net) const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = { .l3proto = PF_INET6, - .pkt_to_tuple = ipv6_pkt_to_tuple, .invert_tuple = ipv6_invert_tuple, .get_l4proto = ipv6_get_l4proto, .net_ns_get = ipv6_hooks_register, diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index be0ab81e6b2c..66b2ebae2747 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -230,15 +230,43 @@ nf_ct_get_tuple(const struct sk_buff *skb, u_int8_t protonum, struct net *net, struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto) { + unsigned int size; + const __be32 *ap; + __be32 _addrs[8]; + memset(tuple, 0, sizeof(*tuple)); tuple->src.l3num = l3num; - if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0) + switch (l3num) { + case NFPROTO_IPV4: + nhoff += offsetof(struct iphdr, saddr); + size = 2 * sizeof(__be32); + break; + case NFPROTO_IPV6: + nhoff += offsetof(struct ipv6hdr, saddr); + size = sizeof(_addrs); + break; + default: + return true; + } + + ap = skb_header_pointer(skb, nhoff, size, _addrs); + if (!ap) return false; + switch (l3num) { + case NFPROTO_IPV4: + tuple->src.u3.ip = ap[0]; + tuple->dst.u3.ip = ap[1]; + break; + case NFPROTO_IPV6: + memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); + memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); + break; + } + tuple->dst.protonum = protonum; tuple->dst.dir = IP_CT_DIR_ORIGINAL; @@ -267,7 +295,7 @@ bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, l4proto = __nf_ct_l4proto_find(l3num, protonum); ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple, - l3proto, l4proto); + l4proto); rcu_read_unlock(); return ret; @@ -1318,8 +1346,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, u32 hash; if (!nf_ct_get_tuple(skb, skb_network_offset(skb), - dataoff, l3num, protonum, net, &tuple, l3proto, - l4proto)) { + dataoff, l3num, protonum, net, &tuple, l4proto)) { pr_debug("Can't get tuple\n"); return 0; } @@ -1633,7 +1660,7 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb) l4proto = nf_ct_l4proto_find_get(l3num, l4num); if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, - l4num, net, &tuple, l3proto, l4proto)) + l4num, net, &tuple, l4proto)) return -1; if (ct->status & IPS_SRC_NAT) { diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c index 397e6911214f..0b01c9970e99 100644 --- a/net/netfilter/nf_conntrack_l3proto_generic.c +++ b/net/netfilter/nf_conntrack_l3proto_generic.c @@ -31,15 +31,6 @@ #include #include -static bool generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, - struct nf_conntrack_tuple *tuple) -{ - memset(&tuple->src.u3, 0, sizeof(tuple->src.u3)); - memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3)); - - return true; -} - static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig) { @@ -59,7 +50,6 @@ static int generic_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = { .l3proto = PF_UNSPEC, - .pkt_to_tuple = generic_pkt_to_tuple, .invert_tuple = generic_invert_tuple, .get_l4proto = generic_get_l4proto, }; -- cgit v1.2.3 From d1b6fe94941f43e4743d5fea953d16b0a001c2c6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Jun 2018 07:46:46 +0200 Subject: netfilter: conntrack: remove invert_tuple indirection from l3 protocol trackers Its simpler to just handle it directly in nf_ct_invert_tuple(). Also gets rid of need to pass l3proto pointer to resolve_conntrack(). Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_core.h | 1 - include/net/netfilter/nf_conntrack_l3proto.h | 7 ------- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 10 ---------- net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 3 +-- net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 10 ---------- net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 3 +-- net/netfilter/nf_conntrack_core.c | 26 ++++++++++++++++---------- net/netfilter/nf_conntrack_l3proto_generic.c | 10 ---------- 8 files changed, 18 insertions(+), 52 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index d454a53ba646..35461b2d3462 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -42,7 +42,6 @@ void nf_conntrack_cleanup_end(void); bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, const struct nf_conntrack_tuple *orig, - const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto); /* Find a connection corresponding to a tuple. */ diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h index ece231450f30..164641c743a5 100644 --- a/include/net/netfilter/nf_conntrack_l3proto.h +++ b/include/net/netfilter/nf_conntrack_l3proto.h @@ -24,13 +24,6 @@ struct nf_conntrack_l3proto { /* size of tuple nlattr, fills a hole */ u16 nla_size; - /* - * Invert the per-proto part of the tuple: ie. turn xmit into reply. - * Some packets can't be inverted: return 0 in that case. - */ - bool (*invert_tuple)(struct nf_conntrack_tuple *inverse, - const struct nf_conntrack_tuple *orig); - /* * Called before tracking. * *dataoff: offset of protocol header (TCP, UDP,...) in skb diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 7ed56f61798b..e10e38c443ab 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -38,15 +38,6 @@ struct conntrack4_net { unsigned int users; }; -static bool ipv4_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - tuple->src.u3.ip = orig->dst.u3.ip; - tuple->dst.u3.ip = orig->src.u3.ip; - - return true; -} - static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, unsigned int *dataoff, u_int8_t *protonum) { @@ -306,7 +297,6 @@ static void ipv4_hooks_unregister(struct net *net) const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = { .l3proto = PF_INET, - .invert_tuple = ipv4_invert_tuple, .get_l4proto = ipv4_get_l4proto, .net_ns_get = ipv4_hooks_register, .net_ns_put = ipv4_hooks_unregister, diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 5c15beafa711..34095949a003 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -142,8 +142,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, /* Ordinarily, we'd expect the inverted tupleproto, but it's been preserved inside the ICMP. */ - if (!nf_ct_invert_tuple(&innertuple, &origtuple, - &nf_conntrack_l3proto_ipv4, innerproto)) { + if (!nf_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { pr_debug("icmp_error_message: no match\n"); return -NF_ACCEPT; } diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index bdb1709bb951..f8051fe20489 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -41,15 +41,6 @@ struct conntrack6_net { unsigned int users; }; -static bool ipv6_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - memcpy(tuple->src.u3.ip6, orig->dst.u3.ip6, sizeof(tuple->src.u3.ip6)); - memcpy(tuple->dst.u3.ip6, orig->src.u3.ip6, sizeof(tuple->dst.u3.ip6)); - - return true; -} - static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, unsigned int *dataoff, u_int8_t *protonum) { @@ -290,7 +281,6 @@ static void ipv6_hooks_unregister(struct net *net) const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = { .l3proto = PF_INET6, - .invert_tuple = ipv6_invert_tuple, .get_l4proto = ipv6_get_l4proto, .net_ns_get = ipv6_hooks_register, .net_ns_put = ipv6_hooks_unregister, diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 2548e2c8aedd..8bcbc2f15bd5 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -152,8 +152,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl, /* Ordinarily, we'd expect the inverted tupleproto, but it's been preserved inside the ICMP. */ - if (!nf_ct_invert_tuple(&intuple, &origtuple, - &nf_conntrack_l3proto_ipv6, inproto)) { + if (!nf_ct_invert_tuple(&intuple, &origtuple, inproto)) { pr_debug("icmpv6_error: Can't invert tuple\n"); return -NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 66b2ebae2747..14c040805b32 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -305,14 +305,24 @@ EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr); bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, const struct nf_conntrack_tuple *orig, - const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto) { memset(inverse, 0, sizeof(*inverse)); inverse->src.l3num = orig->src.l3num; - if (l3proto->invert_tuple(inverse, orig) == 0) - return false; + + switch (orig->src.l3num) { + case NFPROTO_IPV4: + inverse->src.u3.ip = orig->dst.u3.ip; + inverse->dst.u3.ip = orig->src.u3.ip; + break; + case NFPROTO_IPV6: + inverse->src.u3.in6 = orig->dst.u3.in6; + inverse->dst.u3.in6 = orig->src.u3.in6; + break; + default: + break; + } inverse->dst.dir = !orig->dst.dir; @@ -1222,7 +1232,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_free); static noinline struct nf_conntrack_tuple_hash * init_conntrack(struct net *net, struct nf_conn *tmpl, const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto, struct sk_buff *skb, unsigned int dataoff, u32 hash) @@ -1237,7 +1246,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, struct nf_conntrack_zone tmp; unsigned int *timeouts; - if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) { + if (!nf_ct_invert_tuple(&repl_tuple, tuple, l4proto)) { pr_debug("Can't invert tuple.\n"); return NULL; } @@ -1334,7 +1343,6 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, unsigned int dataoff, u_int16_t l3num, u_int8_t protonum, - const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto) { const struct nf_conntrack_zone *zone; @@ -1356,7 +1364,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, hash = hash_conntrack_raw(&tuple, net); h = __nf_conntrack_find_get(net, zone, &tuple, hash); if (!h) { - h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto, + h = init_conntrack(net, tmpl, &tuple, l4proto, skb, dataoff, hash); if (!h) return 0; @@ -1439,8 +1447,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, goto out; } repeat: - ret = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, - l3proto, l4proto); + ret = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, l4proto); if (ret < 0) { /* Too stressed to deal. */ NF_CT_STAT_INC_ATOMIC(net, drop); @@ -1497,7 +1504,6 @@ bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse, rcu_read_lock(); ret = nf_ct_invert_tuple(inverse, orig, - __nf_ct_l3proto_find(orig->src.l3num), __nf_ct_l4proto_find(orig->src.l3num, orig->dst.protonum)); rcu_read_unlock(); diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c index 0b01c9970e99..d6a8fe591ccc 100644 --- a/net/netfilter/nf_conntrack_l3proto_generic.c +++ b/net/netfilter/nf_conntrack_l3proto_generic.c @@ -31,15 +31,6 @@ #include #include -static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - memset(&tuple->src.u3, 0, sizeof(tuple->src.u3)); - memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3)); - - return true; -} - static int generic_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, unsigned int *dataoff, u_int8_t *protonum) { @@ -50,7 +41,6 @@ static int generic_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = { .l3proto = PF_UNSPEC, - .invert_tuple = generic_invert_tuple, .get_l4proto = generic_get_l4proto, }; EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_generic); -- cgit v1.2.3 From 6816d931cab009024b68c11c4cf752f8bf9a1e32 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Jun 2018 07:46:47 +0200 Subject: netfilter: conntrack: remove get_l4proto indirection from l3 protocol trackers Handle it in the core instead. ipv6_skip_exthdr() is built-in even if ipv6 is a module, i.e. this doesn't create an ipv6 dependency. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l3proto.h | 8 -- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 30 ------- net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 29 ------- net/netfilter/Makefile | 2 +- net/netfilter/nf_conntrack_core.c | 108 ++++++++++++++++++++----- net/netfilter/nf_conntrack_l3proto_generic.c | 46 ----------- net/netfilter/nf_conntrack_proto.c | 5 ++ 7 files changed, 94 insertions(+), 134 deletions(-) delete mode 100644 net/netfilter/nf_conntrack_l3proto_generic.c (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h index 164641c743a5..5f160375c93a 100644 --- a/include/net/netfilter/nf_conntrack_l3proto.h +++ b/include/net/netfilter/nf_conntrack_l3proto.h @@ -24,14 +24,6 @@ struct nf_conntrack_l3proto { /* size of tuple nlattr, fills a hole */ u16 nla_size; - /* - * Called before tracking. - * *dataoff: offset of protocol header (TCP, UDP,...) in skb - * *protonum: protocol number - */ - int (*get_l4proto)(const struct sk_buff *skb, unsigned int nhoff, - unsigned int *dataoff, u_int8_t *protonum); - /* Called when netns wants to use connection tracking */ int (*net_ns_get)(struct net *); void (*net_ns_put)(struct net *); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index e10e38c443ab..9fbf6c7f8ece 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -38,35 +38,6 @@ struct conntrack4_net { unsigned int users; }; -static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, - unsigned int *dataoff, u_int8_t *protonum) -{ - const struct iphdr *iph; - struct iphdr _iph; - - iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); - if (iph == NULL) - return -NF_ACCEPT; - - /* Conntrack defragments packets, we might still see fragments - * inside ICMP packets though. */ - if (iph->frag_off & htons(IP_OFFSET)) - return -NF_ACCEPT; - - *dataoff = nhoff + (iph->ihl << 2); - *protonum = iph->protocol; - - /* Check bogus IP headers */ - if (*dataoff > skb->len) { - pr_debug("nf_conntrack_ipv4: bogus IPv4 packet: " - "nhoff %u, ihl %u, skblen %u\n", - nhoff, iph->ihl << 2, skb->len); - return -NF_ACCEPT; - } - - return NF_ACCEPT; -} - static unsigned int ipv4_helper(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -297,7 +268,6 @@ static void ipv4_hooks_unregister(struct net *net) const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = { .l3proto = PF_INET, - .get_l4proto = ipv4_get_l4proto, .net_ns_get = ipv4_hooks_register, .net_ns_put = ipv4_hooks_unregister, .me = THIS_MODULE, diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index f8051fe20489..37ab25645cf2 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -41,34 +41,6 @@ struct conntrack6_net { unsigned int users; }; -static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, - unsigned int *dataoff, u_int8_t *protonum) -{ - unsigned int extoff = nhoff + sizeof(struct ipv6hdr); - __be16 frag_off; - int protoff; - u8 nexthdr; - - if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr), - &nexthdr, sizeof(nexthdr)) != 0) { - pr_debug("ip6_conntrack_core: can't get nexthdr\n"); - return -NF_ACCEPT; - } - protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off); - /* - * (protoff == skb->len) means the packet has not data, just - * IPv6 and possibly extensions headers, but it is tracked anyway - */ - if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { - pr_debug("ip6_conntrack_core: can't find proto in pkt\n"); - return -NF_ACCEPT; - } - - *dataoff = protoff; - *protonum = nexthdr; - return NF_ACCEPT; -} - static unsigned int ipv6_helper(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -281,7 +253,6 @@ static void ipv6_hooks_unregister(struct net *net) const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = { .l3proto = PF_INET6, - .get_l4proto = ipv6_get_l4proto, .net_ns_get = ipv6_hooks_register, .net_ns_put = ipv6_hooks_unregister, .me = THIS_MODULE, diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 44449389e527..f132ea850778 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o utils.o -nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o +nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 14c040805b32..0674c6e5bfed 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -37,7 +37,6 @@ #include #include -#include #include #include #include @@ -55,6 +54,7 @@ #include #include #include +#include #include "nf_internals.h" @@ -273,21 +273,94 @@ nf_ct_get_tuple(const struct sk_buff *skb, return l4proto->pkt_to_tuple(skb, dataoff, net, tuple); } +static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, + u_int8_t *protonum) +{ + int dataoff = -1; +#if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV4) + const struct iphdr *iph; + struct iphdr _iph; + + iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); + if (!iph) + return -1; + + /* Conntrack defragments packets, we might still see fragments + * inside ICMP packets though. + */ + if (iph->frag_off & htons(IP_OFFSET)) + return -1; + + dataoff = nhoff + (iph->ihl << 2); + *protonum = iph->protocol; + + /* Check bogus IP headers */ + if (dataoff > skb->len) { + pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n", + nhoff, iph->ihl << 2, skb->len); + return -1; + } +#endif + return dataoff; +} + +static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, + u8 *protonum) +{ + int protoff = -1; +#if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV6) + unsigned int extoff = nhoff + sizeof(struct ipv6hdr); + __be16 frag_off; + u8 nexthdr; + + if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr), + &nexthdr, sizeof(nexthdr)) != 0) { + pr_debug("can't get nexthdr\n"); + return -1; + } + protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off); + /* + * (protoff == skb->len) means the packet has not data, just + * IPv6 and possibly extensions headers, but it is tracked anyway + */ + if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { + pr_debug("can't find proto in pkt\n"); + return -1; + } + + *protonum = nexthdr; +#endif + return protoff; +} + +static int get_l4proto(const struct sk_buff *skb, + unsigned int nhoff, u8 pf, u8 *l4num) +{ + switch (pf) { + case NFPROTO_IPV4: + return ipv4_get_l4proto(skb, nhoff, l4num); + case NFPROTO_IPV6: + return ipv6_get_l4proto(skb, nhoff, l4num); + default: + *l4num = 0; + break; + } + return -1; +} + bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, u_int16_t l3num, struct net *net, struct nf_conntrack_tuple *tuple) { - const struct nf_conntrack_l3proto *l3proto; const struct nf_conntrack_l4proto *l4proto; - unsigned int protoff; - u_int8_t protonum; + u8 protonum; + int protoff; int ret; rcu_read_lock(); - l3proto = __nf_ct_l3proto_find(l3num); - ret = l3proto->get_l4proto(skb, nhoff, &protoff, &protonum); - if (ret != NF_ACCEPT) { + protoff = get_l4proto(skb, nhoff, l3num, &protonum); + if (protoff <= 0) { rcu_read_unlock(); return false; } @@ -1397,14 +1470,12 @@ unsigned int nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, struct sk_buff *skb) { - const struct nf_conntrack_l3proto *l3proto; const struct nf_conntrack_l4proto *l4proto; struct nf_conn *ct, *tmpl; enum ip_conntrack_info ctinfo; unsigned int *timeouts; - unsigned int dataoff; u_int8_t protonum; - int ret; + int dataoff, ret; tmpl = nf_ct_get(skb, &ctinfo); if (tmpl || ctinfo == IP_CT_UNTRACKED) { @@ -1418,14 +1489,12 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, } /* rcu_read_lock()ed by nf_hook_thresh */ - l3proto = __nf_ct_l3proto_find(pf); - ret = l3proto->get_l4proto(skb, skb_network_offset(skb), - &dataoff, &protonum); - if (ret <= 0) { + dataoff = get_l4proto(skb, skb_network_offset(skb), pf, &protonum); + if (dataoff <= 0) { pr_debug("not prepared to track yet or error occurred\n"); NF_CT_STAT_INC_ATOMIC(net, error); NF_CT_STAT_INC_ATOMIC(net, invalid); - ret = -ret; + ret = NF_ACCEPT; goto out; } @@ -1641,14 +1710,14 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) static int nf_conntrack_update(struct net *net, struct sk_buff *skb) { - const struct nf_conntrack_l3proto *l3proto; const struct nf_conntrack_l4proto *l4proto; struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; enum ip_conntrack_info ctinfo; struct nf_nat_hook *nat_hook; - unsigned int dataoff, status; + unsigned int status; struct nf_conn *ct; + int dataoff; u16 l3num; u8 l4num; @@ -1657,10 +1726,9 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb) return 0; l3num = nf_ct_l3num(ct); - l3proto = nf_ct_l3proto_find_get(l3num); - if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff, - &l4num) <= 0) + dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num); + if (dataoff <= 0) return -1; l4proto = nf_ct_l4proto_find_get(l3num, l4num); diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c deleted file mode 100644 index d6a8fe591ccc..000000000000 --- a/net/netfilter/nf_conntrack_l3proto_generic.c +++ /dev/null @@ -1,46 +0,0 @@ -/* - * (C) 2003,2004 USAGI/WIDE Project - * - * Based largely upon the original ip_conntrack code which - * had the following copyright information: - * - * (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Author: - * Yasuyuki Kozakai @USAGI - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -static int generic_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, - unsigned int *dataoff, u_int8_t *protonum) -{ - /* Never track !!! */ - return -NF_ACCEPT; -} - - -struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = { - .l3proto = PF_UNSPEC, - .get_l4proto = generic_get_l4proto, -}; -EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_generic); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 859cb303bb91..39df72bb9d56 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -35,6 +35,11 @@ EXPORT_SYMBOL_GPL(nf_ct_l3protos); static DEFINE_MUTEX(nf_ct_proto_mutex); +struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = { + .l3proto = PF_UNSPEC, +}; +EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_generic); + #ifdef CONFIG_SYSCTL static int nf_ct_register_sysctl(struct net *net, -- cgit v1.2.3 From 8b3892ea8718920d29432328fe9544d89a429614 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Jun 2018 07:46:48 +0200 Subject: netfilter: conntrack: avoid calls to l4proto invert_tuple Handle the common cases (tcp, udp, etc). in the core and only do the indirect call for the protocols that need it (GRE for instance). Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 2 +- net/netfilter/nf_conntrack_core.c | 8 +++++++- net/netfilter/nf_conntrack_proto_dccp.c | 10 ---------- net/netfilter/nf_conntrack_proto_generic.c | 10 ---------- net/netfilter/nf_conntrack_proto_gre.c | 10 ---------- net/netfilter/nf_conntrack_proto_sctp.c | 10 ---------- net/netfilter/nf_conntrack_proto_tcp.c | 10 ---------- net/netfilter/nf_conntrack_proto_udp.c | 12 ------------ 8 files changed, 8 insertions(+), 64 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index a7220eef9aee..6a55e337a161 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -36,7 +36,7 @@ struct nf_conntrack_l4proto { struct net *net, struct nf_conntrack_tuple *tuple); /* Invert the per-proto part of the tuple: ie. turn xmit into reply. - * Some packets can't be inverted: return 0 in that case. + * Only used by icmp, most protocols use a generic version. */ bool (*invert_tuple)(struct nf_conntrack_tuple *inverse, const struct nf_conntrack_tuple *orig); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 0674c6e5bfed..92efce69b690 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -400,7 +400,13 @@ nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, inverse->dst.dir = !orig->dst.dir; inverse->dst.protonum = orig->dst.protonum; - return l4proto->invert_tuple(inverse, orig); + + if (unlikely(l4proto->invert_tuple)) + return l4proto->invert_tuple(inverse, orig); + + inverse->src.u.all = orig->dst.u.all; + inverse->dst.u.all = orig->src.u.all; + return true; } EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index abe647d5b8c6..05620c03f138 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -403,14 +403,6 @@ static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, return true; } -static bool dccp_invert_tuple(struct nf_conntrack_tuple *inv, - const struct nf_conntrack_tuple *tuple) -{ - inv->src.u.dccp.port = tuple->dst.u.dccp.port; - inv->dst.u.dccp.port = tuple->src.u.dccp.port; - return true; -} - static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, unsigned int *timeouts) { @@ -865,7 +857,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 = { .l3proto = AF_INET, .l4proto = IPPROTO_DCCP, .pkt_to_tuple = dccp_pkt_to_tuple, - .invert_tuple = dccp_invert_tuple, .new = dccp_new, .packet = dccp_packet, .get_timeouts = dccp_get_timeouts, @@ -901,7 +892,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = { .l3proto = AF_INET6, .l4proto = IPPROTO_DCCP, .pkt_to_tuple = dccp_pkt_to_tuple, - .invert_tuple = dccp_invert_tuple, .new = dccp_new, .packet = dccp_packet, .get_timeouts = dccp_get_timeouts, diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index 6c6896d21cd7..4dfe40aa9446 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -41,15 +41,6 @@ static bool generic_pkt_to_tuple(const struct sk_buff *skb, return true; } -static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - tuple->src.u.all = 0; - tuple->dst.u.all = 0; - - return true; -} - static unsigned int *generic_get_timeouts(struct net *net) { return &(generic_pernet(net)->timeout); @@ -168,7 +159,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic = .l3proto = PF_UNSPEC, .l4proto = 255, .pkt_to_tuple = generic_pkt_to_tuple, - .invert_tuple = generic_invert_tuple, .packet = generic_packet, .get_timeouts = generic_get_timeouts, .new = generic_new, diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index d049ea5a3770..0bd40eb06b55 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -179,15 +179,6 @@ EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_destroy); /* PUBLIC CONNTRACK PROTO HELPER FUNCTIONS */ -/* invert gre part of tuple */ -static bool gre_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - tuple->dst.u.gre.key = orig->src.u.gre.key; - tuple->src.u.gre.key = orig->dst.u.gre.key; - return true; -} - /* gre hdr info to tuple */ static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple) @@ -356,7 +347,6 @@ static const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = { .l3proto = AF_INET, .l4proto = IPPROTO_GRE, .pkt_to_tuple = gre_pkt_to_tuple, - .invert_tuple = gre_invert_tuple, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = gre_print_conntrack, #endif diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index fb9a35d16069..148957a5cf3e 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -166,14 +166,6 @@ static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, return true; } -static bool sctp_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - tuple->src.u.sctp.port = orig->dst.u.sctp.port; - tuple->dst.u.sctp.port = orig->src.u.sctp.port; - return true; -} - #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct) @@ -781,7 +773,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = { .l3proto = PF_INET, .l4proto = IPPROTO_SCTP, .pkt_to_tuple = sctp_pkt_to_tuple, - .invert_tuple = sctp_invert_tuple, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = sctp_print_conntrack, #endif @@ -818,7 +809,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = { .l3proto = PF_INET6, .l4proto = IPPROTO_SCTP, .pkt_to_tuple = sctp_pkt_to_tuple, - .invert_tuple = sctp_invert_tuple, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = sctp_print_conntrack, #endif diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 8e67910185a0..03cff1e3066a 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -293,14 +293,6 @@ static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, return true; } -static bool tcp_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - tuple->src.u.tcp.port = orig->dst.u.tcp.port; - tuple->dst.u.tcp.port = orig->src.u.tcp.port; - return true; -} - #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct) @@ -1560,7 +1552,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 = .l3proto = PF_INET, .l4proto = IPPROTO_TCP, .pkt_to_tuple = tcp_pkt_to_tuple, - .invert_tuple = tcp_invert_tuple, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = tcp_print_conntrack, #endif @@ -1598,7 +1589,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 = .l3proto = PF_INET6, .l4proto = IPPROTO_TCP, .pkt_to_tuple = tcp_pkt_to_tuple, - .invert_tuple = tcp_invert_tuple, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = tcp_print_conntrack, #endif diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index fe7243970aa4..6fe2233c323a 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -55,14 +55,6 @@ static bool udp_pkt_to_tuple(const struct sk_buff *skb, return true; } -static bool udp_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - tuple->src.u.udp.port = orig->dst.u.udp.port; - tuple->dst.u.udp.port = orig->src.u.udp.port; - return true; -} - static unsigned int *udp_get_timeouts(struct net *net) { return udp_pernet(net)->timeouts; @@ -302,7 +294,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 = .l4proto = IPPROTO_UDP, .allow_clash = true, .pkt_to_tuple = udp_pkt_to_tuple, - .invert_tuple = udp_invert_tuple, .packet = udp_packet, .get_timeouts = udp_get_timeouts, .new = udp_new, @@ -334,7 +325,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 = .l4proto = IPPROTO_UDPLITE, .allow_clash = true, .pkt_to_tuple = udp_pkt_to_tuple, - .invert_tuple = udp_invert_tuple, .packet = udp_packet, .get_timeouts = udp_get_timeouts, .new = udp_new, @@ -366,7 +356,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 = .l4proto = IPPROTO_UDP, .allow_clash = true, .pkt_to_tuple = udp_pkt_to_tuple, - .invert_tuple = udp_invert_tuple, .packet = udp_packet, .get_timeouts = udp_get_timeouts, .new = udp_new, @@ -398,7 +387,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 = .l4proto = IPPROTO_UDPLITE, .allow_clash = true, .pkt_to_tuple = udp_pkt_to_tuple, - .invert_tuple = udp_invert_tuple, .packet = udp_packet, .get_timeouts = udp_get_timeouts, .new = udp_new, -- cgit v1.2.3 From c779e849608a875448f6ffc2a5c2a15523bdcd00 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Jun 2018 07:46:50 +0200 Subject: netfilter: conntrack: remove get_timeout() indirection Not needed, we can have the l4trackers fetch it themselvs. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 8 ++------ include/net/netfilter/nf_conntrack_timeout.h | 18 ++++-------------- net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 16 +++++++++++----- net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 14 ++++++++++---- net/netfilter/nf_conntrack_core.c | 16 ++-------------- net/netfilter/nf_conntrack_proto_dccp.c | 17 +++++++---------- net/netfilter/nf_conntrack_proto_generic.c | 22 ++++++++++++---------- net/netfilter/nf_conntrack_proto_gre.c | 14 ++++++++++---- net/netfilter/nf_conntrack_proto_sctp.c | 18 ++++++++---------- net/netfilter/nf_conntrack_proto_tcp.c | 23 +++++++++++------------ net/netfilter/nf_conntrack_proto_udp.c | 20 +++++++++++++------- net/netfilter/nfnetlink_cttimeout.c | 12 ++++-------- 12 files changed, 94 insertions(+), 104 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 6a55e337a161..c7a0075d96df 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -45,13 +45,12 @@ struct nf_conntrack_l4proto { int (*packet)(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo, - unsigned int *timeouts); + enum ip_conntrack_info ctinfo); /* Called when a new connection for this protocol found; * returns TRUE if it's OK. If so, packet() called next. */ bool (*new)(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts); + unsigned int dataoff); /* Called when a conntrack entry is destroyed */ void (*destroy)(struct nf_conn *ct); @@ -63,9 +62,6 @@ struct nf_conntrack_l4proto { /* called by gc worker if table is full */ bool (*can_early_drop)(const struct nf_conn *ct); - /* Return the array of timeouts for this protocol. */ - unsigned int *(*get_timeouts)(struct net *net); - /* convert protoinfo to nfnetink attributes */ int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct); diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h index 9468ab4ad12d..80ceb3d0291d 100644 --- a/include/net/netfilter/nf_conntrack_timeout.h +++ b/include/net/netfilter/nf_conntrack_timeout.h @@ -67,27 +67,17 @@ struct nf_conn_timeout *nf_ct_timeout_ext_add(struct nf_conn *ct, #endif }; -static inline unsigned int * -nf_ct_timeout_lookup(struct net *net, struct nf_conn *ct, - const struct nf_conntrack_l4proto *l4proto) +static inline unsigned int *nf_ct_timeout_lookup(const struct nf_conn *ct) { + unsigned int *timeouts = NULL; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT struct nf_conn_timeout *timeout_ext; - unsigned int *timeouts; timeout_ext = nf_ct_timeout_find(ct); - if (timeout_ext) { + if (timeout_ext) timeouts = nf_ct_timeout_data(timeout_ext); - if (unlikely(!timeouts)) - timeouts = l4proto->get_timeouts(net); - } else { - timeouts = l4proto->get_timeouts(net); - } - - return timeouts; -#else - return l4proto->get_timeouts(net); #endif + return timeouts; } #ifdef CONFIG_NF_CONNTRACK_TIMEOUT diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 34095949a003..036670b38282 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -80,12 +81,16 @@ static unsigned int *icmp_get_timeouts(struct net *net) static int icmp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo, - unsigned int *timeout) + enum ip_conntrack_info ctinfo) { /* Do not immediately delete the connection after the first successful reply to avoid excessive conntrackd traffic and also to handle correctly ICMP echo reply duplicates. */ + unsigned int *timeout = nf_ct_timeout_lookup(ct); + + if (!timeout) + timeout = icmp_get_timeouts(nf_ct_net(ct)); + nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; @@ -93,7 +98,7 @@ static int icmp_packet(struct nf_conn *ct, /* Called when a new connection for this protocol found. */ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts) + unsigned int dataoff) { static const u_int8_t valid_new[] = { [ICMP_ECHO] = 1, @@ -280,9 +285,11 @@ static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], struct nf_icmp_net *in = icmp_pernet(net); if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) { + if (!timeout) + timeout = &in->timeout; *timeout = ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ; - } else { + } else if (timeout) { /* Set default ICMP timeout. */ *timeout = in->timeout; } @@ -357,7 +364,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp = .pkt_to_tuple = icmp_pkt_to_tuple, .invert_tuple = icmp_invert_tuple, .packet = icmp_packet, - .get_timeouts = icmp_get_timeouts, .new = icmp_new, .error = icmp_error, .destroy = NULL, diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 8bcbc2f15bd5..bed07b998a10 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -93,9 +94,13 @@ static unsigned int *icmpv6_get_timeouts(struct net *net) static int icmpv6_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo, - unsigned int *timeout) + enum ip_conntrack_info ctinfo) { + unsigned int *timeout = nf_ct_timeout_lookup(ct); + + if (!timeout) + timeout = icmpv6_get_timeouts(nf_ct_net(ct)); + /* Do not immediately delete the connection after the first successful reply to avoid excessive conntrackd traffic and also to handle correctly ICMP echo reply duplicates. */ @@ -106,7 +111,7 @@ static int icmpv6_packet(struct nf_conn *ct, /* Called when a new connection for this protocol found. */ static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts) + unsigned int dataoff) { static const u_int8_t valid_new[] = { [ICMPV6_ECHO_REQUEST - 128] = 1, @@ -280,6 +285,8 @@ static int icmpv6_timeout_nlattr_to_obj(struct nlattr *tb[], unsigned int *timeout = data; struct nf_icmp_net *in = icmpv6_pernet(net); + if (!timeout) + timeout = icmpv6_get_timeouts(net); if (tb[CTA_TIMEOUT_ICMPV6_TIMEOUT]) { *timeout = ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMPV6_TIMEOUT])) * HZ; @@ -358,7 +365,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 = .pkt_to_tuple = icmpv6_pkt_to_tuple, .invert_tuple = icmpv6_invert_tuple, .packet = icmpv6_packet, - .get_timeouts = icmpv6_get_timeouts, .new = icmpv6_new, .error = icmpv6_error, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 994591fd9b96..c069f2faff4c 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1337,7 +1337,6 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, const struct nf_conntrack_zone *zone; struct nf_conn_timeout *timeout_ext; struct nf_conntrack_zone tmp; - unsigned int *timeouts; if (!nf_ct_invert_tuple(&repl_tuple, tuple, l4proto)) { pr_debug("Can't invert tuple.\n"); @@ -1356,15 +1355,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, } timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; - if (timeout_ext) { - timeouts = nf_ct_timeout_data(timeout_ext); - if (unlikely(!timeouts)) - timeouts = l4proto->get_timeouts(net); - } else { - timeouts = l4proto->get_timeouts(net); - } - if (!l4proto->new(ct, skb, dataoff, timeouts)) { + if (!l4proto->new(ct, skb, dataoff)) { nf_conntrack_free(ct); pr_debug("can't track with proto module\n"); return NULL; @@ -1493,7 +1485,6 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, const struct nf_conntrack_l4proto *l4proto; struct nf_conn *ct, *tmpl; enum ip_conntrack_info ctinfo; - unsigned int *timeouts; u_int8_t protonum; int dataoff, ret; @@ -1552,10 +1543,7 @@ repeat: goto out; } - /* Decide what timeout policy we want to apply to this flow. */ - timeouts = nf_ct_timeout_lookup(net, ct, l4proto); - - ret = l4proto->packet(ct, skb, dataoff, ctinfo, timeouts); + ret = l4proto->packet(ct, skb, dataoff, ctinfo); if (ret <= 0) { /* Invalid: inverse of the return code tells * the netfilter core what to do */ diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index abfdce7baed5..f476d116c816 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -23,6 +23,7 @@ #include #include #include +#include #include /* Timeouts are based on values from RFC4340: @@ -389,7 +390,7 @@ static inline struct nf_dccp_net *dccp_pernet(struct net *net) } static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts) + unsigned int dataoff) { struct net *net = nf_ct_net(ct); struct nf_dccp_net *dn; @@ -437,19 +438,14 @@ static u64 dccp_ack_seq(const struct dccp_hdr *dh) ntohl(dhack->dccph_ack_nr_low); } -static unsigned int *dccp_get_timeouts(struct net *net) -{ - return dccp_pernet(net)->dccp_timeout; -} - static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, enum ip_conntrack_info ctinfo, - unsigned int *timeouts) + unsigned int dataoff, enum ip_conntrack_info ctinfo) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct dccp_hdr _dh, *dh; u_int8_t type, old_state, new_state; enum ct_dccp_roles role; + unsigned int *timeouts; dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh); BUG_ON(dh == NULL); @@ -523,6 +519,9 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, if (new_state != old_state) nf_conntrack_event_cache(IPCT_PROTOINFO, ct); + timeouts = nf_ct_timeout_lookup(ct); + if (!timeouts) + timeouts = dccp_pernet(nf_ct_net(ct))->dccp_timeout; nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); return NF_ACCEPT; @@ -843,7 +842,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 = { .l4proto = IPPROTO_DCCP, .new = dccp_new, .packet = dccp_packet, - .get_timeouts = dccp_get_timeouts, .error = dccp_error, .can_early_drop = dccp_can_early_drop, #ifdef CONFIG_NF_CONNTRACK_PROCFS @@ -877,7 +875,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = { .l4proto = IPPROTO_DCCP, .new = dccp_new, .packet = dccp_packet, - .get_timeouts = dccp_get_timeouts, .error = dccp_error, .can_early_drop = dccp_can_early_drop, #ifdef CONFIG_NF_CONNTRACK_PROCFS diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index 4dfe40aa9446..ac4a0b296dcd 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -11,6 +11,7 @@ #include #include #include +#include static const unsigned int nf_ct_generic_timeout = 600*HZ; @@ -41,25 +42,24 @@ static bool generic_pkt_to_tuple(const struct sk_buff *skb, return true; } -static unsigned int *generic_get_timeouts(struct net *net) -{ - return &(generic_pernet(net)->timeout); -} - /* Returns verdict for packet, or -1 for invalid. */ static int generic_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo, - unsigned int *timeout) + enum ip_conntrack_info ctinfo) { + const unsigned int *timeout = nf_ct_timeout_lookup(ct); + + if (!timeout) + timeout = &generic_pernet(nf_ct_net(ct))->timeout; + nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; } /* Called when a new connection for this protocol found. */ static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts) + unsigned int dataoff) { bool ret; @@ -78,8 +78,11 @@ static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb, static int generic_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { - unsigned int *timeout = data; struct nf_generic_net *gn = generic_pernet(net); + unsigned int *timeout = data; + + if (!timeout) + timeout = &gn->timeout; if (tb[CTA_TIMEOUT_GENERIC_TIMEOUT]) *timeout = @@ -160,7 +163,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic = .l4proto = 255, .pkt_to_tuple = generic_pkt_to_tuple, .packet = generic_packet, - .get_timeouts = generic_get_timeouts, .new = generic_new, #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) .ctnl_timeout = { diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 0bd40eb06b55..d1632252bf5b 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -234,8 +235,7 @@ static unsigned int *gre_get_timeouts(struct net *net) static int gre_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo, - unsigned int *timeouts) + enum ip_conntrack_info ctinfo) { /* If we've seen traffic both ways, this is a GRE connection. * Extend timeout. */ @@ -254,8 +254,13 @@ static int gre_packet(struct nf_conn *ct, /* Called when a new connection for this protocol found. */ static bool gre_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts) + unsigned int dataoff) { + unsigned int *timeouts = nf_ct_timeout_lookup(ct); + + if (!timeouts) + timeouts = gre_get_timeouts(nf_ct_net(ct)); + pr_debug(": "); nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); @@ -291,6 +296,8 @@ static int gre_timeout_nlattr_to_obj(struct nlattr *tb[], unsigned int *timeouts = data; struct netns_proto_gre *net_gre = gre_pernet(net); + if (!timeouts) + timeouts = gre_get_timeouts(net); /* set default timeouts for GRE. */ timeouts[GRE_CT_UNREPLIED] = net_gre->gre_timeouts[GRE_CT_UNREPLIED]; timeouts[GRE_CT_REPLIED] = net_gre->gre_timeouts[GRE_CT_REPLIED]; @@ -350,7 +357,6 @@ static const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = { #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = gre_print_conntrack, #endif - .get_timeouts = gre_get_timeouts, .packet = gre_packet, .new = gre_new, .destroy = gre_destroy, diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index b4126a842bfd..8d1e085fc14a 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -28,6 +28,7 @@ #include #include #include +#include /* FIXME: Examine ipfilter's timeouts and conntrack transitions more closely. They're more complex. --RR @@ -272,17 +273,11 @@ static int sctp_new_state(enum ip_conntrack_dir dir, return sctp_conntracks[dir][i][cur_state]; } -static unsigned int *sctp_get_timeouts(struct net *net) -{ - return sctp_pernet(net)->timeouts; -} - /* Returns verdict for packet, or -NF_ACCEPT for invalid. */ static int sctp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo, - unsigned int *timeouts) + enum ip_conntrack_info ctinfo) { enum sctp_conntrack new_state, old_state; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); @@ -291,6 +286,7 @@ static int sctp_packet(struct nf_conn *ct, const struct sctp_chunkhdr *sch; struct sctp_chunkhdr _sch; u_int32_t offset, count; + unsigned int *timeouts; unsigned long map[256 / sizeof(unsigned long)] = { 0 }; sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); @@ -379,6 +375,10 @@ static int sctp_packet(struct nf_conn *ct, } spin_unlock_bh(&ct->lock); + timeouts = nf_ct_timeout_lookup(ct); + if (!timeouts) + timeouts = sctp_pernet(nf_ct_net(ct))->timeouts; + nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); if (old_state == SCTP_CONNTRACK_COOKIE_ECHOED && @@ -399,7 +399,7 @@ out: /* Called when a new connection for this protocol found. */ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts) + unsigned int dataoff) { enum sctp_conntrack new_state; const struct sctphdr *sh; @@ -760,7 +760,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = { .print_conntrack = sctp_print_conntrack, #endif .packet = sctp_packet, - .get_timeouts = sctp_get_timeouts, .new = sctp_new, .error = sctp_error, .can_early_drop = sctp_can_early_drop, @@ -795,7 +794,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = { .print_conntrack = sctp_print_conntrack, #endif .packet = sctp_packet, - .get_timeouts = sctp_get_timeouts, .new = sctp_new, .error = sctp_error, .can_early_drop = sctp_can_early_drop, diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 13c89fd107b2..d80d322b9d8b 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -768,27 +769,21 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl, return NF_ACCEPT; } -static unsigned int *tcp_get_timeouts(struct net *net) -{ - return tcp_pernet(net)->timeouts; -} - /* Returns verdict for packet, or -1 for invalid. */ static int tcp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo, - unsigned int *timeouts) + enum ip_conntrack_info ctinfo) { struct net *net = nf_ct_net(ct); struct nf_tcp_net *tn = tcp_pernet(net); struct nf_conntrack_tuple *tuple; enum tcp_conntrack new_state, old_state; + unsigned int index, *timeouts; enum ip_conntrack_dir dir; const struct tcphdr *th; struct tcphdr _tcph; unsigned long timeout; - unsigned int index; th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); BUG_ON(th == NULL); @@ -1021,6 +1016,10 @@ static int tcp_packet(struct nf_conn *ct, && new_state == TCP_CONNTRACK_FIN_WAIT) ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; + timeouts = nf_ct_timeout_lookup(ct); + if (!timeouts) + timeouts = tn->timeouts; + if (ct->proto.tcp.retrans >= tn->tcp_max_retrans && timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS]) timeout = timeouts[TCP_CONNTRACK_RETRANS]; @@ -1070,7 +1069,7 @@ static int tcp_packet(struct nf_conn *ct, /* Called when a new connection for this protocol found. */ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts) + unsigned int dataoff) { enum tcp_conntrack new_state; const struct tcphdr *th; @@ -1288,10 +1287,12 @@ static unsigned int tcp_nlattr_tuple_size(void) static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { - unsigned int *timeouts = data; struct nf_tcp_net *tn = tcp_pernet(net); + unsigned int *timeouts = data; int i; + if (!timeouts) + timeouts = tn->timeouts; /* set default TCP timeouts. */ for (i=0; itimeouts[i]; @@ -1538,7 +1539,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 = .print_conntrack = tcp_print_conntrack, #endif .packet = tcp_packet, - .get_timeouts = tcp_get_timeouts, .new = tcp_new, .error = tcp_error, .can_early_drop = tcp_can_early_drop, @@ -1574,7 +1574,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 = .print_conntrack = tcp_print_conntrack, #endif .packet = tcp_packet, - .get_timeouts = tcp_get_timeouts, .new = tcp_new, .error = tcp_error, .can_early_drop = tcp_can_early_drop, diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 8b435d70ffe3..7a1b8988a931 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -45,9 +46,14 @@ static unsigned int *udp_get_timeouts(struct net *net) static int udp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo, - unsigned int *timeouts) + enum ip_conntrack_info ctinfo) { + unsigned int *timeouts; + + timeouts = nf_ct_timeout_lookup(ct); + if (!timeouts) + timeouts = udp_get_timeouts(nf_ct_net(ct)); + /* If we've seen traffic both ways, this is some kind of UDP stream. Extend timeout. */ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { @@ -65,7 +71,7 @@ static int udp_packet(struct nf_conn *ct, /* Called when a new connection for this protocol found. */ static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, unsigned int *timeouts) + unsigned int dataoff) { return true; } @@ -176,6 +182,9 @@ static int udp_timeout_nlattr_to_obj(struct nlattr *tb[], unsigned int *timeouts = data; struct nf_udp_net *un = udp_pernet(net); + if (!timeouts) + timeouts = un->timeouts; + /* set default timeouts for UDP. */ timeouts[UDP_CT_UNREPLIED] = un->timeouts[UDP_CT_UNREPLIED]; timeouts[UDP_CT_REPLIED] = un->timeouts[UDP_CT_REPLIED]; @@ -275,7 +284,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 = .l4proto = IPPROTO_UDP, .allow_clash = true, .packet = udp_packet, - .get_timeouts = udp_get_timeouts, .new = udp_new, .error = udp_error, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -305,7 +313,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 = .l4proto = IPPROTO_UDPLITE, .allow_clash = true, .packet = udp_packet, - .get_timeouts = udp_get_timeouts, .new = udp_new, .error = udplite_error, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -335,7 +342,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 = .l4proto = IPPROTO_UDP, .allow_clash = true, .packet = udp_packet, - .get_timeouts = udp_get_timeouts, .new = udp_new, .error = udp_error, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -365,7 +371,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 = .l4proto = IPPROTO_UDPLITE, .allow_clash = true, .packet = udp_packet, - .get_timeouts = udp_get_timeouts, .new = udp_new, .error = udplite_error, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -388,3 +393,4 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 = }; EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite6); #endif +#include diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 9da4b8462004..d9d952fad3e0 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -46,7 +46,7 @@ static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = { }; static int -ctnl_timeout_parse_policy(void *timeouts, +ctnl_timeout_parse_policy(void *timeout, const struct nf_conntrack_l4proto *l4proto, struct net *net, const struct nlattr *attr) { @@ -67,7 +67,7 @@ ctnl_timeout_parse_policy(void *timeouts, if (ret < 0) goto err; - ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts); + ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeout); err: kfree(tb); @@ -372,7 +372,6 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl, struct netlink_ext_ack *extack) { const struct nf_conntrack_l4proto *l4proto; - unsigned int *timeouts; __u16 l3num; __u8 l4num; int ret; @@ -392,9 +391,7 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl, goto err; } - timeouts = l4proto->get_timeouts(net); - - ret = ctnl_timeout_parse_policy(timeouts, l4proto, net, + ret = ctnl_timeout_parse_policy(NULL, l4proto, net, cda[CTA_TIMEOUT_DATA]); if (ret < 0) goto err; @@ -431,7 +428,6 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) { struct nlattr *nest_parms; - unsigned int *timeouts = l4proto->get_timeouts(net); int ret; nest_parms = nla_nest_start(skb, @@ -439,7 +435,7 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, if (!nest_parms) goto nla_put_failure; - ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, timeouts); + ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, NULL); if (ret < 0) goto nla_put_failure; -- cgit v1.2.3 From a0ae2562c6c4b2721d9fddba63b7286c13517d9f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Jun 2018 07:46:51 +0200 Subject: netfilter: conntrack: remove l3proto abstraction This unifies ipv4 and ipv6 protocol trackers and removes the l3proto abstraction. This gets rid of all l3proto indirect calls and the need to do a lookup on the function to call for l3 demux. It increases module size by only a small amount (12kbyte), so this reduces size because nf_conntrack.ko is useless without either nf_conntrack_ipv4 or nf_conntrack_ipv6 module. before: text data bss dec hex filename 7357 1088 0 8445 20fd nf_conntrack_ipv4.ko 7405 1084 4 8493 212d nf_conntrack_ipv6.ko 72614 13689 236 86539 1520b nf_conntrack.ko 19K nf_conntrack_ipv4.ko 19K nf_conntrack_ipv6.ko 179K nf_conntrack.ko after: text data bss dec hex filename 79277 13937 236 93450 16d0a nf_conntrack.ko 191K nf_conntrack.ko Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_conntrack_ipv4.h | 3 - include/net/netfilter/nf_conntrack.h | 5 + include/net/netfilter/nf_conntrack_core.h | 1 - include/net/netfilter/nf_conntrack_l3proto.h | 54 -- include/net/netfilter/nf_conntrack_l4proto.h | 4 - net/ipv4/netfilter/Kconfig | 22 +- net/ipv4/netfilter/Makefile | 6 - net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 368 ----------- net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 388 ----------- net/ipv6/netfilter/Kconfig | 27 +- net/ipv6/netfilter/Makefile | 6 - net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 355 ----------- net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 387 ----------- net/netfilter/Kconfig | 2 + net/netfilter/Makefile | 7 +- net/netfilter/nf_conntrack_core.c | 11 +- net/netfilter/nf_conntrack_proto.c | 847 ++++++++++++++++++------- net/netfilter/nf_conntrack_proto_icmp.c | 388 +++++++++++ net/netfilter/nf_conntrack_proto_icmpv6.c | 387 +++++++++++ net/netfilter/nf_conntrack_standalone.c | 14 +- net/netfilter/nf_nat_core.c | 8 - 21 files changed, 1420 insertions(+), 1870 deletions(-) delete mode 100644 include/net/netfilter/nf_conntrack_l3proto.h delete mode 100644 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c delete mode 100644 net/ipv4/netfilter/nf_conntrack_proto_icmp.c delete mode 100644 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c delete mode 100644 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c create mode 100644 net/netfilter/nf_conntrack_proto_icmp.c create mode 100644 net/netfilter/nf_conntrack_proto_icmpv6.c (limited to 'include/net') diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h index 73f825732326..c84b51682f08 100644 --- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h +++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h @@ -10,9 +10,6 @@ #ifndef _NF_CONNTRACK_IPV4_H #define _NF_CONNTRACK_IPV4_H - -const extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4; - extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4; extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4; extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp; diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 062dc19b5840..a2b0ed025908 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -41,6 +41,11 @@ union nf_conntrack_expect_proto { /* insert expect proto private data here */ }; +struct nf_conntrack_net { + unsigned int users4; + unsigned int users6; +}; + #include #include diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 35461b2d3462..2a3e0974a6af 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -14,7 +14,6 @@ #define _NF_CONNTRACK_CORE_H #include -#include #include #include diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h deleted file mode 100644 index 5f160375c93a..000000000000 --- a/include/net/netfilter/nf_conntrack_l3proto.h +++ /dev/null @@ -1,54 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C)2003,2004 USAGI/WIDE Project - * - * Header for use in defining a given L3 protocol for connection tracking. - * - * Author: - * Yasuyuki Kozakai @USAGI - * - * Derived from include/netfilter_ipv4/ip_conntrack_protocol.h - */ - -#ifndef _NF_CONNTRACK_L3PROTO_H -#define _NF_CONNTRACK_L3PROTO_H -#include -#include -#include -#include - -struct nf_conntrack_l3proto { - /* L3 Protocol Family number. ex) PF_INET */ - u_int16_t l3proto; - - /* size of tuple nlattr, fills a hole */ - u16 nla_size; - - /* Called when netns wants to use connection tracking */ - int (*net_ns_get)(struct net *); - void (*net_ns_put)(struct net *); - - /* Module (if any) which this is connected to. */ - struct module *me; -}; - -extern struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[NFPROTO_NUMPROTO]; - -/* Protocol global registration. */ -int nf_ct_l3proto_register(const struct nf_conntrack_l3proto *proto); -void nf_ct_l3proto_unregister(const struct nf_conntrack_l3proto *proto); - -const struct nf_conntrack_l3proto *nf_ct_l3proto_find_get(u_int16_t l3proto); - -/* Existing built-in protocols */ -extern struct nf_conntrack_l3proto nf_conntrack_l3proto_generic; - -static inline struct nf_conntrack_l3proto * -__nf_ct_l3proto_find(u_int16_t l3proto) -{ - if (unlikely(l3proto >= NFPROTO_NUMPROTO)) - return &nf_conntrack_l3proto_generic; - return rcu_dereference(nf_ct_l3protos[l3proto]); -} - -#endif /*_NF_CONNTRACK_L3PROTO_H*/ diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index c7a0075d96df..6068c6da3eac 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -130,10 +130,6 @@ void nf_ct_l4proto_pernet_unregister(struct net *net, /* Protocol global registration. */ int nf_ct_l4proto_register_one(const struct nf_conntrack_l4proto *proto); void nf_ct_l4proto_unregister_one(const struct nf_conntrack_l4proto *proto); -int nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const proto[], - unsigned int num_proto); -void nf_ct_l4proto_unregister(const struct nf_conntrack_l4proto * const proto[], - unsigned int num_proto); /* Generic netlink helpers */ int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index bbfc356cb1b5..d9504adc47b3 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -9,22 +9,6 @@ config NF_DEFRAG_IPV4 tristate default n -config NF_CONNTRACK_IPV4 - tristate "IPv4 connection tracking support (required for NAT)" - depends on NF_CONNTRACK - default m if NETFILTER_ADVANCED=n - select NF_DEFRAG_IPV4 - ---help--- - Connection tracking keeps a record of what packets have passed - through your machine, in order to figure out how they are related - into connections. - - This is IPv4 support on Layer 3 independent connection tracking. - Layer 3 independent connection tracking is experimental scheme - which generalize ip_conntrack to support other layer 3 protocols. - - To compile it as a module, choose M here. If unsure, say N. - config NF_SOCKET_IPV4 tristate "IPv4 socket lookup support" help @@ -112,7 +96,7 @@ config NF_REJECT_IPV4 config NF_NAT_IPV4 tristate "IPv4 NAT" - depends on NF_CONNTRACK_IPV4 + depends on NF_CONNTRACK default m if NETFILTER_ADVANCED=n select NF_NAT help @@ -279,7 +263,7 @@ config IP_NF_TARGET_SYNPROXY # NAT + specific targets: nf_conntrack config IP_NF_NAT tristate "iptables NAT support" - depends on NF_CONNTRACK_IPV4 + depends on NF_CONNTRACK default m if NETFILTER_ADVANCED=n select NF_NAT select NF_NAT_IPV4 @@ -340,7 +324,7 @@ config IP_NF_MANGLE config IP_NF_TARGET_CLUSTERIP tristate "CLUSTERIP target support" depends on IP_NF_MANGLE - depends on NF_CONNTRACK_IPV4 + depends on NF_CONNTRACK depends on NETFILTER_ADVANCED select NF_CONNTRACK_MARK select NETFILTER_FAMILY_ARP diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 8394c17c269f..367993adf4d3 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -3,12 +3,6 @@ # Makefile for the netfilter modules on top of IPv4. # -# objects for l3 independent conntrack -nf_conntrack_ipv4-y := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o - -# connection tracking -obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o - nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o nf_nat_ipv4-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c deleted file mode 100644 index 9fbf6c7f8ece..000000000000 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ /dev/null @@ -1,368 +0,0 @@ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team - * (C) 2006-2012 Patrick McHardy - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static int conntrack4_net_id __read_mostly; -static DEFINE_MUTEX(register_ipv4_hooks); - -struct conntrack4_net { - unsigned int users; -}; - -static unsigned int ipv4_helper(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - const struct nf_conn_help *help; - const struct nf_conntrack_helper *helper; - - /* This is where we call the helper: as the packet goes out. */ - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - return NF_ACCEPT; - - help = nfct_help(ct); - if (!help) - return NF_ACCEPT; - - /* rcu_read_lock()ed by nf_hook_thresh */ - helper = rcu_dereference(help->helper); - if (!helper) - return NF_ACCEPT; - - return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), - ct, ctinfo); -} - -static unsigned int ipv4_confirm(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - goto out; - - /* adjust seqs for loopback traffic only in outgoing direction */ - if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && - !nf_is_loopback_packet(skb)) { - if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) { - NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); - return NF_DROP; - } - } -out: - /* We've seen it coming out the other side: confirm it */ - return nf_conntrack_confirm(skb); -} - -static unsigned int ipv4_conntrack_in(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return nf_conntrack_in(state->net, PF_INET, state->hook, skb); -} - -static unsigned int ipv4_conntrack_local(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - if (ip_is_fragment(ip_hdr(skb))) { /* IP_NODEFRAG setsockopt set */ - enum ip_conntrack_info ctinfo; - struct nf_conn *tmpl; - - tmpl = nf_ct_get(skb, &ctinfo); - if (tmpl && nf_ct_is_template(tmpl)) { - /* when skipping ct, clear templates to avoid fooling - * later targets/matches - */ - skb->_nfct = 0; - nf_ct_put(tmpl); - } - return NF_ACCEPT; - } - - return nf_conntrack_in(state->net, PF_INET, state->hook, skb); -} - -/* Connection tracking may drop packets, but never alters them, so - make it the first hook. */ -static const struct nf_hook_ops ipv4_conntrack_ops[] = { - { - .hook = ipv4_conntrack_in, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP_PRI_CONNTRACK, - }, - { - .hook = ipv4_conntrack_local, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP_PRI_CONNTRACK, - }, - { - .hook = ipv4_helper, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_HELPER, - }, - { - .hook = ipv4_confirm, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM, - }, - { - .hook = ipv4_helper, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_HELPER, - }, - { - .hook = ipv4_confirm, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM, - }, -}; - -/* Fast function for those who don't want to parse /proc (and I don't - blame them). */ -/* Reversing the socket's dst/src point of view gives us the reply - mapping. */ -static int -getorigdst(struct sock *sk, int optval, void __user *user, int *len) -{ - const struct inet_sock *inet = inet_sk(sk); - const struct nf_conntrack_tuple_hash *h; - struct nf_conntrack_tuple tuple; - - memset(&tuple, 0, sizeof(tuple)); - - lock_sock(sk); - tuple.src.u3.ip = inet->inet_rcv_saddr; - tuple.src.u.tcp.port = inet->inet_sport; - tuple.dst.u3.ip = inet->inet_daddr; - tuple.dst.u.tcp.port = inet->inet_dport; - tuple.src.l3num = PF_INET; - tuple.dst.protonum = sk->sk_protocol; - release_sock(sk); - - /* We only do TCP and SCTP at the moment: is there a better way? */ - if (tuple.dst.protonum != IPPROTO_TCP && - tuple.dst.protonum != IPPROTO_SCTP) { - pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n"); - return -ENOPROTOOPT; - } - - if ((unsigned int) *len < sizeof(struct sockaddr_in)) { - pr_debug("SO_ORIGINAL_DST: len %d not %zu\n", - *len, sizeof(struct sockaddr_in)); - return -EINVAL; - } - - h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); - if (h) { - struct sockaddr_in sin; - struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); - - sin.sin_family = AF_INET; - sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.dst.u.tcp.port; - sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.dst.u3.ip; - memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); - - pr_debug("SO_ORIGINAL_DST: %pI4 %u\n", - &sin.sin_addr.s_addr, ntohs(sin.sin_port)); - nf_ct_put(ct); - if (copy_to_user(user, &sin, sizeof(sin)) != 0) - return -EFAULT; - else - return 0; - } - pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n", - &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port), - &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port)); - return -ENOENT; -} - -static struct nf_sockopt_ops so_getorigdst = { - .pf = PF_INET, - .get_optmin = SO_ORIGINAL_DST, - .get_optmax = SO_ORIGINAL_DST+1, - .get = getorigdst, - .owner = THIS_MODULE, -}; - -static int ipv4_hooks_register(struct net *net) -{ - struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id); - int err = 0; - - mutex_lock(®ister_ipv4_hooks); - - cnet->users++; - if (cnet->users > 1) - goto out_unlock; - - err = nf_defrag_ipv4_enable(net); - if (err) { - cnet->users = 0; - goto out_unlock; - } - - err = nf_register_net_hooks(net, ipv4_conntrack_ops, - ARRAY_SIZE(ipv4_conntrack_ops)); - - if (err) - cnet->users = 0; - out_unlock: - mutex_unlock(®ister_ipv4_hooks); - return err; -} - -static void ipv4_hooks_unregister(struct net *net) -{ - struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id); - - mutex_lock(®ister_ipv4_hooks); - if (cnet->users && (--cnet->users == 0)) - nf_unregister_net_hooks(net, ipv4_conntrack_ops, - ARRAY_SIZE(ipv4_conntrack_ops)); - mutex_unlock(®ister_ipv4_hooks); -} - -const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = { - .l3proto = PF_INET, - .net_ns_get = ipv4_hooks_register, - .net_ns_put = ipv4_hooks_unregister, - .me = THIS_MODULE, -}; - -module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, - &nf_conntrack_htable_size, 0600); - -MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET)); -MODULE_ALIAS("ip_conntrack"); -MODULE_LICENSE("GPL"); - -static const struct nf_conntrack_l4proto * const builtin_l4proto4[] = { - &nf_conntrack_l4proto_tcp4, - &nf_conntrack_l4proto_udp4, - &nf_conntrack_l4proto_icmp, -#ifdef CONFIG_NF_CT_PROTO_DCCP - &nf_conntrack_l4proto_dccp4, -#endif -#ifdef CONFIG_NF_CT_PROTO_SCTP - &nf_conntrack_l4proto_sctp4, -#endif -#ifdef CONFIG_NF_CT_PROTO_UDPLITE - &nf_conntrack_l4proto_udplite4, -#endif -}; - -static int ipv4_net_init(struct net *net) -{ - return nf_ct_l4proto_pernet_register(net, builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); -} - -static void ipv4_net_exit(struct net *net) -{ - nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); -} - -static struct pernet_operations ipv4_net_ops = { - .init = ipv4_net_init, - .exit = ipv4_net_exit, - .id = &conntrack4_net_id, - .size = sizeof(struct conntrack4_net), -}; - -static int __init nf_conntrack_l3proto_ipv4_init(void) -{ - int ret = 0; - - need_conntrack(); - - ret = nf_register_sockopt(&so_getorigdst); - if (ret < 0) { - pr_err("Unable to register netfilter socket option\n"); - return ret; - } - - ret = register_pernet_subsys(&ipv4_net_ops); - if (ret < 0) { - pr_err("nf_conntrack_ipv4: can't register pernet ops\n"); - goto cleanup_sockopt; - } - - ret = nf_ct_l4proto_register(builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); - if (ret < 0) - goto cleanup_pernet; - - ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4); - if (ret < 0) { - pr_err("nf_conntrack_ipv4: can't register ipv4 proto.\n"); - goto cleanup_l4proto; - } - - return ret; -cleanup_l4proto: - nf_ct_l4proto_unregister(builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); - cleanup_pernet: - unregister_pernet_subsys(&ipv4_net_ops); - cleanup_sockopt: - nf_unregister_sockopt(&so_getorigdst); - return ret; -} - -static void __exit nf_conntrack_l3proto_ipv4_fini(void) -{ - synchronize_net(); - nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4); - nf_ct_l4proto_unregister(builtin_l4proto4, - ARRAY_SIZE(builtin_l4proto4)); - unregister_pernet_subsys(&ipv4_net_ops); - nf_unregister_sockopt(&so_getorigdst); -} - -module_init(nf_conntrack_l3proto_ipv4_init); -module_exit(nf_conntrack_l3proto_ipv4_fini); diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c deleted file mode 100644 index 036670b38282..000000000000 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ /dev/null @@ -1,388 +0,0 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team - * (C) 2006-2010 Patrick McHardy - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static const unsigned int nf_ct_icmp_timeout = 30*HZ; - -static inline struct nf_icmp_net *icmp_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.icmp; -} - -static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, - struct net *net, struct nf_conntrack_tuple *tuple) -{ - const struct icmphdr *hp; - struct icmphdr _hdr; - - hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); - if (hp == NULL) - return false; - - tuple->dst.u.icmp.type = hp->type; - tuple->src.u.icmp.id = hp->un.echo.id; - tuple->dst.u.icmp.code = hp->code; - - return true; -} - -/* Add 1; spaces filled with 0. */ -static const u_int8_t invmap[] = { - [ICMP_ECHO] = ICMP_ECHOREPLY + 1, - [ICMP_ECHOREPLY] = ICMP_ECHO + 1, - [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, - [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, - [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, - [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, - [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, - [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1 -}; - -static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - if (orig->dst.u.icmp.type >= sizeof(invmap) || - !invmap[orig->dst.u.icmp.type]) - return false; - - tuple->src.u.icmp.id = orig->src.u.icmp.id; - tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1; - tuple->dst.u.icmp.code = orig->dst.u.icmp.code; - return true; -} - -static unsigned int *icmp_get_timeouts(struct net *net) -{ - return &icmp_pernet(net)->timeout; -} - -/* Returns verdict for packet, or -1 for invalid. */ -static int icmp_packet(struct nf_conn *ct, - const struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo) -{ - /* Do not immediately delete the connection after the first - successful reply to avoid excessive conntrackd traffic - and also to handle correctly ICMP echo reply duplicates. */ - unsigned int *timeout = nf_ct_timeout_lookup(ct); - - if (!timeout) - timeout = icmp_get_timeouts(nf_ct_net(ct)); - - nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); - - return NF_ACCEPT; -} - -/* Called when a new connection for this protocol found. */ -static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) -{ - static const u_int8_t valid_new[] = { - [ICMP_ECHO] = 1, - [ICMP_TIMESTAMP] = 1, - [ICMP_INFO_REQUEST] = 1, - [ICMP_ADDRESS] = 1 - }; - - if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) || - !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) { - /* Can't create a new ICMP `conn' with this. */ - pr_debug("icmp: can't create new conn with type %u\n", - ct->tuplehash[0].tuple.dst.u.icmp.type); - nf_ct_dump_tuple_ip(&ct->tuplehash[0].tuple); - return false; - } - return true; -} - -/* Returns conntrack if it dealt with ICMP, and filled in skb fields */ -static int -icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, - unsigned int hooknum) -{ - struct nf_conntrack_tuple innertuple, origtuple; - const struct nf_conntrack_l4proto *innerproto; - const struct nf_conntrack_tuple_hash *h; - const struct nf_conntrack_zone *zone; - enum ip_conntrack_info ctinfo; - struct nf_conntrack_zone tmp; - - WARN_ON(skb_nfct(skb)); - zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); - - /* Are they talking about one of our connections? */ - if (!nf_ct_get_tuplepr(skb, - skb_network_offset(skb) + ip_hdrlen(skb) - + sizeof(struct icmphdr), - PF_INET, net, &origtuple)) { - pr_debug("icmp_error_message: failed to get tuple\n"); - return -NF_ACCEPT; - } - - /* rcu_read_lock()ed by nf_hook_thresh */ - innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum); - - /* Ordinarily, we'd expect the inverted tupleproto, but it's - been preserved inside the ICMP. */ - if (!nf_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { - pr_debug("icmp_error_message: no match\n"); - return -NF_ACCEPT; - } - - ctinfo = IP_CT_RELATED; - - h = nf_conntrack_find_get(net, zone, &innertuple); - if (!h) { - pr_debug("icmp_error_message: no match\n"); - return -NF_ACCEPT; - } - - if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) - ctinfo += IP_CT_IS_REPLY; - - /* Update skb to refer to this connection */ - nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo); - return NF_ACCEPT; -} - -static void icmp_error_log(const struct sk_buff *skb, struct net *net, - u8 pf, const char *msg) -{ - nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMP, "%s", msg); -} - -/* Small and modified version of icmp_rcv */ -static int -icmp_error(struct net *net, struct nf_conn *tmpl, - struct sk_buff *skb, unsigned int dataoff, - u8 pf, unsigned int hooknum) -{ - const struct icmphdr *icmph; - struct icmphdr _ih; - - /* Not enough header? */ - icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); - if (icmph == NULL) { - icmp_error_log(skb, net, pf, "short packet"); - return -NF_ACCEPT; - } - - /* See ip_conntrack_proto_tcp.c */ - if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && - nf_ip_checksum(skb, hooknum, dataoff, 0)) { - icmp_error_log(skb, net, pf, "bad hw icmp checksum"); - return -NF_ACCEPT; - } - - /* - * 18 is the highest 'known' ICMP type. Anything else is a mystery - * - * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently - * discarded. - */ - if (icmph->type > NR_ICMP_TYPES) { - icmp_error_log(skb, net, pf, "invalid icmp type"); - return -NF_ACCEPT; - } - - /* Need to track icmp error message? */ - if (icmph->type != ICMP_DEST_UNREACH && - icmph->type != ICMP_SOURCE_QUENCH && - icmph->type != ICMP_TIME_EXCEEDED && - icmph->type != ICMP_PARAMETERPROB && - icmph->type != ICMP_REDIRECT) - return NF_ACCEPT; - - return icmp_error_message(net, tmpl, skb, hooknum); -} - -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - -#include -#include - -static int icmp_tuple_to_nlattr(struct sk_buff *skb, - const struct nf_conntrack_tuple *t) -{ - if (nla_put_be16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id) || - nla_put_u8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type) || - nla_put_u8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -1; -} - -static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = { - [CTA_PROTO_ICMP_TYPE] = { .type = NLA_U8 }, - [CTA_PROTO_ICMP_CODE] = { .type = NLA_U8 }, - [CTA_PROTO_ICMP_ID] = { .type = NLA_U16 }, -}; - -static int icmp_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *tuple) -{ - if (!tb[CTA_PROTO_ICMP_TYPE] || - !tb[CTA_PROTO_ICMP_CODE] || - !tb[CTA_PROTO_ICMP_ID]) - return -EINVAL; - - tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]); - tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]); - tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]); - - if (tuple->dst.u.icmp.type >= sizeof(invmap) || - !invmap[tuple->dst.u.icmp.type]) - return -EINVAL; - - return 0; -} - -static unsigned int icmp_nlattr_tuple_size(void) -{ - static unsigned int size __read_mostly; - - if (!size) - size = nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1); - - return size; -} -#endif - -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) - -#include -#include - -static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], - struct net *net, void *data) -{ - unsigned int *timeout = data; - struct nf_icmp_net *in = icmp_pernet(net); - - if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) { - if (!timeout) - timeout = &in->timeout; - *timeout = - ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ; - } else if (timeout) { - /* Set default ICMP timeout. */ - *timeout = in->timeout; - } - return 0; -} - -static int -icmp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) -{ - const unsigned int *timeout = data; - - if (nla_put_be32(skb, CTA_TIMEOUT_ICMP_TIMEOUT, htonl(*timeout / HZ))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -ENOSPC; -} - -static const struct nla_policy -icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = { - [CTA_TIMEOUT_ICMP_TIMEOUT] = { .type = NLA_U32 }, -}; -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ - -#ifdef CONFIG_SYSCTL -static struct ctl_table icmp_sysctl_table[] = { - { - .procname = "nf_conntrack_icmp_timeout", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { } -}; -#endif /* CONFIG_SYSCTL */ - -static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn, - struct nf_icmp_net *in) -{ -#ifdef CONFIG_SYSCTL - pn->ctl_table = kmemdup(icmp_sysctl_table, - sizeof(icmp_sysctl_table), - GFP_KERNEL); - if (!pn->ctl_table) - return -ENOMEM; - - pn->ctl_table[0].data = &in->timeout; -#endif - return 0; -} - -static int icmp_init_net(struct net *net, u_int16_t proto) -{ - struct nf_icmp_net *in = icmp_pernet(net); - struct nf_proto_net *pn = &in->pn; - - in->timeout = nf_ct_icmp_timeout; - - return icmp_kmemdup_sysctl_table(pn, in); -} - -static struct nf_proto_net *icmp_get_net_proto(struct net *net) -{ - return &net->ct.nf_ct_proto.icmp.pn; -} - -const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp = -{ - .l3proto = PF_INET, - .l4proto = IPPROTO_ICMP, - .pkt_to_tuple = icmp_pkt_to_tuple, - .invert_tuple = icmp_invert_tuple, - .packet = icmp_packet, - .new = icmp_new, - .error = icmp_error, - .destroy = NULL, - .me = NULL, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .tuple_to_nlattr = icmp_tuple_to_nlattr, - .nlattr_tuple_size = icmp_nlattr_tuple_size, - .nlattr_to_tuple = icmp_nlattr_to_tuple, - .nla_policy = icmp_nla_policy, -#endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) - .ctnl_timeout = { - .nlattr_to_obj = icmp_timeout_nlattr_to_obj, - .obj_to_nlattr = icmp_timeout_obj_to_nlattr, - .nlattr_max = CTA_TIMEOUT_ICMP_MAX, - .obj_size = sizeof(unsigned int), - .nla_policy = icmp_timeout_nla_policy, - }, -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ - .init_net = icmp_init_net, - .get_net_proto = icmp_get_net_proto, -}; diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 37b14dc9d863..339d0762b027 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -5,26 +5,6 @@ menu "IPv6: Netfilter Configuration" depends on INET && IPV6 && NETFILTER -config NF_DEFRAG_IPV6 - tristate - default n - -config NF_CONNTRACK_IPV6 - tristate "IPv6 connection tracking support" - depends on INET && IPV6 && NF_CONNTRACK - default m if NETFILTER_ADVANCED=n - select NF_DEFRAG_IPV6 - ---help--- - Connection tracking keeps a record of what packets have passed - through your machine, in order to figure out how they are related - into connections. - - This is IPv6 support on Layer 3 independent connection tracking. - Layer 3 independent connection tracking is experimental scheme - which generalize ip_conntrack to support other layer 3 protocols. - - To compile it as a module, choose M here. If unsure, say N. - config NF_SOCKET_IPV6 tristate "IPv6 socket lookup support" help @@ -128,7 +108,7 @@ config NF_LOG_IPV6 config NF_NAT_IPV6 tristate "IPv6 NAT" - depends on NF_CONNTRACK_IPV6 + depends on NF_CONNTRACK depends on NETFILTER_ADVANCED select NF_NAT help @@ -328,7 +308,7 @@ config IP6_NF_SECURITY config IP6_NF_NAT tristate "ip6tables NAT support" - depends on NF_CONNTRACK_IPV6 + depends on NF_CONNTRACK depends on NETFILTER_ADVANCED select NF_NAT select NF_NAT_IPV6 @@ -365,6 +345,7 @@ config IP6_NF_TARGET_NPT endif # IP6_NF_NAT endif # IP6_NF_IPTABLES - endmenu +config NF_DEFRAG_IPV6 + tristate diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 10a5a1c87320..200c0c235565 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -11,12 +11,6 @@ obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o obj-$(CONFIG_IP6_NF_NAT) += ip6table_nat.o -# objects for l3 independent conntrack -nf_conntrack_ipv6-y := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o - -# l3 independent conntrack -obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o - nf_nat_ipv6-y := nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o nf_nat_ipv6-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c deleted file mode 100644 index 37ab25645cf2..000000000000 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ /dev/null @@ -1,355 +0,0 @@ -/* - * Copyright (C)2004 USAGI/WIDE Project - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Author: - * Yasuyuki Kozakai @USAGI - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static int conntrack6_net_id; -static DEFINE_MUTEX(register_ipv6_hooks); - -struct conntrack6_net { - unsigned int users; -}; - -static unsigned int ipv6_helper(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_conn *ct; - const struct nf_conn_help *help; - const struct nf_conntrack_helper *helper; - enum ip_conntrack_info ctinfo; - __be16 frag_off; - int protoff; - u8 nexthdr; - - /* This is where we call the helper: as the packet goes out. */ - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - return NF_ACCEPT; - - help = nfct_help(ct); - if (!help) - return NF_ACCEPT; - /* rcu_read_lock()ed by nf_hook_thresh */ - helper = rcu_dereference(help->helper); - if (!helper) - return NF_ACCEPT; - - nexthdr = ipv6_hdr(skb)->nexthdr; - protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, - &frag_off); - if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { - pr_debug("proto header not found\n"); - return NF_ACCEPT; - } - - return helper->help(skb, protoff, ct, ctinfo); -} - -static unsigned int ipv6_confirm(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - unsigned char pnum = ipv6_hdr(skb)->nexthdr; - int protoff; - __be16 frag_off; - - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - goto out; - - protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, - &frag_off); - if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { - pr_debug("proto header not found\n"); - goto out; - } - - /* adjust seqs for loopback traffic only in outgoing direction */ - if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && - !nf_is_loopback_packet(skb)) { - if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { - NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); - return NF_DROP; - } - } -out: - /* We've seen it coming out the other side: confirm it */ - return nf_conntrack_confirm(skb); -} - -static unsigned int ipv6_conntrack_in(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return nf_conntrack_in(state->net, PF_INET6, state->hook, skb); -} - -static unsigned int ipv6_conntrack_local(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return nf_conntrack_in(state->net, PF_INET6, state->hook, skb); -} - -static const struct nf_hook_ops ipv6_conntrack_ops[] = { - { - .hook = ipv6_conntrack_in, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP6_PRI_CONNTRACK, - }, - { - .hook = ipv6_conntrack_local, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP6_PRI_CONNTRACK, - }, - { - .hook = ipv6_helper, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP6_PRI_CONNTRACK_HELPER, - }, - { - .hook = ipv6_confirm, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP6_PRI_LAST, - }, - { - .hook = ipv6_helper, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP6_PRI_CONNTRACK_HELPER, - }, - { - .hook = ipv6_confirm, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP6_PRI_LAST-1, - }, -}; - -static int -ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len) -{ - struct nf_conntrack_tuple tuple = { .src.l3num = NFPROTO_IPV6 }; - const struct ipv6_pinfo *inet6 = inet6_sk(sk); - const struct inet_sock *inet = inet_sk(sk); - const struct nf_conntrack_tuple_hash *h; - struct sockaddr_in6 sin6; - struct nf_conn *ct; - __be32 flow_label; - int bound_dev_if; - - lock_sock(sk); - tuple.src.u3.in6 = sk->sk_v6_rcv_saddr; - tuple.src.u.tcp.port = inet->inet_sport; - tuple.dst.u3.in6 = sk->sk_v6_daddr; - tuple.dst.u.tcp.port = inet->inet_dport; - tuple.dst.protonum = sk->sk_protocol; - bound_dev_if = sk->sk_bound_dev_if; - flow_label = inet6->flow_label; - release_sock(sk); - - if (tuple.dst.protonum != IPPROTO_TCP && - tuple.dst.protonum != IPPROTO_SCTP) - return -ENOPROTOOPT; - - if (*len < 0 || (unsigned int) *len < sizeof(sin6)) - return -EINVAL; - - h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); - if (!h) { - pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n", - &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port), - &tuple.dst.u3.ip6, ntohs(tuple.dst.u.tcp.port)); - return -ENOENT; - } - - ct = nf_ct_tuplehash_to_ctrack(h); - - sin6.sin6_family = AF_INET6; - sin6.sin6_port = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port; - sin6.sin6_flowinfo = flow_label & IPV6_FLOWINFO_MASK; - memcpy(&sin6.sin6_addr, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6, - sizeof(sin6.sin6_addr)); - - nf_ct_put(ct); - sin6.sin6_scope_id = ipv6_iface_scope_id(&sin6.sin6_addr, bound_dev_if); - return copy_to_user(user, &sin6, sizeof(sin6)) ? -EFAULT : 0; -} - -static int ipv6_hooks_register(struct net *net) -{ - struct conntrack6_net *cnet = net_generic(net, conntrack6_net_id); - int err = 0; - - mutex_lock(®ister_ipv6_hooks); - cnet->users++; - if (cnet->users > 1) - goto out_unlock; - - err = nf_defrag_ipv6_enable(net); - if (err < 0) { - cnet->users = 0; - goto out_unlock; - } - - err = nf_register_net_hooks(net, ipv6_conntrack_ops, - ARRAY_SIZE(ipv6_conntrack_ops)); - if (err) - cnet->users = 0; - out_unlock: - mutex_unlock(®ister_ipv6_hooks); - return err; -} - -static void ipv6_hooks_unregister(struct net *net) -{ - struct conntrack6_net *cnet = net_generic(net, conntrack6_net_id); - - mutex_lock(®ister_ipv6_hooks); - if (cnet->users && (--cnet->users == 0)) - nf_unregister_net_hooks(net, ipv6_conntrack_ops, - ARRAY_SIZE(ipv6_conntrack_ops)); - mutex_unlock(®ister_ipv6_hooks); -} - -const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = { - .l3proto = PF_INET6, - .net_ns_get = ipv6_hooks_register, - .net_ns_put = ipv6_hooks_unregister, - .me = THIS_MODULE, -}; - -MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6)); -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Yasuyuki KOZAKAI @USAGI "); - -static struct nf_sockopt_ops so_getorigdst6 = { - .pf = NFPROTO_IPV6, - .get_optmin = IP6T_SO_ORIGINAL_DST, - .get_optmax = IP6T_SO_ORIGINAL_DST + 1, - .get = ipv6_getorigdst, - .owner = THIS_MODULE, -}; - -static const struct nf_conntrack_l4proto * const builtin_l4proto6[] = { - &nf_conntrack_l4proto_tcp6, - &nf_conntrack_l4proto_udp6, - &nf_conntrack_l4proto_icmpv6, -#ifdef CONFIG_NF_CT_PROTO_DCCP - &nf_conntrack_l4proto_dccp6, -#endif -#ifdef CONFIG_NF_CT_PROTO_SCTP - &nf_conntrack_l4proto_sctp6, -#endif -#ifdef CONFIG_NF_CT_PROTO_UDPLITE - &nf_conntrack_l4proto_udplite6, -#endif -}; - -static int ipv6_net_init(struct net *net) -{ - return nf_ct_l4proto_pernet_register(net, builtin_l4proto6, - ARRAY_SIZE(builtin_l4proto6)); -} - -static void ipv6_net_exit(struct net *net) -{ - nf_ct_l4proto_pernet_unregister(net, builtin_l4proto6, - ARRAY_SIZE(builtin_l4proto6)); -} - -static struct pernet_operations ipv6_net_ops = { - .init = ipv6_net_init, - .exit = ipv6_net_exit, - .id = &conntrack6_net_id, - .size = sizeof(struct conntrack6_net), -}; - -static int __init nf_conntrack_l3proto_ipv6_init(void) -{ - int ret = 0; - - need_conntrack(); - - ret = nf_register_sockopt(&so_getorigdst6); - if (ret < 0) { - pr_err("Unable to register netfilter socket option\n"); - return ret; - } - - ret = register_pernet_subsys(&ipv6_net_ops); - if (ret < 0) - goto cleanup_sockopt; - - ret = nf_ct_l4proto_register(builtin_l4proto6, - ARRAY_SIZE(builtin_l4proto6)); - if (ret < 0) - goto cleanup_pernet; - - ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv6); - if (ret < 0) { - pr_err("nf_conntrack_ipv6: can't register ipv6 proto.\n"); - goto cleanup_l4proto; - } - return ret; -cleanup_l4proto: - nf_ct_l4proto_unregister(builtin_l4proto6, - ARRAY_SIZE(builtin_l4proto6)); - cleanup_pernet: - unregister_pernet_subsys(&ipv6_net_ops); - cleanup_sockopt: - nf_unregister_sockopt(&so_getorigdst6); - return ret; -} - -static void __exit nf_conntrack_l3proto_ipv6_fini(void) -{ - synchronize_net(); - nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv6); - nf_ct_l4proto_unregister(builtin_l4proto6, - ARRAY_SIZE(builtin_l4proto6)); - unregister_pernet_subsys(&ipv6_net_ops); - nf_unregister_sockopt(&so_getorigdst6); -} - -module_init(nf_conntrack_l3proto_ipv6_init); -module_exit(nf_conntrack_l3proto_ipv6_fini); diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c deleted file mode 100644 index bed07b998a10..000000000000 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ /dev/null @@ -1,387 +0,0 @@ -/* - * Copyright (C)2003,2004 USAGI/WIDE Project - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Author: - * Yasuyuki Kozakai @USAGI - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static const unsigned int nf_ct_icmpv6_timeout = 30*HZ; - -static inline struct nf_icmp_net *icmpv6_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.icmpv6; -} - -static bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct net *net, - struct nf_conntrack_tuple *tuple) -{ - const struct icmp6hdr *hp; - struct icmp6hdr _hdr; - - hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); - if (hp == NULL) - return false; - tuple->dst.u.icmp.type = hp->icmp6_type; - tuple->src.u.icmp.id = hp->icmp6_identifier; - tuple->dst.u.icmp.code = hp->icmp6_code; - - return true; -} - -/* Add 1; spaces filled with 0. */ -static const u_int8_t invmap[] = { - [ICMPV6_ECHO_REQUEST - 128] = ICMPV6_ECHO_REPLY + 1, - [ICMPV6_ECHO_REPLY - 128] = ICMPV6_ECHO_REQUEST + 1, - [ICMPV6_NI_QUERY - 128] = ICMPV6_NI_REPLY + 1, - [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_QUERY + 1 -}; - -static const u_int8_t noct_valid_new[] = { - [ICMPV6_MGM_QUERY - 130] = 1, - [ICMPV6_MGM_REPORT - 130] = 1, - [ICMPV6_MGM_REDUCTION - 130] = 1, - [NDISC_ROUTER_SOLICITATION - 130] = 1, - [NDISC_ROUTER_ADVERTISEMENT - 130] = 1, - [NDISC_NEIGHBOUR_SOLICITATION - 130] = 1, - [NDISC_NEIGHBOUR_ADVERTISEMENT - 130] = 1, - [ICMPV6_MLD2_REPORT - 130] = 1 -}; - -static bool icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) -{ - int type = orig->dst.u.icmp.type - 128; - if (type < 0 || type >= sizeof(invmap) || !invmap[type]) - return false; - - tuple->src.u.icmp.id = orig->src.u.icmp.id; - tuple->dst.u.icmp.type = invmap[type] - 1; - tuple->dst.u.icmp.code = orig->dst.u.icmp.code; - return true; -} - -static unsigned int *icmpv6_get_timeouts(struct net *net) -{ - return &icmpv6_pernet(net)->timeout; -} - -/* Returns verdict for packet, or -1 for invalid. */ -static int icmpv6_packet(struct nf_conn *ct, - const struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo) -{ - unsigned int *timeout = nf_ct_timeout_lookup(ct); - - if (!timeout) - timeout = icmpv6_get_timeouts(nf_ct_net(ct)); - - /* Do not immediately delete the connection after the first - successful reply to avoid excessive conntrackd traffic - and also to handle correctly ICMP echo reply duplicates. */ - nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); - - return NF_ACCEPT; -} - -/* Called when a new connection for this protocol found. */ -static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) -{ - static const u_int8_t valid_new[] = { - [ICMPV6_ECHO_REQUEST - 128] = 1, - [ICMPV6_NI_QUERY - 128] = 1 - }; - int type = ct->tuplehash[0].tuple.dst.u.icmp.type - 128; - - if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) { - /* Can't create a new ICMPv6 `conn' with this. */ - pr_debug("icmpv6: can't create new conn with type %u\n", - type + 128); - nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple); - return false; - } - return true; -} - -static int -icmpv6_error_message(struct net *net, struct nf_conn *tmpl, - struct sk_buff *skb, - unsigned int icmp6off) -{ - struct nf_conntrack_tuple intuple, origtuple; - const struct nf_conntrack_tuple_hash *h; - const struct nf_conntrack_l4proto *inproto; - enum ip_conntrack_info ctinfo; - struct nf_conntrack_zone tmp; - - WARN_ON(skb_nfct(skb)); - - /* Are they talking about one of our connections? */ - if (!nf_ct_get_tuplepr(skb, - skb_network_offset(skb) - + sizeof(struct ipv6hdr) - + sizeof(struct icmp6hdr), - PF_INET6, net, &origtuple)) { - pr_debug("icmpv6_error: Can't get tuple\n"); - return -NF_ACCEPT; - } - - /* rcu_read_lock()ed by nf_hook_thresh */ - inproto = __nf_ct_l4proto_find(PF_INET6, origtuple.dst.protonum); - - /* Ordinarily, we'd expect the inverted tupleproto, but it's - been preserved inside the ICMP. */ - if (!nf_ct_invert_tuple(&intuple, &origtuple, inproto)) { - pr_debug("icmpv6_error: Can't invert tuple\n"); - return -NF_ACCEPT; - } - - ctinfo = IP_CT_RELATED; - - h = nf_conntrack_find_get(net, nf_ct_zone_tmpl(tmpl, skb, &tmp), - &intuple); - if (!h) { - pr_debug("icmpv6_error: no match\n"); - return -NF_ACCEPT; - } else { - if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) - ctinfo += IP_CT_IS_REPLY; - } - - /* Update skb to refer to this connection */ - nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo); - return NF_ACCEPT; -} - -static void icmpv6_error_log(const struct sk_buff *skb, struct net *net, - u8 pf, const char *msg) -{ - nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMPV6, "%s", msg); -} - -static int -icmpv6_error(struct net *net, struct nf_conn *tmpl, - struct sk_buff *skb, unsigned int dataoff, - u8 pf, unsigned int hooknum) -{ - const struct icmp6hdr *icmp6h; - struct icmp6hdr _ih; - int type; - - icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); - if (icmp6h == NULL) { - icmpv6_error_log(skb, net, pf, "short packet"); - return -NF_ACCEPT; - } - - if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && - nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) { - icmpv6_error_log(skb, net, pf, "ICMPv6 checksum failed"); - return -NF_ACCEPT; - } - - type = icmp6h->icmp6_type - 130; - if (type >= 0 && type < sizeof(noct_valid_new) && - noct_valid_new[type]) { - nf_ct_set(skb, NULL, IP_CT_UNTRACKED); - return NF_ACCEPT; - } - - /* is not error message ? */ - if (icmp6h->icmp6_type >= 128) - return NF_ACCEPT; - - return icmpv6_error_message(net, tmpl, skb, dataoff); -} - -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - -#include -#include -static int icmpv6_tuple_to_nlattr(struct sk_buff *skb, - const struct nf_conntrack_tuple *t) -{ - if (nla_put_be16(skb, CTA_PROTO_ICMPV6_ID, t->src.u.icmp.id) || - nla_put_u8(skb, CTA_PROTO_ICMPV6_TYPE, t->dst.u.icmp.type) || - nla_put_u8(skb, CTA_PROTO_ICMPV6_CODE, t->dst.u.icmp.code)) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -1; -} - -static const struct nla_policy icmpv6_nla_policy[CTA_PROTO_MAX+1] = { - [CTA_PROTO_ICMPV6_TYPE] = { .type = NLA_U8 }, - [CTA_PROTO_ICMPV6_CODE] = { .type = NLA_U8 }, - [CTA_PROTO_ICMPV6_ID] = { .type = NLA_U16 }, -}; - -static int icmpv6_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *tuple) -{ - if (!tb[CTA_PROTO_ICMPV6_TYPE] || - !tb[CTA_PROTO_ICMPV6_CODE] || - !tb[CTA_PROTO_ICMPV6_ID]) - return -EINVAL; - - tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]); - tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]); - tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]); - - if (tuple->dst.u.icmp.type < 128 || - tuple->dst.u.icmp.type - 128 >= sizeof(invmap) || - !invmap[tuple->dst.u.icmp.type - 128]) - return -EINVAL; - - return 0; -} - -static unsigned int icmpv6_nlattr_tuple_size(void) -{ - static unsigned int size __read_mostly; - - if (!size) - size = nla_policy_len(icmpv6_nla_policy, CTA_PROTO_MAX + 1); - - return size; -} -#endif - -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) - -#include -#include - -static int icmpv6_timeout_nlattr_to_obj(struct nlattr *tb[], - struct net *net, void *data) -{ - unsigned int *timeout = data; - struct nf_icmp_net *in = icmpv6_pernet(net); - - if (!timeout) - timeout = icmpv6_get_timeouts(net); - if (tb[CTA_TIMEOUT_ICMPV6_TIMEOUT]) { - *timeout = - ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMPV6_TIMEOUT])) * HZ; - } else { - /* Set default ICMPv6 timeout. */ - *timeout = in->timeout; - } - return 0; -} - -static int -icmpv6_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) -{ - const unsigned int *timeout = data; - - if (nla_put_be32(skb, CTA_TIMEOUT_ICMPV6_TIMEOUT, htonl(*timeout / HZ))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -ENOSPC; -} - -static const struct nla_policy -icmpv6_timeout_nla_policy[CTA_TIMEOUT_ICMPV6_MAX+1] = { - [CTA_TIMEOUT_ICMPV6_TIMEOUT] = { .type = NLA_U32 }, -}; -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ - -#ifdef CONFIG_SYSCTL -static struct ctl_table icmpv6_sysctl_table[] = { - { - .procname = "nf_conntrack_icmpv6_timeout", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { } -}; -#endif /* CONFIG_SYSCTL */ - -static int icmpv6_kmemdup_sysctl_table(struct nf_proto_net *pn, - struct nf_icmp_net *in) -{ -#ifdef CONFIG_SYSCTL - pn->ctl_table = kmemdup(icmpv6_sysctl_table, - sizeof(icmpv6_sysctl_table), - GFP_KERNEL); - if (!pn->ctl_table) - return -ENOMEM; - - pn->ctl_table[0].data = &in->timeout; -#endif - return 0; -} - -static int icmpv6_init_net(struct net *net, u_int16_t proto) -{ - struct nf_icmp_net *in = icmpv6_pernet(net); - struct nf_proto_net *pn = &in->pn; - - in->timeout = nf_ct_icmpv6_timeout; - - return icmpv6_kmemdup_sysctl_table(pn, in); -} - -static struct nf_proto_net *icmpv6_get_net_proto(struct net *net) -{ - return &net->ct.nf_ct_proto.icmpv6.pn; -} - -const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 = -{ - .l3proto = PF_INET6, - .l4proto = IPPROTO_ICMPV6, - .pkt_to_tuple = icmpv6_pkt_to_tuple, - .invert_tuple = icmpv6_invert_tuple, - .packet = icmpv6_packet, - .new = icmpv6_new, - .error = icmpv6_error, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .tuple_to_nlattr = icmpv6_tuple_to_nlattr, - .nlattr_tuple_size = icmpv6_nlattr_tuple_size, - .nlattr_to_tuple = icmpv6_nlattr_to_tuple, - .nla_policy = icmpv6_nla_policy, -#endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) - .ctnl_timeout = { - .nlattr_to_obj = icmpv6_timeout_nlattr_to_obj, - .obj_to_nlattr = icmpv6_timeout_obj_to_nlattr, - .nlattr_max = CTA_TIMEOUT_ICMP_MAX, - .obj_size = sizeof(unsigned int), - .nla_policy = icmpv6_timeout_nla_policy, - }, -#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ - .init_net = icmpv6_init_net, - .get_net_proto = icmpv6_get_net_proto, -}; diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 3ce657fbca67..9eab519b403a 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -49,6 +49,8 @@ config NETFILTER_NETLINK_LOG config NF_CONNTRACK tristate "Netfilter connection tracking support" default m if NETFILTER_ADVANCED=n + select NF_DEFRAG_IPV4 + select NF_DEFRAG_IPV6 if IPV6 != n help Connection tracking keeps a record of what packets have passed through your machine, in order to figure out how they are related diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index f132ea850778..53bd1ed1228a 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -1,7 +1,12 @@ # SPDX-License-Identifier: GPL-2.0 netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o utils.o -nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o +nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o \ + nf_conntrack_proto.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o \ + nf_conntrack_proto_icmp.o \ + nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o + +nf_conntrack-$(subst m,y,$(CONFIG_IPV6)) += nf_conntrack_proto_icmpv6.o nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index c069f2faff4c..5123e91b1982 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -291,7 +291,6 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, u_int8_t *protonum) { int dataoff = -1; -#if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV4) const struct iphdr *iph; struct iphdr _iph; @@ -314,15 +313,14 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, nhoff, iph->ihl << 2, skb->len); return -1; } -#endif return dataoff; } +#if IS_ENABLED(CONFIG_IPV6) static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, u8 *protonum) { int protoff = -1; -#if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV6) unsigned int extoff = nhoff + sizeof(struct ipv6hdr); __be16 frag_off; u8 nexthdr; @@ -343,9 +341,9 @@ static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, } *protonum = nexthdr; -#endif return protoff; } +#endif static int get_l4proto(const struct sk_buff *skb, unsigned int nhoff, u8 pf, u8 *l4num) @@ -353,8 +351,10 @@ static int get_l4proto(const struct sk_buff *skb, switch (pf) { case NFPROTO_IPV4: return ipv4_get_l4proto(skb, nhoff, l4num); +#if IS_ENABLED(CONFIG_IPV6) case NFPROTO_IPV6: return ipv6_get_l4proto(skb, nhoff, l4num); +#endif default: *l4num = 0; break; @@ -2197,9 +2197,6 @@ int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) } EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); -module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, - &nf_conntrack_htable_size, 0600); - static __always_inline unsigned int total_extension_size(void) { /* remember to add new extensions below */ diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 39df72bb9d56..803607a90102 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -1,14 +1,4 @@ -/* L3/L4 protocol support for nf_conntrack. */ - -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team - * (C) 2003,2004 USAGI/WIDE Project - * (C) 2006-2012 Patrick McHardy - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ +// SPDX-License-Identifier: GPL-2.0 #include #include @@ -24,22 +14,39 @@ #include #include -#include #include #include #include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +extern unsigned int nf_conntrack_net_id; + static struct nf_conntrack_l4proto __rcu **nf_ct_protos[NFPROTO_NUMPROTO] __read_mostly; -struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[NFPROTO_NUMPROTO] __read_mostly; -EXPORT_SYMBOL_GPL(nf_ct_l3protos); static DEFINE_MUTEX(nf_ct_proto_mutex); -struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = { - .l3proto = PF_UNSPEC, -}; -EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_generic); - #ifdef CONFIG_SYSCTL static int nf_ct_register_sysctl(struct net *net, @@ -127,137 +134,6 @@ __nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto) } EXPORT_SYMBOL_GPL(__nf_ct_l4proto_find); -/* this is guaranteed to always return a valid protocol helper, since - * it falls back to generic_protocol */ -const struct nf_conntrack_l3proto * -nf_ct_l3proto_find_get(u_int16_t l3proto) -{ - struct nf_conntrack_l3proto *p; - - rcu_read_lock(); - p = __nf_ct_l3proto_find(l3proto); - if (!try_module_get(p->me)) - p = &nf_conntrack_l3proto_generic; - rcu_read_unlock(); - - return p; -} -EXPORT_SYMBOL_GPL(nf_ct_l3proto_find_get); - -int -nf_ct_l3proto_try_module_get(unsigned short l3proto) -{ - const struct nf_conntrack_l3proto *p; - int ret; - -retry: p = nf_ct_l3proto_find_get(l3proto); - if (p == &nf_conntrack_l3proto_generic) { - ret = request_module("nf_conntrack-%d", l3proto); - if (!ret) - goto retry; - - return -EPROTOTYPE; - } - - return 0; -} -EXPORT_SYMBOL_GPL(nf_ct_l3proto_try_module_get); - -void nf_ct_l3proto_module_put(unsigned short l3proto) -{ - struct nf_conntrack_l3proto *p; - - /* rcu_read_lock not necessary since the caller holds a reference, but - * taken anyways to avoid lockdep warnings in __nf_ct_l3proto_find() - */ - rcu_read_lock(); - p = __nf_ct_l3proto_find(l3proto); - module_put(p->me); - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put); - -static int nf_ct_netns_do_get(struct net *net, u8 nfproto) -{ - const struct nf_conntrack_l3proto *l3proto; - int ret; - - might_sleep(); - - ret = nf_ct_l3proto_try_module_get(nfproto); - if (ret < 0) - return ret; - - /* we already have a reference, can't fail */ - rcu_read_lock(); - l3proto = __nf_ct_l3proto_find(nfproto); - rcu_read_unlock(); - - if (!l3proto->net_ns_get) - return 0; - - ret = l3proto->net_ns_get(net); - if (ret < 0) - nf_ct_l3proto_module_put(nfproto); - - return ret; -} - -int nf_ct_netns_get(struct net *net, u8 nfproto) -{ - int err; - - if (nfproto == NFPROTO_INET) { - err = nf_ct_netns_do_get(net, NFPROTO_IPV4); - if (err < 0) - goto err1; - err = nf_ct_netns_do_get(net, NFPROTO_IPV6); - if (err < 0) - goto err2; - } else { - err = nf_ct_netns_do_get(net, nfproto); - if (err < 0) - goto err1; - } - return 0; - -err2: - nf_ct_netns_put(net, NFPROTO_IPV4); -err1: - return err; -} -EXPORT_SYMBOL_GPL(nf_ct_netns_get); - -static void nf_ct_netns_do_put(struct net *net, u8 nfproto) -{ - const struct nf_conntrack_l3proto *l3proto; - - might_sleep(); - - /* same as nf_conntrack_netns_get(), reference assumed */ - rcu_read_lock(); - l3proto = __nf_ct_l3proto_find(nfproto); - rcu_read_unlock(); - - if (WARN_ON(!l3proto)) - return; - - if (l3proto->net_ns_put) - l3proto->net_ns_put(net); - - nf_ct_l3proto_module_put(nfproto); -} - -void nf_ct_netns_put(struct net *net, uint8_t nfproto) -{ - if (nfproto == NFPROTO_INET) { - nf_ct_netns_do_put(net, NFPROTO_IPV4); - nf_ct_netns_do_put(net, NFPROTO_IPV6); - } else - nf_ct_netns_do_put(net, nfproto); -} -EXPORT_SYMBOL_GPL(nf_ct_netns_put); - const struct nf_conntrack_l4proto * nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num) { @@ -279,11 +155,6 @@ void nf_ct_l4proto_put(const struct nf_conntrack_l4proto *p) } EXPORT_SYMBOL_GPL(nf_ct_l4proto_put); -static int kill_l3proto(struct nf_conn *i, void *data) -{ - return nf_ct_l3num(i) == ((const struct nf_conntrack_l3proto *)data)->l3proto; -} - static int kill_l4proto(struct nf_conn *i, void *data) { const struct nf_conntrack_l4proto *l4proto; @@ -292,49 +163,6 @@ static int kill_l4proto(struct nf_conn *i, void *data) nf_ct_l3num(i) == l4proto->l3proto; } -int nf_ct_l3proto_register(const struct nf_conntrack_l3proto *proto) -{ - int ret = 0; - struct nf_conntrack_l3proto *old; - - if (proto->l3proto >= NFPROTO_NUMPROTO) - return -EBUSY; - - mutex_lock(&nf_ct_proto_mutex); - old = rcu_dereference_protected(nf_ct_l3protos[proto->l3proto], - lockdep_is_held(&nf_ct_proto_mutex)); - if (old != &nf_conntrack_l3proto_generic) { - ret = -EBUSY; - goto out_unlock; - } - - rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], proto); - -out_unlock: - mutex_unlock(&nf_ct_proto_mutex); - return ret; - -} -EXPORT_SYMBOL_GPL(nf_ct_l3proto_register); - -void nf_ct_l3proto_unregister(const struct nf_conntrack_l3proto *proto) -{ - BUG_ON(proto->l3proto >= NFPROTO_NUMPROTO); - - mutex_lock(&nf_ct_proto_mutex); - BUG_ON(rcu_dereference_protected(nf_ct_l3protos[proto->l3proto], - lockdep_is_held(&nf_ct_proto_mutex) - ) != proto); - rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], - &nf_conntrack_l3proto_generic); - mutex_unlock(&nf_ct_proto_mutex); - - synchronize_rcu(); - /* Remove all contrack entries for this protocol */ - nf_ct_iterate_destroy(kill_l3proto, (void*)proto); -} -EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister); - static struct nf_proto_net *nf_ct_l4proto_net(struct net *net, const struct nf_conntrack_l4proto *l4proto) { @@ -501,8 +329,23 @@ void nf_ct_l4proto_pernet_unregister_one(struct net *net, } EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister_one); -int nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[], - unsigned int num_proto) +static void +nf_ct_l4proto_unregister(const struct nf_conntrack_l4proto * const l4proto[], + unsigned int num_proto) +{ + mutex_lock(&nf_ct_proto_mutex); + while (num_proto-- != 0) + __nf_ct_l4proto_unregister_one(l4proto[num_proto]); + mutex_unlock(&nf_ct_proto_mutex); + + synchronize_net(); + /* Remove all contrack entries for this protocol */ + nf_ct_iterate_destroy(kill_l4proto, (void *)l4proto); +} + +static int +nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[], + unsigned int num_proto) { int ret = -EINVAL, ver; unsigned int i; @@ -520,7 +363,6 @@ int nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[], } return ret; } -EXPORT_SYMBOL_GPL(nf_ct_l4proto_register); int nf_ct_l4proto_pernet_register(struct net *net, const struct nf_conntrack_l4proto *const l4proto[], @@ -544,20 +386,6 @@ int nf_ct_l4proto_pernet_register(struct net *net, } EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register); -void nf_ct_l4proto_unregister(const struct nf_conntrack_l4proto * const l4proto[], - unsigned int num_proto) -{ - mutex_lock(&nf_ct_proto_mutex); - while (num_proto-- != 0) - __nf_ct_l4proto_unregister_one(l4proto[num_proto]); - mutex_unlock(&nf_ct_proto_mutex); - - synchronize_net(); - /* Remove all contrack entries for this protocol */ - nf_ct_iterate_destroy(kill_l4proto, (void *)l4proto); -} -EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister); - void nf_ct_l4proto_pernet_unregister(struct net *net, const struct nf_conntrack_l4proto *const l4proto[], unsigned int num_proto) @@ -567,6 +395,563 @@ void nf_ct_l4proto_pernet_unregister(struct net *net, } EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister); +static unsigned int ipv4_helper(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + const struct nf_conn_help *help; + const struct nf_conntrack_helper *helper; + + /* This is where we call the helper: as the packet goes out. */ + ct = nf_ct_get(skb, &ctinfo); + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + return NF_ACCEPT; + + help = nfct_help(ct); + if (!help) + return NF_ACCEPT; + + /* rcu_read_lock()ed by nf_hook_thresh */ + helper = rcu_dereference(help->helper); + if (!helper) + return NF_ACCEPT; + + return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb), + ct, ctinfo); +} + +static unsigned int ipv4_confirm(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + goto out; + + /* adjust seqs for loopback traffic only in outgoing direction */ + if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && + !nf_is_loopback_packet(skb)) { + if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) { + NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); + return NF_DROP; + } + } +out: + /* We've seen it coming out the other side: confirm it */ + return nf_conntrack_confirm(skb); +} + +static unsigned int ipv4_conntrack_in(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + return nf_conntrack_in(state->net, PF_INET, state->hook, skb); +} + +static unsigned int ipv4_conntrack_local(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + if (ip_is_fragment(ip_hdr(skb))) { /* IP_NODEFRAG setsockopt set */ + enum ip_conntrack_info ctinfo; + struct nf_conn *tmpl; + + tmpl = nf_ct_get(skb, &ctinfo); + if (tmpl && nf_ct_is_template(tmpl)) { + /* when skipping ct, clear templates to avoid fooling + * later targets/matches + */ + skb->_nfct = 0; + nf_ct_put(tmpl); + } + return NF_ACCEPT; + } + + return nf_conntrack_in(state->net, PF_INET, state->hook, skb); +} + +/* Connection tracking may drop packets, but never alters them, so + * make it the first hook. + */ +static const struct nf_hook_ops ipv4_conntrack_ops[] = { + { + .hook = ipv4_conntrack_in, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP_PRI_CONNTRACK, + }, + { + .hook = ipv4_conntrack_local, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP_PRI_CONNTRACK, + }, + { + .hook = ipv4_helper, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_HELPER, + }, + { + .hook = ipv4_confirm, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM, + }, + { + .hook = ipv4_helper, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_HELPER, + }, + { + .hook = ipv4_confirm, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM, + }, +}; + +/* Fast function for those who don't want to parse /proc (and I don't + * blame them). + * Reversing the socket's dst/src point of view gives us the reply + * mapping. + */ +static int +getorigdst(struct sock *sk, int optval, void __user *user, int *len) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_tuple tuple; + + memset(&tuple, 0, sizeof(tuple)); + + lock_sock(sk); + tuple.src.u3.ip = inet->inet_rcv_saddr; + tuple.src.u.tcp.port = inet->inet_sport; + tuple.dst.u3.ip = inet->inet_daddr; + tuple.dst.u.tcp.port = inet->inet_dport; + tuple.src.l3num = PF_INET; + tuple.dst.protonum = sk->sk_protocol; + release_sock(sk); + + /* We only do TCP and SCTP at the moment: is there a better way? */ + if (tuple.dst.protonum != IPPROTO_TCP && + tuple.dst.protonum != IPPROTO_SCTP) { + pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n"); + return -ENOPROTOOPT; + } + + if ((unsigned int)*len < sizeof(struct sockaddr_in)) { + pr_debug("SO_ORIGINAL_DST: len %d not %zu\n", + *len, sizeof(struct sockaddr_in)); + return -EINVAL; + } + + h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); + if (h) { + struct sockaddr_in sin; + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + sin.sin_family = AF_INET; + sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.u.tcp.port; + sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.u3.ip; + memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); + + pr_debug("SO_ORIGINAL_DST: %pI4 %u\n", + &sin.sin_addr.s_addr, ntohs(sin.sin_port)); + nf_ct_put(ct); + if (copy_to_user(user, &sin, sizeof(sin)) != 0) + return -EFAULT; + else + return 0; + } + pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n", + &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port), + &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port)); + return -ENOENT; +} + +static struct nf_sockopt_ops so_getorigdst = { + .pf = PF_INET, + .get_optmin = SO_ORIGINAL_DST, + .get_optmax = SO_ORIGINAL_DST + 1, + .get = getorigdst, + .owner = THIS_MODULE, +}; + +#if IS_ENABLED(CONFIG_IPV6) +static int +ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len) +{ + struct nf_conntrack_tuple tuple = { .src.l3num = NFPROTO_IPV6 }; + const struct ipv6_pinfo *inet6 = inet6_sk(sk); + const struct inet_sock *inet = inet_sk(sk); + const struct nf_conntrack_tuple_hash *h; + struct sockaddr_in6 sin6; + struct nf_conn *ct; + __be32 flow_label; + int bound_dev_if; + + lock_sock(sk); + tuple.src.u3.in6 = sk->sk_v6_rcv_saddr; + tuple.src.u.tcp.port = inet->inet_sport; + tuple.dst.u3.in6 = sk->sk_v6_daddr; + tuple.dst.u.tcp.port = inet->inet_dport; + tuple.dst.protonum = sk->sk_protocol; + bound_dev_if = sk->sk_bound_dev_if; + flow_label = inet6->flow_label; + release_sock(sk); + + if (tuple.dst.protonum != IPPROTO_TCP && + tuple.dst.protonum != IPPROTO_SCTP) + return -ENOPROTOOPT; + + if (*len < 0 || (unsigned int)*len < sizeof(sin6)) + return -EINVAL; + + h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); + if (!h) { + pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n", + &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port), + &tuple.dst.u3.ip6, ntohs(tuple.dst.u.tcp.port)); + return -ENOENT; + } + + ct = nf_ct_tuplehash_to_ctrack(h); + + sin6.sin6_family = AF_INET6; + sin6.sin6_port = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port; + sin6.sin6_flowinfo = flow_label & IPV6_FLOWINFO_MASK; + memcpy(&sin6.sin6_addr, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6, + sizeof(sin6.sin6_addr)); + + nf_ct_put(ct); + sin6.sin6_scope_id = ipv6_iface_scope_id(&sin6.sin6_addr, bound_dev_if); + return copy_to_user(user, &sin6, sizeof(sin6)) ? -EFAULT : 0; +} + +static struct nf_sockopt_ops so_getorigdst6 = { + .pf = NFPROTO_IPV6, + .get_optmin = IP6T_SO_ORIGINAL_DST, + .get_optmax = IP6T_SO_ORIGINAL_DST + 1, + .get = ipv6_getorigdst, + .owner = THIS_MODULE, +}; + +static unsigned int ipv6_confirm(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + unsigned char pnum = ipv6_hdr(skb)->nexthdr; + int protoff; + __be16 frag_off; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + goto out; + + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, + &frag_off); + if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { + pr_debug("proto header not found\n"); + goto out; + } + + /* adjust seqs for loopback traffic only in outgoing direction */ + if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && + !nf_is_loopback_packet(skb)) { + if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { + NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); + return NF_DROP; + } + } +out: + /* We've seen it coming out the other side: confirm it */ + return nf_conntrack_confirm(skb); +} + +static unsigned int ipv6_conntrack_in(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + return nf_conntrack_in(state->net, PF_INET6, state->hook, skb); +} + +static unsigned int ipv6_conntrack_local(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + return nf_conntrack_in(state->net, PF_INET6, state->hook, skb); +} + +static unsigned int ipv6_helper(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_conn *ct; + const struct nf_conn_help *help; + const struct nf_conntrack_helper *helper; + enum ip_conntrack_info ctinfo; + __be16 frag_off; + int protoff; + u8 nexthdr; + + /* This is where we call the helper: as the packet goes out. */ + ct = nf_ct_get(skb, &ctinfo); + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + return NF_ACCEPT; + + help = nfct_help(ct); + if (!help) + return NF_ACCEPT; + /* rcu_read_lock()ed by nf_hook_thresh */ + helper = rcu_dereference(help->helper); + if (!helper) + return NF_ACCEPT; + + nexthdr = ipv6_hdr(skb)->nexthdr; + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + &frag_off); + if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { + pr_debug("proto header not found\n"); + return NF_ACCEPT; + } + + return helper->help(skb, protoff, ct, ctinfo); +} + +static const struct nf_hook_ops ipv6_conntrack_ops[] = { + { + .hook = ipv6_conntrack_in, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP6_PRI_CONNTRACK, + }, + { + .hook = ipv6_conntrack_local, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP6_PRI_CONNTRACK, + }, + { + .hook = ipv6_helper, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP6_PRI_CONNTRACK_HELPER, + }, + { + .hook = ipv6_confirm, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP6_PRI_LAST, + }, + { + .hook = ipv6_helper, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP6_PRI_CONNTRACK_HELPER, + }, + { + .hook = ipv6_confirm, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP6_PRI_LAST - 1, + }, +}; +#endif + +static int nf_ct_netns_do_get(struct net *net, u8 nfproto) +{ + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + int err = 0; + + mutex_lock(&nf_ct_proto_mutex); + + switch (nfproto) { + case NFPROTO_IPV4: + cnet->users4++; + if (cnet->users4 > 1) + goto out_unlock; + err = nf_defrag_ipv4_enable(net); + if (err) { + cnet->users4 = 0; + goto out_unlock; + } + + err = nf_register_net_hooks(net, ipv4_conntrack_ops, + ARRAY_SIZE(ipv4_conntrack_ops)); + if (err) + cnet->users4 = 0; + break; +#if IS_ENABLED(CONFIG_IPV6) + case NFPROTO_IPV6: + cnet->users6++; + if (cnet->users6 > 1) + goto out_unlock; + err = nf_defrag_ipv6_enable(net); + if (err < 0) { + cnet->users6 = 0; + goto out_unlock; + } + + err = nf_register_net_hooks(net, ipv6_conntrack_ops, + ARRAY_SIZE(ipv6_conntrack_ops)); + if (err) + cnet->users6 = 0; + break; +#endif + default: + err = -EPROTO; + break; + } + out_unlock: + mutex_unlock(&nf_ct_proto_mutex); + return err; +} + +static void nf_ct_netns_do_put(struct net *net, u8 nfproto) +{ + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + + mutex_lock(&nf_ct_proto_mutex); + switch (nfproto) { + case NFPROTO_IPV4: + if (cnet->users4 && (--cnet->users4 == 0)) + nf_unregister_net_hooks(net, ipv4_conntrack_ops, + ARRAY_SIZE(ipv4_conntrack_ops)); + break; +#if IS_ENABLED(CONFIG_IPV6) + case NFPROTO_IPV6: + if (cnet->users6 && (--cnet->users6 == 0)) + nf_unregister_net_hooks(net, ipv6_conntrack_ops, + ARRAY_SIZE(ipv6_conntrack_ops)); + break; +#endif + } + + mutex_unlock(&nf_ct_proto_mutex); +} + +int nf_ct_netns_get(struct net *net, u8 nfproto) +{ + int err; + + if (nfproto == NFPROTO_INET) { + err = nf_ct_netns_do_get(net, NFPROTO_IPV4); + if (err < 0) + goto err1; + err = nf_ct_netns_do_get(net, NFPROTO_IPV6); + if (err < 0) + goto err2; + } else { + err = nf_ct_netns_do_get(net, nfproto); + if (err < 0) + goto err1; + } + return 0; + +err2: + nf_ct_netns_put(net, NFPROTO_IPV4); +err1: + return err; +} +EXPORT_SYMBOL_GPL(nf_ct_netns_get); + +void nf_ct_netns_put(struct net *net, uint8_t nfproto) +{ + if (nfproto == NFPROTO_INET) { + nf_ct_netns_do_put(net, NFPROTO_IPV4); + nf_ct_netns_do_put(net, NFPROTO_IPV6); + } else { + nf_ct_netns_do_put(net, nfproto); + } +} +EXPORT_SYMBOL_GPL(nf_ct_netns_put); + +static const struct nf_conntrack_l4proto * const builtin_l4proto[] = { + &nf_conntrack_l4proto_tcp4, + &nf_conntrack_l4proto_udp4, + &nf_conntrack_l4proto_icmp, +#ifdef CONFIG_NF_CT_PROTO_DCCP + &nf_conntrack_l4proto_dccp4, +#endif +#ifdef CONFIG_NF_CT_PROTO_SCTP + &nf_conntrack_l4proto_sctp4, +#endif +#ifdef CONFIG_NF_CT_PROTO_UDPLITE + &nf_conntrack_l4proto_udplite4, +#endif +#if IS_ENABLED(CONFIG_IPV6) + &nf_conntrack_l4proto_tcp6, + &nf_conntrack_l4proto_udp6, + &nf_conntrack_l4proto_icmpv6, +#ifdef CONFIG_NF_CT_PROTO_DCCP + &nf_conntrack_l4proto_dccp6, +#endif +#ifdef CONFIG_NF_CT_PROTO_SCTP + &nf_conntrack_l4proto_sctp6, +#endif +#ifdef CONFIG_NF_CT_PROTO_UDPLITE + &nf_conntrack_l4proto_udplite6, +#endif +#endif /* CONFIG_IPV6 */ +}; + +int nf_conntrack_proto_init(void) +{ + int ret = 0; + + ret = nf_register_sockopt(&so_getorigdst); + if (ret < 0) + return ret; + +#if IS_ENABLED(CONFIG_IPV6) + ret = nf_register_sockopt(&so_getorigdst6); + if (ret < 0) + goto cleanup_sockopt; +#endif + ret = nf_ct_l4proto_register(builtin_l4proto, + ARRAY_SIZE(builtin_l4proto)); + if (ret < 0) + goto cleanup_sockopt2; + + return ret; +cleanup_sockopt2: + nf_unregister_sockopt(&so_getorigdst); +#if IS_ENABLED(CONFIG_IPV6) +cleanup_sockopt: + nf_unregister_sockopt(&so_getorigdst6); +#endif + return ret; +} + +void nf_conntrack_proto_fini(void) +{ + unsigned int i; + + nf_ct_l4proto_unregister(builtin_l4proto, + ARRAY_SIZE(builtin_l4proto)); + nf_unregister_sockopt(&so_getorigdst); +#if IS_ENABLED(CONFIG_IPV6) + nf_unregister_sockopt(&so_getorigdst6); +#endif + + /* free l3proto protocol tables */ + for (i = 0; i < ARRAY_SIZE(nf_ct_protos); i++) + kfree(nf_ct_protos[i]); +} + int nf_conntrack_proto_pernet_init(struct net *net) { int err; @@ -583,6 +968,14 @@ int nf_conntrack_proto_pernet_init(struct net *net) if (err < 0) return err; + err = nf_ct_l4proto_pernet_register(net, builtin_l4proto, + ARRAY_SIZE(builtin_l4proto)); + if (err < 0) { + nf_ct_l4proto_unregister_sysctl(net, pn, + &nf_conntrack_l4proto_generic); + return err; + } + pn->users++; return 0; } @@ -592,25 +985,19 @@ void nf_conntrack_proto_pernet_fini(struct net *net) struct nf_proto_net *pn = nf_ct_l4proto_net(net, &nf_conntrack_l4proto_generic); + nf_ct_l4proto_pernet_unregister(net, builtin_l4proto, + ARRAY_SIZE(builtin_l4proto)); pn->users--; nf_ct_l4proto_unregister_sysctl(net, pn, &nf_conntrack_l4proto_generic); } -int nf_conntrack_proto_init(void) -{ - unsigned int i; - for (i = 0; i < NFPROTO_NUMPROTO; i++) - rcu_assign_pointer(nf_ct_l3protos[i], - &nf_conntrack_l3proto_generic); - return 0; -} -void nf_conntrack_proto_fini(void) -{ - unsigned int i; - /* free l3proto protocol tables */ - for (i = 0; i < ARRAY_SIZE(nf_ct_protos); i++) - kfree(nf_ct_protos[i]); -} +module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, + &nf_conntrack_htable_size, 0600); + +MODULE_ALIAS("ip_conntrack"); +MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET)); +MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6)); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c new file mode 100644 index 000000000000..036670b38282 --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -0,0 +1,388 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * (C) 2006-2010 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const unsigned int nf_ct_icmp_timeout = 30*HZ; + +static inline struct nf_icmp_net *icmp_pernet(struct net *net) +{ + return &net->ct.nf_ct_proto.icmp; +} + +static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct net *net, struct nf_conntrack_tuple *tuple) +{ + const struct icmphdr *hp; + struct icmphdr _hdr; + + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) + return false; + + tuple->dst.u.icmp.type = hp->type; + tuple->src.u.icmp.id = hp->un.echo.id; + tuple->dst.u.icmp.code = hp->code; + + return true; +} + +/* Add 1; spaces filled with 0. */ +static const u_int8_t invmap[] = { + [ICMP_ECHO] = ICMP_ECHOREPLY + 1, + [ICMP_ECHOREPLY] = ICMP_ECHO + 1, + [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, + [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, + [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, + [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, + [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, + [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1 +}; + +static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + if (orig->dst.u.icmp.type >= sizeof(invmap) || + !invmap[orig->dst.u.icmp.type]) + return false; + + tuple->src.u.icmp.id = orig->src.u.icmp.id; + tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1; + tuple->dst.u.icmp.code = orig->dst.u.icmp.code; + return true; +} + +static unsigned int *icmp_get_timeouts(struct net *net) +{ + return &icmp_pernet(net)->timeout; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int icmp_packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo) +{ + /* Do not immediately delete the connection after the first + successful reply to avoid excessive conntrackd traffic + and also to handle correctly ICMP echo reply duplicates. */ + unsigned int *timeout = nf_ct_timeout_lookup(ct); + + if (!timeout) + timeout = icmp_get_timeouts(nf_ct_net(ct)); + + nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff) +{ + static const u_int8_t valid_new[] = { + [ICMP_ECHO] = 1, + [ICMP_TIMESTAMP] = 1, + [ICMP_INFO_REQUEST] = 1, + [ICMP_ADDRESS] = 1 + }; + + if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) || + !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) { + /* Can't create a new ICMP `conn' with this. */ + pr_debug("icmp: can't create new conn with type %u\n", + ct->tuplehash[0].tuple.dst.u.icmp.type); + nf_ct_dump_tuple_ip(&ct->tuplehash[0].tuple); + return false; + } + return true; +} + +/* Returns conntrack if it dealt with ICMP, and filled in skb fields */ +static int +icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, + unsigned int hooknum) +{ + struct nf_conntrack_tuple innertuple, origtuple; + const struct nf_conntrack_l4proto *innerproto; + const struct nf_conntrack_tuple_hash *h; + const struct nf_conntrack_zone *zone; + enum ip_conntrack_info ctinfo; + struct nf_conntrack_zone tmp; + + WARN_ON(skb_nfct(skb)); + zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); + + /* Are they talking about one of our connections? */ + if (!nf_ct_get_tuplepr(skb, + skb_network_offset(skb) + ip_hdrlen(skb) + + sizeof(struct icmphdr), + PF_INET, net, &origtuple)) { + pr_debug("icmp_error_message: failed to get tuple\n"); + return -NF_ACCEPT; + } + + /* rcu_read_lock()ed by nf_hook_thresh */ + innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum); + + /* Ordinarily, we'd expect the inverted tupleproto, but it's + been preserved inside the ICMP. */ + if (!nf_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { + pr_debug("icmp_error_message: no match\n"); + return -NF_ACCEPT; + } + + ctinfo = IP_CT_RELATED; + + h = nf_conntrack_find_get(net, zone, &innertuple); + if (!h) { + pr_debug("icmp_error_message: no match\n"); + return -NF_ACCEPT; + } + + if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) + ctinfo += IP_CT_IS_REPLY; + + /* Update skb to refer to this connection */ + nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo); + return NF_ACCEPT; +} + +static void icmp_error_log(const struct sk_buff *skb, struct net *net, + u8 pf, const char *msg) +{ + nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMP, "%s", msg); +} + +/* Small and modified version of icmp_rcv */ +static int +icmp_error(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, unsigned int dataoff, + u8 pf, unsigned int hooknum) +{ + const struct icmphdr *icmph; + struct icmphdr _ih; + + /* Not enough header? */ + icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); + if (icmph == NULL) { + icmp_error_log(skb, net, pf, "short packet"); + return -NF_ACCEPT; + } + + /* See ip_conntrack_proto_tcp.c */ + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && + nf_ip_checksum(skb, hooknum, dataoff, 0)) { + icmp_error_log(skb, net, pf, "bad hw icmp checksum"); + return -NF_ACCEPT; + } + + /* + * 18 is the highest 'known' ICMP type. Anything else is a mystery + * + * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently + * discarded. + */ + if (icmph->type > NR_ICMP_TYPES) { + icmp_error_log(skb, net, pf, "invalid icmp type"); + return -NF_ACCEPT; + } + + /* Need to track icmp error message? */ + if (icmph->type != ICMP_DEST_UNREACH && + icmph->type != ICMP_SOURCE_QUENCH && + icmph->type != ICMP_TIME_EXCEEDED && + icmph->type != ICMP_PARAMETERPROB && + icmph->type != ICMP_REDIRECT) + return NF_ACCEPT; + + return icmp_error_message(net, tmpl, skb, hooknum); +} + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + +#include +#include + +static int icmp_tuple_to_nlattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *t) +{ + if (nla_put_be16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id) || + nla_put_u8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type) || + nla_put_u8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code)) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = { + [CTA_PROTO_ICMP_TYPE] = { .type = NLA_U8 }, + [CTA_PROTO_ICMP_CODE] = { .type = NLA_U8 }, + [CTA_PROTO_ICMP_ID] = { .type = NLA_U16 }, +}; + +static int icmp_nlattr_to_tuple(struct nlattr *tb[], + struct nf_conntrack_tuple *tuple) +{ + if (!tb[CTA_PROTO_ICMP_TYPE] || + !tb[CTA_PROTO_ICMP_CODE] || + !tb[CTA_PROTO_ICMP_ID]) + return -EINVAL; + + tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]); + tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]); + tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]); + + if (tuple->dst.u.icmp.type >= sizeof(invmap) || + !invmap[tuple->dst.u.icmp.type]) + return -EINVAL; + + return 0; +} + +static unsigned int icmp_nlattr_tuple_size(void) +{ + static unsigned int size __read_mostly; + + if (!size) + size = nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1); + + return size; +} +#endif + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include +#include + +static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], + struct net *net, void *data) +{ + unsigned int *timeout = data; + struct nf_icmp_net *in = icmp_pernet(net); + + if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) { + if (!timeout) + timeout = &in->timeout; + *timeout = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ; + } else if (timeout) { + /* Set default ICMP timeout. */ + *timeout = in->timeout; + } + return 0; +} + +static int +icmp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ + const unsigned int *timeout = data; + + if (nla_put_be32(skb, CTA_TIMEOUT_ICMP_TIMEOUT, htonl(*timeout / HZ))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy +icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = { + [CTA_TIMEOUT_ICMP_TIMEOUT] = { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + +#ifdef CONFIG_SYSCTL +static struct ctl_table icmp_sysctl_table[] = { + { + .procname = "nf_conntrack_icmp_timeout", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#endif /* CONFIG_SYSCTL */ + +static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn, + struct nf_icmp_net *in) +{ +#ifdef CONFIG_SYSCTL + pn->ctl_table = kmemdup(icmp_sysctl_table, + sizeof(icmp_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_table) + return -ENOMEM; + + pn->ctl_table[0].data = &in->timeout; +#endif + return 0; +} + +static int icmp_init_net(struct net *net, u_int16_t proto) +{ + struct nf_icmp_net *in = icmp_pernet(net); + struct nf_proto_net *pn = &in->pn; + + in->timeout = nf_ct_icmp_timeout; + + return icmp_kmemdup_sysctl_table(pn, in); +} + +static struct nf_proto_net *icmp_get_net_proto(struct net *net) +{ + return &net->ct.nf_ct_proto.icmp.pn; +} + +const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp = +{ + .l3proto = PF_INET, + .l4proto = IPPROTO_ICMP, + .pkt_to_tuple = icmp_pkt_to_tuple, + .invert_tuple = icmp_invert_tuple, + .packet = icmp_packet, + .new = icmp_new, + .error = icmp_error, + .destroy = NULL, + .me = NULL, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .tuple_to_nlattr = icmp_tuple_to_nlattr, + .nlattr_tuple_size = icmp_nlattr_tuple_size, + .nlattr_to_tuple = icmp_nlattr_to_tuple, + .nla_policy = icmp_nla_policy, +#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = icmp_timeout_nlattr_to_obj, + .obj_to_nlattr = icmp_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_ICMP_MAX, + .obj_size = sizeof(unsigned int), + .nla_policy = icmp_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .init_net = icmp_init_net, + .get_net_proto = icmp_get_net_proto, +}; diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c new file mode 100644 index 000000000000..bed07b998a10 --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -0,0 +1,387 @@ +/* + * Copyright (C)2003,2004 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Author: + * Yasuyuki Kozakai @USAGI + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const unsigned int nf_ct_icmpv6_timeout = 30*HZ; + +static inline struct nf_icmp_net *icmpv6_pernet(struct net *net) +{ + return &net->ct.nf_ct_proto.icmpv6; +} + +static bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct net *net, + struct nf_conntrack_tuple *tuple) +{ + const struct icmp6hdr *hp; + struct icmp6hdr _hdr; + + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) + return false; + tuple->dst.u.icmp.type = hp->icmp6_type; + tuple->src.u.icmp.id = hp->icmp6_identifier; + tuple->dst.u.icmp.code = hp->icmp6_code; + + return true; +} + +/* Add 1; spaces filled with 0. */ +static const u_int8_t invmap[] = { + [ICMPV6_ECHO_REQUEST - 128] = ICMPV6_ECHO_REPLY + 1, + [ICMPV6_ECHO_REPLY - 128] = ICMPV6_ECHO_REQUEST + 1, + [ICMPV6_NI_QUERY - 128] = ICMPV6_NI_REPLY + 1, + [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_QUERY + 1 +}; + +static const u_int8_t noct_valid_new[] = { + [ICMPV6_MGM_QUERY - 130] = 1, + [ICMPV6_MGM_REPORT - 130] = 1, + [ICMPV6_MGM_REDUCTION - 130] = 1, + [NDISC_ROUTER_SOLICITATION - 130] = 1, + [NDISC_ROUTER_ADVERTISEMENT - 130] = 1, + [NDISC_NEIGHBOUR_SOLICITATION - 130] = 1, + [NDISC_NEIGHBOUR_ADVERTISEMENT - 130] = 1, + [ICMPV6_MLD2_REPORT - 130] = 1 +}; + +static bool icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + int type = orig->dst.u.icmp.type - 128; + if (type < 0 || type >= sizeof(invmap) || !invmap[type]) + return false; + + tuple->src.u.icmp.id = orig->src.u.icmp.id; + tuple->dst.u.icmp.type = invmap[type] - 1; + tuple->dst.u.icmp.code = orig->dst.u.icmp.code; + return true; +} + +static unsigned int *icmpv6_get_timeouts(struct net *net) +{ + return &icmpv6_pernet(net)->timeout; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int icmpv6_packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo) +{ + unsigned int *timeout = nf_ct_timeout_lookup(ct); + + if (!timeout) + timeout = icmpv6_get_timeouts(nf_ct_net(ct)); + + /* Do not immediately delete the connection after the first + successful reply to avoid excessive conntrackd traffic + and also to handle correctly ICMP echo reply duplicates. */ + nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff) +{ + static const u_int8_t valid_new[] = { + [ICMPV6_ECHO_REQUEST - 128] = 1, + [ICMPV6_NI_QUERY - 128] = 1 + }; + int type = ct->tuplehash[0].tuple.dst.u.icmp.type - 128; + + if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) { + /* Can't create a new ICMPv6 `conn' with this. */ + pr_debug("icmpv6: can't create new conn with type %u\n", + type + 128); + nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple); + return false; + } + return true; +} + +static int +icmpv6_error_message(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, + unsigned int icmp6off) +{ + struct nf_conntrack_tuple intuple, origtuple; + const struct nf_conntrack_tuple_hash *h; + const struct nf_conntrack_l4proto *inproto; + enum ip_conntrack_info ctinfo; + struct nf_conntrack_zone tmp; + + WARN_ON(skb_nfct(skb)); + + /* Are they talking about one of our connections? */ + if (!nf_ct_get_tuplepr(skb, + skb_network_offset(skb) + + sizeof(struct ipv6hdr) + + sizeof(struct icmp6hdr), + PF_INET6, net, &origtuple)) { + pr_debug("icmpv6_error: Can't get tuple\n"); + return -NF_ACCEPT; + } + + /* rcu_read_lock()ed by nf_hook_thresh */ + inproto = __nf_ct_l4proto_find(PF_INET6, origtuple.dst.protonum); + + /* Ordinarily, we'd expect the inverted tupleproto, but it's + been preserved inside the ICMP. */ + if (!nf_ct_invert_tuple(&intuple, &origtuple, inproto)) { + pr_debug("icmpv6_error: Can't invert tuple\n"); + return -NF_ACCEPT; + } + + ctinfo = IP_CT_RELATED; + + h = nf_conntrack_find_get(net, nf_ct_zone_tmpl(tmpl, skb, &tmp), + &intuple); + if (!h) { + pr_debug("icmpv6_error: no match\n"); + return -NF_ACCEPT; + } else { + if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) + ctinfo += IP_CT_IS_REPLY; + } + + /* Update skb to refer to this connection */ + nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo); + return NF_ACCEPT; +} + +static void icmpv6_error_log(const struct sk_buff *skb, struct net *net, + u8 pf, const char *msg) +{ + nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMPV6, "%s", msg); +} + +static int +icmpv6_error(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, unsigned int dataoff, + u8 pf, unsigned int hooknum) +{ + const struct icmp6hdr *icmp6h; + struct icmp6hdr _ih; + int type; + + icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); + if (icmp6h == NULL) { + icmpv6_error_log(skb, net, pf, "short packet"); + return -NF_ACCEPT; + } + + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && + nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) { + icmpv6_error_log(skb, net, pf, "ICMPv6 checksum failed"); + return -NF_ACCEPT; + } + + type = icmp6h->icmp6_type - 130; + if (type >= 0 && type < sizeof(noct_valid_new) && + noct_valid_new[type]) { + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); + return NF_ACCEPT; + } + + /* is not error message ? */ + if (icmp6h->icmp6_type >= 128) + return NF_ACCEPT; + + return icmpv6_error_message(net, tmpl, skb, dataoff); +} + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + +#include +#include +static int icmpv6_tuple_to_nlattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *t) +{ + if (nla_put_be16(skb, CTA_PROTO_ICMPV6_ID, t->src.u.icmp.id) || + nla_put_u8(skb, CTA_PROTO_ICMPV6_TYPE, t->dst.u.icmp.type) || + nla_put_u8(skb, CTA_PROTO_ICMPV6_CODE, t->dst.u.icmp.code)) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static const struct nla_policy icmpv6_nla_policy[CTA_PROTO_MAX+1] = { + [CTA_PROTO_ICMPV6_TYPE] = { .type = NLA_U8 }, + [CTA_PROTO_ICMPV6_CODE] = { .type = NLA_U8 }, + [CTA_PROTO_ICMPV6_ID] = { .type = NLA_U16 }, +}; + +static int icmpv6_nlattr_to_tuple(struct nlattr *tb[], + struct nf_conntrack_tuple *tuple) +{ + if (!tb[CTA_PROTO_ICMPV6_TYPE] || + !tb[CTA_PROTO_ICMPV6_CODE] || + !tb[CTA_PROTO_ICMPV6_ID]) + return -EINVAL; + + tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]); + tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]); + tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]); + + if (tuple->dst.u.icmp.type < 128 || + tuple->dst.u.icmp.type - 128 >= sizeof(invmap) || + !invmap[tuple->dst.u.icmp.type - 128]) + return -EINVAL; + + return 0; +} + +static unsigned int icmpv6_nlattr_tuple_size(void) +{ + static unsigned int size __read_mostly; + + if (!size) + size = nla_policy_len(icmpv6_nla_policy, CTA_PROTO_MAX + 1); + + return size; +} +#endif + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include +#include + +static int icmpv6_timeout_nlattr_to_obj(struct nlattr *tb[], + struct net *net, void *data) +{ + unsigned int *timeout = data; + struct nf_icmp_net *in = icmpv6_pernet(net); + + if (!timeout) + timeout = icmpv6_get_timeouts(net); + if (tb[CTA_TIMEOUT_ICMPV6_TIMEOUT]) { + *timeout = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMPV6_TIMEOUT])) * HZ; + } else { + /* Set default ICMPv6 timeout. */ + *timeout = in->timeout; + } + return 0; +} + +static int +icmpv6_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ + const unsigned int *timeout = data; + + if (nla_put_be32(skb, CTA_TIMEOUT_ICMPV6_TIMEOUT, htonl(*timeout / HZ))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy +icmpv6_timeout_nla_policy[CTA_TIMEOUT_ICMPV6_MAX+1] = { + [CTA_TIMEOUT_ICMPV6_TIMEOUT] = { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + +#ifdef CONFIG_SYSCTL +static struct ctl_table icmpv6_sysctl_table[] = { + { + .procname = "nf_conntrack_icmpv6_timeout", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#endif /* CONFIG_SYSCTL */ + +static int icmpv6_kmemdup_sysctl_table(struct nf_proto_net *pn, + struct nf_icmp_net *in) +{ +#ifdef CONFIG_SYSCTL + pn->ctl_table = kmemdup(icmpv6_sysctl_table, + sizeof(icmpv6_sysctl_table), + GFP_KERNEL); + if (!pn->ctl_table) + return -ENOMEM; + + pn->ctl_table[0].data = &in->timeout; +#endif + return 0; +} + +static int icmpv6_init_net(struct net *net, u_int16_t proto) +{ + struct nf_icmp_net *in = icmpv6_pernet(net); + struct nf_proto_net *pn = &in->pn; + + in->timeout = nf_ct_icmpv6_timeout; + + return icmpv6_kmemdup_sysctl_table(pn, in); +} + +static struct nf_proto_net *icmpv6_get_net_proto(struct net *net) +{ + return &net->ct.nf_ct_proto.icmpv6.pn; +} + +const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 = +{ + .l3proto = PF_INET6, + .l4proto = IPPROTO_ICMPV6, + .pkt_to_tuple = icmpv6_pkt_to_tuple, + .invert_tuple = icmpv6_invert_tuple, + .packet = icmpv6_packet, + .new = icmpv6_new, + .error = icmpv6_error, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .tuple_to_nlattr = icmpv6_tuple_to_nlattr, + .nlattr_tuple_size = icmpv6_nlattr_tuple_size, + .nlattr_to_tuple = icmpv6_nlattr_to_tuple, + .nla_policy = icmpv6_nla_policy, +#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = icmpv6_timeout_nlattr_to_obj, + .obj_to_nlattr = icmpv6_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_ICMP_MAX, + .obj_size = sizeof(unsigned int), + .nla_policy = icmpv6_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + .init_net = icmpv6_init_net, + .get_net_proto = icmpv6_get_net_proto, +}; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 47b80fd0d2c3..13279f683da9 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -1,12 +1,4 @@ -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team - * (C) 2005-2012 Patrick McHardy - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - +// SPDX-License-Identifier: GPL-2.0 #include #include #include @@ -32,7 +24,7 @@ #include #include -MODULE_LICENSE("GPL"); +unsigned int nf_conntrack_net_id __read_mostly; #ifdef CONFIG_NF_CONNTRACK_PROCFS void @@ -674,6 +666,8 @@ static void nf_conntrack_pernet_exit(struct list_head *net_exit_list) static struct pernet_operations nf_conntrack_net_ops = { .init = nf_conntrack_pernet_init, .exit_batch = nf_conntrack_pernet_exit, + .id = &nf_conntrack_net_id, + .size = sizeof(struct nf_conntrack_net), }; static int __init nf_conntrack_standalone_init(void) diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 86df2a1666fd..6366f0c0b8c1 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include @@ -743,12 +742,6 @@ EXPORT_SYMBOL_GPL(nf_nat_l4proto_unregister); int nf_nat_l3proto_register(const struct nf_nat_l3proto *l3proto) { - int err; - - err = nf_ct_l3proto_try_module_get(l3proto->l3proto); - if (err < 0) - return err; - mutex_lock(&nf_nat_proto_mutex); RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_TCP], &nf_nat_l4proto_tcp); @@ -781,7 +774,6 @@ void nf_nat_l3proto_unregister(const struct nf_nat_l3proto *l3proto) synchronize_rcu(); nf_nat_l3proto_clean(l3proto->l3proto); - nf_ct_l3proto_module_put(l3proto->l3proto); } EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister); -- cgit v1.2.3 From cb2b36f5a97df76f547fcc4ab444a02522fb6c96 Mon Sep 17 00:00:00 2001 From: Yi-Hung Wei Date: Mon, 2 Jul 2018 17:33:40 -0700 Subject: netfilter: nf_conncount: Switch to plain list Original patch is from Florian Westphal. This patch switches from hlist to plain list to store the list of connections with the same filtering key in nf_conncount. With the plain list, we can insert new connections at the tail, so over time the beginning of list holds long-running connections and those are expired, while the newly creates ones are at the end. Later on, we could probably move checked ones to the end of the list, so the next run has higher chance to reclaim stale entries in the front. Signed-off-by: Yi-Hung Wei Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_count.h | 15 ++++-- net/netfilter/nf_conncount.c | 83 ++++++++++++++++++------------ net/netfilter/nft_connlimit.c | 24 ++++----- 3 files changed, 75 insertions(+), 47 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h index 3a188a0923a3..e4884e0e4f69 100644 --- a/include/net/netfilter/nf_conntrack_count.h +++ b/include/net/netfilter/nf_conntrack_count.h @@ -1,8 +1,15 @@ #ifndef _NF_CONNTRACK_COUNT_H #define _NF_CONNTRACK_COUNT_H +#include + struct nf_conncount_data; +struct nf_conncount_list { + struct list_head head; /* connections with the same filtering key */ + unsigned int count; /* length of list */ +}; + struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family, unsigned int keylen); void nf_conncount_destroy(struct net *net, unsigned int family, @@ -14,15 +21,17 @@ unsigned int nf_conncount_count(struct net *net, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone); -unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head, +unsigned int nf_conncount_lookup(struct net *net, struct nf_conncount_list *list, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone, bool *addit); -bool nf_conncount_add(struct hlist_head *head, +void nf_conncount_list_init(struct nf_conncount_list *list); + +bool nf_conncount_add(struct nf_conncount_list *list, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone); -void nf_conncount_cache_free(struct hlist_head *hhead); +void nf_conncount_cache_free(struct nf_conncount_list *list); #endif diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 81c02185b2e8..81b060adefef 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -44,7 +44,7 @@ /* we will save the tuples of all connections we care about */ struct nf_conncount_tuple { - struct hlist_node node; + struct list_head node; struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; int cpu; @@ -53,7 +53,7 @@ struct nf_conncount_tuple { struct nf_conncount_rb { struct rb_node node; - struct hlist_head hhead; /* connections/hosts in same subnet */ + struct nf_conncount_list list; u32 key[MAX_KEYLEN]; }; @@ -82,12 +82,15 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen) return memcmp(a, b, klen * sizeof(u32)); } -bool nf_conncount_add(struct hlist_head *head, +bool nf_conncount_add(struct nf_conncount_list *list, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { struct nf_conncount_tuple *conn; + if (WARN_ON_ONCE(list->count > INT_MAX)) + return false; + conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); if (conn == NULL) return false; @@ -95,13 +98,26 @@ bool nf_conncount_add(struct hlist_head *head, conn->zone = *zone; conn->cpu = raw_smp_processor_id(); conn->jiffies32 = (u32)jiffies; - hlist_add_head(&conn->node, head); + list_add_tail(&conn->node, &list->head); + list->count++; return true; } EXPORT_SYMBOL_GPL(nf_conncount_add); +static void conn_free(struct nf_conncount_list *list, + struct nf_conncount_tuple *conn) +{ + if (WARN_ON_ONCE(list->count == 0)) + return; + + list->count--; + list_del(&conn->node); + kmem_cache_free(conncount_conn_cachep, conn); +} + static const struct nf_conntrack_tuple_hash * -find_or_evict(struct net *net, struct nf_conncount_tuple *conn) +find_or_evict(struct net *net, struct nf_conncount_list *list, + struct nf_conncount_tuple *conn) { const struct nf_conntrack_tuple_hash *found; unsigned long a, b; @@ -121,30 +137,29 @@ find_or_evict(struct net *net, struct nf_conncount_tuple *conn) */ age = a - b; if (conn->cpu == cpu || age >= 2) { - hlist_del(&conn->node); - kmem_cache_free(conncount_conn_cachep, conn); + conn_free(list, conn); return ERR_PTR(-ENOENT); } return ERR_PTR(-EAGAIN); } -unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head, +unsigned int nf_conncount_lookup(struct net *net, + struct nf_conncount_list *list, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone, bool *addit) { const struct nf_conntrack_tuple_hash *found; - struct nf_conncount_tuple *conn; + struct nf_conncount_tuple *conn, *conn_n; struct nf_conn *found_ct; - struct hlist_node *n; unsigned int length = 0; *addit = tuple ? true : false; /* check the saved connections */ - hlist_for_each_entry_safe(conn, n, head, node) { - found = find_or_evict(net, conn); + list_for_each_entry_safe(conn, conn_n, &list->head, node) { + found = find_or_evict(net, list, conn); if (IS_ERR(found)) { /* Not found, but might be about to be confirmed */ if (PTR_ERR(found) == -EAGAIN) { @@ -157,6 +172,7 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head, nf_ct_zone_id(zone, zone->dir)) *addit = false; } + continue; } @@ -176,8 +192,7 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head, * closed already -> ditch it */ nf_ct_put(found_ct); - hlist_del(&conn->node); - kmem_cache_free(conncount_conn_cachep, conn); + conn_free(list, conn); continue; } @@ -189,17 +204,23 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head, } EXPORT_SYMBOL_GPL(nf_conncount_lookup); +void nf_conncount_list_init(struct nf_conncount_list *list) +{ + INIT_LIST_HEAD(&list->head); + list->count = 1; +} +EXPORT_SYMBOL_GPL(nf_conncount_list_init); + static void nf_conncount_gc_list(struct net *net, - struct nf_conncount_rb *rbconn) + struct nf_conncount_list *list) { const struct nf_conntrack_tuple_hash *found; - struct nf_conncount_tuple *conn; - struct hlist_node *n; + struct nf_conncount_tuple *conn, *conn_n; struct nf_conn *found_ct; unsigned int collected = 0; - hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node) { - found = find_or_evict(net, conn); + list_for_each_entry_safe(conn, conn_n, &list->head, node) { + found = find_or_evict(net, list, conn); if (IS_ERR(found)) { if (PTR_ERR(found) == -ENOENT) collected++; @@ -213,8 +234,7 @@ static void nf_conncount_gc_list(struct net *net, * closed already -> ditch it */ nf_ct_put(found_ct); - hlist_del(&conn->node); - kmem_cache_free(conncount_conn_cachep, conn); + conn_free(list, conn); collected++; continue; } @@ -271,14 +291,14 @@ count_tree(struct net *net, struct rb_root *root, /* same source network -> be counted! */ unsigned int count; - count = nf_conncount_lookup(net, &rbconn->hhead, tuple, + count = nf_conncount_lookup(net, &rbconn->list, tuple, zone, &addit); tree_nodes_free(root, gc_nodes, gc_count); if (!addit) return count; - if (!nf_conncount_add(&rbconn->hhead, tuple, zone)) + if (!nf_conncount_add(&rbconn->list, tuple, zone)) return 0; /* hotdrop */ return count + 1; @@ -287,8 +307,8 @@ count_tree(struct net *net, struct rb_root *root, if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes)) continue; - nf_conncount_gc_list(net, rbconn); - if (hlist_empty(&rbconn->hhead)) + nf_conncount_gc_list(net, &rbconn->list); + if (list_empty(&rbconn->list.head)) gc_nodes[gc_count++] = rbconn; } @@ -322,8 +342,8 @@ count_tree(struct net *net, struct rb_root *root, conn->zone = *zone; memcpy(rbconn->key, key, sizeof(u32) * keylen); - INIT_HLIST_HEAD(&rbconn->hhead); - hlist_add_head(&conn->node, &rbconn->hhead); + nf_conncount_list_init(&rbconn->list); + list_add(&conn->node, &rbconn->list.head); rb_link_node(&rbconn->node, parent, rbnode); rb_insert_color(&rbconn->node, root); @@ -388,12 +408,11 @@ struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family } EXPORT_SYMBOL_GPL(nf_conncount_init); -void nf_conncount_cache_free(struct hlist_head *hhead) +void nf_conncount_cache_free(struct nf_conncount_list *list) { - struct nf_conncount_tuple *conn; - struct hlist_node *n; + struct nf_conncount_tuple *conn, *conn_n; - hlist_for_each_entry_safe(conn, n, hhead, node) + list_for_each_entry_safe(conn, conn_n, &list->head, node) kmem_cache_free(conncount_conn_cachep, conn); } EXPORT_SYMBOL_GPL(nf_conncount_cache_free); @@ -408,7 +427,7 @@ static void destroy_tree(struct rb_root *r) rb_erase(node, r); - nf_conncount_cache_free(&rbconn->hhead); + nf_conncount_cache_free(&rbconn->list); kmem_cache_free(conncount_rb_cachep, rbconn); } diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index a832c59f0a9c..4f0491a36a1d 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -14,10 +14,10 @@ #include struct nft_connlimit { - spinlock_t lock; - struct hlist_head hhead; - u32 limit; - bool invert; + spinlock_t lock; + struct nf_conncount_list list; + u32 limit; + bool invert; }; static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, @@ -46,13 +46,13 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, } spin_lock_bh(&priv->lock); - count = nf_conncount_lookup(nft_net(pkt), &priv->hhead, tuple_ptr, zone, + count = nf_conncount_lookup(nft_net(pkt), &priv->list, tuple_ptr, zone, &addit); if (!addit) goto out; - if (!nf_conncount_add(&priv->hhead, tuple_ptr, zone)) { + if (!nf_conncount_add(&priv->list, tuple_ptr, zone)) { regs->verdict.code = NF_DROP; spin_unlock_bh(&priv->lock); return; @@ -88,7 +88,7 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx, } spin_lock_init(&priv->lock); - INIT_HLIST_HEAD(&priv->hhead); + nf_conncount_list_init(&priv->list); priv->limit = limit; priv->invert = invert; @@ -99,7 +99,7 @@ static void nft_connlimit_do_destroy(const struct nft_ctx *ctx, struct nft_connlimit *priv) { nf_ct_netns_put(ctx->net, ctx->family); - nf_conncount_cache_free(&priv->hhead); + nf_conncount_cache_free(&priv->list); } static int nft_connlimit_do_dump(struct sk_buff *skb, @@ -213,7 +213,7 @@ static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src) struct nft_connlimit *priv_src = nft_expr_priv(src); spin_lock_init(&priv_dst->lock); - INIT_HLIST_HEAD(&priv_dst->hhead); + nf_conncount_list_init(&priv_dst->list); priv_dst->limit = priv_src->limit; priv_dst->invert = priv_src->invert; @@ -225,7 +225,7 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx, { struct nft_connlimit *priv = nft_expr_priv(expr); - nf_conncount_cache_free(&priv->hhead); + nf_conncount_cache_free(&priv->list); } static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr) @@ -234,9 +234,9 @@ static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr) bool addit, ret; spin_lock_bh(&priv->lock); - nf_conncount_lookup(net, &priv->hhead, NULL, &nf_ct_zone_dflt, &addit); + nf_conncount_lookup(net, &priv->list, NULL, &nf_ct_zone_dflt, &addit); - ret = hlist_empty(&priv->hhead); + ret = list_empty(&priv->list.head); spin_unlock_bh(&priv->lock); return ret; -- cgit v1.2.3 From 976afca1ceba53df6f4a543014e15d1c7a962571 Mon Sep 17 00:00:00 2001 From: Yi-Hung Wei Date: Mon, 2 Jul 2018 17:33:41 -0700 Subject: netfilter: nf_conncount: Early exit in nf_conncount_lookup() and cleanup This patch is originally from Florian Westphal. This patch does the following three tasks. It applies the same early exit technique for nf_conncount_lookup(). Since now we keep the number of connections in 'struct nf_conncount_list', we no longer need to return the count in nf_conncount_lookup(). Moreover, we expose the garbage collection function nf_conncount_gc_list() for nft_connlimit. Signed-off-by: Yi-Hung Wei Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_count.h | 11 +++++---- net/netfilter/nf_conncount.c | 38 +++++++++++++++++------------- net/netfilter/nft_connlimit.c | 9 +++---- 3 files changed, 33 insertions(+), 25 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h index e4884e0e4f69..dbec17f674b7 100644 --- a/include/net/netfilter/nf_conntrack_count.h +++ b/include/net/netfilter/nf_conntrack_count.h @@ -21,10 +21,10 @@ unsigned int nf_conncount_count(struct net *net, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone); -unsigned int nf_conncount_lookup(struct net *net, struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone, - bool *addit); +void nf_conncount_lookup(struct net *net, struct nf_conncount_list *list, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone, + bool *addit); void nf_conncount_list_init(struct nf_conncount_list *list); @@ -32,6 +32,9 @@ bool nf_conncount_add(struct nf_conncount_list *list, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone); +void nf_conncount_gc_list(struct net *net, + struct nf_conncount_list *list); + void nf_conncount_cache_free(struct nf_conncount_list *list); #endif diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 81b060adefef..7dfd9d5e6a3e 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -144,26 +144,29 @@ find_or_evict(struct net *net, struct nf_conncount_list *list, return ERR_PTR(-EAGAIN); } -unsigned int nf_conncount_lookup(struct net *net, - struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone, - bool *addit) +void nf_conncount_lookup(struct net *net, + struct nf_conncount_list *list, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone, + bool *addit) { const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn, *conn_n; struct nf_conn *found_ct; - unsigned int length = 0; + unsigned int collect = 0; + /* best effort only */ *addit = tuple ? true : false; /* check the saved connections */ list_for_each_entry_safe(conn, conn_n, &list->head, node) { + if (collect > CONNCOUNT_GC_MAX_NODES) + break; + found = find_or_evict(net, list, conn); if (IS_ERR(found)) { /* Not found, but might be about to be confirmed */ if (PTR_ERR(found) == -EAGAIN) { - length++; if (!tuple) continue; @@ -171,8 +174,8 @@ unsigned int nf_conncount_lookup(struct net *net, nf_ct_zone_id(&conn->zone, conn->zone.dir) == nf_ct_zone_id(zone, zone->dir)) *addit = false; - } - + } else if (PTR_ERR(found) == -ENOENT) + collect++; continue; } @@ -181,9 +184,10 @@ unsigned int nf_conncount_lookup(struct net *net, if (tuple && nf_ct_tuple_equal(&conn->tuple, tuple) && nf_ct_zone_equal(found_ct, zone, zone->dir)) { /* - * Just to be sure we have it only once in the list. * We should not see tuples twice unless someone hooks * this into a table without "-p tcp --syn". + * + * Attempt to avoid a re-add in this case. */ *addit = false; } else if (already_closed(found_ct)) { @@ -193,14 +197,12 @@ unsigned int nf_conncount_lookup(struct net *net, */ nf_ct_put(found_ct); conn_free(list, conn); + collect++; continue; } nf_ct_put(found_ct); - length++; } - - return length; } EXPORT_SYMBOL_GPL(nf_conncount_lookup); @@ -211,8 +213,8 @@ void nf_conncount_list_init(struct nf_conncount_list *list) } EXPORT_SYMBOL_GPL(nf_conncount_list_init); -static void nf_conncount_gc_list(struct net *net, - struct nf_conncount_list *list) +void nf_conncount_gc_list(struct net *net, + struct nf_conncount_list *list) { const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn, *conn_n; @@ -244,6 +246,7 @@ static void nf_conncount_gc_list(struct net *net, return; } } +EXPORT_SYMBOL_GPL(nf_conncount_gc_list); static void tree_nodes_free(struct rb_root *root, struct nf_conncount_rb *gc_nodes[], @@ -291,8 +294,9 @@ count_tree(struct net *net, struct rb_root *root, /* same source network -> be counted! */ unsigned int count; - count = nf_conncount_lookup(net, &rbconn->list, tuple, - zone, &addit); + nf_conncount_lookup(net, &rbconn->list, tuple, zone, + &addit); + count = rbconn->list.count; tree_nodes_free(root, gc_nodes, gc_count); if (!addit) diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index 4f0491a36a1d..37c52ae06741 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -46,8 +46,9 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, } spin_lock_bh(&priv->lock); - count = nf_conncount_lookup(nft_net(pkt), &priv->list, tuple_ptr, zone, - &addit); + nf_conncount_lookup(nft_net(pkt), &priv->list, tuple_ptr, zone, + &addit); + count = priv->list.count; if (!addit) goto out; @@ -231,10 +232,10 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx, static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr) { struct nft_connlimit *priv = nft_expr_priv(expr); - bool addit, ret; + bool ret; spin_lock_bh(&priv->lock); - nf_conncount_lookup(net, &priv->list, NULL, &nf_ct_zone_dflt, &addit); + nf_conncount_gc_list(net, &priv->list); ret = list_empty(&priv->list.head); spin_unlock_bh(&priv->lock); -- cgit v1.2.3 From 5c789e131cbb997a528451564ea4613e812fc718 Mon Sep 17 00:00:00 2001 From: Yi-Hung Wei Date: Mon, 2 Jul 2018 17:33:44 -0700 Subject: netfilter: nf_conncount: Add list lock and gc worker, and RCU for init tree search This patch is originally from Florian Westphal. This patch does the following 3 main tasks. 1) Add list lock to 'struct nf_conncount_list' so that we can alter the lists containing the individual connections without holding the main tree lock. It would be useful when we only need to add/remove to/from a list without allocate/remove a node in the tree. With this change, we update nft_connlimit accordingly since we longer need to maintain a list lock in nft_connlimit now. 2) Use RCU for the initial tree search to improve tree look up performance. 3) Add a garbage collection worker. This worker is schedule when there are excessive tree node that needed to be recycled. Moreover,the rbnode reclaim logic is moved from search tree to insert tree to avoid race condition. Signed-off-by: Yi-Hung Wei Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_count.h | 17 +- net/netfilter/nf_conncount.c | 253 +++++++++++++++++++++-------- net/netfilter/nft_connlimit.c | 17 +- 3 files changed, 196 insertions(+), 91 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h index dbec17f674b7..4b2b2baf8ab4 100644 --- a/include/net/netfilter/nf_conntrack_count.h +++ b/include/net/netfilter/nf_conntrack_count.h @@ -5,9 +5,17 @@ struct nf_conncount_data; +enum nf_conncount_list_add { + NF_CONNCOUNT_ADDED, /* list add was ok */ + NF_CONNCOUNT_ERR, /* -ENOMEM, must drop skb */ + NF_CONNCOUNT_SKIP, /* list is already reclaimed by gc */ +}; + struct nf_conncount_list { + spinlock_t list_lock; struct list_head head; /* connections with the same filtering key */ unsigned int count; /* length of list */ + bool dead; }; struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family, @@ -28,11 +36,12 @@ void nf_conncount_lookup(struct net *net, struct nf_conncount_list *list, void nf_conncount_list_init(struct nf_conncount_list *list); -bool nf_conncount_add(struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone); +enum nf_conncount_list_add +nf_conncount_add(struct nf_conncount_list *list, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone); -void nf_conncount_gc_list(struct net *net, +bool nf_conncount_gc_list(struct net *net, struct nf_conncount_list *list); void nf_conncount_cache_free(struct nf_conncount_list *list); diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 3f14806b7271..02ca7df793f5 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -49,12 +49,14 @@ struct nf_conncount_tuple { struct nf_conntrack_zone zone; int cpu; u32 jiffies32; + struct rcu_head rcu_head; }; struct nf_conncount_rb { struct rb_node node; struct nf_conncount_list list; u32 key[MAX_KEYLEN]; + struct rcu_head rcu_head; }; static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] __cacheline_aligned_in_smp; @@ -62,6 +64,10 @@ static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] __cacheline_aligned_i struct nf_conncount_data { unsigned int keylen; struct rb_root root[CONNCOUNT_SLOTS]; + struct net *net; + struct work_struct gc_work; + unsigned long pending_trees[BITS_TO_LONGS(CONNCOUNT_SLOTS)]; + unsigned int gc_tree; }; static u_int32_t conncount_rnd __read_mostly; @@ -82,42 +88,70 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen) return memcmp(a, b, klen * sizeof(u32)); } -bool nf_conncount_add(struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) +enum nf_conncount_list_add +nf_conncount_add(struct nf_conncount_list *list, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone) { struct nf_conncount_tuple *conn; if (WARN_ON_ONCE(list->count > INT_MAX)) - return false; + return NF_CONNCOUNT_ERR; conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); if (conn == NULL) - return false; + return NF_CONNCOUNT_ERR; + conn->tuple = *tuple; conn->zone = *zone; conn->cpu = raw_smp_processor_id(); conn->jiffies32 = (u32)jiffies; + spin_lock(&list->list_lock); + if (list->dead == true) { + kmem_cache_free(conncount_conn_cachep, conn); + spin_unlock(&list->list_lock); + return NF_CONNCOUNT_SKIP; + } list_add_tail(&conn->node, &list->head); list->count++; - return true; + spin_unlock(&list->list_lock); + return NF_CONNCOUNT_ADDED; } EXPORT_SYMBOL_GPL(nf_conncount_add); -static void conn_free(struct nf_conncount_list *list, +static void __conn_free(struct rcu_head *h) +{ + struct nf_conncount_tuple *conn; + + conn = container_of(h, struct nf_conncount_tuple, rcu_head); + kmem_cache_free(conncount_conn_cachep, conn); +} + +static bool conn_free(struct nf_conncount_list *list, struct nf_conncount_tuple *conn) { - if (WARN_ON_ONCE(list->count == 0)) - return; + bool free_entry = false; + + spin_lock(&list->list_lock); + + if (list->count == 0) { + spin_unlock(&list->list_lock); + return free_entry; + } list->count--; - list_del(&conn->node); - kmem_cache_free(conncount_conn_cachep, conn); + list_del_rcu(&conn->node); + if (list->count == 0) + free_entry = true; + + spin_unlock(&list->list_lock); + call_rcu(&conn->rcu_head, __conn_free); + return free_entry; } static const struct nf_conntrack_tuple_hash * find_or_evict(struct net *net, struct nf_conncount_list *list, - struct nf_conncount_tuple *conn) + struct nf_conncount_tuple *conn, bool *free_entry) { const struct nf_conntrack_tuple_hash *found; unsigned long a, b; @@ -137,7 +171,7 @@ find_or_evict(struct net *net, struct nf_conncount_list *list, */ age = a - b; if (conn->cpu == cpu || age >= 2) { - conn_free(list, conn); + *free_entry = conn_free(list, conn); return ERR_PTR(-ENOENT); } @@ -154,6 +188,7 @@ void nf_conncount_lookup(struct net *net, struct nf_conncount_tuple *conn, *conn_n; struct nf_conn *found_ct; unsigned int collect = 0; + bool free_entry = false; /* best effort only */ *addit = tuple ? true : false; @@ -163,7 +198,7 @@ void nf_conncount_lookup(struct net *net, if (collect > CONNCOUNT_GC_MAX_NODES) break; - found = find_or_evict(net, list, conn); + found = find_or_evict(net, list, conn, &free_entry); if (IS_ERR(found)) { /* Not found, but might be about to be confirmed */ if (PTR_ERR(found) == -EAGAIN) { @@ -208,24 +243,31 @@ EXPORT_SYMBOL_GPL(nf_conncount_lookup); void nf_conncount_list_init(struct nf_conncount_list *list) { + spin_lock_init(&list->list_lock); INIT_LIST_HEAD(&list->head); list->count = 1; + list->dead = false; } EXPORT_SYMBOL_GPL(nf_conncount_list_init); -void nf_conncount_gc_list(struct net *net, +/* Return true if the list is empty */ +bool nf_conncount_gc_list(struct net *net, struct nf_conncount_list *list) { const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn, *conn_n; struct nf_conn *found_ct; unsigned int collected = 0; + bool free_entry = false; list_for_each_entry_safe(conn, conn_n, &list->head, node) { - found = find_or_evict(net, list, conn); + found = find_or_evict(net, list, conn, &free_entry); if (IS_ERR(found)) { - if (PTR_ERR(found) == -ENOENT) + if (PTR_ERR(found) == -ENOENT) { + if (free_entry) + return true; collected++; + } continue; } @@ -236,18 +278,28 @@ void nf_conncount_gc_list(struct net *net, * closed already -> ditch it */ nf_ct_put(found_ct); - conn_free(list, conn); + if (conn_free(list, conn)) + return true; collected++; continue; } nf_ct_put(found_ct); if (collected > CONNCOUNT_GC_MAX_NODES) - return; + return false; } + return false; } EXPORT_SYMBOL_GPL(nf_conncount_gc_list); +static void __tree_nodes_free(struct rcu_head *h) +{ + struct nf_conncount_rb *rbconn; + + rbconn = container_of(h, struct nf_conncount_rb, rcu_head); + kmem_cache_free(conncount_rb_cachep, rbconn); +} + static void tree_nodes_free(struct rb_root *root, struct nf_conncount_rb *gc_nodes[], unsigned int gc_count) @@ -256,23 +308,39 @@ static void tree_nodes_free(struct rb_root *root, while (gc_count) { rbconn = gc_nodes[--gc_count]; - rb_erase(&rbconn->node, root); - kmem_cache_free(conncount_rb_cachep, rbconn); + spin_lock(&rbconn->list.list_lock); + if (rbconn->list.count == 0 && rbconn->list.dead == false) { + rbconn->list.dead = true; + rb_erase(&rbconn->node, root); + call_rcu(&rbconn->rcu_head, __tree_nodes_free); + } + spin_unlock(&rbconn->list.list_lock); } } +static void schedule_gc_worker(struct nf_conncount_data *data, int tree) +{ + set_bit(tree, data->pending_trees); + schedule_work(&data->gc_work); +} + static unsigned int -insert_tree(struct rb_root *root, +insert_tree(struct net *net, + struct nf_conncount_data *data, + struct rb_root *root, unsigned int hash, const u32 *key, u8 keylen, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { + enum nf_conncount_list_add ret; + struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; struct rb_node **rbnode, *parent; struct nf_conncount_rb *rbconn; struct nf_conncount_tuple *conn; - unsigned int count = 0; + unsigned int count = 0, gc_count = 0; + bool node_found = false; spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); @@ -290,16 +358,44 @@ insert_tree(struct rb_root *root, rbnode = &((*rbnode)->rb_right); } else { /* unlikely: other cpu added node already */ - if (!nf_conncount_add(&rbconn->list, tuple, zone)) { + node_found = true; + ret = nf_conncount_add(&rbconn->list, tuple, zone); + if (ret == NF_CONNCOUNT_ERR) { count = 0; /* hotdrop */ - goto out_unlock; + } else if (ret == NF_CONNCOUNT_ADDED) { + count = rbconn->list.count; + } else { + /* NF_CONNCOUNT_SKIP, rbconn is already + * reclaimed by gc, insert a new tree node + */ + node_found = false; } - - count = rbconn->list.count; - goto out_unlock; + break; } + + if (gc_count >= ARRAY_SIZE(gc_nodes)) + continue; + + if (nf_conncount_gc_list(net, &rbconn->list)) + gc_nodes[gc_count++] = rbconn; + } + + if (gc_count) { + tree_nodes_free(root, gc_nodes, gc_count); + /* tree_node_free before new allocation permits + * allocator to re-use newly free'd object. + * + * This is a rare event; in most cases we will find + * existing node to re-use. (or gc_count is 0). + */ + + if (gc_count >= ARRAY_SIZE(gc_nodes)) + schedule_gc_worker(data, hash); } + if (node_found) + goto out_unlock; + /* expected case: match, insert new node */ rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); if (rbconn == NULL) @@ -333,87 +429,97 @@ count_tree(struct net *net, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { - struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; + enum nf_conncount_list_add ret; struct rb_root *root; - struct rb_node **rbnode, *parent; + struct rb_node *parent; struct nf_conncount_rb *rbconn; - unsigned int gc_count, hash; - bool no_gc = false; - unsigned int count = 0; + unsigned int hash; u8 keylen = data->keylen; hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS; root = &data->root[hash]; - spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); - restart: - gc_count = 0; - parent = NULL; - rbnode = &(root->rb_node); - while (*rbnode) { + parent = rcu_dereference_raw(root->rb_node); + while (parent) { int diff; bool addit; - rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node); + rbconn = rb_entry(parent, struct nf_conncount_rb, node); - parent = *rbnode; diff = key_diff(key, rbconn->key, keylen); if (diff < 0) { - rbnode = &((*rbnode)->rb_left); + parent = rcu_dereference_raw(parent->rb_left); } else if (diff > 0) { - rbnode = &((*rbnode)->rb_right); + parent = rcu_dereference_raw(parent->rb_right); } else { /* same source network -> be counted! */ nf_conncount_lookup(net, &rbconn->list, tuple, zone, &addit); - count = rbconn->list.count; - tree_nodes_free(root, gc_nodes, gc_count); if (!addit) - goto out_unlock; + return rbconn->list.count; + + ret = nf_conncount_add(&rbconn->list, tuple, zone); + if (ret == NF_CONNCOUNT_ERR) { + return 0; /* hotdrop */ + } else if (ret == NF_CONNCOUNT_ADDED) { + return rbconn->list.count; + } else { + /* NF_CONNCOUNT_SKIP, rbconn is already + * reclaimed by gc, insert a new tree node + */ + break; + } + } + } - if (!nf_conncount_add(&rbconn->list, tuple, zone)) - count = 0; /* hotdrop */ - goto out_unlock; + if (!tuple) + return 0; - count++; - goto out_unlock; - } + return insert_tree(net, data, root, hash, key, keylen, tuple, zone); +} - if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes)) - continue; +static void tree_gc_worker(struct work_struct *work) +{ + struct nf_conncount_data *data = container_of(work, struct nf_conncount_data, gc_work); + struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES], *rbconn; + struct rb_root *root; + struct rb_node *node; + unsigned int tree, next_tree, gc_count = 0; + + tree = data->gc_tree % CONNCOUNT_LOCK_SLOTS; + root = &data->root[tree]; - nf_conncount_gc_list(net, &rbconn->list); - if (list_empty(&rbconn->list.head)) + rcu_read_lock(); + for (node = rb_first(root); node != NULL; node = rb_next(node)) { + rbconn = rb_entry(node, struct nf_conncount_rb, node); + if (nf_conncount_gc_list(data->net, &rbconn->list)) gc_nodes[gc_count++] = rbconn; } + rcu_read_unlock(); + + spin_lock_bh(&nf_conncount_locks[tree]); if (gc_count) { - no_gc = true; tree_nodes_free(root, gc_nodes, gc_count); - /* tree_node_free before new allocation permits - * allocator to re-use newly free'd object. - * - * This is a rare event; in most cases we will find - * existing node to re-use. (or gc_count is 0). - */ - goto restart; } - count = 0; - if (!tuple) - goto out_unlock; + clear_bit(tree, data->pending_trees); - spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); - return insert_tree(root, hash, key, keylen, tuple, zone); + next_tree = (tree + 1) % CONNCOUNT_SLOTS; + next_tree = find_next_bit(data->pending_trees, next_tree, CONNCOUNT_SLOTS); -out_unlock: - spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); - return count; + if (next_tree < CONNCOUNT_SLOTS) { + data->gc_tree = next_tree; + schedule_work(work); + } + + spin_unlock_bh(&nf_conncount_locks[tree]); } /* Count and return number of conntrack entries in 'net' with particular 'key'. * If 'tuple' is not null, insert it into the accounting data structure. + * Call with RCU read lock. */ unsigned int nf_conncount_count(struct net *net, struct nf_conncount_data *data, @@ -452,6 +558,8 @@ struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family data->root[i] = RB_ROOT; data->keylen = keylen / sizeof(u32); + data->net = net; + INIT_WORK(&data->gc_work, tree_gc_worker); return data; } @@ -487,6 +595,7 @@ void nf_conncount_destroy(struct net *net, unsigned int family, { unsigned int i; + cancel_work_sync(&data->gc_work); nf_ct_netns_put(net, family); for (i = 0; i < ARRAY_SIZE(data->root); ++i) diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index 37c52ae06741..b90d96ba4a12 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -14,7 +14,6 @@ #include struct nft_connlimit { - spinlock_t lock; struct nf_conncount_list list; u32 limit; bool invert; @@ -45,7 +44,6 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, return; } - spin_lock_bh(&priv->lock); nf_conncount_lookup(nft_net(pkt), &priv->list, tuple_ptr, zone, &addit); count = priv->list.count; @@ -53,14 +51,12 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, if (!addit) goto out; - if (!nf_conncount_add(&priv->list, tuple_ptr, zone)) { + if (nf_conncount_add(&priv->list, tuple_ptr, zone) == NF_CONNCOUNT_ERR) { regs->verdict.code = NF_DROP; - spin_unlock_bh(&priv->lock); return; } count++; out: - spin_unlock_bh(&priv->lock); if ((count > priv->limit) ^ priv->invert) { regs->verdict.code = NFT_BREAK; @@ -88,7 +84,6 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx, invert = true; } - spin_lock_init(&priv->lock); nf_conncount_list_init(&priv->list); priv->limit = limit; priv->invert = invert; @@ -213,7 +208,6 @@ static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src) struct nft_connlimit *priv_dst = nft_expr_priv(dst); struct nft_connlimit *priv_src = nft_expr_priv(src); - spin_lock_init(&priv_dst->lock); nf_conncount_list_init(&priv_dst->list); priv_dst->limit = priv_src->limit; priv_dst->invert = priv_src->invert; @@ -232,15 +226,8 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx, static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr) { struct nft_connlimit *priv = nft_expr_priv(expr); - bool ret; - spin_lock_bh(&priv->lock); - nf_conncount_gc_list(net, &priv->list); - - ret = list_empty(&priv->list.head); - spin_unlock_bh(&priv->lock); - - return ret; + return nf_conncount_gc_list(net, &priv->list); } static struct nft_expr_type nft_connlimit_type; -- cgit v1.2.3 From ec1b28ca9674def4a158808a6493bdb87b993d81 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Fri, 6 Jul 2018 08:25:52 +0300 Subject: ipvs: provide just conn to ip_vs_state_name In preparation for followup patches, provide just the cp ptr to ip_vs_state_name. Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 2 +- net/netfilter/ipvs/ip_vs_conn.c | 8 ++++---- net/netfilter/ipvs/ip_vs_proto.c | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index a0bec23c6d5e..4d76abcf1c41 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1221,7 +1221,7 @@ struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, struct ip_vs_dest *dest, __u32 fwmark); void ip_vs_conn_expire_now(struct ip_vs_conn *cp); -const char *ip_vs_state_name(__u16 proto, int state); +const char *ip_vs_state_name(const struct ip_vs_conn *cp); void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp); int ip_vs_check_template(struct ip_vs_conn *ct, struct ip_vs_dest *cdest); diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 99e0aa350dc5..de5a64e42ebd 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1107,7 +1107,7 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) &cp->caddr.in6, ntohs(cp->cport), &cp->vaddr.in6, ntohs(cp->vport), dbuf, ntohs(cp->dport), - ip_vs_state_name(cp->protocol, cp->state), + ip_vs_state_name(cp), (cp->timer.expires-jiffies)/HZ, pe_data); else #endif @@ -1118,7 +1118,7 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) ntohl(cp->caddr.ip), ntohs(cp->cport), ntohl(cp->vaddr.ip), ntohs(cp->vport), dbuf, ntohs(cp->dport), - ip_vs_state_name(cp->protocol, cp->state), + ip_vs_state_name(cp), (cp->timer.expires-jiffies)/HZ, pe_data); } return 0; @@ -1169,7 +1169,7 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) &cp->caddr.in6, ntohs(cp->cport), &cp->vaddr.in6, ntohs(cp->vport), dbuf, ntohs(cp->dport), - ip_vs_state_name(cp->protocol, cp->state), + ip_vs_state_name(cp), ip_vs_origin_name(cp->flags), (cp->timer.expires-jiffies)/HZ); else @@ -1181,7 +1181,7 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) ntohl(cp->caddr.ip), ntohs(cp->cport), ntohl(cp->vaddr.ip), ntohs(cp->vport), dbuf, ntohs(cp->dport), - ip_vs_state_name(cp->protocol, cp->state), + ip_vs_state_name(cp), ip_vs_origin_name(cp->flags), (cp->timer.expires-jiffies)/HZ); } diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index ca880a3ad033..85c446621758 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -193,13 +193,13 @@ ip_vs_create_timeout_table(int *table, int size) } -const char * ip_vs_state_name(__u16 proto, int state) +const char *ip_vs_state_name(const struct ip_vs_conn *cp) { - struct ip_vs_protocol *pp = ip_vs_proto_get(proto); + struct ip_vs_protocol *pp = ip_vs_proto_get(cp->protocol); if (pp == NULL || pp->state_name == NULL) - return (IPPROTO_IP == proto) ? "NONE" : "ERR!"; - return pp->state_name(state); + return (cp->protocol == IPPROTO_IP) ? "NONE" : "ERR!"; + return pp->state_name(cp->state); } -- cgit v1.2.3 From 275411430f892407b885be1de2548b2e632892c3 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Fri, 6 Jul 2018 08:25:53 +0300 Subject: ipvs: add assured state for conn templates cp->state was not used for templates. Add support for state bits and for the first "assured" bit which indicates that some connection controlled by this template was established or assured by the real server. In a followup patch we will use it to drop templates under SYN attack. Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 16 ++++++++++++++++ net/netfilter/ipvs/ip_vs_proto.c | 17 +++++++++++++++-- net/netfilter/ipvs/ip_vs_proto_sctp.c | 2 ++ net/netfilter/ipvs/ip_vs_proto_tcp.c | 2 ++ net/netfilter/ipvs/ip_vs_proto_udp.c | 2 ++ net/netfilter/ipvs/ip_vs_sync.c | 18 ++++++------------ 6 files changed, 43 insertions(+), 14 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 4d76abcf1c41..a0d2e0bb9a94 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -335,6 +335,11 @@ enum ip_vs_sctp_states { IP_VS_SCTP_S_LAST }; +/* Connection templates use bits from state */ +#define IP_VS_CTPL_S_NONE 0x0000 +#define IP_VS_CTPL_S_ASSURED 0x0001 +#define IP_VS_CTPL_S_LAST 0x0002 + /* Delta sequence info structure * Each ip_vs_conn has 2 (output AND input seq. changes). * Only used in the VS/NAT. @@ -1289,6 +1294,17 @@ ip_vs_control_add(struct ip_vs_conn *cp, struct ip_vs_conn *ctl_cp) atomic_inc(&ctl_cp->n_control); } +/* Mark our template as assured */ +static inline void +ip_vs_control_assure_ct(struct ip_vs_conn *cp) +{ + struct ip_vs_conn *ct = cp->control; + + if (ct && !(ct->state & IP_VS_CTPL_S_ASSURED) && + (ct->flags & IP_VS_CONN_F_TEMPLATE)) + ct->state |= IP_VS_CTPL_S_ASSURED; +} + /* IPVS netns init & cleanup functions */ int ip_vs_estimator_net_init(struct netns_ipvs *ipvs); int ip_vs_control_net_init(struct netns_ipvs *ipvs); diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index 85c446621758..54ee84adf0bd 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -42,6 +42,11 @@ static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; +/* States for conn templates: NONE or words separated with ",", max 15 chars */ +static const char *ip_vs_ctpl_state_name_table[IP_VS_CTPL_S_LAST] = { + [IP_VS_CTPL_S_NONE] = "NONE", + [IP_VS_CTPL_S_ASSURED] = "ASSURED", +}; /* * register an ipvs protocol @@ -195,11 +200,19 @@ ip_vs_create_timeout_table(int *table, int size) const char *ip_vs_state_name(const struct ip_vs_conn *cp) { - struct ip_vs_protocol *pp = ip_vs_proto_get(cp->protocol); + unsigned int state = cp->state; + struct ip_vs_protocol *pp; + + if (cp->flags & IP_VS_CONN_F_TEMPLATE) { + if (state >= IP_VS_CTPL_S_LAST) + return "ERR!"; + return ip_vs_ctpl_state_name_table[state] ? : "?"; + } + pp = ip_vs_proto_get(cp->protocol); if (pp == NULL || pp->state_name == NULL) return (cp->protocol == IPPROTO_IP) ? "NONE" : "ERR!"; - return pp->state_name(cp->state); + return pp->state_name(state); } diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 3250c4a1111e..b0cd7d08f2a7 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -461,6 +461,8 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, cp->flags &= ~IP_VS_CONN_F_INACTIVE; } } + if (next_state == IP_VS_SCTP_S_ESTABLISHED) + ip_vs_control_assure_ct(cp); } if (likely(pd)) cp->timeout = pd->timeout_table[cp->state = next_state]; diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 80d10ad12a15..1770fc6ce960 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -569,6 +569,8 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, cp->flags &= ~IP_VS_CONN_F_INACTIVE; } } + if (new_state == IP_VS_TCP_S_ESTABLISHED) + ip_vs_control_assure_ct(cp); } if (likely(pd)) diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index e0ef11c3691e..0f53c49025f8 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -460,6 +460,8 @@ udp_state_transition(struct ip_vs_conn *cp, int direction, } cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL]; + if (direction == IP_VS_DIR_OUTPUT) + ip_vs_control_assure_ct(cp); } static int __udp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 001501e25625..d4020c5e831d 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1003,12 +1003,9 @@ static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer continue; } } else { - /* protocol in templates is not used for state/timeout */ - if (state > 0) { - IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n", - state); - state = 0; - } + if (state >= IP_VS_CTPL_S_LAST) + IP_VS_DBG(7, "BACKUP v0, Invalid tpl state %u\n", + state); } ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol, @@ -1166,12 +1163,9 @@ static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *m goto out; } } else { - /* protocol in templates is not used for state/timeout */ - if (state > 0) { - IP_VS_DBG(3, "BACKUP, Invalid template state %u\n", - state); - state = 0; - } + if (state >= IP_VS_CTPL_S_LAST) + IP_VS_DBG(7, "BACKUP, Invalid tpl state %u\n", + state); } if (ip_vs_conn_fill_param_sync(ipvs, af, s, ¶m, pe_data, pe_data_len, pe_name, pe_name_len)) { -- cgit v1.2.3 From 440534d3c56be04abfb26850ee882d19d223557a Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Mon, 9 Jul 2018 18:06:33 +0800 Subject: netfilter: Remove useless param helper of nf_ct_helper_ext_add The param helper of nf_ct_helper_ext_add is useless now, then remove it now. Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_helper.h | 4 +--- net/netfilter/nf_conntrack_core.c | 3 +-- net/netfilter/nf_conntrack_helper.c | 5 ++--- net/netfilter/nf_conntrack_netlink.c | 2 +- net/netfilter/nft_ct.c | 2 +- net/netfilter/xt_CT.c | 2 +- net/openvswitch/conntrack.c | 2 +- 7 files changed, 8 insertions(+), 12 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index 32c2a94a219d..2492120b8097 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -103,9 +103,7 @@ int nf_conntrack_helpers_register(struct nf_conntrack_helper *, unsigned int); void nf_conntrack_helpers_unregister(struct nf_conntrack_helper *, unsigned int); -struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, - struct nf_conntrack_helper *helper, - gfp_t gfp); +struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp); int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, gfp_t flags); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 4ced7c7102b6..d97d7e9a9ee7 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1401,8 +1401,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ ct->master = exp->master; if (exp->helper) { - help = nf_ct_helper_ext_add(ct, exp->helper, - GFP_ATOMIC); + help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (help) rcu_assign_pointer(help->helper, exp->helper); } diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index a55a58c706a9..d557a425289d 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -192,8 +192,7 @@ void nf_conntrack_helper_put(struct nf_conntrack_helper *helper) EXPORT_SYMBOL_GPL(nf_conntrack_helper_put); struct nf_conn_help * -nf_ct_helper_ext_add(struct nf_conn *ct, - struct nf_conntrack_helper *helper, gfp_t gfp) +nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp) { struct nf_conn_help *help; @@ -262,7 +261,7 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, } if (help == NULL) { - help = nf_ct_helper_ext_add(ct, helper, flags); + help = nf_ct_helper_ext_add(ct, flags); if (help == NULL) return -ENOMEM; } else { diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 40152b9ad772..f981bfa8db72 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1947,7 +1947,7 @@ ctnetlink_create_conntrack(struct net *net, } else { struct nf_conn_help *help; - help = nf_ct_helper_ext_add(ct, helper, GFP_ATOMIC); + help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (help == NULL) { err = -ENOMEM; goto err2; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 1435ffc5f57e..3bc82ee5464d 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -870,7 +870,7 @@ static void nft_ct_helper_obj_eval(struct nft_object *obj, if (test_bit(IPS_HELPER_BIT, &ct->status)) return; - help = nf_ct_helper_ext_add(ct, to_assign, GFP_ATOMIC); + help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (help) { rcu_assign_pointer(help->helper, to_assign); set_bit(IPS_HELPER_BIT, &ct->status); diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 03b9a50ec93b..7ba454e9e3fa 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -93,7 +93,7 @@ xt_ct_set_helper(struct nf_conn *ct, const char *helper_name, return -ENOENT; } - help = nf_ct_helper_ext_add(ct, helper, GFP_KERNEL); + help = nf_ct_helper_ext_add(ct, GFP_KERNEL); if (help == NULL) { nf_conntrack_helper_put(helper); return -ENOMEM; diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index e05bd3e53f0f..3e33c382367f 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1303,7 +1303,7 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, return -EINVAL; } - help = nf_ct_helper_ext_add(info->ct, helper, GFP_KERNEL); + help = nf_ct_helper_ext_add(info->ct, GFP_KERNEL); if (!help) { nf_conntrack_helper_put(helper); return -ENOMEM; -- cgit v1.2.3 From f102d66b335a417d4848da9441f585695a838934 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 11 Jul 2018 13:45:14 +0200 Subject: netfilter: nf_tables: use dedicated mutex to guard transactions Continue to use nftnl subsys mutex to protect (un)registration of hook types, expressions and so on, but force batch operations to do their own locking. This allows distinct net namespaces to perform transactions in parallel. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netns/nftables.h | 1 + net/netfilter/nf_tables_api.c | 88 +++++++++++++++++++++++++++++++--------- net/netfilter/nfnetlink.c | 10 ++--- net/netfilter/nft_chain_filter.c | 4 +- net/netfilter/nft_dynset.c | 2 + 5 files changed, 77 insertions(+), 28 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h index 94767ea3a490..286fd960896f 100644 --- a/include/net/netns/nftables.h +++ b/include/net/netns/nftables.h @@ -7,6 +7,7 @@ struct netns_nftables { struct list_head tables; struct list_head commit_list; + struct mutex commit_mutex; unsigned int base_seq; u8 gencursor; u8 validate_state; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 68436edd9cdf..c0fb2bcd30fe 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -480,12 +480,19 @@ static void nft_request_module(struct net *net, const char *fmt, ...) if (WARN(ret >= MODULE_NAME_LEN, "truncated: '%s' (len %d)", module_name, ret)) return; - nfnl_unlock(NFNL_SUBSYS_NFTABLES); + mutex_unlock(&net->nft.commit_mutex); request_module("%s", module_name); - nfnl_lock(NFNL_SUBSYS_NFTABLES); + mutex_lock(&net->nft.commit_mutex); } #endif +static void lockdep_nfnl_nft_mutex_not_held(void) +{ +#ifdef CONFIG_PROVE_LOCKING + WARN_ON_ONCE(lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES)); +#endif +} + static const struct nft_chain_type * nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla, u8 family, bool autoload) @@ -495,6 +502,8 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla, type = __nf_tables_chain_type_lookup(nla, family); if (type != NULL) return type; + + lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (autoload) { nft_request_module(net, "nft-chain-%u-%.*s", family, @@ -802,6 +811,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, struct nft_ctx ctx; int err; + lockdep_assert_held(&net->nft.commit_mutex); attr = nla[NFTA_TABLE_NAME]; table = nft_table_lookup(net, attr, family, genmask); if (IS_ERR(table)) { @@ -1042,7 +1052,17 @@ nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask) return ERR_PTR(-ENOENT); } -static struct nft_chain *nft_chain_lookup(struct nft_table *table, +static bool lockdep_commit_lock_is_held(struct net *net) +{ +#ifdef CONFIG_PROVE_LOCKING + return lockdep_is_held(&net->nft.commit_mutex); +#else + return true; +#endif +} + +static struct nft_chain *nft_chain_lookup(struct net *net, + struct nft_table *table, const struct nlattr *nla, u8 genmask) { char search[NFT_CHAIN_MAXNAMELEN + 1]; @@ -1055,7 +1075,7 @@ static struct nft_chain *nft_chain_lookup(struct nft_table *table, nla_strlcpy(search, nla, sizeof(search)); WARN_ON(!rcu_read_lock_held() && - !lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES)); + !lockdep_commit_lock_is_held(net)); chain = ERR_PTR(-ENOENT); rcu_read_lock(); @@ -1295,7 +1315,7 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, return PTR_ERR(table); } - chain = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); + chain = nft_chain_lookup(net, table, nla[NFTA_CHAIN_NAME], genmask); if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]); return PTR_ERR(chain); @@ -1428,6 +1448,9 @@ static int nft_chain_parse_hook(struct net *net, struct net_device *dev; int err; + lockdep_assert_held(&net->nft.commit_mutex); + lockdep_nfnl_nft_mutex_not_held(); + err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK], nft_hook_policy, NULL); if (err < 0) @@ -1662,7 +1685,8 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, nla[NFTA_CHAIN_NAME]) { struct nft_chain *chain2; - chain2 = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); + chain2 = nft_chain_lookup(ctx->net, table, + nla[NFTA_CHAIN_NAME], genmask); if (!IS_ERR(chain2)) return -EEXIST; } @@ -1724,6 +1748,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; + lockdep_assert_held(&net->nft.commit_mutex); + table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]); @@ -1742,7 +1768,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, } attr = nla[NFTA_CHAIN_HANDLE]; } else { - chain = nft_chain_lookup(table, attr, genmask); + chain = nft_chain_lookup(net, table, attr, genmask); if (IS_ERR(chain)) { if (PTR_ERR(chain) != -ENOENT) { NL_SET_BAD_ATTR(extack, attr); @@ -1820,7 +1846,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, chain = nft_chain_lookup_byhandle(table, handle, genmask); } else { attr = nla[NFTA_CHAIN_NAME]; - chain = nft_chain_lookup(table, attr, genmask); + chain = nft_chain_lookup(net, table, attr, genmask); } if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, attr); @@ -1918,6 +1944,7 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net, if (type != NULL && try_module_get(type->owner)) return type; + lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { nft_request_module(net, "nft-expr-%u-%.*s", family, @@ -2352,7 +2379,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, return PTR_ERR(table); } - chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); + chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); @@ -2386,6 +2413,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, { struct nft_expr *expr; + lockdep_assert_held(&ctx->net->nft.commit_mutex); /* * Careful: some expressions might not be initialized in case this * is called on error from nf_tables_newrule(). @@ -2476,6 +2504,8 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, bool create; u64 handle, pos_handle; + lockdep_assert_held(&net->nft.commit_mutex); + create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask); @@ -2484,7 +2514,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, return PTR_ERR(table); } - chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); + chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask); if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); @@ -2684,7 +2714,8 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, } if (nla[NFTA_RULE_CHAIN]) { - chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); + chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], + genmask); if (IS_ERR(chain)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); @@ -2776,6 +2807,8 @@ nft_select_set_ops(const struct nft_ctx *ctx, const struct nft_set_type *type; u32 flags = 0; + lockdep_assert_held(&ctx->net->nft.commit_mutex); + lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (list_empty(&nf_tables_set_types)) { nft_request_module(ctx->net, "nft-set"); @@ -4820,6 +4853,7 @@ nft_obj_type_get(struct net *net, u32 objtype) if (type != NULL && try_module_get(type->owner)) return type; + lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { nft_request_module(net, "nft-obj-%u", objtype); @@ -5379,6 +5413,7 @@ nft_flowtable_type_get(struct net *net, u8 family) if (type != NULL && try_module_get(type->owner)) return type; + lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { nft_request_module(net, "nf-flowtable-%u", family); @@ -6232,9 +6267,9 @@ static void nf_tables_commit_chain_active(struct net *net, struct nft_chain *cha next_genbit = nft_gencursor_next(net); g0 = rcu_dereference_protected(chain->rules_gen_0, - lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES)); + lockdep_commit_lock_is_held(net)); g1 = rcu_dereference_protected(chain->rules_gen_1, - lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES)); + lockdep_commit_lock_is_held(net)); /* No changes to this chain? */ if (chain->rules_next == NULL) { @@ -6442,6 +6477,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_commit_release(net); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); + mutex_unlock(&net->nft.commit_mutex); return 0; } @@ -6593,12 +6629,25 @@ static void nf_tables_cleanup(struct net *net) static int nf_tables_abort(struct net *net, struct sk_buff *skb) { - return __nf_tables_abort(net); + int ret = __nf_tables_abort(net); + + mutex_unlock(&net->nft.commit_mutex); + + return ret; } static bool nf_tables_valid_genid(struct net *net, u32 genid) { - return genid == 0 || net->nft.base_seq == genid; + bool genid_ok; + + mutex_lock(&net->nft.commit_mutex); + + genid_ok = genid == 0 || net->nft.base_seq == genid; + if (!genid_ok) + mutex_unlock(&net->nft.commit_mutex); + + /* else, commit mutex has to be released by commit or abort function */ + return genid_ok; } static const struct nfnetlink_subsystem nf_tables_subsys = { @@ -6937,8 +6986,8 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, case NFT_GOTO: if (!tb[NFTA_VERDICT_CHAIN]) return -EINVAL; - chain = nft_chain_lookup(ctx->table, tb[NFTA_VERDICT_CHAIN], - genmask); + chain = nft_chain_lookup(ctx->net, ctx->table, + tb[NFTA_VERDICT_CHAIN], genmask); if (IS_ERR(chain)) return PTR_ERR(chain); if (nft_is_base_chain(chain)) @@ -7183,6 +7232,7 @@ static int __net_init nf_tables_init_net(struct net *net) { INIT_LIST_HEAD(&net->nft.tables); INIT_LIST_HEAD(&net->nft.commit_list); + mutex_init(&net->nft.commit_mutex); net->nft.base_seq = 1; net->nft.validate_state = NFT_VALIDATE_SKIP; @@ -7191,11 +7241,11 @@ static int __net_init nf_tables_init_net(struct net *net) static void __net_exit nf_tables_exit_net(struct net *net) { - nfnl_lock(NFNL_SUBSYS_NFTABLES); + mutex_lock(&net->nft.commit_mutex); if (!list_empty(&net->nft.commit_list)) __nf_tables_abort(net); __nft_release_tables(net); - nfnl_unlock(NFNL_SUBSYS_NFTABLES); + mutex_unlock(&net->nft.commit_mutex); WARN_ON_ONCE(!list_empty(&net->nft.tables)); } diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index dd1d7bc23b03..916913454624 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -350,6 +350,8 @@ replay: return kfree_skb(skb); } + nfnl_unlock(subsys_id); + while (skb->len >= nlmsg_total_size(0)) { int msglen, type; @@ -471,13 +473,8 @@ ack: } done: if (status & NFNL_BATCH_REPLAY) { - const struct nfnetlink_subsystem *ss2; - - ss2 = nfnl_dereference_protected(subsys_id); - if (ss2 == ss) - ss->abort(net, oskb); + ss->abort(net, oskb); nfnl_err_reset(&err_list); - nfnl_unlock(subsys_id); kfree_skb(skb); module_put(ss->owner); goto replay; @@ -497,7 +494,6 @@ done: ss->cleanup(net); nfnl_err_deliver(&err_list, oskb); - nfnl_unlock(subsys_id); kfree_skb(skb); module_put(ss->owner); } diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index d21834bed805..ea5b7c4944f6 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -322,7 +322,7 @@ static int nf_tables_netdev_event(struct notifier_block *this, if (!ctx.net) return NOTIFY_DONE; - nfnl_lock(NFNL_SUBSYS_NFTABLES); + mutex_lock(&ctx.net->nft.commit_mutex); list_for_each_entry(table, &ctx.net->nft.tables, list) { if (table->family != NFPROTO_NETDEV) continue; @@ -337,7 +337,7 @@ static int nf_tables_netdev_event(struct notifier_block *this, nft_netdev_event(event, dev, &ctx); } } - nfnl_unlock(NFNL_SUBSYS_NFTABLES); + mutex_unlock(&ctx.net->nft.commit_mutex); put_net(ctx.net); return NOTIFY_DONE; diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 27d7e4598ab6..81184c244d1a 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -118,6 +118,8 @@ static int nft_dynset_init(const struct nft_ctx *ctx, u64 timeout; int err; + lockdep_assert_held(&ctx->net->nft.commit_mutex); + if (tb[NFTA_DYNSET_SET_NAME] == NULL || tb[NFTA_DYNSET_OP] == NULL || tb[NFTA_DYNSET_SREG_KEY] == NULL) -- cgit v1.2.3 From 70b095c84326640eeacfd69a411db8fc36e8ab1a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 14 Jul 2018 01:14:01 +0200 Subject: ipv6: remove dependency of nf_defrag_ipv6 on ipv6 module IPV6=m DEFRAG_IPV6=m CONNTRACK=y yields: net/netfilter/nf_conntrack_proto.o: In function `nf_ct_netns_do_get': net/netfilter/nf_conntrack_proto.c:802: undefined reference to `nf_defrag_ipv6_enable' net/netfilter/nf_conntrack_proto.o:(.rodata+0x640): undefined reference to `nf_conntrack_l4proto_icmpv6' Setting DEFRAG_IPV6=y causes undefined references to ip6_rhash_params ip6_frag_init and ip6_expire_frag_queue so it would be needed to force IPV6=y too. This patch gets rid of the 'followup linker error' by removing the dependency of ipv6.ko symbols from netfilter ipv6 defrag. Shared code is placed into a header, then used from both. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/ipv6.h | 28 -------- include/net/ipv6_frag.h | 104 ++++++++++++++++++++++++++++++ net/ieee802154/6lowpan/reassembly.c | 2 +- net/ipv6/netfilter/nf_conntrack_reasm.c | 17 +++-- net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 3 +- net/ipv6/reassembly.c | 92 ++------------------------ net/openvswitch/conntrack.c | 1 + 7 files changed, 126 insertions(+), 121 deletions(-) create mode 100644 include/net/ipv6_frag.h (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index aa6fd11a887c..3720958cd4e1 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -581,34 +581,6 @@ static inline bool ipv6_prefix_equal(const struct in6_addr *addr1, } #endif -struct inet_frag_queue; - -enum ip6_defrag_users { - IP6_DEFRAG_LOCAL_DELIVER, - IP6_DEFRAG_CONNTRACK_IN, - __IP6_DEFRAG_CONNTRACK_IN = IP6_DEFRAG_CONNTRACK_IN + USHRT_MAX, - IP6_DEFRAG_CONNTRACK_OUT, - __IP6_DEFRAG_CONNTRACK_OUT = IP6_DEFRAG_CONNTRACK_OUT + USHRT_MAX, - IP6_DEFRAG_CONNTRACK_BRIDGE_IN, - __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX, -}; - -void ip6_frag_init(struct inet_frag_queue *q, const void *a); -extern const struct rhashtable_params ip6_rhash_params; - -/* - * Equivalent of ipv4 struct ip - */ -struct frag_queue { - struct inet_frag_queue q; - - int iif; - __u16 nhoffset; - u8 ecn; -}; - -void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq); - static inline bool ipv6_addr_any(const struct in6_addr *a) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h new file mode 100644 index 000000000000..6ced1e6899b6 --- /dev/null +++ b/include/net/ipv6_frag.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _IPV6_FRAG_H +#define _IPV6_FRAG_H +#include +#include +#include +#include + +enum ip6_defrag_users { + IP6_DEFRAG_LOCAL_DELIVER, + IP6_DEFRAG_CONNTRACK_IN, + __IP6_DEFRAG_CONNTRACK_IN = IP6_DEFRAG_CONNTRACK_IN + USHRT_MAX, + IP6_DEFRAG_CONNTRACK_OUT, + __IP6_DEFRAG_CONNTRACK_OUT = IP6_DEFRAG_CONNTRACK_OUT + USHRT_MAX, + IP6_DEFRAG_CONNTRACK_BRIDGE_IN, + __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX, +}; + +/* + * Equivalent of ipv4 struct ip + */ +struct frag_queue { + struct inet_frag_queue q; + + int iif; + __u16 nhoffset; + u8 ecn; +}; + +#if IS_ENABLED(CONFIG_IPV6) +static inline void ip6frag_init(struct inet_frag_queue *q, const void *a) +{ + struct frag_queue *fq = container_of(q, struct frag_queue, q); + const struct frag_v6_compare_key *key = a; + + q->key.v6 = *key; + fq->ecn = 0; +} + +static inline u32 ip6frag_key_hashfn(const void *data, u32 len, u32 seed) +{ + return jhash2(data, + sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); +} + +static inline u32 ip6frag_obj_hashfn(const void *data, u32 len, u32 seed) +{ + const struct inet_frag_queue *fq = data; + + return jhash2((const u32 *)&fq->key.v6, + sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); +} + +static inline int +ip6frag_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) +{ + const struct frag_v6_compare_key *key = arg->key; + const struct inet_frag_queue *fq = ptr; + + return !!memcmp(&fq->key, key, sizeof(*key)); +} + +static inline void +ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq) +{ + struct net_device *dev = NULL; + struct sk_buff *head; + + rcu_read_lock(); + spin_lock(&fq->q.lock); + + if (fq->q.flags & INET_FRAG_COMPLETE) + goto out; + + inet_frag_kill(&fq->q); + + dev = dev_get_by_index_rcu(net, fq->iif); + if (!dev) + goto out; + + __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); + __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); + + /* Don't send error if the first segment did not arrive. */ + head = fq->q.fragments; + if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head) + goto out; + + head->dev = dev; + skb_get(head); + spin_unlock(&fq->q.lock); + + icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); + kfree_skb(head); + goto out_rcu_unlock; + +out: + spin_unlock(&fq->q.lock); +out_rcu_unlock: + rcu_read_unlock(); + inet_frag_put(&fq->q); +} +#endif +#endif diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 2cc224106b69..ec7a5da56129 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -25,7 +25,7 @@ #include #include -#include +#include #include #include "6lowpan_i.h" diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index a452d99c9f52..333ee3256964 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -33,9 +33,8 @@ #include #include -#include +#include -#include #include #include #include @@ -151,7 +150,7 @@ static void nf_ct_frag6_expire(struct timer_list *t) fq = container_of(frag, struct frag_queue, q); net = container_of(fq->q.net, struct net, nf_frag.frags); - ip6_expire_frag_queue(net, fq); + ip6frag_expire_frag_queue(net, fq); } /* Creation primitives. */ @@ -622,16 +621,24 @@ static struct pernet_operations nf_ct_net_ops = { .exit = nf_ct_net_exit, }; +static const struct rhashtable_params nfct_rhash_params = { + .head_offset = offsetof(struct inet_frag_queue, node), + .hashfn = ip6frag_key_hashfn, + .obj_hashfn = ip6frag_obj_hashfn, + .obj_cmpfn = ip6frag_obj_cmpfn, + .automatic_shrinking = true, +}; + int nf_ct_frag6_init(void) { int ret = 0; - nf_frags.constructor = ip6_frag_init; + nf_frags.constructor = ip6frag_init; nf_frags.destructor = NULL; nf_frags.qsize = sizeof(struct frag_queue); nf_frags.frag_expire = nf_ct_frag6_expire; nf_frags.frags_cache_name = nf_frags_cache_name; - nf_frags.rhash_params = ip6_rhash_params; + nf_frags.rhash_params = nfct_rhash_params; ret = inet_frags_init(&nf_frags); if (ret) goto out; diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index e631be25337e..72dd3e202375 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -14,8 +14,7 @@ #include #include #include -#include -#include +#include #include #include diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index b939b94e7e91..6edd2ac8ae4b 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -57,7 +57,7 @@ #include #include #include -#include +#include #include static const char ip6_frag_cache_name[] = "ip6-frags"; @@ -72,61 +72,6 @@ static struct inet_frags ip6_frags; static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_device *dev); -void ip6_frag_init(struct inet_frag_queue *q, const void *a) -{ - struct frag_queue *fq = container_of(q, struct frag_queue, q); - const struct frag_v6_compare_key *key = a; - - q->key.v6 = *key; - fq->ecn = 0; -} -EXPORT_SYMBOL(ip6_frag_init); - -void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq) -{ - struct net_device *dev = NULL; - struct sk_buff *head; - - rcu_read_lock(); - spin_lock(&fq->q.lock); - - if (fq->q.flags & INET_FRAG_COMPLETE) - goto out; - - inet_frag_kill(&fq->q); - - dev = dev_get_by_index_rcu(net, fq->iif); - if (!dev) - goto out; - - __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); - __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); - - /* Don't send error if the first segment did not arrive. */ - head = fq->q.fragments; - if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head) - goto out; - - /* But use as source device on which LAST ARRIVED - * segment was received. And do not use fq->dev - * pointer directly, device might already disappeared. - */ - head->dev = dev; - skb_get(head); - spin_unlock(&fq->q.lock); - - icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); - kfree_skb(head); - goto out_rcu_unlock; - -out: - spin_unlock(&fq->q.lock); -out_rcu_unlock: - rcu_read_unlock(); - inet_frag_put(&fq->q); -} -EXPORT_SYMBOL(ip6_expire_frag_queue); - static void ip6_frag_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); @@ -136,7 +81,7 @@ static void ip6_frag_expire(struct timer_list *t) fq = container_of(frag, struct frag_queue, q); net = container_of(fq->q.net, struct net, ipv6.frags); - ip6_expire_frag_queue(net, fq); + ip6frag_expire_frag_queue(net, fq); } static struct frag_queue * @@ -696,42 +641,19 @@ static struct pernet_operations ip6_frags_ops = { .exit = ipv6_frags_exit_net, }; -static u32 ip6_key_hashfn(const void *data, u32 len, u32 seed) -{ - return jhash2(data, - sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); -} - -static u32 ip6_obj_hashfn(const void *data, u32 len, u32 seed) -{ - const struct inet_frag_queue *fq = data; - - return jhash2((const u32 *)&fq->key.v6, - sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); -} - -static int ip6_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) -{ - const struct frag_v6_compare_key *key = arg->key; - const struct inet_frag_queue *fq = ptr; - - return !!memcmp(&fq->key, key, sizeof(*key)); -} - -const struct rhashtable_params ip6_rhash_params = { +static const struct rhashtable_params ip6_rhash_params = { .head_offset = offsetof(struct inet_frag_queue, node), - .hashfn = ip6_key_hashfn, - .obj_hashfn = ip6_obj_hashfn, - .obj_cmpfn = ip6_obj_cmpfn, + .hashfn = ip6frag_key_hashfn, + .obj_hashfn = ip6frag_obj_hashfn, + .obj_cmpfn = ip6frag_obj_cmpfn, .automatic_shrinking = true, }; -EXPORT_SYMBOL(ip6_rhash_params); int __init ipv6_frag_init(void) { int ret; - ip6_frags.constructor = ip6_frag_init; + ip6_frags.constructor = ip6frag_init; ip6_frags.destructor = NULL; ip6_frags.qsize = sizeof(struct frag_queue); ip6_frags.frag_expire = ip6_frag_expire; diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 3e33c382367f..86a75105af1a 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -26,6 +26,7 @@ #include #include #include +#include #ifdef CONFIG_NF_NAT_NEEDED #include -- cgit v1.2.3 From 0015b80abccecca82622d9e9d48eb210572a0c3b Mon Sep 17 00:00:00 2001 From: Salvatore Mesoraca Date: Mon, 16 Jul 2018 21:10:34 -0700 Subject: net: dsa: Remove VLA usage We avoid 2 VLAs by using a pre-allocated field in dsa_switch. We also try to avoid dynamic allocation whenever possible (when using fewer than bits-per-long ports, which is the common case). Link: http://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com Link: http://lkml.kernel.org/r/20180505185145.GB32630@lunn.ch Signed-off-by: Salvatore Mesoraca [kees: tweak commit subject and message slightly] Signed-off-by: Kees Cook Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 3 +++ net/dsa/dsa2.c | 14 ++++++++++++++ net/dsa/switch.c | 22 ++++++++++------------ 3 files changed, 27 insertions(+), 12 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index fdbd6082945d..461e8a7661b7 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -259,6 +259,9 @@ struct dsa_switch { /* Number of switch port queues */ unsigned int num_tx_queues; + unsigned long *bitmap; + unsigned long _bitmap; + /* Dynamically allocated ports, keep last */ size_t num_ports; struct dsa_port ports[]; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index dc5d9af3dc80..a1917025e155 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -775,6 +775,20 @@ struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n) if (!ds) return NULL; + /* We avoid allocating memory outside dsa_switch + * if it is not needed. + */ + if (n <= sizeof(ds->_bitmap) * 8) { + ds->bitmap = &ds->_bitmap; + } else { + ds->bitmap = devm_kcalloc(dev, + BITS_TO_LONGS(n), + sizeof(unsigned long), + GFP_KERNEL); + if (unlikely(!ds->bitmap)) + return NULL; + } + ds->dev = dev; ds->num_ports = n; diff --git a/net/dsa/switch.c b/net/dsa/switch.c index b93511726069..142b294d3446 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -136,21 +136,20 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds, { const struct switchdev_obj_port_mdb *mdb = info->mdb; struct switchdev_trans *trans = info->trans; - DECLARE_BITMAP(group, ds->num_ports); int port; /* Build a mask of Multicast group members */ - bitmap_zero(group, ds->num_ports); + bitmap_zero(ds->bitmap, ds->num_ports); if (ds->index == info->sw_index) - set_bit(info->port, group); + set_bit(info->port, ds->bitmap); for (port = 0; port < ds->num_ports; port++) if (dsa_is_dsa_port(ds, port)) - set_bit(port, group); + set_bit(port, ds->bitmap); if (switchdev_trans_ph_prepare(trans)) - return dsa_switch_mdb_prepare_bitmap(ds, mdb, group); + return dsa_switch_mdb_prepare_bitmap(ds, mdb, ds->bitmap); - dsa_switch_mdb_add_bitmap(ds, mdb, group); + dsa_switch_mdb_add_bitmap(ds, mdb, ds->bitmap); return 0; } @@ -204,21 +203,20 @@ static int dsa_switch_vlan_add(struct dsa_switch *ds, { const struct switchdev_obj_port_vlan *vlan = info->vlan; struct switchdev_trans *trans = info->trans; - DECLARE_BITMAP(members, ds->num_ports); int port; /* Build a mask of VLAN members */ - bitmap_zero(members, ds->num_ports); + bitmap_zero(ds->bitmap, ds->num_ports); if (ds->index == info->sw_index) - set_bit(info->port, members); + set_bit(info->port, ds->bitmap); for (port = 0; port < ds->num_ports; port++) if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)) - set_bit(port, members); + set_bit(port, ds->bitmap); if (switchdev_trans_ph_prepare(trans)) - return dsa_switch_vlan_prepare_bitmap(ds, vlan, members); + return dsa_switch_vlan_prepare_bitmap(ds, vlan, ds->bitmap); - dsa_switch_vlan_add_bitmap(ds, vlan, members); + dsa_switch_vlan_add_bitmap(ds, vlan, ds->bitmap); return 0; } -- cgit v1.2.3 From 5544adb9707fda5d54494c37940701894c16b9a0 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 17 Jul 2018 19:27:17 +0300 Subject: flow_dissector: Dissect tos and ttl from the tunnel info Add dissection of the tos and ttl from the ip tunnel headers fields in case a match is needed on them. Signed-off-by: Or Gerlitz Reviewed-by: Roi Dayan Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 2 +- net/core/flow_dissector.c | 14 +++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index c64406717eee..2a17f041f7a1 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -207,7 +207,7 @@ enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_TCP, /* struct flow_dissector_key_tcp */ FLOW_DISSECTOR_KEY_IP, /* struct flow_dissector_key_ip */ FLOW_DISSECTOR_KEY_CVLAN, /* struct flow_dissector_key_flow_vlan */ - + FLOW_DISSECTOR_KEY_ENC_IP, /* struct flow_dissector_key_ip */ FLOW_DISSECTOR_KEY_MAX, }; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index b555fc229e96..08a5184f4b34 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -152,7 +152,9 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL) && !dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_ENC_PORTS)) + FLOW_DISSECTOR_KEY_ENC_PORTS) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IP)) return; info = skb_tunnel_info(skb); @@ -212,6 +214,16 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, tp->src = key->tp_src; tp->dst = key->tp_dst; } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP)) { + struct flow_dissector_key_ip *ip; + + ip = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IP, + target_container); + ip->tos = key->tos; + ip->ttl = key->ttl; + } } EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); -- cgit v1.2.3 From bc56b33404599edc412b91933d74b36873e8ea25 Mon Sep 17 00:00:00 2001 From: Benedict Wong Date: Thu, 19 Jul 2018 10:50:44 -0700 Subject: xfrm: Remove xfrmi interface ID from flowi In order to remove performance impact of having the extra u32 in every single flowi, this change removes the flowi_xfrm struct, prefering to take the if_id as a method parameter where needed. In the inbound direction, if_id is only needed during the __xfrm_check_policy() function, and the if_id can be determined at that point based on the skb. As such, xfrmi_decode_session() is only called with the skb in __xfrm_check_policy(). In the outbound direction, the only place where if_id is needed is the xfrm_lookup() call in xfrmi_xmit2(). With this change, the if_id is directly passed into the xfrm_lookup_with_ifid() call. All existing callers can still call xfrm_lookup(), which uses a default if_id of 0. This change does not change any behavior of XFRMIs except for improving overall system performance via flowi size reduction. This change has been tested against the Android Kernel Networking Tests: https://android.googlesource.com/kernel/tests/+/master/net/test Signed-off-by: Benedict Wong Signed-off-by: Steffen Klassert --- include/net/dst.h | 14 +++++++ include/net/flow.h | 9 ----- include/net/xfrm.h | 2 +- net/xfrm/xfrm_interface.c | 4 +- net/xfrm/xfrm_policy.c | 98 +++++++++++++++++++++++++++++++---------------- net/xfrm/xfrm_state.c | 3 +- 6 files changed, 83 insertions(+), 47 deletions(-) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index b3219cd8a5a1..7f735e76ca73 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -475,6 +475,14 @@ static inline struct dst_entry *xfrm_lookup(struct net *net, return dst_orig; } +static inline struct dst_entry * +xfrm_lookup_with_ifid(struct net *net, struct dst_entry *dst_orig, + const struct flowi *fl, const struct sock *sk, + int flags, u32 if_id) +{ + return dst_orig; +} + static inline struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, @@ -494,6 +502,12 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags); +struct dst_entry *xfrm_lookup_with_ifid(struct net *net, + struct dst_entry *dst_orig, + const struct flowi *fl, + const struct sock *sk, int flags, + u32 if_id); + struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags); diff --git a/include/net/flow.h b/include/net/flow.h index 187c9bef672f..8ce21793094e 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -26,10 +26,6 @@ struct flowi_tunnel { __be64 tun_id; }; -struct flowi_xfrm { - __u32 if_id; -}; - struct flowi_common { int flowic_oif; int flowic_iif; @@ -43,7 +39,6 @@ struct flowi_common { #define FLOWI_FLAG_SKIP_NH_OIF 0x04 __u32 flowic_secid; struct flowi_tunnel flowic_tun_key; - struct flowi_xfrm xfrm; kuid_t flowic_uid; }; @@ -83,7 +78,6 @@ struct flowi4 { #define flowi4_secid __fl_common.flowic_secid #define flowi4_tun_key __fl_common.flowic_tun_key #define flowi4_uid __fl_common.flowic_uid -#define flowi4_xfrm __fl_common.xfrm /* (saddr,daddr) must be grouped, same order as in IP header */ __be32 saddr; @@ -115,7 +109,6 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, fl4->flowi4_flags = flags; fl4->flowi4_secid = 0; fl4->flowi4_tun_key.tun_id = 0; - fl4->flowi4_xfrm.if_id = 0; fl4->flowi4_uid = uid; fl4->daddr = daddr; fl4->saddr = saddr; @@ -145,7 +138,6 @@ struct flowi6 { #define flowi6_secid __fl_common.flowic_secid #define flowi6_tun_key __fl_common.flowic_tun_key #define flowi6_uid __fl_common.flowic_uid -#define flowi6_xfrm __fl_common.xfrm struct in6_addr daddr; struct in6_addr saddr; /* Note: flowi6_tos is encoded in flowlabel, too. */ @@ -193,7 +185,6 @@ struct flowi { #define flowi_secid u.__fl_common.flowic_secid #define flowi_tun_key u.__fl_common.flowic_tun_key #define flowi_uid u.__fl_common.flowic_uid -#define flowi_xfrm u.__fl_common.xfrm } __attribute__((__aligned__(BITS_PER_LONG/8))); static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 1350e2cf0749..ca820945f30c 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1557,7 +1557,7 @@ struct xfrm_state *xfrm_state_find(const xfrm_address_t *daddr, const struct flowi *fl, struct xfrm_tmpl *tmpl, struct xfrm_policy *pol, int *err, - unsigned short family); + unsigned short family, u32 if_id); struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id, xfrm_address_t *daddr, xfrm_address_t *saddr, diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 31cb1c7e3881..ccfe18d67e98 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -307,10 +307,8 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) if (!dst) goto tx_err_link_failure; - fl->flowi_xfrm.if_id = xi->p.if_id; - dst_hold(dst); - dst = xfrm_lookup(xi->net, dst, fl, NULL, 0); + dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, xi->p.if_id); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 5d2f734f4309..2f70fe68b9b0 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1068,14 +1068,14 @@ EXPORT_SYMBOL(xfrm_policy_walk_done); */ static int xfrm_policy_match(const struct xfrm_policy *pol, const struct flowi *fl, - u8 type, u16 family, int dir) + u8 type, u16 family, int dir, u32 if_id) { const struct xfrm_selector *sel = &pol->selector; int ret = -ESRCH; bool match; if (pol->family != family || - pol->if_id != fl->flowi_xfrm.if_id || + pol->if_id != if_id || (fl->flowi_mark & pol->mark.m) != pol->mark.v || pol->type != type) return ret; @@ -1090,7 +1090,8 @@ static int xfrm_policy_match(const struct xfrm_policy *pol, static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, const struct flowi *fl, - u16 family, u8 dir) + u16 family, u8 dir, + u32 if_id) { int err; struct xfrm_policy *pol, *ret; @@ -1114,7 +1115,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, priority = ~0U; ret = NULL; hlist_for_each_entry_rcu(pol, chain, bydst) { - err = xfrm_policy_match(pol, fl, type, family, dir); + err = xfrm_policy_match(pol, fl, type, family, dir, if_id); if (err) { if (err == -ESRCH) continue; @@ -1133,7 +1134,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, if ((pol->priority >= priority) && ret) break; - err = xfrm_policy_match(pol, fl, type, family, dir); + err = xfrm_policy_match(pol, fl, type, family, dir, if_id); if (err) { if (err == -ESRCH) continue; @@ -1158,21 +1159,25 @@ fail: return ret; } -static struct xfrm_policy * -xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir) +static struct xfrm_policy *xfrm_policy_lookup(struct net *net, + const struct flowi *fl, + u16 family, u8 dir, u32 if_id) { #ifdef CONFIG_XFRM_SUB_POLICY struct xfrm_policy *pol; - pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir); + pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, + dir, if_id); if (pol != NULL) return pol; #endif - return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir); + return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, + dir, if_id); } static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, - const struct flowi *fl, u16 family) + const struct flowi *fl, + u16 family, u32 if_id) { struct xfrm_policy *pol; @@ -1191,7 +1196,7 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, match = xfrm_selector_match(&pol->selector, fl, family); if (match) { if ((sk->sk_mark & pol->mark.m) != pol->mark.v || - pol->if_id != fl->flowi_xfrm.if_id) { + pol->if_id != if_id) { pol = NULL; goto out; } @@ -1405,7 +1410,8 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, } } - x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family); + x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, + family, policy->if_id); if (x && x->km.state == XFRM_STATE_VALID) { xfrm[nx++] = x; @@ -1708,7 +1714,8 @@ static int xfrm_expand_policies(const struct flowi *fl, u16 family, pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]), XFRM_POLICY_TYPE_MAIN, fl, family, - XFRM_POLICY_OUT); + XFRM_POLICY_OUT, + pols[0]->if_id); if (pols[1]) { if (IS_ERR(pols[1])) { xfrm_pols_put(pols, *num_pols); @@ -1942,8 +1949,10 @@ free_dst: goto out; } -static struct xfrm_dst * -xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, struct xfrm_flo *xflo) +static struct xfrm_dst *xfrm_bundle_lookup(struct net *net, + const struct flowi *fl, + u16 family, u8 dir, + struct xfrm_flo *xflo, u32 if_id) { struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; int num_pols = 0, num_xfrms = 0, err; @@ -1952,7 +1961,7 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, /* Resolve policies to use if we couldn't get them from * previous cache entry */ num_pols = 1; - pols[0] = xfrm_policy_lookup(net, fl, family, dir); + pols[0] = xfrm_policy_lookup(net, fl, family, dir, if_id); err = xfrm_expand_policies(fl, family, pols, &num_pols, &num_xfrms); if (err < 0) @@ -2020,14 +2029,19 @@ static struct dst_entry *make_blackhole(struct net *net, u16 family, return ret; } -/* Main function: finds/creates a bundle for given flow. +/* Finds/creates a bundle for given flow and if_id * * At the moment we eat a raw IP route. Mostly to speed up lookups * on interfaces with disabled IPsec. + * + * xfrm_lookup uses an if_id of 0 by default, and is provided for + * compatibility */ -struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, - const struct flowi *fl, - const struct sock *sk, int flags) +struct dst_entry *xfrm_lookup_with_ifid(struct net *net, + struct dst_entry *dst_orig, + const struct flowi *fl, + const struct sock *sk, + int flags, u32 if_id) { struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; struct xfrm_dst *xdst; @@ -2043,7 +2057,8 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, sk = sk_const_to_full_sk(sk); if (sk && sk->sk_policy[XFRM_POLICY_OUT]) { num_pols = 1; - pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, family); + pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, family, + if_id); err = xfrm_expand_policies(fl, family, pols, &num_pols, &num_xfrms); if (err < 0) @@ -2087,7 +2102,7 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, !net->xfrm.policy_count[XFRM_POLICY_OUT]) goto nopol; - xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo); + xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo, if_id); if (xdst == NULL) goto nopol; if (IS_ERR(xdst)) { @@ -2168,6 +2183,19 @@ dropdst: xfrm_pols_put(pols, drop_pols); return ERR_PTR(err); } +EXPORT_SYMBOL(xfrm_lookup_with_ifid); + +/* Main function: finds/creates a bundle for given flow. + * + * At the moment we eat a raw IP route. Mostly to speed up lookups + * on interfaces with disabled IPsec. + */ +struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, + const struct flowi *fl, const struct sock *sk, + int flags) +{ + return xfrm_lookup_with_ifid(net, dst_orig, fl, sk, flags, 0); +} EXPORT_SYMBOL(xfrm_lookup); /* Callers of xfrm_lookup_route() must ensure a call to dst_output(). @@ -2257,19 +2285,12 @@ int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned int family, int reverse) { const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); - const struct xfrm_if_cb *ifcb = xfrm_if_get_cb(); - struct xfrm_if *xi; int err; if (unlikely(afinfo == NULL)) return -EAFNOSUPPORT; afinfo->decode_session(skb, fl, reverse); - if (ifcb) { - xi = ifcb->decode_session(skb); - if (xi) - fl->flowi_xfrm.if_id = xi->p.if_id; - } err = security_xfrm_decode_session(skb, &fl->flowi_secid); rcu_read_unlock(); @@ -2301,6 +2322,19 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, int reverse; struct flowi fl; int xerr_idx = -1; + const struct xfrm_if_cb *ifcb; + struct xfrm_if *xi; + u32 if_id = 0; + + rcu_read_lock(); + ifcb = xfrm_if_get_cb(); + + if (ifcb) { + xi = ifcb->decode_session(skb); + if (xi) + if_id = xi->p.if_id; + } + rcu_read_unlock(); reverse = dir & ~XFRM_POLICY_MASK; dir &= XFRM_POLICY_MASK; @@ -2328,7 +2362,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, pol = NULL; sk = sk_to_full_sk(sk); if (sk && sk->sk_policy[dir]) { - pol = xfrm_sk_policy_lookup(sk, dir, &fl, family); + pol = xfrm_sk_policy_lookup(sk, dir, &fl, family, if_id); if (IS_ERR(pol)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); return 0; @@ -2336,7 +2370,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, } if (!pol) - pol = xfrm_policy_lookup(net, &fl, family, dir); + pol = xfrm_policy_lookup(net, &fl, family, dir, if_id); if (IS_ERR(pol)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); @@ -2360,7 +2394,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) { pols[1] = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, &fl, family, - XFRM_POLICY_IN); + XFRM_POLICY_IN, if_id); if (pols[1]) { if (IS_ERR(pols[1])) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 27c84e63c7ff..bd5cb7ad2447 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -930,7 +930,7 @@ struct xfrm_state * xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, const struct flowi *fl, struct xfrm_tmpl *tmpl, struct xfrm_policy *pol, int *err, - unsigned short family) + unsigned short family, u32 if_id) { static xfrm_address_t saddr_wildcard = { }; struct net *net = xp_net(pol); @@ -940,7 +940,6 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, int error = 0; struct xfrm_state *best = NULL; u32 mark = pol->mark.v & pol->mark.m; - u32 if_id = fl->flowi_xfrm.if_id; unsigned short encap_family = tmpl->encap_family; unsigned int sequence; struct km_event c; -- cgit v1.2.3 From fbdeaed408cf2728c62640c10848ddb1b67e63d3 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 20 Jul 2018 21:56:53 +0000 Subject: net: create reusable function for getting ownership info of sysfs inodes Make net_ns_get_ownership() reusable by networking code outside of core. This is useful, for example, to allow bridge related sysfs files to be owned by container root. Add a function comment since this is a potentially dangerous function to use given the way that kobject_get_ownership() works by initializing uid and gid before calling .get_ownership(). Signed-off-by: Tyler Hicks Signed-off-by: David S. Miller --- include/net/net_namespace.h | 10 ++++++++++ net/core/net-sysfs.c | 18 ------------------ net/core/net_namespace.c | 28 ++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 18 deletions(-) (limited to 'include/net') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index a71264d75d7f..9b5fdc50519a 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -170,6 +171,8 @@ extern struct net init_net; struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns, struct net *old_net); +void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid); + void net_ns_barrier(void); #else /* CONFIG_NET_NS */ #include @@ -182,6 +185,13 @@ static inline struct net *copy_net_ns(unsigned long flags, return old_net; } +static inline void net_ns_get_ownership(const struct net *net, + kuid_t *uid, kgid_t *gid) +{ + *uid = GLOBAL_ROOT_UID; + *gid = GLOBAL_ROOT_GID; +} + static inline void net_ns_barrier(void) {} #endif /* CONFIG_NET_NS */ diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index ada065fc685e..0a95bcf64cdc 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -656,24 +656,6 @@ static const struct attribute_group wireless_group = { #define net_class_groups NULL #endif /* CONFIG_SYSFS */ -static void net_ns_get_ownership(const struct net *net, - kuid_t *uid, kgid_t *gid) -{ - if (net) { - kuid_t ns_root_uid = make_kuid(net->user_ns, 0); - kgid_t ns_root_gid = make_kgid(net->user_ns, 0); - - if (uid_valid(ns_root_uid)) - *uid = ns_root_uid; - - if (gid_valid(ns_root_gid)) - *gid = ns_root_gid; - } else { - *uid = GLOBAL_ROOT_UID; - *gid = GLOBAL_ROOT_GID; - } -} - #ifdef CONFIG_SYSFS #define to_rx_queue_attr(_attr) \ container_of(_attr, struct rx_queue_attribute, attr) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index a11e03f920d3..738871af5efa 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -448,6 +449,33 @@ dec_ucounts: return net; } +/** + * net_ns_get_ownership - get sysfs ownership data for @net + * @net: network namespace in question (can be NULL) + * @uid: kernel user ID for sysfs objects + * @gid: kernel group ID for sysfs objects + * + * Returns the uid/gid pair of root in the user namespace associated with the + * given network namespace. + */ +void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid) +{ + if (net) { + kuid_t ns_root_uid = make_kuid(net->user_ns, 0); + kgid_t ns_root_gid = make_kgid(net->user_ns, 0); + + if (uid_valid(ns_root_uid)) + *uid = ns_root_uid; + + if (gid_valid(ns_root_gid)) + *gid = ns_root_gid; + } else { + *uid = GLOBAL_ROOT_UID; + *gid = GLOBAL_ROOT_GID; + } +} +EXPORT_SYMBOL_GPL(net_ns_get_ownership); + static void unhash_nsid(struct net *net, struct net *last) { struct net *tmp; -- cgit v1.2.3 From 042f8825569d628517784d558aefe23c212f0fb2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 20 Jul 2018 21:14:38 -0700 Subject: nfp: bring back support for offloading shared blocks Now that we have offload replay infrastructure added by commit 326367427cc0 ("net: sched: call reoffload op on block callback reg") and flows are guaranteed to be removed correctly, we can revert commit 951a8ee6def3 ("nfp: reject binding to shared blocks"). Signed-off-by: Jakub Kicinski Reviewed-by: John Hurley Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/bpf/main.c | 3 --- drivers/net/ethernet/netronome/nfp/flower/offload.c | 3 --- include/net/pkt_cls.h | 5 ----- 3 files changed, 11 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index 458f49235d06..994d2b756fe1 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -195,9 +195,6 @@ static int nfp_bpf_setup_tc_block(struct net_device *netdev, if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS) return -EOPNOTSUPP; - if (tcf_block_shared(f->block)) - return -EOPNOTSUPP; - switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c index 43b9bf12b174..6bc8a97f7e03 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/offload.c +++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c @@ -631,9 +631,6 @@ static int nfp_flower_setup_tc_block(struct net_device *netdev, if (f->binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS) return -EOPNOTSUPP; - if (tcf_block_shared(f->block)) - return -EOPNOTSUPP; - switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index e4252a176eec..4f405ca8346f 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -114,11 +114,6 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, { } -static inline bool tcf_block_shared(struct tcf_block *block) -{ - return false; -} - static inline struct Qdisc *tcf_block_q(struct tcf_block *block) { return NULL; -- cgit v1.2.3 From f71e0ca4db187af7c44987e9d21e9042c3046070 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 23 Jul 2018 09:23:05 +0200 Subject: net: sched: Avoid implicit chain 0 creation Currently, chain 0 is implicitly created during block creation. However that does not align with chain object exposure, creation and destruction api introduced later on. So make the chain 0 behave the same way as any other chain and only create it when it is needed. Since chain 0 is somehow special as the qdiscs need to hold pointer to the first chain tp, this requires to move the chain head change callback infra to the block structure. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 5 ++- net/sched/cls_api.c | 86 +++++++++++++++++++++-------------------------- 2 files changed, 43 insertions(+), 48 deletions(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 7432100027b7..86f4651784e8 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -300,7 +300,6 @@ typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv); struct tcf_chain { struct tcf_proto __rcu *filter_chain; - struct list_head filter_chain_list; struct list_head list; struct tcf_block *block; u32 index; /* chain index */ @@ -318,6 +317,10 @@ struct tcf_block { bool keep_dst; unsigned int offloadcnt; /* Number of oddloaded filters */ unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */ + struct { + struct tcf_chain *chain; + struct list_head filter_chain_list; + } chain0; }; static inline void tcf_block_offload_inc(struct tcf_block *block, u32 *flags) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 1e8d69790d82..eb0bf9037ef9 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -204,11 +204,12 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block, chain = kzalloc(sizeof(*chain), GFP_KERNEL); if (!chain) return NULL; - INIT_LIST_HEAD(&chain->filter_chain_list); list_add_tail(&chain->list, &block->chain_list); chain->block = block; chain->index = chain_index; chain->refcnt = 1; + if (!chain->index) + block->chain0.chain = chain; return chain; } @@ -218,12 +219,16 @@ static void tcf_chain_head_change_item(struct tcf_filter_chain_list_item *item, if (item->chain_head_change) item->chain_head_change(tp_head, item->chain_head_change_priv); } -static void tcf_chain_head_change(struct tcf_chain *chain, - struct tcf_proto *tp_head) + +static void tcf_chain0_head_change(struct tcf_chain *chain, + struct tcf_proto *tp_head) { struct tcf_filter_chain_list_item *item; + struct tcf_block *block = chain->block; - list_for_each_entry(item, &chain->filter_chain_list, list) + if (chain->index) + return; + list_for_each_entry(item, &block->chain0.filter_chain_list, list) tcf_chain_head_change_item(item, tp_head); } @@ -231,7 +236,7 @@ static void tcf_chain_flush(struct tcf_chain *chain) { struct tcf_proto *tp = rtnl_dereference(chain->filter_chain); - tcf_chain_head_change(chain, NULL); + tcf_chain0_head_change(chain, NULL); while (tp) { RCU_INIT_POINTER(chain->filter_chain, tp->next); tcf_proto_destroy(tp, NULL); @@ -245,8 +250,10 @@ static void tcf_chain_destroy(struct tcf_chain *chain) struct tcf_block *block = chain->block; list_del(&chain->list); + if (!chain->index) + block->chain0.chain = NULL; kfree(chain); - if (list_empty(&block->chain_list)) + if (list_empty(&block->chain_list) && block->refcnt == 0) kfree(block); } @@ -346,10 +353,11 @@ no_offload_dev_dec: } static int -tcf_chain_head_change_cb_add(struct tcf_chain *chain, - struct tcf_block_ext_info *ei, - struct netlink_ext_ack *extack) +tcf_chain0_head_change_cb_add(struct tcf_block *block, + struct tcf_block_ext_info *ei, + struct netlink_ext_ack *extack) { + struct tcf_chain *chain0 = block->chain0.chain; struct tcf_filter_chain_list_item *item; item = kmalloc(sizeof(*item), GFP_KERNEL); @@ -359,23 +367,25 @@ tcf_chain_head_change_cb_add(struct tcf_chain *chain, } item->chain_head_change = ei->chain_head_change; item->chain_head_change_priv = ei->chain_head_change_priv; - if (chain->filter_chain) - tcf_chain_head_change_item(item, chain->filter_chain); - list_add(&item->list, &chain->filter_chain_list); + if (chain0 && chain0->filter_chain) + tcf_chain_head_change_item(item, chain0->filter_chain); + list_add(&item->list, &block->chain0.filter_chain_list); return 0; } static void -tcf_chain_head_change_cb_del(struct tcf_chain *chain, - struct tcf_block_ext_info *ei) +tcf_chain0_head_change_cb_del(struct tcf_block *block, + struct tcf_block_ext_info *ei) { + struct tcf_chain *chain0 = block->chain0.chain; struct tcf_filter_chain_list_item *item; - list_for_each_entry(item, &chain->filter_chain_list, list) { + list_for_each_entry(item, &block->chain0.filter_chain_list, list) { if ((!ei->chain_head_change && !ei->chain_head_change_priv) || (item->chain_head_change == ei->chain_head_change && item->chain_head_change_priv == ei->chain_head_change_priv)) { - tcf_chain_head_change_item(item, NULL); + if (chain0) + tcf_chain_head_change_item(item, NULL); list_del(&item->list); kfree(item); return; @@ -411,8 +421,6 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, struct netlink_ext_ack *extack) { struct tcf_block *block; - struct tcf_chain *chain; - int err; block = kzalloc(sizeof(*block), GFP_KERNEL); if (!block) { @@ -422,14 +430,8 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, INIT_LIST_HEAD(&block->chain_list); INIT_LIST_HEAD(&block->cb_list); INIT_LIST_HEAD(&block->owner_list); + INIT_LIST_HEAD(&block->chain0.filter_chain_list); - /* Create chain 0 by default, it has to be always present. */ - chain = tcf_chain_create(block, 0); - if (!chain) { - NL_SET_ERR_MSG(extack, "Failed to create new tcf chain"); - err = -ENOMEM; - goto err_chain_create; - } block->refcnt = 1; block->net = net; block->index = block_index; @@ -438,10 +440,6 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, if (!tcf_block_shared(block)) block->q = q; return block; - -err_chain_create: - kfree(block); - return ERR_PTR(err); } static struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index) @@ -523,11 +521,6 @@ static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q, return block; } -static struct tcf_chain *tcf_block_chain_zero(struct tcf_block *block) -{ - return list_first_entry(&block->chain_list, struct tcf_chain, list); -} - struct tcf_block_owner_item { struct list_head list; struct Qdisc *q; @@ -621,10 +614,9 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, tcf_block_owner_netif_keep_dst(block, q, ei->binder_type); - err = tcf_chain_head_change_cb_add(tcf_block_chain_zero(block), - ei, extack); + err = tcf_chain0_head_change_cb_add(block, ei, extack); if (err) - goto err_chain_head_change_cb_add; + goto err_chain0_head_change_cb_add; err = tcf_block_offload_bind(block, q, ei, extack); if (err) @@ -634,15 +626,14 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, return 0; err_block_offload_bind: - tcf_chain_head_change_cb_del(tcf_block_chain_zero(block), ei); -err_chain_head_change_cb_add: + tcf_chain0_head_change_cb_del(block, ei); +err_chain0_head_change_cb_add: tcf_block_owner_del(block, q, ei->binder_type); err_block_owner_add: if (created) { if (tcf_block_shared(block)) tcf_block_remove(block, net); err_block_insert: - kfree(tcf_block_chain_zero(block)); kfree(block); } else { block->refcnt--; @@ -682,10 +673,10 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, if (!block) return; - tcf_chain_head_change_cb_del(tcf_block_chain_zero(block), ei); + tcf_chain0_head_change_cb_del(block, ei); tcf_block_owner_del(block, q, ei->binder_type); - if (--block->refcnt == 0) { + if (block->refcnt == 1) { if (tcf_block_shared(block)) tcf_block_remove(block, block->net); @@ -701,13 +692,14 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, tcf_block_offload_unbind(block, q, ei); - if (block->refcnt == 0) { + if (block->refcnt == 1) { /* At this point, all the chains should have refcnt >= 1. */ list_for_each_entry_safe(chain, tmp, &block->chain_list, list) tcf_chain_put(chain); - /* Finally, put chain 0 and allow block to be freed. */ - tcf_chain_put(tcf_block_chain_zero(block)); + block->refcnt--; + if (list_empty(&block->chain_list)) + kfree(block); } } EXPORT_SYMBOL(tcf_block_put_ext); @@ -947,7 +939,7 @@ static void tcf_chain_tp_insert(struct tcf_chain *chain, struct tcf_proto *tp) { if (*chain_info->pprev == chain->filter_chain) - tcf_chain_head_change(chain, tp); + tcf_chain0_head_change(chain, tp); RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain_info)); rcu_assign_pointer(*chain_info->pprev, tp); tcf_chain_hold(chain); @@ -960,7 +952,7 @@ static void tcf_chain_tp_remove(struct tcf_chain *chain, struct tcf_proto *next = rtnl_dereference(chain_info->next); if (tp == chain->filter_chain) - tcf_chain_head_change(chain, next); + tcf_chain0_head_change(chain, next); RCU_INIT_POINTER(*chain_info->pprev, next); tcf_chain_put(chain); } -- cgit v1.2.3 From 32a4f5ecd7381f30ae3bb36dea77a150ba68af2e Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 23 Jul 2018 09:23:06 +0200 Subject: net: sched: introduce chain object to uapi Allow user to create, destroy, get and dump chain objects. Do that by extending rtnl commands by the chain-specific ones. User will now be able to explicitly create or destroy chains (so far this was done only automatically according the filter/act needs and refcounting). Also, the user will receive notification about any chain creation or destuction. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 1 + include/uapi/linux/rtnetlink.h | 7 + net/sched/cls_api.c | 308 +++++++++++++++++++++++++++++++++++++++-- security/selinux/nlmsgtab.c | 2 +- 4 files changed, 309 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 86f4651784e8..81ec8276db9c 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -304,6 +304,7 @@ struct tcf_chain { struct tcf_block *block; u32 index; /* chain index */ unsigned int refcnt; + bool explicitly_created; }; struct tcf_block { diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 7d8502313c99..46399367627f 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -150,6 +150,13 @@ enum { RTM_NEWCACHEREPORT = 96, #define RTM_NEWCACHEREPORT RTM_NEWCACHEREPORT + RTM_NEWCHAIN = 100, +#define RTM_NEWCHAIN RTM_NEWCHAIN + RTM_DELCHAIN, +#define RTM_DELCHAIN RTM_DELCHAIN + RTM_GETCHAIN, +#define RTM_GETCHAIN RTM_GETCHAIN + __RTM_MAX, #define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1) }; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index eb0bf9037ef9..e65b390336aa 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -262,29 +262,57 @@ static void tcf_chain_hold(struct tcf_chain *chain) ++chain->refcnt; } -struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, - bool create) +static struct tcf_chain *tcf_chain_lookup(struct tcf_block *block, + u32 chain_index) { struct tcf_chain *chain; list_for_each_entry(chain, &block->chain_list, list) { - if (chain->index == chain_index) { - tcf_chain_hold(chain); + if (chain->index == chain_index) return chain; - } + } + return NULL; +} + +static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, + u32 seq, u16 flags, int event, bool unicast); + +struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, + bool create) +{ + struct tcf_chain *chain = tcf_chain_lookup(block, chain_index); + + if (chain) { + tcf_chain_hold(chain); + return chain; } - return create ? tcf_chain_create(block, chain_index) : NULL; + if (!create) + return NULL; + chain = tcf_chain_create(block, chain_index); + if (!chain) + return NULL; + tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL, + RTM_NEWCHAIN, false); + return chain; } EXPORT_SYMBOL(tcf_chain_get); void tcf_chain_put(struct tcf_chain *chain) { - if (--chain->refcnt == 0) + if (--chain->refcnt == 0) { + tc_chain_notify(chain, NULL, 0, 0, RTM_DELCHAIN, false); tcf_chain_destroy(chain); + } } EXPORT_SYMBOL(tcf_chain_put); +static void tcf_chain_put_explicitly_created(struct tcf_chain *chain) +{ + if (chain->explicitly_created) + tcf_chain_put(chain); +} + static bool tcf_block_offload_in_use(struct tcf_block *block) { return block->offloadcnt; @@ -694,8 +722,10 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, if (block->refcnt == 1) { /* At this point, all the chains should have refcnt >= 1. */ - list_for_each_entry_safe(chain, tmp, &block->chain_list, list) + list_for_each_entry_safe(chain, tmp, &block->chain_list, list) { + tcf_chain_put_explicitly_created(chain); tcf_chain_put(chain); + } block->refcnt--; if (list_empty(&block->chain_list)) @@ -1609,6 +1639,264 @@ out: return skb->len; } +static int tc_chain_fill_node(struct tcf_chain *chain, struct net *net, + struct sk_buff *skb, struct tcf_block *block, + u32 portid, u32 seq, u16 flags, int event) +{ + unsigned char *b = skb_tail_pointer(skb); + struct nlmsghdr *nlh; + struct tcmsg *tcm; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); + if (!nlh) + goto out_nlmsg_trim; + tcm = nlmsg_data(nlh); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm__pad1 = 0; + tcm->tcm__pad2 = 0; + tcm->tcm_handle = 0; + if (block->q) { + tcm->tcm_ifindex = qdisc_dev(block->q)->ifindex; + tcm->tcm_parent = block->q->handle; + } else { + tcm->tcm_ifindex = TCM_IFINDEX_MAGIC_BLOCK; + tcm->tcm_block_index = block->index; + } + + if (nla_put_u32(skb, TCA_CHAIN, chain->index)) + goto nla_put_failure; + + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; + +out_nlmsg_trim: +nla_put_failure: + nlmsg_trim(skb, b); + return -EMSGSIZE; +} + +static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, + u32 seq, u16 flags, int event, bool unicast) +{ + u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; + struct tcf_block *block = chain->block; + struct net *net = block->net; + struct sk_buff *skb; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (tc_chain_fill_node(chain, net, skb, block, portid, + seq, flags, event) <= 0) { + kfree_skb(skb); + return -EINVAL; + } + + if (unicast) + return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT); + + return rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO); +} + +/* Add/delete/get a chain */ + +static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tca[TCA_MAX + 1]; + struct tcmsg *t; + u32 parent; + u32 chain_index; + struct Qdisc *q = NULL; + struct tcf_chain *chain = NULL; + struct tcf_block *block; + unsigned long cl; + int err; + + if (n->nlmsg_type != RTM_GETCHAIN && + !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + +replay: + err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack); + if (err < 0) + return err; + + t = nlmsg_data(n); + parent = t->tcm_parent; + cl = 0; + + block = tcf_block_find(net, &q, &parent, &cl, + t->tcm_ifindex, t->tcm_block_index, extack); + if (IS_ERR(block)) + return PTR_ERR(block); + + chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0; + if (chain_index > TC_ACT_EXT_VAL_MASK) { + NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); + return -EINVAL; + } + chain = tcf_chain_lookup(block, chain_index); + if (n->nlmsg_type == RTM_NEWCHAIN) { + if (chain) { + NL_SET_ERR_MSG(extack, "Filter chain already exists"); + return -EEXIST; + } + if (!(n->nlmsg_flags & NLM_F_CREATE)) { + NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and NLM_F_CREATE to create a new chain"); + return -ENOENT; + } + chain = tcf_chain_create(block, chain_index); + if (!chain) { + NL_SET_ERR_MSG(extack, "Failed to create filter chain"); + return -ENOMEM; + } + } else { + if (!chain) { + NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); + return -EINVAL; + } + tcf_chain_hold(chain); + } + + switch (n->nlmsg_type) { + case RTM_NEWCHAIN: + /* In case the chain was successfully added, take a reference + * to the chain. This ensures that an empty chain + * does not disappear at the end of this function. + */ + tcf_chain_hold(chain); + chain->explicitly_created = true; + tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL, + RTM_NEWCHAIN, false); + break; + case RTM_DELCHAIN: + /* Flush the chain first as the user requested chain removal. */ + tcf_chain_flush(chain); + /* In case the chain was successfully deleted, put a reference + * to the chain previously taken during addition. + */ + tcf_chain_put_explicitly_created(chain); + break; + case RTM_GETCHAIN: + break; + err = tc_chain_notify(chain, skb, n->nlmsg_seq, + n->nlmsg_seq, n->nlmsg_type, true); + if (err < 0) + NL_SET_ERR_MSG(extack, "Failed to send chain notify message"); + break; + default: + err = -EOPNOTSUPP; + NL_SET_ERR_MSG(extack, "Unsupported message type"); + goto errout; + } + +errout: + tcf_chain_put(chain); + if (err == -EAGAIN) + /* Replay the request. */ + goto replay; + return err; +} + +/* called with RTNL */ +static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tca[TCA_MAX + 1]; + struct Qdisc *q = NULL; + struct tcf_block *block; + struct tcf_chain *chain; + struct tcmsg *tcm = nlmsg_data(cb->nlh); + long index_start; + long index; + u32 parent; + int err; + + if (nlmsg_len(cb->nlh) < sizeof(*tcm)) + return skb->len; + + err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, NULL, NULL); + if (err) + return err; + + if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) { + block = tcf_block_lookup(net, tcm->tcm_block_index); + if (!block) + goto out; + /* If we work with block index, q is NULL and parent value + * will never be used in the following code. The check + * in tcf_fill_node prevents it. However, compiler does not + * see that far, so set parent to zero to silence the warning + * about parent being uninitialized. + */ + parent = 0; + } else { + const struct Qdisc_class_ops *cops; + struct net_device *dev; + unsigned long cl = 0; + + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) + return skb->len; + + parent = tcm->tcm_parent; + if (!parent) { + q = dev->qdisc; + parent = q->handle; + } else { + q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); + } + if (!q) + goto out; + cops = q->ops->cl_ops; + if (!cops) + goto out; + if (!cops->tcf_block) + goto out; + if (TC_H_MIN(tcm->tcm_parent)) { + cl = cops->find(q, tcm->tcm_parent); + if (cl == 0) + goto out; + } + block = cops->tcf_block(q, cl, NULL); + if (!block) + goto out; + if (tcf_block_shared(block)) + q = NULL; + } + + index_start = cb->args[0]; + index = 0; + + list_for_each_entry(chain, &block->chain_list, list) { + if ((tca[TCA_CHAIN] && + nla_get_u32(tca[TCA_CHAIN]) != chain->index)) + continue; + if (index < index_start) { + index++; + continue; + } + err = tc_chain_fill_node(chain, net, skb, block, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + RTM_NEWCHAIN); + if (err <= 0) + break; + index++; + } + + cb->args[0] = index; + +out: + /* If we did no progress, the error (EMSGSIZE) is real */ + if (skb->len == 0 && err) + return err; + return skb->len; +} + void tcf_exts_destroy(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT @@ -1825,6 +2113,10 @@ static int __init tc_filter_init(void) rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_get_tfilter, tc_dump_tfilter, 0); + rtnl_register(PF_UNSPEC, RTM_NEWCHAIN, tc_ctl_chain, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_DELCHAIN, tc_ctl_chain, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_GETCHAIN, tc_ctl_chain, + tc_dump_chain, 0); return 0; diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c index 7b7433a1a34c..74b951f55608 100644 --- a/security/selinux/nlmsgtab.c +++ b/security/selinux/nlmsgtab.c @@ -159,7 +159,7 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm) switch (sclass) { case SECCLASS_NETLINK_ROUTE_SOCKET: /* RTM_MAX always point to RTM_SETxxxx, ie RTM_NEWxxx + 3 */ - BUILD_BUG_ON(RTM_MAX != (RTM_NEWCACHEREPORT + 3)); + BUILD_BUG_ON(RTM_MAX != (RTM_NEWCHAIN + 3)); err = nlmsg_perm(nlmsg_type, perm, nlmsg_route_perms, sizeof(nlmsg_route_perms)); break; -- cgit v1.2.3 From 9f407f1768d3e1a5ddd7bd49fa4d1f5a26e10ed2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 23 Jul 2018 09:23:07 +0200 Subject: net: sched: introduce chain templates Allow user to set a template for newly created chains. Template lock down the chain for particular classifier type/options combinations. The classifier needs to support templates, otherwise kernel would reply with error. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 12 +++++++++ net/sched/cls_api.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 81ec8276db9c..085c509c8674 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -238,6 +238,8 @@ struct tcf_result { }; }; +struct tcf_chain; + struct tcf_proto_ops { struct list_head head; char kind[IFNAMSIZ]; @@ -263,10 +265,18 @@ struct tcf_proto_ops { tc_setup_cb_t *cb, void *cb_priv, struct netlink_ext_ack *extack); void (*bind_class)(void *, u32, unsigned long); + void * (*tmplt_create)(struct net *net, + struct tcf_chain *chain, + struct nlattr **tca, + struct netlink_ext_ack *extack); + void (*tmplt_destroy)(void *tmplt_priv); /* rtnetlink specific */ int (*dump)(struct net*, struct tcf_proto*, void *, struct sk_buff *skb, struct tcmsg*); + int (*tmplt_dump)(struct sk_buff *skb, + struct net *net, + void *tmplt_priv); struct module *owner; }; @@ -305,6 +315,8 @@ struct tcf_chain { u32 index; /* chain index */ unsigned int refcnt; bool explicitly_created; + const struct tcf_proto_ops *tmplt_ops; + void *tmplt_priv; }; struct tcf_block { diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index e65b390336aa..5f7098b5405e 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -298,10 +298,13 @@ struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, } EXPORT_SYMBOL(tcf_chain_get); +static void tc_chain_tmplt_del(struct tcf_chain *chain); + void tcf_chain_put(struct tcf_chain *chain) { if (--chain->refcnt == 0) { tc_chain_notify(chain, NULL, 0, 0, RTM_DELCHAIN, false); + tc_chain_tmplt_del(chain); tcf_chain_destroy(chain); } } @@ -1258,6 +1261,12 @@ replay: goto errout; } + if (chain->tmplt_ops && chain->tmplt_ops != tp->ops) { + NL_SET_ERR_MSG(extack, "Chain template is set to a different filter kind"); + err = -EINVAL; + goto errout; + } + err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh, n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE, extack); @@ -1644,8 +1653,13 @@ static int tc_chain_fill_node(struct tcf_chain *chain, struct net *net, u32 portid, u32 seq, u16 flags, int event) { unsigned char *b = skb_tail_pointer(skb); + const struct tcf_proto_ops *ops; struct nlmsghdr *nlh; struct tcmsg *tcm; + void *priv; + + ops = chain->tmplt_ops; + priv = chain->tmplt_priv; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); if (!nlh) @@ -1666,6 +1680,13 @@ static int tc_chain_fill_node(struct tcf_chain *chain, struct net *net, if (nla_put_u32(skb, TCA_CHAIN, chain->index)) goto nla_put_failure; + if (ops) { + if (nla_put_string(skb, TCA_KIND, ops->kind)) + goto nla_put_failure; + if (ops->tmplt_dump(skb, net, priv) < 0) + goto nla_put_failure; + } + nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; @@ -1699,6 +1720,47 @@ static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, return rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO); } +static int tc_chain_tmplt_add(struct tcf_chain *chain, struct net *net, + struct nlattr **tca, + struct netlink_ext_ack *extack) +{ + const struct tcf_proto_ops *ops; + void *tmplt_priv; + + /* If kind is not set, user did not specify template. */ + if (!tca[TCA_KIND]) + return 0; + + ops = tcf_proto_lookup_ops(nla_data(tca[TCA_KIND]), extack); + if (IS_ERR(ops)) + return PTR_ERR(ops); + if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump) { + NL_SET_ERR_MSG(extack, "Chain templates are not supported with specified classifier"); + return -EOPNOTSUPP; + } + + tmplt_priv = ops->tmplt_create(net, chain, tca, extack); + if (IS_ERR(tmplt_priv)) { + module_put(ops->owner); + return PTR_ERR(tmplt_priv); + } + chain->tmplt_ops = ops; + chain->tmplt_priv = tmplt_priv; + return 0; +} + +static void tc_chain_tmplt_del(struct tcf_chain *chain) +{ + const struct tcf_proto_ops *ops = chain->tmplt_ops; + + /* If template ops are set, no work to do for us. */ + if (!ops) + return; + + ops->tmplt_destroy(chain->tmplt_priv); + module_put(ops->owner); +} + /* Add/delete/get a chain */ static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n, @@ -1763,6 +1825,9 @@ replay: switch (n->nlmsg_type) { case RTM_NEWCHAIN: + err = tc_chain_tmplt_add(chain, net, tca, extack); + if (err) + goto errout; /* In case the chain was successfully added, take a reference * to the chain. This ensures that an empty chain * does not disappear at the end of this function. -- cgit v1.2.3 From 34738452739069947e528123810533f28dd8332b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 23 Jul 2018 09:23:11 +0200 Subject: net: sched: cls_flower: propagate chain teplate creation and destruction to drivers Introduce a couple of flower offload commands in order to propagate template creation/destruction events down to device drivers. Drivers may use this information to prepare HW in an optimal way for future filter insertions. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 2 ++ net/sched/cls_flower.c | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 4f405ca8346f..a3101582f642 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -721,6 +721,8 @@ enum tc_fl_command { TC_CLSFLOWER_REPLACE, TC_CLSFLOWER_DESTROY, TC_CLSFLOWER_STATS, + TC_CLSFLOWER_TMPLT_CREATE, + TC_CLSFLOWER_TMPLT_DESTROY, }; struct tc_cls_flower_offload { diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index f0c80758a594..6ccf60364297 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1194,6 +1194,42 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, return 0; } +static void fl_hw_create_tmplt(struct tcf_chain *chain, + struct fl_flow_tmplt *tmplt) +{ + struct tc_cls_flower_offload cls_flower = {}; + struct tcf_block *block = chain->block; + struct tcf_exts dummy_exts = { 0, }; + + cls_flower.common.chain_index = chain->index; + cls_flower.command = TC_CLSFLOWER_TMPLT_CREATE; + cls_flower.cookie = (unsigned long) tmplt; + cls_flower.dissector = &tmplt->dissector; + cls_flower.mask = &tmplt->mask; + cls_flower.key = &tmplt->dummy_key; + cls_flower.exts = &dummy_exts; + + /* We don't care if driver (any of them) fails to handle this + * call. It serves just as a hint for it. + */ + tc_setup_cb_call(block, NULL, TC_SETUP_CLSFLOWER, + &cls_flower, false); +} + +static void fl_hw_destroy_tmplt(struct tcf_chain *chain, + struct fl_flow_tmplt *tmplt) +{ + struct tc_cls_flower_offload cls_flower = {}; + struct tcf_block *block = chain->block; + + cls_flower.common.chain_index = chain->index; + cls_flower.command = TC_CLSFLOWER_TMPLT_DESTROY; + cls_flower.cookie = (unsigned long) tmplt; + + tc_setup_cb_call(block, NULL, TC_SETUP_CLSFLOWER, + &cls_flower, false); +} + static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain, struct nlattr **tca, struct netlink_ext_ack *extack) @@ -1224,6 +1260,8 @@ static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain, fl_init_dissector(&tmplt->dissector, &tmplt->mask); + fl_hw_create_tmplt(chain, tmplt); + return tmplt; errout_tmplt: @@ -1237,6 +1275,7 @@ static void fl_tmplt_destroy(void *tmplt_priv) { struct fl_flow_tmplt *tmplt = tmplt_priv; + fl_hw_destroy_tmplt(tmplt->chain, tmplt); kfree(tmplt); } -- cgit v1.2.3 From 1f3ed383fb9a073ae2e408cd7a0717b04c7c3a21 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 27 Jul 2018 09:45:05 +0200 Subject: net: sched: don't dump chains only held by actions In case a chain is empty and not explicitly created by a user, such chain should not exist. The only exception is if there is an action "goto chain" pointing to it. In that case, don't show the chain in the dump. Track the chain references held by actions and use them to find out if a chain should or should not be shown in chain dump. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 3 ++ include/net/sch_generic.h | 1 + net/sched/act_api.c | 4 +-- net/sched/cls_api.c | 70 +++++++++++++++++++++++++++++++++++++++-------- 4 files changed, 64 insertions(+), 14 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index a3101582f642..6d02f31abba8 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -39,7 +39,10 @@ bool tcf_queue_work(struct rcu_work *rwork, work_func_t func); #ifdef CONFIG_NET_CLS struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, bool create); +struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, + u32 chain_index); void tcf_chain_put(struct tcf_chain *chain); +void tcf_chain_put_by_act(struct tcf_chain *chain); void tcf_block_netif_keep_dst(struct tcf_block *block); int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 085c509c8674..c5432362dc26 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -314,6 +314,7 @@ struct tcf_chain { struct tcf_block *block; u32 index; /* chain index */ unsigned int refcnt; + unsigned int action_refcnt; bool explicitly_created; const struct tcf_proto_ops *tmplt_ops; void *tmplt_priv; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 148a89ab789b..b43df1e25c6d 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -36,7 +36,7 @@ static int tcf_action_goto_chain_init(struct tc_action *a, struct tcf_proto *tp) if (!tp) return -EINVAL; - a->goto_chain = tcf_chain_get(tp->chain->block, chain_index, true); + a->goto_chain = tcf_chain_get_by_act(tp->chain->block, chain_index); if (!a->goto_chain) return -ENOMEM; return 0; @@ -44,7 +44,7 @@ static int tcf_action_goto_chain_init(struct tc_action *a, struct tcf_proto *tp) static void tcf_action_goto_chain_fini(struct tc_action *a) { - tcf_chain_put(a->goto_chain); + tcf_chain_put_by_act(a->goto_chain); } static void tcf_action_goto_chain_exec(const struct tc_action *a, diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 75cce2819de9..e20aad1987b8 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -262,6 +262,25 @@ static void tcf_chain_hold(struct tcf_chain *chain) ++chain->refcnt; } +static void tcf_chain_hold_by_act(struct tcf_chain *chain) +{ + ++chain->action_refcnt; +} + +static void tcf_chain_release_by_act(struct tcf_chain *chain) +{ + --chain->action_refcnt; +} + +static bool tcf_chain_is_zombie(struct tcf_chain *chain) +{ + /* In case all the references are action references, this + * chain is a zombie and should not be listed in the chain + * dump list. + */ + return chain->refcnt == chain->action_refcnt; +} + static struct tcf_chain *tcf_chain_lookup(struct tcf_block *block, u32 chain_index) { @@ -298,6 +317,15 @@ struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, } EXPORT_SYMBOL(tcf_chain_get); +struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 chain_index) +{ + struct tcf_chain *chain = tcf_chain_get(block, chain_index, true); + + tcf_chain_hold_by_act(chain); + return chain; +} +EXPORT_SYMBOL(tcf_chain_get_by_act); + static void tc_chain_tmplt_del(struct tcf_chain *chain); void tcf_chain_put(struct tcf_chain *chain) @@ -310,6 +338,13 @@ void tcf_chain_put(struct tcf_chain *chain) } EXPORT_SYMBOL(tcf_chain_put); +void tcf_chain_put_by_act(struct tcf_chain *chain) +{ + tcf_chain_release_by_act(chain); + tcf_chain_put(chain); +} +EXPORT_SYMBOL(tcf_chain_put_by_act); + static void tcf_chain_put_explicitly_created(struct tcf_chain *chain) { if (chain->explicitly_created) @@ -1803,20 +1838,29 @@ replay: chain = tcf_chain_lookup(block, chain_index); if (n->nlmsg_type == RTM_NEWCHAIN) { if (chain) { - NL_SET_ERR_MSG(extack, "Filter chain already exists"); - return -EEXIST; - } - if (!(n->nlmsg_flags & NLM_F_CREATE)) { - NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and NLM_F_CREATE to create a new chain"); - return -ENOENT; - } - chain = tcf_chain_create(block, chain_index); - if (!chain) { - NL_SET_ERR_MSG(extack, "Failed to create filter chain"); - return -ENOMEM; + if (tcf_chain_is_zombie(chain)) { + /* The chain exists only because there is + * some action referencing it, meaning it + * is a zombie. + */ + tcf_chain_hold(chain); + } else { + NL_SET_ERR_MSG(extack, "Filter chain already exists"); + return -EEXIST; + } + } else { + if (!(n->nlmsg_flags & NLM_F_CREATE)) { + NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and NLM_F_CREATE to create a new chain"); + return -ENOENT; + } + chain = tcf_chain_create(block, chain_index); + if (!chain) { + NL_SET_ERR_MSG(extack, "Failed to create filter chain"); + return -ENOMEM; + } } } else { - if (!chain) { + if (!chain || tcf_chain_is_zombie(chain)) { NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); return -EINVAL; } @@ -1944,6 +1988,8 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) index++; continue; } + if (tcf_chain_is_zombie(chain)) + continue; err = tc_chain_fill_node(chain, net, skb, block, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, -- cgit v1.2.3 From b67c540b8a987e365dc548e5b2ddf023946e3d63 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 27 Jul 2018 15:26:56 +0300 Subject: net: dcb: Add priority-to-DSCP map getters On ingress, a network device such as a switch assigns to packets priority based on various criteria. Common options include interpreting PCP and DSCP fields according to user configuration. When a packet egresses the switch, a reverse process may rewrite PCP and/or DSCP values according to packet priority. The following three functions support a) obtaining a DSCP-to-priority map or vice versa, and b) finding default-priority entries in APP database. The DCB subsystem supports for APP entries a very generous M:N mapping between priorities and protocol identifiers. Understandably, several (say) DSCP values can map to the same priority. But this asymmetry holds the other way around as well--one priority can map to several DSCP values. For this reason, the following functions operate in terms of bitmaps, with ones in positions that match some APP entry. - dcb_ieee_getapp_dscp_prio_mask_map() to compute for a given netdevice a map of DSCP-to-priority-mask, which gives for each DSCP value a bitmap of priorities related to that DSCP value by APP, along the lines of dcb_ieee_getapp_mask(). - dcb_ieee_getapp_prio_dscp_mask_map() similarly to compute for a given netdevice a map from priorities to a bitmap of DSCPs. - dcb_ieee_getapp_default_prio_mask() which finds all default-priority rules for a given port in APP database, and returns a mask of priorities allowed by these default-priority rules. Signed-off-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/dcbnl.h | 13 ++++++++ net/dcb/dcbnl.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+) (limited to 'include/net') diff --git a/include/net/dcbnl.h b/include/net/dcbnl.h index 0e5e91be2d30..e22a8a3c089b 100644 --- a/include/net/dcbnl.h +++ b/include/net/dcbnl.h @@ -34,6 +34,19 @@ int dcb_ieee_setapp(struct net_device *, struct dcb_app *); int dcb_ieee_delapp(struct net_device *, struct dcb_app *); u8 dcb_ieee_getapp_mask(struct net_device *, struct dcb_app *); +struct dcb_ieee_app_prio_map { + u64 map[IEEE_8021QAZ_MAX_TCS]; +}; +void dcb_ieee_getapp_prio_dscp_mask_map(const struct net_device *dev, + struct dcb_ieee_app_prio_map *p_map); + +struct dcb_ieee_app_dscp_map { + u8 map[64]; +}; +void dcb_ieee_getapp_dscp_prio_mask_map(const struct net_device *dev, + struct dcb_ieee_app_dscp_map *p_map); +u8 dcb_ieee_getapp_default_prio_mask(const struct net_device *dev); + int dcbnl_ieee_notify(struct net_device *dev, int event, int cmd, u32 seq, u32 pid); int dcbnl_cee_notify(struct net_device *dev, int event, int cmd, diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 013fdb6fa07a..a556cd708885 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -1958,6 +1958,92 @@ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del) } EXPORT_SYMBOL(dcb_ieee_delapp); +/** + * dcb_ieee_getapp_prio_dscp_mask_map - For a given device, find mapping from + * priorities to the DSCP values assigned to that priority. Initialize p_map + * such that each map element holds a bit mask of DSCP values configured for + * that priority by APP entries. + */ +void dcb_ieee_getapp_prio_dscp_mask_map(const struct net_device *dev, + struct dcb_ieee_app_prio_map *p_map) +{ + int ifindex = dev->ifindex; + struct dcb_app_type *itr; + u8 prio; + + memset(p_map->map, 0, sizeof(p_map->map)); + + spin_lock_bh(&dcb_lock); + list_for_each_entry(itr, &dcb_app_list, list) { + if (itr->ifindex == ifindex && + itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP && + itr->app.protocol < 64 && + itr->app.priority < IEEE_8021QAZ_MAX_TCS) { + prio = itr->app.priority; + p_map->map[prio] |= 1ULL << itr->app.protocol; + } + } + spin_unlock_bh(&dcb_lock); +} +EXPORT_SYMBOL(dcb_ieee_getapp_prio_dscp_mask_map); + +/** + * dcb_ieee_getapp_dscp_prio_mask_map - For a given device, find mapping from + * DSCP values to the priorities assigned to that DSCP value. Initialize p_map + * such that each map element holds a bit mask of priorities configured for a + * given DSCP value by APP entries. + */ +void +dcb_ieee_getapp_dscp_prio_mask_map(const struct net_device *dev, + struct dcb_ieee_app_dscp_map *p_map) +{ + int ifindex = dev->ifindex; + struct dcb_app_type *itr; + + memset(p_map->map, 0, sizeof(p_map->map)); + + spin_lock_bh(&dcb_lock); + list_for_each_entry(itr, &dcb_app_list, list) { + if (itr->ifindex == ifindex && + itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP && + itr->app.protocol < 64 && + itr->app.priority < IEEE_8021QAZ_MAX_TCS) + p_map->map[itr->app.protocol] |= 1 << itr->app.priority; + } + spin_unlock_bh(&dcb_lock); +} +EXPORT_SYMBOL(dcb_ieee_getapp_dscp_prio_mask_map); + +/** + * Per 802.1Q-2014, the selector value of 1 is used for matching on Ethernet + * type, with valid PID values >= 1536. A special meaning is then assigned to + * protocol value of 0: "default priority. For use when priority is not + * otherwise specified". + * + * dcb_ieee_getapp_default_prio_mask - For a given device, find all APP entries + * of the form {$PRIO, ETHERTYPE, 0} and construct a bit mask of all default + * priorities set by these entries. + */ +u8 dcb_ieee_getapp_default_prio_mask(const struct net_device *dev) +{ + int ifindex = dev->ifindex; + struct dcb_app_type *itr; + u8 mask = 0; + + spin_lock_bh(&dcb_lock); + list_for_each_entry(itr, &dcb_app_list, list) { + if (itr->ifindex == ifindex && + itr->app.selector == IEEE_8021QAZ_APP_SEL_ETHERTYPE && + itr->app.protocol == 0 && + itr->app.priority < IEEE_8021QAZ_MAX_TCS) + mask |= 1 << itr->app.priority; + } + spin_unlock_bh(&dcb_lock); + + return mask; +} +EXPORT_SYMBOL(dcb_ieee_getapp_default_prio_mask); + static int __init dcbnl_init(void) { INIT_LIST_HEAD(&dcb_app_list); -- cgit v1.2.3 From 222440b4e832059c0ddf18d1e409f0552ab53a7d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 4 Jul 2018 12:48:04 +0200 Subject: netfilter: nf_tables: handle meta/lookup with direct call Currently nft uses inlined variants for common operations such as 'ip saddr 1.2.3.4' instead of an indirect call. Also handle meta get operations and lookups without indirect call, both are builtin. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 7 +++++++ net/netfilter/nf_tables_core.c | 16 +++++++++++++++- net/netfilter/nft_lookup.c | 6 +++--- net/netfilter/nft_meta.c | 6 +++--- 4 files changed, 28 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index a05134507e7b..8da837d2aaf9 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -71,4 +71,11 @@ extern struct nft_set_type nft_set_hash_fast_type; extern struct nft_set_type nft_set_rbtree_type; extern struct nft_set_type nft_set_bitmap_type; +struct nft_expr; +struct nft_regs; +struct nft_pktinfo; +void nft_meta_get_eval(const struct nft_expr *expr, + struct nft_regs *regs, const struct nft_pktinfo *pkt); +void nft_lookup_eval(const struct nft_expr *expr, + struct nft_regs *regs, const struct nft_pktinfo *pkt); #endif /* _NET_NF_TABLES_CORE_H */ diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 8de912ca53d3..ffd5c0f9412b 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -120,6 +120,20 @@ struct nft_jumpstack { struct nft_rule *const *rules; }; +static void expr_call_ops_eval(const struct nft_expr *expr, + struct nft_regs *regs, + struct nft_pktinfo *pkt) +{ + unsigned long e = (unsigned long)expr->ops->eval; + + if (e == (unsigned long)nft_meta_get_eval) + nft_meta_get_eval(expr, regs, pkt); + else if (e == (unsigned long)nft_lookup_eval) + nft_lookup_eval(expr, regs, pkt); + else + expr->ops->eval(expr, regs, pkt); +} + unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv) { @@ -153,7 +167,7 @@ next_rule: nft_cmp_fast_eval(expr, ®s); else if (expr->ops != &nft_payload_fast_ops || !nft_payload_fast_eval(expr, ®s, pkt)) - expr->ops->eval(expr, ®s, pkt); + expr_call_ops_eval(expr, ®s, pkt); if (regs.verdict.code != NFT_CONTINUE) break; diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index c2a1d84cdfc4..ad13e8643599 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -26,9 +26,9 @@ struct nft_lookup { struct nft_set_binding binding; }; -static void nft_lookup_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +void nft_lookup_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { const struct nft_lookup *priv = nft_expr_priv(expr); const struct nft_set *set = priv->set; diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 2b94dcc43456..297fe7d97c18 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -41,9 +41,9 @@ static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state); #include "../bridge/br_private.h" #endif -static void nft_meta_get_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +void nft_meta_get_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { const struct nft_meta *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; -- cgit v1.2.3 From 6decb5b45e70d6ffff6488cc8e8bad6b9ac7f99b Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:32 +0530 Subject: Bluetooth: Define PHY flags in hdev and set 1M as default 1M is mandatory to be supported by LE controllers and the same would be set in power on. This patch defines hdev flags for LE PHYs and set 1M to default. Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 4 ++++ include/net/bluetooth/hci_core.h | 3 +++ net/bluetooth/hci_core.c | 9 +++++---- 3 files changed, 12 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 73e48be5bbb3..664fe1ebf2c7 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1514,6 +1514,10 @@ struct hci_cp_le_set_default_phy { __u8 rx_phys; } __packed; +#define HCI_LE_SET_PHY_1M 0x01 +#define HCI_LE_SET_PHY_2M 0x02 +#define HCI_LE_SET_PHY_CODED 0x04 + #define HCI_OP_LE_SET_EXT_SCAN_PARAMS 0x2041 struct hci_cp_le_set_ext_scan_params { __u8 own_addr_type; diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index a74453571264..71f79df9ee05 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -315,6 +315,9 @@ struct hci_dev { unsigned long sco_last_tx; unsigned long le_last_tx; + __u8 le_tx_def_phys; + __u8 le_rx_def_phys; + struct workqueue_struct *workqueue; struct workqueue_struct *req_workqueue; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index f5c21004186c..432f89f390c0 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -830,10 +830,9 @@ static int hci_init4_req(struct hci_request *req, unsigned long opt) if (hdev->commands[35] & 0x20) { struct hci_cp_le_set_default_phy cp; - /* No transmitter PHY or receiver PHY preferences */ - cp.all_phys = 0x03; - cp.tx_phys = 0; - cp.rx_phys = 0; + cp.all_phys = 0x00; + cp.tx_phys = hdev->le_tx_def_phys; + cp.rx_phys = hdev->le_rx_def_phys; hci_req_add(req, HCI_OP_LE_SET_DEFAULT_PHY, sizeof(cp), &cp); } @@ -3027,6 +3026,8 @@ struct hci_dev *hci_alloc_dev(void) hdev->le_max_tx_time = 0x0148; hdev->le_max_rx_len = 0x001b; hdev->le_max_rx_time = 0x0148; + hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M; + hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M; hdev->rpa_timeout = HCI_DEFAULT_RPA_TIMEOUT; hdev->discov_interleaved_timeout = DISCOV_INTERLEAVED_TIMEOUT; -- cgit v1.2.3 From 5075b972f20ddad5bb19542ea4f5794d06673375 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:33 +0530 Subject: Bluetooth: Add defines for BREDR pkt_type and LE PHYs This also add macros for checking LMP support for different pkt_types Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 14 ++++++++++++++ include/net/bluetooth/hci_core.h | 4 ++++ 2 files changed, 18 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 664fe1ebf2c7..89bf800f6eb1 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -291,6 +291,14 @@ enum { #define HCI_DH3 0x0800 #define HCI_DH5 0x8000 +/* HCI packet types inverted masks */ +#define HCI_2DH1 0x0002 +#define HCI_3DH1 0x0004 +#define HCI_2DH3 0x0100 +#define HCI_3DH3 0x0200 +#define HCI_2DH5 0x1000 +#define HCI_3DH5 0x2000 + #define HCI_HV1 0x0020 #define HCI_HV2 0x0040 #define HCI_HV3 0x0080 @@ -354,6 +362,8 @@ enum { #define LMP_PCONTROL 0x04 #define LMP_TRANSPARENT 0x08 +#define LMP_EDR_2M 0x02 +#define LMP_EDR_3M 0x04 #define LMP_RSSI_INQ 0x40 #define LMP_ESCO 0x80 @@ -361,7 +371,9 @@ enum { #define LMP_EV5 0x02 #define LMP_NO_BREDR 0x20 #define LMP_LE 0x40 +#define LMP_EDR_3SLOT 0x80 +#define LMP_EDR_5SLOT 0x01 #define LMP_SNIFF_SUBR 0x02 #define LMP_PAUSE_ENC 0x04 #define LMP_EDR_ESCO_2M 0x20 @@ -399,6 +411,8 @@ enum { #define HCI_LE_PING 0x10 #define HCI_LE_DATA_LEN_EXT 0x20 #define HCI_LE_EXT_SCAN_POLICY 0x80 +#define HCI_LE_PHY_2M 0x01 +#define HCI_LE_PHY_CODED 0x08 #define HCI_LE_CHAN_SEL_ALG2 0x40 /* Connection modes */ diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 71f79df9ee05..a64d13f91d09 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1141,6 +1141,10 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define lmp_inq_tx_pwr_capable(dev) ((dev)->features[0][7] & LMP_INQ_TX_PWR) #define lmp_ext_feat_capable(dev) ((dev)->features[0][7] & LMP_EXTFEATURES) #define lmp_transp_capable(dev) ((dev)->features[0][2] & LMP_TRANSPARENT) +#define lmp_edr_2m_capable(dev) ((dev)->features[0][3] & LMP_EDR_2M) +#define lmp_edr_3m_capable(dev) ((dev)->features[0][3] & LMP_EDR_3M) +#define lmp_edr_3slot_capable(dev) ((dev)->features[0][4] & LMP_EDR_3SLOT) +#define lmp_edr_5slot_capable(dev) ((dev)->features[0][5] & LMP_EDR_5SLOT) /* ----- Extended LMP capabilities ----- */ #define lmp_csb_master_capable(dev) ((dev)->features[2][0] & LMP_CSB_MASTER) -- cgit v1.2.3 From 6244691fec4dd0adebca255e60e0ed7ac8155b2e Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:34 +0530 Subject: Bluetooth: Implement Get PHY Configuration mgmt command This commands basically retrieve the supported packet types of BREDR and supported PHYs of the controller. BR_1M_1SLOT, LE_1M_TX and LE_1M_RX would be supported by default. Other PHYs are supported based on the local features. Also this sets PHY_CONFIGURATION bit in supported settings. @ MGMT Command: Get PHY Configuration (0x0044) plen 0 @ MGMT Event: Command Complete (0x0001) plen 15 Get PHY Configuration (0x0044) plen 12 Status: Success (0x00) Supported PHYs: 0x7fff BR 1M 1SLOT BR 1M 3SLOT BR 1M 5SLOT EDR 2M 1SLOT EDR 2M 3SLOT EDR 2M 5SLOT EDR 3M 1SLOT EDR 3M 3SLOT EDR 3M 5SLOT LE 1M TX LE 1M RX LE 2M TX LE 2M RX LE CODED TX LE CODED RX Configurable PHYs: 0x79fe BR 1M 3SLOT BR 1M 5SLOT EDR 2M 1SLOT EDR 2M 3SLOT EDR 2M 5SLOT EDR 3M 1SLOT EDR 3M 3SLOT EDR 3M 5SLOT LE 2M TX LE 2M RX LE CODED TX LE CODED RX Selected PHYs: 0x07ff BR 1M 1SLOT BR 1M 3SLOT BR 1M 5SLOT EDR 2M 1SLOT EDR 2M 3SLOT EDR 2M 5SLOT EDR 3M 1SLOT EDR 3M 3SLOT EDR 3M 5SLOT LE 1M TX LE 1M RX Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/mgmt.h | 25 ++++++++ net/bluetooth/mgmt.c | 145 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 170 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index e7303eee65cd..1c93d6e83a6c 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -101,6 +101,7 @@ struct mgmt_rp_read_index_list { #define MGMT_SETTING_PRIVACY 0x00002000 #define MGMT_SETTING_CONFIGURATION 0x00004000 #define MGMT_SETTING_STATIC_ADDRESS 0x00008000 +#define MGMT_SETTING_PHY_CONFIGURATION 0x00010000 #define MGMT_OP_READ_INFO 0x0004 #define MGMT_READ_INFO_SIZE 0 @@ -604,6 +605,30 @@ struct mgmt_cp_set_appearance { } __packed; #define MGMT_SET_APPEARANCE_SIZE 2 +#define MGMT_OP_GET_PHY_CONFIGURATION 0x0044 +struct mgmt_rp_get_phy_confguration { + __le32 supported_phys; + __le32 configurable_phys; + __le32 selected_phys; +} __packed; +#define MGMT_GET_PHY_CONFIGURATION_SIZE 0 + +#define MGMT_PHY_BR_1M_1SLOT 0x00000001 +#define MGMT_PHY_BR_1M_3SLOT 0x00000002 +#define MGMT_PHY_BR_1M_5SLOT 0x00000004 +#define MGMT_PHY_EDR_2M_1SLOT 0x00000008 +#define MGMT_PHY_EDR_2M_3SLOT 0x00000010 +#define MGMT_PHY_EDR_2M_5SLOT 0x00000020 +#define MGMT_PHY_EDR_3M_1SLOT 0x00000040 +#define MGMT_PHY_EDR_3M_3SLOT 0x00000080 +#define MGMT_PHY_EDR_3M_5SLOT 0x00000100 +#define MGMT_PHY_LE_1M_TX 0x00000200 +#define MGMT_PHY_LE_1M_RX 0x00000400 +#define MGMT_PHY_LE_2M_TX 0x00000800 +#define MGMT_PHY_LE_2M_RX 0x00001000 +#define MGMT_PHY_LE_CODED_TX 0x00002000 +#define MGMT_PHY_LE_CODED_RX 0x00004000 + #define MGMT_EV_CMD_COMPLETE 0x0001 struct mgmt_ev_cmd_complete { __le16 opcode; diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 8a80d48d89c4..c8c3b39fa9f2 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -617,6 +617,127 @@ static int read_config_info(struct sock *sk, struct hci_dev *hdev, &rp, sizeof(rp)); } +static u32 get_supported_phys(struct hci_dev *hdev) +{ + u32 supported_phys = 0; + + if (lmp_bredr_capable(hdev)) { + supported_phys |= MGMT_PHY_BR_1M_1SLOT; + + if (hdev->features[0][0] & LMP_3SLOT) + supported_phys |= MGMT_PHY_BR_1M_3SLOT; + + if (hdev->features[0][0] & LMP_5SLOT) + supported_phys |= MGMT_PHY_BR_1M_5SLOT; + + if (lmp_edr_2m_capable(hdev)) { + supported_phys |= MGMT_PHY_EDR_2M_1SLOT; + + if (lmp_edr_3slot_capable(hdev)) + supported_phys |= MGMT_PHY_EDR_2M_3SLOT; + + if (lmp_edr_5slot_capable(hdev)) + supported_phys |= MGMT_PHY_EDR_2M_5SLOT; + + if (lmp_edr_3m_capable(hdev)) { + supported_phys |= MGMT_PHY_EDR_3M_1SLOT; + + if (lmp_edr_3slot_capable(hdev)) + supported_phys |= MGMT_PHY_EDR_3M_3SLOT; + + if (lmp_edr_5slot_capable(hdev)) + supported_phys |= MGMT_PHY_EDR_3M_5SLOT; + } + } + } + + if (lmp_le_capable(hdev)) { + supported_phys |= MGMT_PHY_LE_1M_TX; + supported_phys |= MGMT_PHY_LE_1M_RX; + + if (hdev->le_features[1] & HCI_LE_PHY_2M) { + supported_phys |= MGMT_PHY_LE_2M_TX; + supported_phys |= MGMT_PHY_LE_2M_RX; + } + + if (hdev->le_features[1] & HCI_LE_PHY_CODED) { + supported_phys |= MGMT_PHY_LE_CODED_TX; + supported_phys |= MGMT_PHY_LE_CODED_RX; + } + } + + return supported_phys; +} + +static u32 get_selected_phys(struct hci_dev *hdev) +{ + u32 selected_phys = 0; + + if (lmp_bredr_capable(hdev)) { + selected_phys |= MGMT_PHY_BR_1M_1SLOT; + + if (hdev->pkt_type & (HCI_DM3 | HCI_DH3)) + selected_phys |= MGMT_PHY_BR_1M_3SLOT; + + if (hdev->pkt_type & (HCI_DM5 | HCI_DH5)) + selected_phys |= MGMT_PHY_BR_1M_5SLOT; + + if (lmp_edr_2m_capable(hdev)) { + if (!(hdev->pkt_type & HCI_2DH1)) + selected_phys |= MGMT_PHY_EDR_2M_1SLOT; + + if (lmp_edr_3slot_capable(hdev) && + !(hdev->pkt_type & HCI_2DH3)) + selected_phys |= MGMT_PHY_EDR_2M_3SLOT; + + if (lmp_edr_5slot_capable(hdev) && + !(hdev->pkt_type & HCI_2DH5)) + selected_phys |= MGMT_PHY_EDR_2M_5SLOT; + + if (lmp_edr_3m_capable(hdev)) { + if (!(hdev->pkt_type & HCI_3DH1)) + selected_phys |= MGMT_PHY_EDR_3M_1SLOT; + + if (lmp_edr_3slot_capable(hdev) && + !(hdev->pkt_type & HCI_3DH3)) + selected_phys |= MGMT_PHY_EDR_3M_3SLOT; + + if (lmp_edr_5slot_capable(hdev) && + !(hdev->pkt_type & HCI_3DH5)) + selected_phys |= MGMT_PHY_EDR_3M_5SLOT; + } + } + } + + if (lmp_le_capable(hdev)) { + if (hdev->le_tx_def_phys & HCI_LE_SET_PHY_1M) + selected_phys |= MGMT_PHY_LE_1M_TX; + + if (hdev->le_rx_def_phys & HCI_LE_SET_PHY_1M) + selected_phys |= MGMT_PHY_LE_1M_RX; + + if (hdev->le_tx_def_phys & HCI_LE_SET_PHY_2M) + selected_phys |= MGMT_PHY_LE_2M_TX; + + if (hdev->le_rx_def_phys & HCI_LE_SET_PHY_2M) + selected_phys |= MGMT_PHY_LE_2M_RX; + + if (hdev->le_tx_def_phys & HCI_LE_SET_PHY_CODED) + selected_phys |= MGMT_PHY_LE_CODED_TX; + + if (hdev->le_rx_def_phys & HCI_LE_SET_PHY_CODED) + selected_phys |= MGMT_PHY_LE_CODED_RX; + } + + return selected_phys; +} + +static u32 get_configurable_phys(struct hci_dev *hdev) +{ + return (get_supported_phys(hdev) & ~MGMT_PHY_BR_1M_1SLOT & + ~MGMT_PHY_LE_1M_TX & ~MGMT_PHY_LE_1M_RX); +} + static u32 get_supported_settings(struct hci_dev *hdev) { u32 settings = 0; @@ -654,6 +775,8 @@ static u32 get_supported_settings(struct hci_dev *hdev) hdev->set_bdaddr) settings |= MGMT_SETTING_CONFIGURATION; + settings |= MGMT_SETTING_PHY_CONFIGURATION; + return settings; } @@ -3184,6 +3307,27 @@ static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data, return err; } +static int get_phy_configuration(struct sock *sk, struct hci_dev *hdev, + void *data, u16 len) +{ + struct mgmt_rp_get_phy_confguration rp; + + BT_DBG("sock %p %s", sk, hdev->name); + + hci_dev_lock(hdev); + + memset(&rp, 0, sizeof(rp)); + + rp.supported_phys = cpu_to_le32(get_supported_phys(hdev)); + rp.selected_phys = cpu_to_le32(get_selected_phys(hdev)); + rp.configurable_phys = cpu_to_le32(get_configurable_phys(hdev)); + + hci_dev_unlock(hdev); + + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_PHY_CONFIGURATION, 0, + &rp, sizeof(rp)); +} + static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb) { @@ -6544,6 +6688,7 @@ static const struct hci_mgmt_handler mgmt_handlers[] = { { read_ext_controller_info,MGMT_READ_EXT_INFO_SIZE, HCI_MGMT_UNTRUSTED }, { set_appearance, MGMT_SET_APPEARANCE_SIZE }, + { get_phy_configuration, MGMT_GET_PHY_CONFIGURATION_SIZE }, }; void mgmt_index_added(struct hci_dev *hdev) -- cgit v1.2.3 From 0314f2867fa0c46d0fc1c23c80e7fab9435079df Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:35 +0530 Subject: Bluetooth: Implement Set PHY Confguration command This enables user to set phys which will be used in all subsequent connections. Also host will use the same in LE scanning as well. @ MGMT Command: Set PHY Configuration (0x0045) plen 4 Selected PHYs: 0x7fff BR 1M 1SLOT BR 1M 3SLOT BR 1M 5SLOT EDR 2M 1SLOT EDR 2M 3SLOT EDR 2M 5SLOT EDR 3M 1SLOT EDR 3M 3SLOT EDR 3M 5SLOT LE 1M TX LE 1M RX LE 2M TX LE 2M RX LE CODED TX LE CODED RX < HCI Command: LE Set Default PHY (0x08|0x0031) plen 3 All PHYs preference: 0x00 TX PHYs preference: 0x07 LE 1M LE 2M LE Coded RX PHYs preference: 0x07 LE 1M LE 2M LE Coded > HCI Event: Command Complete (0x0e) plen 4 LE Set Default PHY (0x08|0x0031) ncmd 1 Status: Success (0x00) @ MGMT Event: Command Complete (0x0001) plen 3 Set PHY Configuration (0x0045) plen 0 Status: Success (0x00) @ MGMT Event: PHY Configuration Changed (0x0026) plen 4 Selected PHYs: 0x7fff BR 1M 1SLOT BR 1M 3SLOT BR 1M 5SLOT EDR 2M 1SLOT EDR 2M 3SLOT EDR 2M 5SLOT EDR 3M 1SLOT EDR 3M 3SLOT EDR 3M 5SLOT LE 1M TX LE 1M RX LE 2M TX LE 2M RX LE CODED TX LE CODED RX Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/mgmt.h | 19 +++++ net/bluetooth/hci_event.c | 26 +++++++ net/bluetooth/mgmt.c | 182 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 227 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 1c93d6e83a6c..0916e203e5d9 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -629,6 +629,25 @@ struct mgmt_rp_get_phy_confguration { #define MGMT_PHY_LE_CODED_TX 0x00002000 #define MGMT_PHY_LE_CODED_RX 0x00004000 +#define MGMT_PHY_BREDR_MASK (MGMT_PHY_BR_1M_1SLOT | MGMT_PHY_BR_1M_3SLOT | \ + MGMT_PHY_BR_1M_5SLOT | MGMT_PHY_EDR_2M_1SLOT | \ + MGMT_PHY_EDR_2M_3SLOT | MGMT_PHY_EDR_2M_5SLOT | \ + MGMT_PHY_EDR_3M_1SLOT | MGMT_PHY_EDR_3M_3SLOT | \ + MGMT_PHY_EDR_3M_5SLOT) +#define MGMT_PHY_LE_MASK (MGMT_PHY_LE_1M_TX | MGMT_PHY_LE_1M_RX | \ + MGMT_PHY_LE_2M_TX | MGMT_PHY_LE_2M_RX | \ + MGMT_PHY_LE_CODED_TX | MGMT_PHY_LE_CODED_RX) +#define MGMT_PHY_LE_TX_MASK (MGMT_PHY_LE_1M_TX | MGMT_PHY_LE_2M_TX | \ + MGMT_PHY_LE_CODED_TX) +#define MGMT_PHY_LE_RX_MASK (MGMT_PHY_LE_1M_RX | MGMT_PHY_LE_2M_RX | \ + MGMT_PHY_LE_CODED_RX) + +#define MGMT_OP_SET_PHY_CONFIGURATION 0x0045 +struct mgmt_cp_set_phy_confguration { + __le32 selected_phys; +} __packed; +#define MGMT_SET_PHY_CONFIGURATION_SIZE 4 + #define MGMT_EV_CMD_COMPLETE 0x0001 struct mgmt_ev_cmd_complete { __le16 opcode; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 68192152c23b..694231541a4c 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1042,6 +1042,28 @@ static void hci_cc_le_set_random_addr(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_unlock(hdev); } +static void hci_cc_le_set_default_phy(struct hci_dev *hdev, struct sk_buff *skb) +{ + __u8 status = *((__u8 *) skb->data); + struct hci_cp_le_set_default_phy *cp; + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + if (status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_DEFAULT_PHY); + if (!cp) + return; + + hci_dev_lock(hdev); + + hdev->le_tx_def_phys = cp->tx_phys; + hdev->le_rx_def_phys = cp->rx_phys; + + hci_dev_unlock(hdev); +} + static void hci_cc_le_set_adv_enable(struct hci_dev *hdev, struct sk_buff *skb) { __u8 *sent, status = *((__u8 *) skb->data); @@ -3163,6 +3185,10 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_le_set_ext_scan_enable(hdev, skb); break; + case HCI_OP_LE_SET_DEFAULT_PHY: + hci_cc_le_set_default_phy(hdev, skb); + break; + default: BT_DBG("%s opcode 0x%4.4x", hdev->name, *opcode); break; diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index c8c3b39fa9f2..7cd6a37a63ee 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -3328,6 +3328,187 @@ static int get_phy_configuration(struct sock *sk, struct hci_dev *hdev, &rp, sizeof(rp)); } +static void set_default_phy_complete(struct hci_dev *hdev, u8 status, + u16 opcode, struct sk_buff *skb) +{ + struct mgmt_cp_set_phy_confguration *cp; + struct mgmt_pending_cmd *cmd; + + BT_DBG("status 0x%02x", status); + + hci_dev_lock(hdev); + + cmd = pending_find(MGMT_OP_SET_PHY_CONFIGURATION, hdev); + if (!cmd) + goto unlock; + + cp = cmd->param; + + if (status) { + mgmt_cmd_status(cmd->sk, hdev->id, + MGMT_OP_SET_PHY_CONFIGURATION, + mgmt_status(status)); + } else { + mgmt_cmd_complete(cmd->sk, hdev->id, + MGMT_OP_SET_PHY_CONFIGURATION, 0, + NULL, 0); + } + + mgmt_pending_remove(cmd); + +unlock: + hci_dev_unlock(hdev); +} + +static int set_phy_configuration(struct sock *sk, struct hci_dev *hdev, + void *data, u16 len) +{ + struct mgmt_cp_set_phy_confguration *cp = data; + struct hci_cp_le_set_default_phy cp_phy; + struct mgmt_pending_cmd *cmd; + struct hci_request req; + u32 selected_phys, configurable_phys, supported_phys, unconfigure_phys; + u16 pkt_type = (HCI_DH1 | HCI_DM1); + int err; + + BT_DBG("sock %p %s", sk, hdev->name); + + configurable_phys = get_configurable_phys(hdev); + supported_phys = get_supported_phys(hdev); + selected_phys = __le32_to_cpu(cp->selected_phys); + + if (selected_phys & ~supported_phys) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_PHY_CONFIGURATION, + MGMT_STATUS_INVALID_PARAMS); + + unconfigure_phys = supported_phys & ~configurable_phys; + + if ((selected_phys & unconfigure_phys) != unconfigure_phys) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_PHY_CONFIGURATION, + MGMT_STATUS_INVALID_PARAMS); + + if (selected_phys == get_selected_phys(hdev)) + return mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_SET_PHY_CONFIGURATION, + 0, NULL, 0); + + hci_dev_lock(hdev); + + if (!hdev_is_powered(hdev)) { + err = mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_PHY_CONFIGURATION, + MGMT_STATUS_REJECTED); + goto unlock; + } + + if (pending_find(MGMT_OP_SET_PHY_CONFIGURATION, hdev)) { + err = mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_PHY_CONFIGURATION, + MGMT_STATUS_BUSY); + goto unlock; + } + + if (selected_phys & MGMT_PHY_BR_1M_3SLOT) + pkt_type |= (HCI_DH3 | HCI_DM3); + else + pkt_type &= ~(HCI_DH3 | HCI_DM3); + + if (selected_phys & MGMT_PHY_BR_1M_5SLOT) + pkt_type |= (HCI_DH5 | HCI_DM5); + else + pkt_type &= ~(HCI_DH5 | HCI_DM5); + + if (selected_phys & MGMT_PHY_EDR_2M_1SLOT) + pkt_type &= ~HCI_2DH1; + else + pkt_type |= HCI_2DH1; + + if (selected_phys & MGMT_PHY_EDR_2M_3SLOT) + pkt_type &= ~HCI_2DH3; + else + pkt_type |= HCI_2DH3; + + if (selected_phys & MGMT_PHY_EDR_2M_5SLOT) + pkt_type &= ~HCI_2DH5; + else + pkt_type |= HCI_2DH5; + + if (selected_phys & MGMT_PHY_EDR_3M_1SLOT) + pkt_type &= ~HCI_3DH1; + else + pkt_type |= HCI_3DH1; + + if (selected_phys & MGMT_PHY_EDR_3M_3SLOT) + pkt_type &= ~HCI_3DH3; + else + pkt_type |= HCI_3DH3; + + if (selected_phys & MGMT_PHY_EDR_3M_5SLOT) + pkt_type &= ~HCI_3DH5; + else + pkt_type |= HCI_3DH5; + + if (pkt_type != hdev->pkt_type) + hdev->pkt_type = pkt_type; + + if ((selected_phys & MGMT_PHY_LE_MASK) == + (get_selected_phys(hdev) & MGMT_PHY_LE_MASK)) { + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_SET_PHY_CONFIGURATION, + 0, NULL, 0); + + goto unlock; + } + + cmd = mgmt_pending_add(sk, MGMT_OP_SET_PHY_CONFIGURATION, hdev, data, + len); + if (!cmd) { + err = -ENOMEM; + goto unlock; + } + + hci_req_init(&req, hdev); + + memset(&cp_phy, 0, sizeof(cp_phy)); + + if (!(selected_phys & MGMT_PHY_LE_TX_MASK)) + cp_phy.all_phys |= 0x01; + + if (!(selected_phys & MGMT_PHY_LE_RX_MASK)) + cp_phy.all_phys |= 0x02; + + if (selected_phys & MGMT_PHY_LE_1M_TX) + cp_phy.tx_phys |= HCI_LE_SET_PHY_1M; + + if (selected_phys & MGMT_PHY_LE_2M_TX) + cp_phy.tx_phys |= HCI_LE_SET_PHY_2M; + + if (selected_phys & MGMT_PHY_LE_CODED_TX) + cp_phy.tx_phys |= HCI_LE_SET_PHY_CODED; + + if (selected_phys & MGMT_PHY_LE_1M_RX) + cp_phy.rx_phys |= HCI_LE_SET_PHY_1M; + + if (selected_phys & MGMT_PHY_LE_2M_RX) + cp_phy.rx_phys |= HCI_LE_SET_PHY_2M; + + if (selected_phys & MGMT_PHY_LE_CODED_RX) + cp_phy.rx_phys |= HCI_LE_SET_PHY_CODED; + + hci_req_add(&req, HCI_OP_LE_SET_DEFAULT_PHY, sizeof(cp_phy), &cp_phy); + + err = hci_req_run_skb(&req, set_default_phy_complete); + if (err < 0) + mgmt_pending_remove(cmd); + +unlock: + hci_dev_unlock(hdev); + + return err; +} + static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb) { @@ -6689,6 +6870,7 @@ static const struct hci_mgmt_handler mgmt_handlers[] = { HCI_MGMT_UNTRUSTED }, { set_appearance, MGMT_SET_APPEARANCE_SIZE }, { get_phy_configuration, MGMT_GET_PHY_CONFIGURATION_SIZE }, + { set_phy_configuration, MGMT_SET_PHY_CONFIGURATION_SIZE }, }; void mgmt_index_added(struct hci_dev *hdev) -- cgit v1.2.3 From b7c23df85b6a1c3bcfb591cfa938d341fc3a556e Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:36 +0530 Subject: Bluetooth: Implement PHY changed event This defines and implement phy changed event and send it to user whenever selected PHYs changes using SET_PHY_CONFIGURATION. This will be also trigerred when BREDR pkt_type is changed using the legacy ioctl HCISETPTYPE. @ MGMT Command: Set PHY Configuration (0x0045) plen 4 Selected PHYs: 0x7fff BR 1M 1SLOT BR 1M 3SLOT BR 1M 5SLOT EDR 2M 1SLOT EDR 2M 3SLOT EDR 2M 5SLOT EDR 3M 1SLOT EDR 3M 3SLOT EDR 3M 5SLOT LE 1M TX LE 1M RX LE 2M TX LE 2M RX LE CODED TX LE CODED RX < HCI Command: LE Set Default PHY (0x08|0x0031) plen 3 All PHYs preference: 0x00 TX PHYs preference: 0x07 LE 1M LE 2M LE Coded RX PHYs preference: 0x07 LE 1M LE 2M LE Coded > HCI Event: Command Complete (0x0e) plen 4 LE Set Default PHY (0x08|0x0031) ncmd 1 Status: Success (0x00) @ MGMT Event: Command Complete (0x0001) plen 3 Set PHY Configuration (0x0045) plen 0 Status: Success (0x00) @ MGMT Event: PHY Configuration Changed (0x0026) plen 4 Selected PHYs: 0x7fff BR 1M 1SLOT BR 1M 3SLOT BR 1M 5SLOT EDR 2M 1SLOT EDR 2M 3SLOT EDR 2M 5SLOT EDR 3M 1SLOT EDR 3M 3SLOT EDR 3M 5SLOT LE 1M TX LE 1M RX LE 2M TX LE 2M RX LE CODED TX LE CODED RX Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 1 + include/net/bluetooth/mgmt.h | 5 +++++ net/bluetooth/hci_core.c | 4 ++++ net/bluetooth/mgmt.c | 22 +++++++++++++++++++++- 4 files changed, 31 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index a64d13f91d09..ab5d494a545a 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1544,6 +1544,7 @@ void mgmt_advertising_added(struct sock *sk, struct hci_dev *hdev, u8 instance); void mgmt_advertising_removed(struct sock *sk, struct hci_dev *hdev, u8 instance); +int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip); u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, u16 to_multiplier); diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 0916e203e5d9..7f372e9067c9 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -868,3 +868,8 @@ struct mgmt_ev_ext_info_changed { __le16 eir_len; __u8 eir[0]; } __packed; + +#define MGMT_EV_PHY_CONFIGURATION_CHANGED 0x0026 +struct mgmt_ev_phy_configuration_changed { + __le32 selected_phys; +} __packed; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 432f89f390c0..523e91ad64d0 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1924,7 +1924,11 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg) break; case HCISETPTYPE: + if (hdev->pkt_type == (__u16) dr.dev_opt) + break; + hdev->pkt_type = (__u16) dr.dev_opt; + mgmt_phy_configuration_changed(hdev, NULL); break; case HCISETACLMTU: diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 7cd6a37a63ee..1867aadc5061 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -3328,6 +3328,18 @@ static int get_phy_configuration(struct sock *sk, struct hci_dev *hdev, &rp, sizeof(rp)); } +int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip) +{ + struct mgmt_ev_phy_configuration_changed ev; + + memset(&ev, 0, sizeof(ev)); + + ev.selected_phys = cpu_to_le32(get_selected_phys(hdev)); + + return mgmt_event(MGMT_EV_PHY_CONFIGURATION_CHANGED, hdev, &ev, + sizeof(ev), skip); +} + static void set_default_phy_complete(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb) { @@ -3352,6 +3364,8 @@ static void set_default_phy_complete(struct hci_dev *hdev, u8 status, mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_SET_PHY_CONFIGURATION, 0, NULL, 0); + + mgmt_phy_configuration_changed(hdev, cmd->sk); } mgmt_pending_remove(cmd); @@ -3369,6 +3383,7 @@ static int set_phy_configuration(struct sock *sk, struct hci_dev *hdev, struct hci_request req; u32 selected_phys, configurable_phys, supported_phys, unconfigure_phys; u16 pkt_type = (HCI_DH1 | HCI_DM1); + bool changed = false; int err; BT_DBG("sock %p %s", sk, hdev->name); @@ -3450,11 +3465,16 @@ static int set_phy_configuration(struct sock *sk, struct hci_dev *hdev, else pkt_type |= HCI_3DH5; - if (pkt_type != hdev->pkt_type) + if (pkt_type != hdev->pkt_type) { hdev->pkt_type = pkt_type; + changed = true; + } if ((selected_phys & MGMT_PHY_LE_MASK) == (get_selected_phys(hdev) & MGMT_PHY_LE_MASK)) { + if (changed) + mgmt_phy_configuration_changed(hdev, sk); + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_PHY_CONFIGURATION, 0, NULL, 0); -- cgit v1.2.3 From 45bdd86eafc7d29e0b4b6681bec9c6ab8eddc6bf Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:37 +0530 Subject: Bluetooth: Set Scan PHYs based on selected PHYs by user Use the PHYs selected in Set Phy Configuration management command while scanning. < HCI Command: LE Set Extended Scan Parameters (0x08|0x0041) plen 13 Own address type: Random (0x01) Filter policy: Accept all advertisement (0x00) PHYs: 0x05 Entry 0: LE 1M Type: Active (0x01) Interval: 11.250 msec (0x0012) Window: 11.250 msec (0x0012) Entry 1: LE Coded Type: Active (0x01) Interval: 11.250 msec (0x0012) Window: 11.250 msec (0x0012) > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Scan Parameters (0x08|0x0041) ncmd 1 Status: Success (0x00) < HCI Command: LE Set Extended Scan Enable (0x08|0x0042) plen 6 Extended scan: Enabled (0x01) Filter duplicates: Enabled (0x01) Duration: 0 msec (0x0000) Period: 0.00 sec (0x0000) > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Scan Enable (0x08|0x0042) ncmd 2 Status: Success (0x00) Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 4 +++- include/net/bluetooth/hci_core.h | 9 +++++++++ net/bluetooth/hci_request.c | 37 ++++++++++++++++++++++++++++--------- 3 files changed, 40 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 89bf800f6eb1..04211457367a 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1540,7 +1540,9 @@ struct hci_cp_le_set_ext_scan_params { __u8 data[0]; } __packed; -#define LE_SCAN_PHY_1M 0x01 +#define LE_SCAN_PHY_1M 0x01 +#define LE_SCAN_PHY_2M 0x02 +#define LE_SCAN_PHY_CODED 0x04 struct hci_cp_le_scan_phy_params { __u8 type; diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index ab5d494a545a..113c9bb609c7 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1165,6 +1165,15 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define bredr_sc_enabled(dev) (lmp_sc_capable(dev) && \ hci_dev_test_flag(dev, HCI_SC_ENABLED)) +#define scan_1m(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_1M) || \ + ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_1M)) + +#define scan_2m(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_2M) || \ + ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_2M)) + +#define scan_coded(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_CODED) || \ + ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_CODED)) + /* Use ext scanning if set ext scan param and ext scan enable is supported */ #define use_ext_scan(dev) (((dev)->commands[37] & 0x20) && \ ((dev)->commands[37] & 0x40)) diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index faf7c711234c..215059a7646e 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -790,8 +790,8 @@ static void hci_req_start_scan(struct hci_request *req, u8 type, u16 interval, struct hci_cp_le_set_ext_scan_params *ext_param_cp; struct hci_cp_le_set_ext_scan_enable ext_enable_cp; struct hci_cp_le_scan_phy_params *phy_params; - /* Ony single PHY (1M) is supported as of now */ - u8 data[sizeof(*ext_param_cp) + sizeof(*phy_params) * 1]; + u8 data[sizeof(*ext_param_cp) + sizeof(*phy_params) * 2]; + u32 plen; ext_param_cp = (void *)data; phy_params = (void *)ext_param_cp->data; @@ -799,16 +799,35 @@ static void hci_req_start_scan(struct hci_request *req, u8 type, u16 interval, memset(ext_param_cp, 0, sizeof(*ext_param_cp)); ext_param_cp->own_addr_type = own_addr_type; ext_param_cp->filter_policy = filter_policy; - ext_param_cp->scanning_phys = LE_SCAN_PHY_1M; - memset(phy_params, 0, sizeof(*phy_params)); - phy_params->type = type; - phy_params->interval = cpu_to_le16(interval); - phy_params->window = cpu_to_le16(window); + plen = sizeof(*ext_param_cp); + + if (scan_1m(hdev) || scan_2m(hdev)) { + ext_param_cp->scanning_phys |= LE_SCAN_PHY_1M; + + memset(phy_params, 0, sizeof(*phy_params)); + phy_params->type = type; + phy_params->interval = cpu_to_le16(interval); + phy_params->window = cpu_to_le16(window); + + plen += sizeof(*phy_params); + phy_params++; + } + + if (scan_coded(hdev)) { + ext_param_cp->scanning_phys |= LE_SCAN_PHY_CODED; + + memset(phy_params, 0, sizeof(*phy_params)); + phy_params->type = type; + phy_params->interval = cpu_to_le16(interval); + phy_params->window = cpu_to_le16(window); + + plen += sizeof(*phy_params); + phy_params++; + } hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_PARAMS, - sizeof(*ext_param_cp) + sizeof(*phy_params), - ext_param_cp); + plen, ext_param_cp); memset(&ext_enable_cp, 0, sizeof(ext_enable_cp)); ext_enable_cp.enable = LE_SCAN_ENABLE; -- cgit v1.2.3 From b2cc9761f144e8ef714be8c590603073b80ddc13 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:38 +0530 Subject: Bluetooth: Handle extended ADV PDU types This patch defines the extended ADV types and handle it in ADV report. Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 8 ++++++++ net/bluetooth/hci_event.c | 50 +++++++++++++++++++++++++++++++++------------ 2 files changed, 45 insertions(+), 13 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 04211457367a..83a1593a128e 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1976,6 +1976,14 @@ struct hci_ev_le_conn_complete { #define LE_LEGACY_SCAN_RSP_ADV 0x001b #define LE_LEGACY_SCAN_RSP_ADV_SCAN 0x001a +/* Extended Advertising event types */ +#define LE_EXT_ADV_NON_CONN_IND 0x0000 +#define LE_EXT_ADV_CONN_IND 0x0001 +#define LE_EXT_ADV_SCAN_IND 0x0002 +#define LE_EXT_ADV_DIRECT_IND 0x0004 +#define LE_EXT_ADV_SCAN_RSP 0x0008 +#define LE_EXT_ADV_LEGACY_PDU 0x0010 + #define ADDR_LE_DEV_PUBLIC 0x00 #define ADDR_LE_DEV_RANDOM 0x01 diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 694231541a4c..5fa00f488cfc 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5137,21 +5137,45 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_unlock(hdev); } -static u8 convert_legacy_evt_type(u16 evt_type) -{ - switch (evt_type) { - case LE_LEGACY_ADV_IND: +static u8 ext_evt_type_to_legacy(u16 evt_type) +{ + if (evt_type & LE_EXT_ADV_LEGACY_PDU) { + switch (evt_type) { + case LE_LEGACY_ADV_IND: + return LE_ADV_IND; + case LE_LEGACY_ADV_DIRECT_IND: + return LE_ADV_DIRECT_IND; + case LE_LEGACY_ADV_SCAN_IND: + return LE_ADV_SCAN_IND; + case LE_LEGACY_NONCONN_IND: + return LE_ADV_NONCONN_IND; + case LE_LEGACY_SCAN_RSP_ADV: + case LE_LEGACY_SCAN_RSP_ADV_SCAN: + return LE_ADV_SCAN_RSP; + } + + BT_ERR_RATELIMITED("Unknown advertising packet type: 0x%02x", + evt_type); + + return LE_ADV_INVALID; + } + + if (evt_type & LE_EXT_ADV_CONN_IND) { + if (evt_type & LE_EXT_ADV_DIRECT_IND) + return LE_ADV_DIRECT_IND; + return LE_ADV_IND; - case LE_LEGACY_ADV_DIRECT_IND: - return LE_ADV_DIRECT_IND; - case LE_LEGACY_ADV_SCAN_IND: + } + + if (evt_type & LE_EXT_ADV_SCAN_RSP) + return LE_ADV_SCAN_RSP; + + if (evt_type & LE_EXT_ADV_SCAN_IND) return LE_ADV_SCAN_IND; - case LE_LEGACY_NONCONN_IND: + + if (evt_type == LE_EXT_ADV_NON_CONN_IND || + evt_type & LE_EXT_ADV_DIRECT_IND) return LE_ADV_NONCONN_IND; - case LE_LEGACY_SCAN_RSP_ADV: - case LE_LEGACY_SCAN_RSP_ADV_SCAN: - return LE_ADV_SCAN_RSP; - } BT_ERR_RATELIMITED("Unknown advertising packet type: 0x%02x", evt_type); @@ -5172,7 +5196,7 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb) u16 evt_type; evt_type = __le16_to_cpu(ev->evt_type); - legacy_evt_type = convert_legacy_evt_type(evt_type); + legacy_evt_type = ext_evt_type_to_legacy(evt_type); if (legacy_evt_type != LE_ADV_INVALID) { process_adv_report(hdev, legacy_evt_type, &ev->bdaddr, ev->bdaddr_type, NULL, 0, ev->rssi, -- cgit v1.2.3 From 6b49bcb4bce2ed0f0aefe8e304a8b9cbaeeaa3f0 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:40 +0530 Subject: Bluetooth: Read no of adv sets during init This patch reads the number of advertising sets in the controller during init and save it in hdev. Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 7 +++++++ include/net/bluetooth/hci_core.h | 4 ++++ net/bluetooth/hci_core.c | 16 ++++++++++++++-- net/bluetooth/hci_event.c | 18 ++++++++++++++++++ 4 files changed, 43 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 83a1593a128e..3f93ae9765a4 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -410,6 +410,7 @@ enum { #define HCI_LE_SLAVE_FEATURES 0x08 #define HCI_LE_PING 0x10 #define HCI_LE_DATA_LEN_EXT 0x20 +#define HCI_LE_EXT_ADV 0x10 #define HCI_LE_EXT_SCAN_POLICY 0x80 #define HCI_LE_PHY_2M 0x01 #define HCI_LE_PHY_CODED 0x08 @@ -1579,6 +1580,12 @@ struct hci_cp_le_ext_conn_param { __le16 max_ce_len; } __packed; +#define HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS 0x203b +struct hci_rp_le_read_num_supported_adv_sets { + __u8 status; + __u8 num_of_sets; +} __packed; + /* ---- HCI Events ---- */ #define HCI_EV_INQUIRY_COMPLETE 0x01 diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 113c9bb609c7..2aad4a863176 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -222,6 +222,7 @@ struct hci_dev { __u8 le_features[8]; __u8 le_white_list_size; __u8 le_resolv_list_size; + __u8 le_num_of_adv_sets; __u8 le_states[8]; __u8 commands[64]; __u8 hci_ver; @@ -1180,6 +1181,9 @@ void hci_conn_del_sysfs(struct hci_conn *conn); /* Use ext create connection if command is supported */ #define use_ext_conn(dev) ((dev)->commands[37] & 0x80) +/* Extended advertising support */ +#define ext_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_EXT_ADV)) + /* ----- HCI protocols ----- */ #define HCI_PROTO_DEFER 0x01 diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 523e91ad64d0..7b08b7f57418 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -715,8 +715,14 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) hci_req_add(req, HCI_OP_LE_SET_EVENT_MASK, sizeof(events), events); - if (hdev->commands[25] & 0x40) { - /* Read LE Advertising Channel TX Power */ + /* Read LE Advertising Channel TX Power */ + if ((hdev->commands[25] & 0x40) && !ext_adv_capable(hdev)) { + /* HCI TS spec forbids mixing of legacy and extended + * advertising commands wherein READ_ADV_TX_POWER is + * also included. So do not call it if extended adv + * is supported otherwise controller will return + * COMMAND_DISALLOWED for extended commands. + */ hci_req_add(req, HCI_OP_LE_READ_ADV_TX_POWER, 0, NULL); } @@ -750,6 +756,12 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) hci_req_add(req, HCI_OP_LE_READ_DEF_DATA_LEN, 0, NULL); } + if (ext_adv_capable(hdev)) { + /* Read LE Number of Supported Advertising Sets */ + hci_req_add(req, HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS, + 0, NULL); + } + hci_set_le_support(req); } diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 5fa00f488cfc..0ceb52edc142 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1267,6 +1267,20 @@ static void hci_cc_le_set_ext_scan_enable(struct hci_dev *hdev, le_set_scan_enable_complete(hdev, cp->enable); } +static void hci_cc_le_read_num_adv_sets(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_rp_le_read_num_supported_adv_sets *rp = (void *) skb->data; + + BT_DBG("%s status 0x%2.2x No of Adv sets %u", hdev->name, rp->status, + rp->num_of_sets); + + if (rp->status) + return; + + hdev->le_num_of_adv_sets = rp->num_of_sets; +} + static void hci_cc_le_read_white_list_size(struct hci_dev *hdev, struct sk_buff *skb) { @@ -3189,6 +3203,10 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_le_set_default_phy(hdev, skb); break; + case HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS: + hci_cc_le_read_num_adv_sets(hdev, skb); + break; + default: BT_DBG("%s opcode 0x%4.4x", hdev->name, *opcode); break; -- cgit v1.2.3 From de181e887ac27dadda127c7d4c3e89c6da8fb6d2 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:41 +0530 Subject: Bluetooth: Impmlement extended adv enable This patch basically replaces legacy adv with extended adv based on the controller support. Currently there is no design change. ie only one adv set will be enabled at a time. This also adds tx_power in instance and store whatever returns from Set_ext_parameter, use the same in adv data as well. For instance 0 tx_power is stored in hdev only. < HCI Command: LE Set Extended Advertising Parameters (0x08|0x0036) plen 25 Handle: 0x00 Properties: 0x0010 Use legacy advertising PDUs: ADV_NONCONN_IND Min advertising interval: 1280.000 msec (0x0800) Max advertising interval: 1280.000 msec (0x0800) Channel map: 37, 38, 39 (0x07) Own address type: Random (0x01) Peer address type: Public (0x00) Peer address: 00:00:00:00:00:00 (OUI 00-00-00) Filter policy: Allow Scan Request from Any, Allow Connect Request from Any (0x00) TX power: 127 dbm (0x7f) Primary PHY: LE 1M (0x01) Secondary max skip: 0x00 Secondary PHY: LE 1M (0x01) SID: 0x00 Scan request notifications: Disabled (0x00) > HCI Event: Command Complete (0x0e) plen 5 LE Set Extended Advertising Parameters (0x08|0x0036) ncmd 1 Status: Success (0x00) TX power (selected): 7 dbm (0x07) < HCI Command: LE Set Extended Advertising Enable (0x08|0x0039) plen 6 Extended advertising: Enabled (0x01) Number of sets: 1 (0x01) Entry 0 Handle: 0x00 Duration: 0 ms (0x00) Max ext adv events: 0 > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Advertising Enable (0x08|0x0039) ncmd 2 Status: Success (0x00) Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 39 +++++++++ include/net/bluetooth/hci_core.h | 1 + net/bluetooth/hci_core.c | 2 + net/bluetooth/hci_event.c | 72 +++++++++++++++++ net/bluetooth/hci_request.c | 171 ++++++++++++++++++++++++++++++++++----- net/bluetooth/hci_request.h | 3 + net/bluetooth/mgmt.c | 22 +++-- 7 files changed, 285 insertions(+), 25 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 3f93ae9765a4..b447b127879e 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1586,6 +1586,45 @@ struct hci_rp_le_read_num_supported_adv_sets { __u8 num_of_sets; } __packed; +#define HCI_OP_LE_SET_EXT_ADV_PARAMS 0x2036 +struct hci_cp_le_set_ext_adv_params { + __u8 handle; + __le16 evt_properties; + __u8 min_interval[3]; + __u8 max_interval[3]; + __u8 channel_map; + __u8 own_addr_type; + __u8 peer_addr_type; + bdaddr_t peer_addr; + __u8 filter_policy; + __u8 tx_power; + __u8 primary_phy; + __u8 secondary_max_skip; + __u8 secondary_phy; + __u8 sid; + __u8 notif_enable; +} __packed; + +#define HCI_ADV_PHY_1M 0X01 + +struct hci_rp_le_set_ext_adv_params { + __u8 status; + __u8 tx_power; +} __packed; + +#define HCI_OP_LE_SET_EXT_ADV_ENABLE 0x2039 +struct hci_cp_le_set_ext_adv_enable { + __u8 enable; + __u8 num_of_sets; + __u8 data[0]; +} __packed; + +struct hci_cp_ext_adv_set { + __u8 handle; + __le16 duration; + __u8 max_events; +} __packed; + /* ---- HCI Events ---- */ #define HCI_EV_INQUIRY_COMPLETE 0x01 diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 2aad4a863176..ad3518303a0c 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -171,6 +171,7 @@ struct adv_info { __u8 adv_data[HCI_MAX_AD_LENGTH]; __u16 scan_rsp_len; __u8 scan_rsp_data[HCI_MAX_AD_LENGTH]; + __s8 tx_power; }; #define HCI_MAX_ADV_INSTANCES 5 diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 7b08b7f57418..944d4fedc317 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2779,6 +2779,8 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, else adv_instance->duration = duration; + adv_instance->tx_power = HCI_TX_POWER_INVALID; + BT_DBG("%s for %dMR", hdev->name, instance); return 0; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 0ceb52edc142..0418a5514819 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1099,6 +1099,41 @@ static void hci_cc_le_set_adv_enable(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_unlock(hdev); } +static void hci_cc_le_set_ext_adv_enable(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_cp_le_set_ext_adv_enable *cp; + struct hci_cp_ext_adv_set *adv_set; + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + if (status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_EXT_ADV_ENABLE); + if (!cp) + return; + + adv_set = (void *) cp->data; + + hci_dev_lock(hdev); + + if (cp->enable) { + struct hci_conn *conn; + + hci_dev_set_flag(hdev, HCI_LE_ADV); + + conn = hci_lookup_le_connect(hdev); + if (conn) + queue_delayed_work(hdev->workqueue, + &conn->le_conn_timeout, + conn->conn_timeout); + } + + hci_dev_unlock(hdev); +} + static void hci_cc_le_set_scan_param(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_cp_le_set_scan_param *cp; @@ -1486,6 +1521,35 @@ static void hci_cc_set_adv_param(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_unlock(hdev); } +static void hci_cc_set_ext_adv_param(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_rp_le_set_ext_adv_params *rp = (void *) skb->data; + struct hci_cp_le_set_ext_adv_params *cp; + struct adv_info *adv_instance; + + BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); + + if (rp->status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_EXT_ADV_PARAMS); + if (!cp) + return; + + hci_dev_lock(hdev); + hdev->adv_addr_type = cp->own_addr_type; + if (!hdev->cur_adv_instance) { + /* Store in hdev for instance 0 */ + hdev->adv_tx_power = rp->tx_power; + } else { + adv_instance = hci_find_adv_instance(hdev, + hdev->cur_adv_instance); + if (adv_instance) + adv_instance->tx_power = rp->tx_power; + } + hci_dev_unlock(hdev); +} + static void hci_cc_read_rssi(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_rp_read_rssi *rp = (void *) skb->data; @@ -3207,6 +3271,14 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_le_read_num_adv_sets(hdev, skb); break; + case HCI_OP_LE_SET_EXT_ADV_PARAMS: + hci_cc_set_ext_adv_param(hdev, skb); + break; + + case HCI_OP_LE_SET_EXT_ADV_ENABLE: + hci_cc_le_set_ext_adv_enable(hdev, skb); + break; + default: BT_DBG("%s opcode 0x%4.4x", hdev->name, *opcode); break; diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 215059a7646e..2ac9fd67440a 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -895,6 +895,24 @@ void hci_req_add_le_passive_scan(struct hci_request *req) hdev->le_scan_window, own_addr_type, filter_policy); } +static u8 get_adv_instance_scan_rsp_len(struct hci_dev *hdev, u8 instance) +{ + struct adv_info *adv_instance; + + /* Ignore instance 0 */ + if (instance == 0x00) + return 0; + + adv_instance = hci_find_adv_instance(hdev, instance); + if (!adv_instance) + return 0; + + /* TODO: Take into account the "appearance" and "local-name" flags here. + * These are currently being ignored as they are not supported. + */ + return adv_instance->scan_rsp_len; +} + static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev) { u8 instance = hdev->cur_adv_instance; @@ -1235,15 +1253,27 @@ static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) ptr += adv_instance->adv_data_len; } - /* Provide Tx Power only if we can provide a valid value for it */ - if (hdev->adv_tx_power != HCI_TX_POWER_INVALID && - (instance_flags & MGMT_ADV_FLAG_TX_POWER)) { - ptr[0] = 0x02; - ptr[1] = EIR_TX_POWER; - ptr[2] = (u8)hdev->adv_tx_power; + if (instance_flags & MGMT_ADV_FLAG_TX_POWER) { + s8 adv_tx_power; - ad_len += 3; - ptr += 3; + if (ext_adv_capable(hdev)) { + if (adv_instance) + adv_tx_power = adv_instance->tx_power; + else + adv_tx_power = hdev->adv_tx_power; + } else { + adv_tx_power = hdev->adv_tx_power; + } + + /* Provide Tx Power only if we can provide a valid value for it */ + if (adv_tx_power != HCI_TX_POWER_INVALID) { + ptr[0] = 0x02; + ptr[1] = EIR_TX_POWER; + ptr[2] = (u8)adv_tx_power; + + ad_len += 3; + ptr += 3; + } } return ad_len; @@ -1304,9 +1334,13 @@ void hci_req_reenable_advertising(struct hci_dev *hdev) __hci_req_schedule_adv_instance(&req, hdev->cur_adv_instance, true); } else { - __hci_req_update_adv_data(&req, 0x00); - __hci_req_update_scan_rsp_data(&req, 0x00); - __hci_req_enable_advertising(&req); + if (ext_adv_capable(hdev)) { + __hci_req_start_ext_adv(&req, 0x00); + } else { + __hci_req_update_adv_data(&req, 0x00); + __hci_req_update_scan_rsp_data(&req, 0x00); + __hci_req_enable_advertising(&req); + } } hci_req_run(&req, adv_enable_complete); @@ -1343,6 +1377,87 @@ unlock: hci_dev_unlock(hdev); } +static int __hci_req_setup_ext_adv_instance(struct hci_request *req, + u8 instance) +{ + struct hci_cp_le_set_ext_adv_params cp; + struct hci_dev *hdev = req->hdev; + bool connectable; + u32 flags; + /* In ext adv set param interval is 3 octets */ + const u8 adv_interval[3] = { 0x00, 0x08, 0x00 }; + + flags = get_adv_instance_flags(hdev, instance); + + /* If the "connectable" instance flag was not set, then choose between + * ADV_IND and ADV_NONCONN_IND based on the global connectable setting. + */ + connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) || + mgmt_get_connectable(hdev); + + if (!is_advertising_allowed(hdev, connectable)) + return -EPERM; + + memset(&cp, 0, sizeof(cp)); + + memcpy(cp.min_interval, adv_interval, sizeof(cp.min_interval)); + memcpy(cp.max_interval, adv_interval, sizeof(cp.max_interval)); + + if (connectable) + cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_IND); + else if (get_adv_instance_scan_rsp_len(hdev, instance)) + cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_SCAN_IND); + else + cp.evt_properties = cpu_to_le16(LE_LEGACY_NONCONN_IND); + + cp.own_addr_type = BDADDR_LE_PUBLIC; + cp.channel_map = hdev->le_adv_channel_map; + cp.tx_power = 127; + cp.primary_phy = HCI_ADV_PHY_1M; + cp.secondary_phy = HCI_ADV_PHY_1M; + cp.handle = 0; + + hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(cp), &cp); + + return 0; +} + +void __hci_req_enable_ext_advertising(struct hci_request *req) +{ + struct hci_cp_le_set_ext_adv_enable *cp; + struct hci_cp_ext_adv_set *adv_set; + u8 data[sizeof(*cp) + sizeof(*adv_set) * 1]; + + cp = (void *) data; + adv_set = (void *) cp->data; + + memset(cp, 0, sizeof(*cp)); + + cp->enable = 0x01; + cp->num_of_sets = 0x01; + + memset(adv_set, 0, sizeof(*adv_set)); + + adv_set->handle = 0; + + hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_ENABLE, + sizeof(*cp) + sizeof(*adv_set) * cp->num_of_sets, + data); +} + +int __hci_req_start_ext_adv(struct hci_request *req, u8 instance) +{ + int err; + + err = __hci_req_setup_ext_adv_instance(req, instance); + if (err < 0) + return err; + + __hci_req_enable_ext_advertising(req); + + return 0; +} + int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance, bool force) { @@ -1396,9 +1511,13 @@ int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance, return 0; hdev->cur_adv_instance = instance; - __hci_req_update_adv_data(req, instance); - __hci_req_update_scan_rsp_data(req, instance); - __hci_req_enable_advertising(req); + if (ext_adv_capable(hdev)) { + __hci_req_start_ext_adv(req, instance); + } else { + __hci_req_update_adv_data(req, instance); + __hci_req_update_scan_rsp_data(req, instance); + __hci_req_enable_advertising(req); + } return 0; } @@ -1669,8 +1788,12 @@ static int connectable_update(struct hci_request *req, unsigned long opt) /* Update the advertising parameters if necessary */ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || - !list_empty(&hdev->adv_instances)) - __hci_req_enable_advertising(req); + !list_empty(&hdev->adv_instances)) { + if (ext_adv_capable(hdev)) + __hci_req_start_ext_adv(req, hdev->cur_adv_instance); + else + __hci_req_enable_advertising(req); + } __hci_update_background_scan(req); @@ -1779,8 +1902,12 @@ static int discoverable_update(struct hci_request *req, unsigned long opt) /* Discoverable mode affects the local advertising * address in limited privacy mode. */ - if (hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY)) - __hci_req_enable_advertising(req); + if (hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY)) { + if (ext_adv_capable(hdev)) + __hci_req_start_ext_adv(req, 0x00); + else + __hci_req_enable_advertising(req); + } } hci_dev_unlock(hdev); @@ -2376,8 +2503,12 @@ static int powered_update_hci(struct hci_request *req, unsigned long opt) __hci_req_update_adv_data(req, 0x00); __hci_req_update_scan_rsp_data(req, 0x00); - if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) - __hci_req_enable_advertising(req); + if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) { + if (ext_adv_capable(hdev)) + __hci_req_start_ext_adv(req, 0x00); + else + __hci_req_enable_advertising(req); + } } else if (!list_empty(&hdev->adv_instances)) { struct adv_info *adv_instance; diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h index 702beb140d9f..9b8c74df6b2b 100644 --- a/net/bluetooth/hci_request.h +++ b/net/bluetooth/hci_request.h @@ -80,6 +80,9 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk, struct hci_request *req, u8 instance, bool force); +int __hci_req_start_ext_adv(struct hci_request *req, u8 instance); +void __hci_req_enable_ext_advertising(struct hci_request *req); + void __hci_req_update_class(struct hci_request *req); /* Returns true if HCI commands were queued */ diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 1867aadc5061..761a9aeaa824 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -940,7 +940,10 @@ static void rpa_expired(struct work_struct *work) * function. */ hci_req_init(&req, hdev); - __hci_req_enable_advertising(&req); + if (ext_adv_capable(hdev)) + __hci_req_start_ext_adv(&req, hdev->cur_adv_instance); + else + __hci_req_enable_advertising(&req); hci_req_run(&req, NULL); } @@ -4382,9 +4385,14 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data, * HCI_ADVERTISING flag is not yet set. */ hdev->cur_adv_instance = 0x00; - __hci_req_update_adv_data(&req, 0x00); - __hci_req_update_scan_rsp_data(&req, 0x00); - __hci_req_enable_advertising(&req); + + if (ext_adv_capable(hdev)) { + __hci_req_start_ext_adv(&req, 0x00); + } else { + __hci_req_update_adv_data(&req, 0x00); + __hci_req_update_scan_rsp_data(&req, 0x00); + __hci_req_enable_advertising(&req); + } } else { __hci_req_disable_advertising(&req); } @@ -6312,7 +6320,11 @@ static u32 get_supported_adv_flags(struct hci_dev *hdev) flags |= MGMT_ADV_FLAG_APPEARANCE; flags |= MGMT_ADV_FLAG_LOCAL_NAME; - if (hdev->adv_tx_power != HCI_TX_POWER_INVALID) + /* In extended adv TX_POWER returned from Set Adv Param + * will be always valid. + */ + if ((hdev->adv_tx_power != HCI_TX_POWER_INVALID) || + ext_adv_capable(hdev)) flags |= MGMT_ADV_FLAG_TX_POWER; return flags; -- cgit v1.2.3 From a0fb3726ba55138ef6fdd5dc67da6d9a70360696 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:42 +0530 Subject: Bluetooth: Use Set ext adv/scan rsp data if controller supports This patch implements Set Ext Adv data and Set Ext Scan rsp data if controller support extended advertising. Currently the operation is set as Complete data and fragment preference is set as no fragment < HCI Command: LE Set Extended Advertising Data (0x08|0x0037) plen 35 Handle: 0x00 Operation: Complete extended advertising data (0x03) Fragment preference: Minimize fragmentation (0x01) Data length: 0x15 16-bit Service UUIDs (complete): 2 entries Heart Rate (0x180d) Battery Service (0x180f) Name (complete): Test LE Company: Google (224) Data: 0102 > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Advertising Data (0x08|0x0037) ncmd 1 Status: Success (0x00) Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 22 ++++++++ net/bluetooth/hci_event.c | 2 + net/bluetooth/hci_request.c | 126 +++++++++++++++++++++++++++++++++----------- net/bluetooth/hci_request.h | 1 + net/bluetooth/mgmt.c | 13 +++-- 5 files changed, 130 insertions(+), 34 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index b447b127879e..aace97099ead 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1625,6 +1625,28 @@ struct hci_cp_ext_adv_set { __u8 max_events; } __packed; +#define HCI_OP_LE_SET_EXT_ADV_DATA 0x2037 +struct hci_cp_le_set_ext_adv_data { + __u8 handle; + __u8 operation; + __u8 frag_pref; + __u8 length; + __u8 data[HCI_MAX_AD_LENGTH]; +} __packed; + +#define HCI_OP_LE_SET_EXT_SCAN_RSP_DATA 0x2038 +struct hci_cp_le_set_ext_scan_rsp_data { + __u8 handle; + __u8 operation; + __u8 frag_pref; + __u8 length; + __u8 data[HCI_MAX_AD_LENGTH]; +} __packed; + +#define LE_SET_ADV_DATA_OP_COMPLETE 0x03 + +#define LE_SET_ADV_DATA_NO_FRAG 0x01 + /* ---- HCI Events ---- */ #define HCI_EV_INQUIRY_COMPLETE 0x01 diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 0418a5514819..0a92bf7e3d80 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1547,6 +1547,8 @@ static void hci_cc_set_ext_adv_param(struct hci_dev *hdev, struct sk_buff *skb) if (adv_instance) adv_instance->tx_power = rp->tx_power; } + /* Update adv data as tx power is known now */ + hci_req_update_adv_data(hdev, hdev->cur_adv_instance); hci_dev_unlock(hdev); } diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 2ac9fd67440a..c41e9bb7818b 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -1174,29 +1174,58 @@ static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance, void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance) { struct hci_dev *hdev = req->hdev; - struct hci_cp_le_set_scan_rsp_data cp; u8 len; if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) return; - memset(&cp, 0, sizeof(cp)); + if (ext_adv_capable(hdev)) { + struct hci_cp_le_set_ext_scan_rsp_data cp; - if (instance) - len = create_instance_scan_rsp_data(hdev, instance, cp.data); - else - len = create_default_scan_rsp_data(hdev, cp.data); + memset(&cp, 0, sizeof(cp)); - if (hdev->scan_rsp_data_len == len && - !memcmp(cp.data, hdev->scan_rsp_data, len)) - return; + if (instance) + len = create_instance_scan_rsp_data(hdev, instance, + cp.data); + else + len = create_default_scan_rsp_data(hdev, cp.data); + + if (hdev->scan_rsp_data_len == len && + !memcmp(cp.data, hdev->scan_rsp_data, len)) + return; + + memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data)); + hdev->scan_rsp_data_len = len; + + cp.handle = 0; + cp.length = len; + cp.operation = LE_SET_ADV_DATA_OP_COMPLETE; + cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG; + + hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_RSP_DATA, sizeof(cp), + &cp); + } else { + struct hci_cp_le_set_scan_rsp_data cp; + + memset(&cp, 0, sizeof(cp)); + + if (instance) + len = create_instance_scan_rsp_data(hdev, instance, + cp.data); + else + len = create_default_scan_rsp_data(hdev, cp.data); + + if (hdev->scan_rsp_data_len == len && + !memcmp(cp.data, hdev->scan_rsp_data, len)) + return; - memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data)); - hdev->scan_rsp_data_len = len; + memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data)); + hdev->scan_rsp_data_len = len; - cp.length = len; + cp.length = len; - hci_req_add(req, HCI_OP_LE_SET_SCAN_RSP_DATA, sizeof(cp), &cp); + hci_req_add(req, HCI_OP_LE_SET_SCAN_RSP_DATA, sizeof(cp), &cp); + } } static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) @@ -1282,27 +1311,51 @@ static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) void __hci_req_update_adv_data(struct hci_request *req, u8 instance) { struct hci_dev *hdev = req->hdev; - struct hci_cp_le_set_adv_data cp; u8 len; if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) return; - memset(&cp, 0, sizeof(cp)); + if (ext_adv_capable(hdev)) { + struct hci_cp_le_set_ext_adv_data cp; - len = create_instance_adv_data(hdev, instance, cp.data); + memset(&cp, 0, sizeof(cp)); - /* There's nothing to do if the data hasn't changed */ - if (hdev->adv_data_len == len && - memcmp(cp.data, hdev->adv_data, len) == 0) - return; + len = create_instance_adv_data(hdev, instance, cp.data); + + /* There's nothing to do if the data hasn't changed */ + if (hdev->adv_data_len == len && + memcmp(cp.data, hdev->adv_data, len) == 0) + return; + + memcpy(hdev->adv_data, cp.data, sizeof(cp.data)); + hdev->adv_data_len = len; + + cp.length = len; + cp.handle = 0; + cp.operation = LE_SET_ADV_DATA_OP_COMPLETE; + cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG; - memcpy(hdev->adv_data, cp.data, sizeof(cp.data)); - hdev->adv_data_len = len; + hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_DATA, sizeof(cp), &cp); + } else { + struct hci_cp_le_set_adv_data cp; + + memset(&cp, 0, sizeof(cp)); - cp.length = len; + len = create_instance_adv_data(hdev, instance, cp.data); + + /* There's nothing to do if the data hasn't changed */ + if (hdev->adv_data_len == len && + memcmp(cp.data, hdev->adv_data, len) == 0) + return; - hci_req_add(req, HCI_OP_LE_SET_ADV_DATA, sizeof(cp), &cp); + memcpy(hdev->adv_data, cp.data, sizeof(cp.data)); + hdev->adv_data_len = len; + + cp.length = len; + + hci_req_add(req, HCI_OP_LE_SET_ADV_DATA, sizeof(cp), &cp); + } } int hci_req_update_adv_data(struct hci_dev *hdev, u8 instance) @@ -1377,8 +1430,7 @@ unlock: hci_dev_unlock(hdev); } -static int __hci_req_setup_ext_adv_instance(struct hci_request *req, - u8 instance) +int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) { struct hci_cp_le_set_ext_adv_params cp; struct hci_dev *hdev = req->hdev; @@ -1453,6 +1505,7 @@ int __hci_req_start_ext_adv(struct hci_request *req, u8 instance) if (err < 0) return err; + __hci_req_update_scan_rsp_data(req, instance); __hci_req_enable_ext_advertising(req); return 0; @@ -2500,14 +2553,25 @@ static int powered_update_hci(struct hci_request *req, unsigned long opt) */ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || list_empty(&hdev->adv_instances)) { - __hci_req_update_adv_data(req, 0x00); - __hci_req_update_scan_rsp_data(req, 0x00); + int err; + + if (ext_adv_capable(hdev)) { + err = __hci_req_setup_ext_adv_instance(req, + 0x00); + if (!err) + __hci_req_update_scan_rsp_data(req, + 0x00); + } else { + err = 0; + __hci_req_update_adv_data(req, 0x00); + __hci_req_update_scan_rsp_data(req, 0x00); + } if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) { - if (ext_adv_capable(hdev)) - __hci_req_start_ext_adv(req, 0x00); - else + if (!ext_adv_capable(hdev)) __hci_req_enable_advertising(req); + else if (!err) + __hci_req_enable_ext_advertising(req); } } else if (!list_empty(&hdev->adv_instances)) { struct adv_info *adv_instance; diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h index 9b8c74df6b2b..6afc624605af 100644 --- a/net/bluetooth/hci_request.h +++ b/net/bluetooth/hci_request.h @@ -80,6 +80,7 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk, struct hci_request *req, u8 instance, bool force); +int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance); int __hci_req_start_ext_adv(struct hci_request *req, u8 instance); void __hci_req_enable_ext_advertising(struct hci_request *req); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 761a9aeaa824..142f7e72a9a2 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1847,10 +1847,17 @@ static void le_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode) */ if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) { struct hci_request req; - hci_req_init(&req, hdev); - __hci_req_update_adv_data(&req, 0x00); - __hci_req_update_scan_rsp_data(&req, 0x00); + if (ext_adv_capable(hdev)) { + int err; + + err = __hci_req_setup_ext_adv_instance(&req, 0x00); + if (!err) + __hci_req_update_scan_rsp_data(&req, 0x00); + } else { + __hci_req_update_adv_data(&req, 0x00); + __hci_req_update_scan_rsp_data(&req, 0x00); + } hci_req_run(&req, NULL); hci_update_background_scan(hdev); } -- cgit v1.2.3 From 45b7749f16aacd9ffab8e958caa77e2aa2358c0b Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:43 +0530 Subject: Bluetooth: Implement disable and removal of adv instance If ext adv is enabled then use ext adv to disable as well. Also remove the adv set during LE disable. < HCI Command: LE Set Extended Advertising Enable (0x08|0x0039) plen 2 Extended advertising: Disabled (0x00) Number of sets: Disable all sets (0x00) > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Advertising Enable (0x08|0x0039) ncmd 2 Status: Success (0x00) Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 2 ++ net/bluetooth/hci_event.c | 2 ++ net/bluetooth/hci_request.c | 23 +++++++++++++++++++++-- net/bluetooth/hci_request.h | 1 + net/bluetooth/mgmt.c | 3 +++ 5 files changed, 29 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index aace97099ead..faa2922a69fd 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1647,6 +1647,8 @@ struct hci_cp_le_set_ext_scan_rsp_data { #define LE_SET_ADV_DATA_NO_FRAG 0x01 +#define HCI_OP_LE_CLEAR_ADV_SETS 0x203d + /* ---- HCI Events ---- */ #define HCI_EV_INQUIRY_COMPLETE 0x01 diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 0a92bf7e3d80..a78d1dd2f57b 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1129,6 +1129,8 @@ static void hci_cc_le_set_ext_adv_enable(struct hci_dev *hdev, queue_delayed_work(hdev->workqueue, &conn->le_conn_timeout, conn->conn_timeout); + } else { + hci_dev_clear_flag(hdev, HCI_LE_ADV); } hci_dev_unlock(hdev); diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index c41e9bb7818b..96e1e05a92c3 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -934,9 +934,19 @@ static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev) void __hci_req_disable_advertising(struct hci_request *req) { - u8 enable = 0x00; + if (ext_adv_capable(req->hdev)) { + struct hci_cp_le_set_ext_adv_enable cp; - hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); + cp.enable = 0x00; + /* Disable all sets since we only support one set at the moment */ + cp.num_of_sets = 0x00; + + hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_ENABLE, sizeof(cp), &cp); + } else { + u8 enable = 0x00; + + hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); + } } static u32 get_adv_instance_flags(struct hci_dev *hdev, u8 instance) @@ -1430,6 +1440,11 @@ unlock: hci_dev_unlock(hdev); } +void __hci_req_clear_ext_adv_sets(struct hci_request *req) +{ + hci_req_add(req, HCI_OP_LE_CLEAR_ADV_SETS, 0, NULL); +} + int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) { struct hci_cp_le_set_ext_adv_params cp; @@ -1499,8 +1514,12 @@ void __hci_req_enable_ext_advertising(struct hci_request *req) int __hci_req_start_ext_adv(struct hci_request *req, u8 instance) { + struct hci_dev *hdev = req->hdev; int err; + if (hci_dev_test_flag(hdev, HCI_LE_ADV)) + __hci_req_disable_advertising(req); + err = __hci_req_setup_ext_adv_instance(req, instance); if (err < 0) return err; diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h index 6afc624605af..2451861bb4f8 100644 --- a/net/bluetooth/hci_request.h +++ b/net/bluetooth/hci_request.h @@ -83,6 +83,7 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk, int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance); int __hci_req_start_ext_adv(struct hci_request *req, u8 instance); void __hci_req_enable_ext_advertising(struct hci_request *req); +void __hci_req_clear_ext_adv_sets(struct hci_request *req); void __hci_req_update_class(struct hci_request *req); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 142f7e72a9a2..c283f0364c0f 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1956,6 +1956,9 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) } else { if (hci_dev_test_flag(hdev, HCI_LE_ADV)) __hci_req_disable_advertising(&req); + + if (ext_adv_capable(hdev)) + __hci_req_clear_ext_adv_sets(&req); } hci_req_add(&req, HCI_OP_WRITE_LE_HOST_SUPPORTED, sizeof(hci_cp), -- cgit v1.2.3 From a73c046a2869048430c332a871a5b169f192c6c3 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:45 +0530 Subject: Bluetooth: Implement Set ADV set random address This basically sets the random address for the adv instance Random address can be set only if the instance is created which is done in Set ext adv param. Random address and rpa expire timer and flags have been added to adv instance which will be used when the respective instance is scheduled. This introduces a hci_get_random_address() which returns the own address type and random address (rpa or nrpa) based on the instance flags and hdev flags. New function is required since own address type should be known before setting adv params but address can be set only after setting params. < HCI Command: LE Set Advertising Set Random Address (0x08|0x0035) plen 7 Advertising handle: 0x00 Advertising random address: 3C:8E:56:9B:77:84 (OUI 3C-8E-56) > HCI Event: Command Complete (0x0e) plen 4 LE Set Advertising Set Random Address (0x08|0x0035) ncmd 1 Status: Success (0x00) Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 6 ++ include/net/bluetooth/hci_core.h | 4 ++ net/bluetooth/hci_conn.c | 23 +++++++ net/bluetooth/hci_core.c | 33 +++++++++- net/bluetooth/hci_event.c | 37 ++++++++++- net/bluetooth/hci_request.c | 128 ++++++++++++++++++++++++++++++++++++++- net/bluetooth/hci_request.h | 3 + net/bluetooth/mgmt.c | 2 + 8 files changed, 233 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index faa2922a69fd..8d348d0d3eea 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1649,6 +1649,12 @@ struct hci_cp_le_set_ext_scan_rsp_data { #define HCI_OP_LE_CLEAR_ADV_SETS 0x203d +#define HCI_OP_LE_SET_ADV_SET_RAND_ADDR 0x2035 +struct hci_cp_le_set_adv_set_rand_addr { + __u8 handle; + bdaddr_t bdaddr; +} __packed; + /* ---- HCI Events ---- */ #define HCI_EV_INQUIRY_COMPLETE 0x01 diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index ad3518303a0c..0db1b9b428b7 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -172,6 +172,9 @@ struct adv_info { __u16 scan_rsp_len; __u8 scan_rsp_data[HCI_MAX_AD_LENGTH]; __s8 tx_power; + bdaddr_t random_addr; + bool rpa_expired; + struct delayed_work rpa_expired_cb; }; #define HCI_MAX_ADV_INSTANCES 5 @@ -1113,6 +1116,7 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, u16 scan_rsp_len, u8 *scan_rsp_data, u16 timeout, u16 duration); int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance); +void hci_adv_instances_set_rpa_expired(struct hci_dev *hdev, bool rpa_expired); void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 5c37d383caa3..bd4978ce8c45 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -873,6 +873,14 @@ static void hci_req_directed_advertising(struct hci_request *req, if (ext_adv_capable(hdev)) { struct hci_cp_le_set_ext_adv_params cp; + bdaddr_t random_addr; + + /* Set require_privacy to false so that the remote device has a + * chance of identifying us. + */ + if (hci_get_random_address(hdev, false, conn_use_rpa(conn), NULL, + &own_addr_type, &random_addr) < 0) + return; memset(&cp, 0, sizeof(cp)); @@ -889,6 +897,21 @@ static void hci_req_directed_advertising(struct hci_request *req, hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(cp), &cp); + if (own_addr_type == ADDR_LE_DEV_RANDOM && + bacmp(&random_addr, BDADDR_ANY) && + bacmp(&random_addr, &hdev->random_addr)) { + struct hci_cp_le_set_adv_set_rand_addr cp; + + memset(&cp, 0, sizeof(cp)); + + cp.handle = 0; + bacpy(&cp.bdaddr, &random_addr); + + hci_req_add(req, + HCI_OP_LE_SET_ADV_SET_RAND_ADDR, + sizeof(cp), &cp); + } + __hci_req_enable_ext_advertising(req); } else { struct hci_cp_le_set_adv_param cp; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 944d4fedc317..840e8fd89fa5 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1471,6 +1471,7 @@ static int hci_dev_do_open(struct hci_dev *hdev) if (!ret) { hci_dev_hold(hdev); hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); + hci_adv_instances_set_rpa_expired(hdev, true); set_bit(HCI_UP, &hdev->flags); hci_sock_dev_event(hdev, HCI_DEV_UP); hci_leds_update_powered(hdev, true); @@ -1626,9 +1627,15 @@ int hci_dev_do_close(struct hci_dev *hdev) if (hci_dev_test_and_clear_flag(hdev, HCI_SERVICE_CACHE)) cancel_delayed_work(&hdev->service_cache); - if (hci_dev_test_flag(hdev, HCI_MGMT)) + if (hci_dev_test_flag(hdev, HCI_MGMT)) { + struct adv_info *adv_instance; + cancel_delayed_work_sync(&hdev->rpa_expired); + list_for_each_entry(adv_instance, &hdev->adv_instances, list) + cancel_delayed_work_sync(&adv_instance->rpa_expired_cb); + } + /* Avoid potential lockdep warnings from the *_flush() calls by * ensuring the workqueue is empty up front. */ @@ -2704,6 +2711,8 @@ int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance) hdev->cur_adv_instance = 0x00; } + cancel_delayed_work_sync(&adv_instance->rpa_expired_cb); + list_del(&adv_instance->list); kfree(adv_instance); @@ -2712,6 +2721,14 @@ int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance) return 0; } +void hci_adv_instances_set_rpa_expired(struct hci_dev *hdev, bool rpa_expired) +{ + struct adv_info *adv_instance, *n; + + list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) + adv_instance->rpa_expired = rpa_expired; +} + /* This function requires the caller holds hdev->lock */ void hci_adv_instances_clear(struct hci_dev *hdev) { @@ -2723,6 +2740,7 @@ void hci_adv_instances_clear(struct hci_dev *hdev) } list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) { + cancel_delayed_work_sync(&adv_instance->rpa_expired_cb); list_del(&adv_instance->list); kfree(adv_instance); } @@ -2731,6 +2749,16 @@ void hci_adv_instances_clear(struct hci_dev *hdev) hdev->cur_adv_instance = 0x00; } +static void adv_instance_rpa_expired(struct work_struct *work) +{ + struct adv_info *adv_instance = container_of(work, struct adv_info, + rpa_expired_cb.work); + + BT_DBG(""); + + adv_instance->rpa_expired = true; +} + /* This function requires the caller holds hdev->lock */ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, u16 adv_data_len, u8 *adv_data, @@ -2781,6 +2809,9 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, adv_instance->tx_power = HCI_TX_POWER_INVALID; + INIT_DELAYED_WORK(&adv_instance->rpa_expired_cb, + adv_instance_rpa_expired); + BT_DBG("%s for %dMR", hdev->name, instance); return 0; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index a78d1dd2f57b..392c9d8febd0 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1064,6 +1064,35 @@ static void hci_cc_le_set_default_phy(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_unlock(hdev); } +static void hci_cc_le_set_adv_set_random_addr(struct hci_dev *hdev, + struct sk_buff *skb) +{ + __u8 status = *((__u8 *) skb->data); + struct hci_cp_le_set_adv_set_rand_addr *cp; + struct adv_info *adv_instance; + + if (status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_ADV_SET_RAND_ADDR); + if (!cp) + return; + + hci_dev_lock(hdev); + + if (!hdev->cur_adv_instance) { + /* Store in hdev for instance 0 (Set adv and Directed advs) */ + bacpy(&hdev->random_addr, &cp->bdaddr); + } else { + adv_instance = hci_find_adv_instance(hdev, + hdev->cur_adv_instance); + if (adv_instance) + bacpy(&adv_instance->random_addr, &cp->bdaddr); + } + + hci_dev_unlock(hdev); +} + static void hci_cc_le_set_adv_enable(struct hci_dev *hdev, struct sk_buff *skb) { __u8 *sent, status = *((__u8 *) skb->data); @@ -2830,8 +2859,10 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) /* We should disregard the current RPA and generate a new one * whenever the encryption procedure fails. */ - if (ev->status && conn->type == LE_LINK) + if (ev->status && conn->type == LE_LINK) { hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); + hci_adv_instances_set_rpa_expired(hdev, true); + } clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); @@ -3283,6 +3314,10 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_le_set_ext_adv_enable(hdev, skb); break; + case HCI_OP_LE_SET_ADV_SET_RAND_ADDR: + hci_cc_le_set_adv_set_random_addr(hdev, skb); + break; + default: BT_DBG("%s opcode 0x%4.4x", hdev->name, *opcode); break; diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 96e1e05a92c3..c72fd9202666 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -1440,6 +1440,87 @@ unlock: hci_dev_unlock(hdev); } +int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, + bool use_rpa, struct adv_info *adv_instance, + u8 *own_addr_type, bdaddr_t *rand_addr) +{ + int err; + + bacpy(rand_addr, BDADDR_ANY); + + /* If privacy is enabled use a resolvable private address. If + * current RPA has expired then generate a new one. + */ + if (use_rpa) { + int to; + + *own_addr_type = ADDR_LE_DEV_RANDOM; + + if (adv_instance) { + if (!adv_instance->rpa_expired && + !bacmp(&adv_instance->random_addr, &hdev->rpa)) + return 0; + + adv_instance->rpa_expired = false; + } else { + if (!hci_dev_test_and_clear_flag(hdev, HCI_RPA_EXPIRED) && + !bacmp(&hdev->random_addr, &hdev->rpa)) + return 0; + } + + err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa); + if (err < 0) { + BT_ERR("%s failed to generate new RPA", hdev->name); + return err; + } + + bacpy(rand_addr, &hdev->rpa); + + to = msecs_to_jiffies(hdev->rpa_timeout * 1000); + if (adv_instance) + queue_delayed_work(hdev->workqueue, + &adv_instance->rpa_expired_cb, to); + else + queue_delayed_work(hdev->workqueue, + &hdev->rpa_expired, to); + + return 0; + } + + /* In case of required privacy without resolvable private address, + * use an non-resolvable private address. This is useful for + * non-connectable advertising. + */ + if (require_privacy) { + bdaddr_t nrpa; + + while (true) { + /* The non-resolvable private address is generated + * from random six bytes with the two most significant + * bits cleared. + */ + get_random_bytes(&nrpa, 6); + nrpa.b[5] &= 0x3f; + + /* The non-resolvable private address shall not be + * equal to the public address. + */ + if (bacmp(&hdev->bdaddr, &nrpa)) + break; + } + + *own_addr_type = ADDR_LE_DEV_RANDOM; + bacpy(rand_addr, &nrpa); + + return 0; + } + + /* No privacy so use a public address. */ + *own_addr_type = ADDR_LE_DEV_PUBLIC; + + return 0; +} + void __hci_req_clear_ext_adv_sets(struct hci_request *req) { hci_req_add(req, HCI_OP_LE_CLEAR_ADV_SETS, 0, NULL); @@ -1451,9 +1532,21 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) struct hci_dev *hdev = req->hdev; bool connectable; u32 flags; + bdaddr_t random_addr; + u8 own_addr_type; + int err; + struct adv_info *adv_instance; /* In ext adv set param interval is 3 octets */ const u8 adv_interval[3] = { 0x00, 0x08, 0x00 }; + if (instance > 0) { + adv_instance = hci_find_adv_instance(hdev, instance); + if (!adv_instance) + return -EINVAL; + } else { + adv_instance = NULL; + } + flags = get_adv_instance_flags(hdev, instance); /* If the "connectable" instance flag was not set, then choose between @@ -1465,6 +1558,16 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) if (!is_advertising_allowed(hdev, connectable)) return -EPERM; + /* Set require_privacy to true only when non-connectable + * advertising is used. In that case it is fine to use a + * non-resolvable private address. + */ + err = hci_get_random_address(hdev, !connectable, + adv_use_rpa(hdev, flags), adv_instance, + &own_addr_type, &random_addr); + if (err < 0) + return err; + memset(&cp, 0, sizeof(cp)); memcpy(cp.min_interval, adv_interval, sizeof(cp.min_interval)); @@ -1477,7 +1580,7 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) else cp.evt_properties = cpu_to_le16(LE_LEGACY_NONCONN_IND); - cp.own_addr_type = BDADDR_LE_PUBLIC; + cp.own_addr_type = own_addr_type; cp.channel_map = hdev->le_adv_channel_map; cp.tx_power = 127; cp.primary_phy = HCI_ADV_PHY_1M; @@ -1486,6 +1589,29 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(cp), &cp); + if (own_addr_type == ADDR_LE_DEV_RANDOM && + bacmp(&random_addr, BDADDR_ANY)) { + struct hci_cp_le_set_adv_set_rand_addr cp; + + /* Check if random address need to be updated */ + if (adv_instance) { + if (!bacmp(&random_addr, &adv_instance->random_addr)) + return 0; + } else { + if (!bacmp(&random_addr, &hdev->random_addr)) + return 0; + } + + memset(&cp, 0, sizeof(cp)); + + cp.handle = 0; + bacpy(&cp.bdaddr, &random_addr); + + hci_req_add(req, + HCI_OP_LE_SET_ADV_SET_RAND_ADDR, + sizeof(cp), &cp); + } + return 0; } diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h index 2451861bb4f8..692cc8b13368 100644 --- a/net/bluetooth/hci_request.h +++ b/net/bluetooth/hci_request.h @@ -84,6 +84,9 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance); int __hci_req_start_ext_adv(struct hci_request *req, u8 instance); void __hci_req_enable_ext_advertising(struct hci_request *req); void __hci_req_clear_ext_adv_sets(struct hci_request *req); +int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, + bool use_rpa, struct adv_info *adv_instance, + u8 *own_addr_type, bdaddr_t *rand_addr); void __hci_req_update_class(struct hci_request *req); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index c283f0364c0f..949986727019 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -4972,6 +4972,7 @@ static int set_privacy(struct sock *sk, struct hci_dev *hdev, void *cp_data, changed = !hci_dev_test_and_set_flag(hdev, HCI_PRIVACY); memcpy(hdev->irk, cp->irk, sizeof(hdev->irk)); hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); + hci_adv_instances_set_rpa_expired(hdev, true); if (cp->privacy == 0x02) hci_dev_set_flag(hdev, HCI_LIMITED_PRIVACY); else @@ -4980,6 +4981,7 @@ static int set_privacy(struct sock *sk, struct hci_dev *hdev, void *cp_data, changed = hci_dev_test_and_clear_flag(hdev, HCI_PRIVACY); memset(hdev->irk, 0, sizeof(hdev->irk)); hci_dev_clear_flag(hdev, HCI_RPA_EXPIRED); + hci_adv_instances_set_rpa_expired(hdev, false); hci_dev_clear_flag(hdev, HCI_LIMITED_PRIVACY); } -- cgit v1.2.3 From acf0aeae431a0f1723385cd1cb50177e4cc10edd Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:46 +0530 Subject: Bluetooth: Handle ADv set terminated event This event comes after connection complete event for incoming connections. Since we now have different random address for each instance, conn resp address is assigned from this event. As of now only connection part is handled as we are not enabling duration or max num of events while starting ext adv. Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 8 ++++++++ net/bluetooth/hci_core.c | 8 ++++++++ net/bluetooth/hci_event.c | 43 ++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 56 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 8d348d0d3eea..57e3e3675d66 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -2155,6 +2155,14 @@ struct hci_ev_le_enh_conn_complete { __u8 clk_accurancy; } __packed; +#define HCI_EV_LE_EXT_ADV_SET_TERM 0x12 +struct hci_evt_le_ext_adv_set_term { + __u8 status; + __u8 handle; + __le16 conn_handle; + __u8 num_evts; +} __packed; + /* Internal events generated by Bluetooth stack */ #define HCI_EV_STACK_INTERNAL 0xfd struct hci_ev_stack_internal { diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 840e8fd89fa5..79e02d24a215 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -712,6 +712,14 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) * Complete */ + /* If the controller supports the LE Extended Advertising + * command, enable the corresponding event. + */ + if (ext_adv_capable(hdev)) + events[2] |= 0x02; /* LE Advertising Set + * Terminated + */ + hci_req_add(req, HCI_OP_LE_SET_EVENT_MASK, sizeof(events), events); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 392c9d8febd0..754714c8d752 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4798,10 +4798,15 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, * the advertising address type. */ conn->resp_addr_type = hdev->adv_addr_type; - if (hdev->adv_addr_type == ADDR_LE_DEV_RANDOM) - bacpy(&conn->resp_addr, &hdev->random_addr); - else + if (hdev->adv_addr_type == ADDR_LE_DEV_RANDOM) { + /* In case of ext adv, resp_addr will be updated in + * Adv Terminated event. + */ + if (!ext_adv_capable(hdev)) + bacpy(&conn->resp_addr, &hdev->random_addr); + } else { bacpy(&conn->resp_addr, &hdev->bdaddr); + } conn->init_addr_type = bdaddr_type; bacpy(&conn->init_addr, bdaddr); @@ -4931,6 +4936,34 @@ static void hci_le_enh_conn_complete_evt(struct hci_dev *hdev, le16_to_cpu(ev->supervision_timeout)); } +static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_evt_le_ext_adv_set_term *ev = (void *) skb->data; + struct hci_conn *conn; + + BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); + + if (ev->status) + return; + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->conn_handle)); + if (conn) { + struct adv_info *adv_instance; + + if (hdev->adv_addr_type != ADDR_LE_DEV_RANDOM) + return; + + if (!hdev->cur_adv_instance) { + bacpy(&conn->resp_addr, &hdev->random_addr); + return; + } + + adv_instance = hci_find_adv_instance(hdev, hdev->cur_adv_instance); + if (adv_instance) + bacpy(&conn->resp_addr, &adv_instance->random_addr); + } +} + static void hci_le_conn_update_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) { @@ -5578,6 +5611,10 @@ static void hci_le_meta_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_le_enh_conn_complete_evt(hdev, skb); break; + case HCI_EV_LE_EXT_ADV_SET_TERM: + hci_le_ext_adv_term_evt(hdev, skb); + break; + default: break; } -- cgit v1.2.3 From 85a721a8b0b6880d8cf6b9def70404ade8563225 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2018 17:09:47 +0530 Subject: Bluetooth: Implement secondary advertising on different PHYs This patch adds support for advertising in primary and secondary channel on different PHYs. User can add the phy preference in the flag based on which phy type will be added in extended advertising parameter would be set. @ MGMT Command: Add Advertising (0x003e) plen 11 Instance: 1 Flags: 0x00000200 Advertise in CODED on Secondary channel Duration: 0 Timeout: 0 Advertising data length: 0 Scan response length: 0 < HCI Command: LE Set Extended Advertising Enable (0x08|0x0039) plen 2 Extended advertising: Disabled (0x00) Number of sets: Disable all sets (0x00) > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Advertising Enable (0x08|0x0039) ncmd 2 Status: Success (0x00) < HCI Command: LE Set Extended Advertising Parameters (0x08|0x0036) plen 25 Handle: 0x00 Properties: 0x0000 Min advertising interval: 1280.000 msec (0x0800) Max advertising interval: 1280.000 msec (0x0800) Channel map: 37, 38, 39 (0x07) Own address type: Random (0x01) Peer address type: Public (0x00) Peer address: 00:00:00:00:00:00 (OUI 00-00-00) Filter policy: Allow Scan Request from Any, Allow Connect Request from Any (0x00) TX power: 127 dbm (0x7f) Primary PHY: LE Coded (0x03) Secondary max skip: 0x00 Secondary PHY: LE Coded (0x03) SID: 0x00 Scan request notifications: Disabled (0x00) Signed-off-by: Jaganath Kanakkassery Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 4 ++++ include/net/bluetooth/mgmt.h | 6 ++++++ net/bluetooth/hci_request.c | 39 +++++++++++++++++++++++++++++++-------- net/bluetooth/mgmt.c | 18 +++++++++++++++--- 4 files changed, 56 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 57e3e3675d66..8ff36463719f 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -410,6 +410,8 @@ enum { #define HCI_LE_SLAVE_FEATURES 0x08 #define HCI_LE_PING 0x10 #define HCI_LE_DATA_LEN_EXT 0x20 +#define HCI_LE_PHY_2M 0x01 +#define HCI_LE_PHY_CODED 0x08 #define HCI_LE_EXT_ADV 0x10 #define HCI_LE_EXT_SCAN_POLICY 0x80 #define HCI_LE_PHY_2M 0x01 @@ -1606,6 +1608,8 @@ struct hci_cp_le_set_ext_adv_params { } __packed; #define HCI_ADV_PHY_1M 0X01 +#define HCI_ADV_PHY_2M 0x02 +#define HCI_ADV_PHY_CODED 0x03 struct hci_rp_le_set_ext_adv_params { __u8 status; diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 7f372e9067c9..9cee7ddc6741 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -562,6 +562,12 @@ struct mgmt_rp_add_advertising { #define MGMT_ADV_FLAG_TX_POWER BIT(4) #define MGMT_ADV_FLAG_APPEARANCE BIT(5) #define MGMT_ADV_FLAG_LOCAL_NAME BIT(6) +#define MGMT_ADV_FLAG_SEC_1M BIT(7) +#define MGMT_ADV_FLAG_SEC_2M BIT(8) +#define MGMT_ADV_FLAG_SEC_CODED BIT(9) + +#define MGMT_ADV_FLAG_SEC_MASK (MGMT_ADV_FLAG_SEC_1M | MGMT_ADV_FLAG_SEC_2M | \ + MGMT_ADV_FLAG_SEC_CODED) #define MGMT_OP_REMOVE_ADVERTISING 0x003F struct mgmt_cp_remove_advertising { diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index c72fd9202666..e8c9ef1e1922 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -1536,6 +1536,7 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) u8 own_addr_type; int err; struct adv_info *adv_instance; + bool secondary_adv; /* In ext adv set param interval is 3 octets */ const u8 adv_interval[3] = { 0x00, 0x08, 0x00 }; @@ -1573,20 +1574,42 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) memcpy(cp.min_interval, adv_interval, sizeof(cp.min_interval)); memcpy(cp.max_interval, adv_interval, sizeof(cp.max_interval)); - if (connectable) - cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_IND); - else if (get_adv_instance_scan_rsp_len(hdev, instance)) - cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_SCAN_IND); - else - cp.evt_properties = cpu_to_le16(LE_LEGACY_NONCONN_IND); + secondary_adv = (flags & MGMT_ADV_FLAG_SEC_MASK); + + if (connectable) { + if (secondary_adv) + cp.evt_properties = cpu_to_le16(LE_EXT_ADV_CONN_IND); + else + cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_IND); + } else if (get_adv_instance_scan_rsp_len(hdev, instance)) { + if (secondary_adv) + cp.evt_properties = cpu_to_le16(LE_EXT_ADV_SCAN_IND); + else + cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_SCAN_IND); + } else { + if (secondary_adv) + cp.evt_properties = cpu_to_le16(LE_EXT_ADV_NON_CONN_IND); + else + cp.evt_properties = cpu_to_le16(LE_LEGACY_NONCONN_IND); + } cp.own_addr_type = own_addr_type; cp.channel_map = hdev->le_adv_channel_map; cp.tx_power = 127; - cp.primary_phy = HCI_ADV_PHY_1M; - cp.secondary_phy = HCI_ADV_PHY_1M; cp.handle = 0; + if (flags & MGMT_ADV_FLAG_SEC_2M) { + cp.primary_phy = HCI_ADV_PHY_1M; + cp.secondary_phy = HCI_ADV_PHY_2M; + } else if (flags & MGMT_ADV_FLAG_SEC_CODED) { + cp.primary_phy = HCI_ADV_PHY_CODED; + cp.secondary_phy = HCI_ADV_PHY_CODED; + } else { + /* In all other cases use 1M */ + cp.primary_phy = HCI_ADV_PHY_1M; + cp.secondary_phy = HCI_ADV_PHY_1M; + } + hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(cp), &cp); if (own_addr_type == ADDR_LE_DEV_RANDOM && diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 949986727019..231602f7cb66 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -6339,6 +6339,16 @@ static u32 get_supported_adv_flags(struct hci_dev *hdev) ext_adv_capable(hdev)) flags |= MGMT_ADV_FLAG_TX_POWER; + if (ext_adv_capable(hdev)) { + flags |= MGMT_ADV_FLAG_SEC_1M; + + if (hdev->le_features[1] & HCI_LE_PHY_2M) + flags |= MGMT_ADV_FLAG_SEC_2M; + + if (hdev->le_features[1] & HCI_LE_PHY_CODED) + flags |= MGMT_ADV_FLAG_SEC_CODED; + } + return flags; } @@ -6544,7 +6554,7 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, struct mgmt_cp_add_advertising *cp = data; struct mgmt_rp_add_advertising rp; u32 flags; - u32 supported_flags; + u32 supported_flags, phy_flags; u8 status; u16 timeout, duration; unsigned int prev_instance_cnt = hdev->adv_instance_cnt; @@ -6574,10 +6584,12 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, duration = __le16_to_cpu(cp->duration); /* The current implementation only supports a subset of the specified - * flags. + * flags. Also need to check mutual exclusiveness of sec flags. */ supported_flags = get_supported_adv_flags(hdev); - if (flags & ~supported_flags) + phy_flags = flags & MGMT_ADV_FLAG_SEC_MASK; + if (flags & ~supported_flags || + ((phy_flags && (phy_flags ^ (phy_flags & -phy_flags))))) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, MGMT_STATUS_INVALID_PARAMS); -- cgit v1.2.3 From 740011cfe94859df8d05f5400d589a8693b095e7 Mon Sep 17 00:00:00 2001 From: Sean Wang Date: Fri, 20 Jul 2018 13:12:28 +0800 Subject: Bluetooth: Add new quirk for non-persistent setup settings Add a new quirk HCI_QUIRK_NON_PERSISTENT_SETUP allowing that a quirk that runs setup() after every open() and not just after the first open(). Signed-off-by: Sean Wang Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 9 +++++++++ net/bluetooth/hci_core.c | 3 ++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 8ff36463719f..7f008097552e 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -183,6 +183,15 @@ enum { * during the hdev->setup vendor callback. */ HCI_QUIRK_NON_PERSISTENT_DIAG, + + /* When this quirk is set, setup() would be run after every + * open() and not just after the first open(). + * + * This quirk can be set before hci_register_dev is called or + * during the hdev->setup vendor callback. + * + */ + HCI_QUIRK_NON_PERSISTENT_SETUP, }; /* HCI device flags */ diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 79e02d24a215..74b29c7d841c 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1415,7 +1415,8 @@ static int hci_dev_do_open(struct hci_dev *hdev) atomic_set(&hdev->cmd_cnt, 1); set_bit(HCI_INIT, &hdev->flags); - if (hci_dev_test_flag(hdev, HCI_SETUP)) { + if (hci_dev_test_flag(hdev, HCI_SETUP) || + test_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks)) { hci_sock_dev_event(hdev, HCI_DEV_SETUP); if (hdev->setup) -- cgit v1.2.3 From dd979b4df817e9976f18fb6f9d134d6bc4a3c317 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 30 Jul 2018 09:42:10 +0200 Subject: net: simplify sock_poll_wait The wait_address argument is always directly derived from the filp argument, so remove it. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- crypto/af_alg.c | 2 +- include/net/sock.h | 11 ++++++----- net/atm/common.c | 2 +- net/caif/caif_socket.c | 2 +- net/core/datagram.c | 2 +- net/dccp/proto.c | 2 +- net/ipv4/tcp.c | 2 +- net/iucv/af_iucv.c | 2 +- net/nfc/llcp_sock.c | 2 +- net/rxrpc/af_rxrpc.c | 2 +- net/smc/af_smc.c | 2 +- net/tipc/socket.c | 2 +- net/unix/af_unix.c | 4 ++-- 13 files changed, 19 insertions(+), 18 deletions(-) (limited to 'include/net') diff --git a/crypto/af_alg.c b/crypto/af_alg.c index c166f424871c..b053179e0bc5 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -1071,7 +1071,7 @@ __poll_t af_alg_poll(struct file *file, struct socket *sock, struct af_alg_ctx *ctx = ask->private; __poll_t mask; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); mask = 0; if (!ctx->more || ctx->used) diff --git a/include/net/sock.h b/include/net/sock.h index 83b747538bd0..0518f61926ec 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2057,16 +2057,17 @@ static inline bool skwq_has_sleeper(struct socket_wq *wq) /** * sock_poll_wait - place memory barrier behind the poll_wait call. * @filp: file - * @wait_address: socket wait queue * @p: poll_table * * See the comments in the wq_has_sleeper function. */ -static inline void sock_poll_wait(struct file *filp, - wait_queue_head_t *wait_address, poll_table *p) +static inline void sock_poll_wait(struct file *filp, poll_table *p) { - if (!poll_does_not_wait(p) && wait_address) { - poll_wait(filp, wait_address, p); + struct socket *sock = filp->private_data; + wait_queue_head_t *wq = sk_sleep(sock->sk); + + if (!poll_does_not_wait(p) && wq) { + poll_wait(filp, wq, p); /* We need to be sure we are in sync with the * socket flags modification. * diff --git a/net/atm/common.c b/net/atm/common.c index a7a68e509628..9f8cb0d2e71e 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -653,7 +653,7 @@ __poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait) struct atm_vcc *vcc; __poll_t mask; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); mask = 0; vcc = ATM_SD(sock); diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index a6fb1b3bcad9..d18965f3291f 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -941,7 +941,7 @@ static __poll_t caif_poll(struct file *file, __poll_t mask; struct caifsock *cf_sk = container_of(sk, struct caifsock, sk); - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); mask = 0; /* exceptional events? */ diff --git a/net/core/datagram.c b/net/core/datagram.c index 9938952c5c78..9aac0d63d53e 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -837,7 +837,7 @@ __poll_t datagram_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk; __poll_t mask; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); mask = 0; /* exceptional events? */ diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 0d56e36a6db7..875858c8b059 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -325,7 +325,7 @@ __poll_t dccp_poll(struct file *file, struct socket *sock, __poll_t mask; struct sock *sk = sock->sk; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); if (sk->sk_state == DCCP_LISTEN) return inet_csk_listen_poll(sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 514aaac1626f..f3bfb9f29520 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -507,7 +507,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) const struct tcp_sock *tp = tcp_sk(sk); int state; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); state = inet_sk_state_load(sk); if (state == TCP_LISTEN) diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 8d1c43f8fed4..92ee91e34395 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1494,7 +1494,7 @@ __poll_t iucv_sock_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk; __poll_t mask = 0; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); if (sk->sk_state == IUCV_LISTEN) return iucv_accept_poll(sk); diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index ea0c0c6f1874..dd4adf8b1167 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -556,7 +556,7 @@ static __poll_t llcp_sock_poll(struct file *file, struct socket *sock, pr_debug("%p\n", sk); - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); if (sk->sk_state == LLCP_LISTEN) return llcp_accept_poll(sk); diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 2b463047dd7b..ac44d8afffb1 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -741,7 +741,7 @@ static __poll_t rxrpc_poll(struct file *file, struct socket *sock, struct rxrpc_sock *rx = rxrpc_sk(sk); __poll_t mask; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); mask = 0; /* the socket is readable if there are any messages waiting on the Rx diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index fce7e4751151..0fc94f296e54 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1535,7 +1535,7 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, mask |= EPOLLERR; } else { if (sk->sk_state != SMC_CLOSED) - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); if (sk->sk_err) mask |= EPOLLERR; if ((sk->sk_shutdown == SHUTDOWN_MASK) || diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 3d21414ba357..3763bedecf5f 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -716,7 +716,7 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock, struct tipc_sock *tsk = tipc_sk(sk); __poll_t revents = 0; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); if (sk->sk_shutdown & RCV_SHUTDOWN) revents |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e5473c03d667..1772a0e32665 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2635,7 +2635,7 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa struct sock *sk = sock->sk; __poll_t mask; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); mask = 0; /* exceptional events? */ @@ -2672,7 +2672,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, unsigned int writable; __poll_t mask; - sock_poll_wait(file, sk_sleep(sk), wait); + sock_poll_wait(file, wait); mask = 0; /* exceptional events? */ -- cgit v1.2.3 From d8bbd13beeaacd6494954bf5b945b54ccb2af309 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 30 Jul 2018 09:42:11 +0200 Subject: net: don not detour through struct sock to find the poll waitqueue For any open socket file descriptor sock->sk->sk_wq->wait will always point to sock->wq->wait. That means we can do the shorter dereference and removal a NULL check and don't have to not worry about any RCU protection. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/sock.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 0518f61926ec..2afea5d1bdfe 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2064,10 +2064,9 @@ static inline bool skwq_has_sleeper(struct socket_wq *wq) static inline void sock_poll_wait(struct file *filp, poll_table *p) { struct socket *sock = filp->private_data; - wait_queue_head_t *wq = sk_sleep(sock->sk); - if (!poll_does_not_wait(p) && wq) { - poll_wait(filp, wq, p); + if (!poll_does_not_wait(p)) { + poll_wait(filp, &sock->wq->wait, p); /* We need to be sure we are in sync with the * socket flags modification. * -- cgit v1.2.3 From f641f13b992979b97e595b761a9ba1a64fed7c4e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 30 Jul 2018 09:42:12 +0200 Subject: net: remove sock_poll_busy_loop There is no point in hiding this logic in a helper. Also remove the useless events != 0 check and only busy loop once we know we actually have a poll method. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/busy_poll.h | 9 --------- net/socket.c | 5 ++++- 2 files changed, 4 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 9e36fda652b7..85777e68f738 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -121,15 +121,6 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock) #endif } -static inline void sock_poll_busy_loop(struct socket *sock, __poll_t events) -{ - if (sk_can_busy_loop(sock->sk) && - events && (events & POLL_BUSY_LOOP)) { - /* once, only if requested by syscall */ - sk_busy_loop(sock->sk, 1); - } -} - /* if this socket can poll_ll, tell the system call */ static inline __poll_t sock_poll_busy_flag(struct socket *sock) { diff --git a/net/socket.c b/net/socket.c index 85633622c94d..674434127b3a 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1132,9 +1132,12 @@ static __poll_t sock_poll(struct file *file, poll_table *wait) struct socket *sock = file->private_data; __poll_t events = poll_requested_events(wait); - sock_poll_busy_loop(sock, events); if (!sock->ops->poll) return 0; + + /* poll once if requested by the syscall */ + if (sk_can_busy_loop(sock->sk) && (events & POLL_BUSY_LOOP)) + sk_busy_loop(sock->sk, 1); return sock->ops->poll(file, sock, wait) | sock_poll_busy_flag(sock); } -- cgit v1.2.3 From a331de3bf0e66ab2437fc8c5b99bd3c0d9da3088 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 30 Jul 2018 09:42:13 +0200 Subject: net: remove sock_poll_busy_flag Fold it into the only caller to make the code simpler and easier to read. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/busy_poll.h | 6 ------ net/socket.c | 16 +++++++++++----- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 85777e68f738..ba61cdd09eaa 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -121,12 +121,6 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock) #endif } -/* if this socket can poll_ll, tell the system call */ -static inline __poll_t sock_poll_busy_flag(struct socket *sock) -{ - return sk_can_busy_loop(sock->sk) ? POLL_BUSY_LOOP : 0; -} - /* used in the NIC receive handler to mark the skb */ static inline void skb_mark_napi_id(struct sk_buff *skb, struct napi_struct *napi) diff --git a/net/socket.c b/net/socket.c index 674434127b3a..5b7df6695f4f 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1130,15 +1130,21 @@ EXPORT_SYMBOL(sock_create_lite); static __poll_t sock_poll(struct file *file, poll_table *wait) { struct socket *sock = file->private_data; - __poll_t events = poll_requested_events(wait); + __poll_t events = poll_requested_events(wait), flag = 0; if (!sock->ops->poll) return 0; - /* poll once if requested by the syscall */ - if (sk_can_busy_loop(sock->sk) && (events & POLL_BUSY_LOOP)) - sk_busy_loop(sock->sk, 1); - return sock->ops->poll(file, sock, wait) | sock_poll_busy_flag(sock); + if (sk_can_busy_loop(sock->sk)) { + /* poll once if requested by the syscall */ + if (events & POLL_BUSY_LOOP) + sk_busy_loop(sock->sk, 1); + + /* if this socket can poll_ll, tell the system call */ + flag = POLL_BUSY_LOOP; + } + + return sock->ops->poll(file, sock, wait) | flag; } static int sock_mmap(struct file *file, struct vm_area_struct *vma) -- cgit v1.2.3 From 7fd4b288ea6a3e45ad8afbcd5ec39554d57f1ae0 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 30 Jul 2018 14:30:43 +0200 Subject: tc/act: remove unneeded RCU lock in action callback Each lockless action currently does its own RCU locking in ->act(). This allows using plain RCU accessor, even if the context is really RCU BH. This change drops the per action RCU lock, replace the accessors with the _bh variant, cleans up a bit the surrounding code and documents the RCU status in the relevant header. No functional nor performance change is intended. The goal of this patch is clarifying that the RCU critical section used by the tc actions extends up to the classifier's caller. v1 -> v2: - preserve rcu lock in act_bpf: it's needed by eBPF helpers, as pointed out by Daniel v3 -> v4: - fixed some typos in the commit message (JiriP) Signed-off-by: Paolo Abeni Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 2 +- include/net/sch_generic.h | 2 ++ net/sched/act_csum.c | 12 +++--------- net/sched/act_ife.c | 5 +---- net/sched/act_mirred.c | 4 +--- net/sched/act_sample.c | 4 +--- net/sched/act_skbedit.c | 10 +++------- net/sched/act_skbmod.c | 21 +++++++++------------ net/sched/act_tunnel_key.c | 6 +----- net/sched/act_vlan.c | 19 +++++++------------ 10 files changed, 29 insertions(+), 56 deletions(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 683ce41053d9..8c9bc02d05e1 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -85,7 +85,7 @@ struct tc_action_ops { size_t size; struct module *owner; int (*act)(struct sk_buff *, const struct tc_action *, - struct tcf_result *); + struct tcf_result *); /* called under RCU BH lock*/ int (*dump)(struct sk_buff *, struct tc_action *, int, int); void (*cleanup)(struct tc_action *); int (*lookup)(struct net *net, struct tc_action **a, u32 index, diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index c5432362dc26..bcae181c1857 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -285,6 +285,8 @@ struct tcf_proto { /* Fast access part */ struct tcf_proto __rcu *next; void __rcu *root; + + /* called under RCU BH lock*/ int (*classify)(struct sk_buff *, const struct tcf_proto *, struct tcf_result *); diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 4e8c383f379e..648a3a35b720 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -561,15 +561,14 @@ static int tcf_csum(struct sk_buff *skb, const struct tc_action *a, u32 update_flags; int action; - rcu_read_lock(); - params = rcu_dereference(p->params); + params = rcu_dereference_bh(p->params); tcf_lastuse_update(&p->tcf_tm); bstats_cpu_update(this_cpu_ptr(p->common.cpu_bstats), skb); action = READ_ONCE(p->tcf_action); if (unlikely(action == TC_ACT_SHOT)) - goto drop_stats; + goto drop; update_flags = params->update_flags; switch (tc_skb_protocol(skb)) { @@ -583,16 +582,11 @@ static int tcf_csum(struct sk_buff *skb, const struct tc_action *a, break; } -unlock: - rcu_read_unlock(); return action; drop: - action = TC_ACT_SHOT; - -drop_stats: qstats_drop_inc(this_cpu_ptr(p->common.cpu_qstats)); - goto unlock; + return TC_ACT_SHOT; } static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 3d6e265758c0..df4060e32d43 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -820,14 +820,11 @@ static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_ife_params *p; int ret; - rcu_read_lock(); - p = rcu_dereference(ife->params); + p = rcu_dereference_bh(ife->params); if (p->flags & IFE_ENCODE) { ret = tcf_ife_encode(skb, a, res, p); - rcu_read_unlock(); return ret; } - rcu_read_unlock(); return tcf_ife_decode(skb, a, res); } diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 6afd89a36c69..eeb335f03102 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -181,11 +181,10 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, tcf_lastuse_update(&m->tcf_tm); bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); - rcu_read_lock(); m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit); m_eaction = READ_ONCE(m->tcfm_eaction); retval = READ_ONCE(m->tcf_action); - dev = rcu_dereference(m->tcfm_dev); + dev = rcu_dereference_bh(m->tcfm_dev); if (unlikely(!dev)) { pr_notice_once("tc mirred: target device is gone\n"); goto out; @@ -236,7 +235,6 @@ out: if (tcf_mirred_is_act_redirect(m_eaction)) retval = TC_ACT_SHOT; } - rcu_read_unlock(); return retval; } diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 3079e7be5bde..2608ccc83e5e 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -140,8 +140,7 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb); retval = READ_ONCE(s->tcf_action); - rcu_read_lock(); - psample_group = rcu_dereference(s->psample_group); + psample_group = rcu_dereference_bh(s->psample_group); /* randomly sample packets according to rate */ if (psample_group && (prandom_u32() % s->rate == 0)) { @@ -165,7 +164,6 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, skb_pull(skb, skb->mac_len); } - rcu_read_unlock(); return retval; } diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index da56e6938c9e..a6db47ebec11 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -43,8 +43,7 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, tcf_lastuse_update(&d->tcf_tm); bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); - rcu_read_lock(); - params = rcu_dereference(d->params); + params = rcu_dereference_bh(d->params); action = READ_ONCE(d->tcf_action); if (params->flags & SKBEDIT_F_PRIORITY) @@ -77,14 +76,11 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, } if (params->flags & SKBEDIT_F_PTYPE) skb->pkt_type = params->ptype; - -unlock: - rcu_read_unlock(); return action; + err: qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats)); - action = TC_ACT_SHOT; - goto unlock; + return TC_ACT_SHOT; } static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index cdc6bacfb190..c437c6d51a71 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -41,20 +41,14 @@ static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a, * then MAX_EDIT_LEN needs to change appropriately */ err = skb_ensure_writable(skb, MAX_EDIT_LEN); - if (unlikely(err)) { /* best policy is to drop on the floor */ - qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats)); - return TC_ACT_SHOT; - } + if (unlikely(err)) /* best policy is to drop on the floor */ + goto drop; - rcu_read_lock(); action = READ_ONCE(d->tcf_action); - if (unlikely(action == TC_ACT_SHOT)) { - qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats)); - rcu_read_unlock(); - return action; - } + if (unlikely(action == TC_ACT_SHOT)) + goto drop; - p = rcu_dereference(d->skbmod_p); + p = rcu_dereference_bh(d->skbmod_p); flags = p->flags; if (flags & SKBMOD_F_DMAC) ether_addr_copy(eth_hdr(skb)->h_dest, p->eth_dst); @@ -62,7 +56,6 @@ static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a, ether_addr_copy(eth_hdr(skb)->h_source, p->eth_src); if (flags & SKBMOD_F_ETYPE) eth_hdr(skb)->h_proto = p->eth_type; - rcu_read_unlock(); if (flags & SKBMOD_F_SWAPMAC) { u16 tmpaddr[ETH_ALEN / 2]; /* ether_addr_copy() requirement */ @@ -73,6 +66,10 @@ static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a, } return action; + +drop: + qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats)); + return TC_ACT_SHOT; } static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = { diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index f811850fd1d0..d42d9e112789 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -31,9 +31,7 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_tunnel_key_params *params; int action; - rcu_read_lock(); - - params = rcu_dereference(t->params); + params = rcu_dereference_bh(t->params); tcf_lastuse_update(&t->tcf_tm); bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb); @@ -53,8 +51,6 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, break; } - rcu_read_unlock(); - return action; } diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index ad37f308175a..15a0ee214c9c 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -40,11 +40,9 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, if (skb_at_tc_ingress(skb)) skb_push_rcsum(skb, skb->mac_len); - rcu_read_lock(); - action = READ_ONCE(v->tcf_action); - p = rcu_dereference(v->vlan_p); + p = rcu_dereference_bh(v->vlan_p); switch (p->tcfv_action) { case TCA_VLAN_ACT_POP: @@ -61,7 +59,7 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, case TCA_VLAN_ACT_MODIFY: /* No-op if no vlan tag (either hw-accel or in-payload) */ if (!skb_vlan_tagged(skb)) - goto unlock; + goto out; /* extract existing tag (and guarantee no hw-accel tag) */ if (skb_vlan_tag_present(skb)) { tci = skb_vlan_tag_get(skb); @@ -86,18 +84,15 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, BUG(); } - goto unlock; - -drop: - action = TC_ACT_SHOT; - qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats)); - -unlock: - rcu_read_unlock(); +out: if (skb_at_tc_ingress(skb)) skb_pull_rcsum(skb, skb->mac_len); return action; + +drop: + qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats)); + return TC_ACT_SHOT; } static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { -- cgit v1.2.3 From cd11b164073b719203318227918f9510809d5e10 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 30 Jul 2018 14:30:44 +0200 Subject: net/tc: introduce TC_ACT_REINSERT. This is similar TC_ACT_REDIRECT, but with a slightly different semantic: - on ingress the mirred skbs are passed to the target device network stack without any additional check not scrubbing. - the rcu-protected stats provided via the tcf_result struct are updated on error conditions. This new tcfa_action value is not exposed to the user-space and can be used only internally by clsact. v1 -> v2: do not touch TC_ACT_REDIRECT code path, introduce a new action type instead v2 -> v3: - rename the new action value TC_ACT_REINJECT, update the helper accordingly - take care of uncloned reinjected packets in XDP generic hook v3 -> v4: - renamed again the new action value (JiriP) v4 -> v5: - fix build error with !NET_CLS_ACT (kbuild bot) Signed-off-by: Paolo Abeni Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 3 +++ include/net/sch_generic.h | 28 ++++++++++++++++++++++++++++ net/core/dev.c | 6 +++++- 3 files changed, 36 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 6d02f31abba8..22bfc3a13c25 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -7,6 +7,9 @@ #include #include +/* TC action not accessible from user space */ +#define TC_ACT_REINSERT (TC_ACT_VALUE_MAX + 1) + /* Basic packet classifier frontend definitions. */ struct tcf_walker { diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index bcae181c1857..a6d00093f35e 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -235,6 +235,12 @@ struct tcf_result { u32 classid; }; const struct tcf_proto *goto_tp; + + /* used by the TC_ACT_REINSERT action */ + struct { + bool ingress; + struct gnet_stats_queue *qstats; + }; }; }; @@ -569,6 +575,15 @@ static inline void skb_reset_tc(struct sk_buff *skb) #endif } +static inline bool skb_is_tc_redirected(const struct sk_buff *skb) +{ +#ifdef CONFIG_NET_CLS_ACT + return skb->tc_redirected; +#else + return false; +#endif +} + static inline bool skb_at_tc_ingress(const struct sk_buff *skb) { #ifdef CONFIG_NET_CLS_ACT @@ -1108,4 +1123,17 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, struct mini_Qdisc __rcu **p_miniq); +static inline void skb_tc_reinsert(struct sk_buff *skb, struct tcf_result *res) +{ + struct gnet_stats_queue *stats = res->qstats; + int ret; + + if (res->ingress) + ret = netif_receive_skb(skb); + else + ret = dev_queue_xmit(skb); + if (ret && stats) + qstats_overlimit_inc(res->qstats); +} + #endif diff --git a/net/core/dev.c b/net/core/dev.c index 89031b5fef9f..38b0c414d780 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4252,7 +4252,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. */ - if (skb_cloned(skb)) + if (skb_cloned(skb) || skb_is_tc_redirected(skb)) return XDP_PASS; /* XDP packets must be linear and must have sufficient headroom @@ -4602,6 +4602,10 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, __skb_push(skb, skb->mac_len); skb_do_redirect(skb); return NULL; + case TC_ACT_REINSERT: + /* this does not scrub the packet, and updates stats on error */ + skb_tc_reinsert(skb, &cl_res); + return NULL; default: break; } -- cgit v1.2.3 From 486cdf21583e5b1fad488a3e4f0a5242a31c0ffa Mon Sep 17 00:00:00 2001 From: Mathieu Xhonneux Date: Thu, 26 Jul 2018 02:10:40 +0000 Subject: bpf: add End.DT6 action to bpf_lwt_seg6_action helper The seg6local LWT provides the End.DT6 action, which allows to decapsulate an outer IPv6 header containing a Segment Routing Header (SRH), full specification is available here: https://tools.ietf.org/html/draft-filsfils-spring-srv6-network-programming-05 This patch adds this action now to the seg6local BPF interface. Since it is not mandatory that the inner IPv6 header also contains a SRH, seg6_bpf_srh_state has been extended with a pointer to a possible SRH of the outermost IPv6 header. This helps assessing if the validation must be triggered or not, and avoids some calls to ipv6_find_hdr. v3: s/1/true, s/0/false for boolean values v2: - changed true/false -> 1/0 - preempt_enable no longer called in first conditional block Signed-off-by: Mathieu Xhonneux Signed-off-by: Daniel Borkmann --- include/net/seg6_local.h | 4 ++- net/core/filter.c | 88 ++++++++++++++++++++++++++++++++---------------- net/ipv6/seg6_local.c | 50 +++++++++++++++++---------- 3 files changed, 94 insertions(+), 48 deletions(-) (limited to 'include/net') diff --git a/include/net/seg6_local.h b/include/net/seg6_local.h index 661fd5b4d3e0..08359e2d8b35 100644 --- a/include/net/seg6_local.h +++ b/include/net/seg6_local.h @@ -21,10 +21,12 @@ extern int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, u32 tbl_id); +extern bool seg6_bpf_has_valid_srh(struct sk_buff *skb); struct seg6_bpf_srh_state { - bool valid; + struct ipv6_sr_hdr *srh; u16 hdrlen; + bool valid; }; DECLARE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states); diff --git a/net/core/filter.c b/net/core/filter.c index 104d560946da..7df1a0f1d1e1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4542,26 +4542,28 @@ BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); + struct ipv6_sr_hdr *srh = srh_state->srh; void *srh_tlvs, *srh_end, *ptr; - struct ipv6_sr_hdr *srh; int srhoff = 0; - if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + if (srh == NULL) return -EINVAL; - srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4)); srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen); ptr = skb->data + offset; if (ptr >= srh_tlvs && ptr + len <= srh_end) - srh_state->valid = 0; + srh_state->valid = false; else if (ptr < (void *)&srh->flags || ptr + len > (void *)&srh->segments) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + len))) return -EFAULT; + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return -EINVAL; + srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); memcpy(skb->data + offset, from, len); return 0; @@ -4577,52 +4579,78 @@ static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { .arg4_type = ARG_CONST_SIZE }; -BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, - u32, action, void *, param, u32, param_len) +static void bpf_update_srh_state(struct sk_buff *skb) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); - struct ipv6_sr_hdr *srh; int srhoff = 0; - int err; - - if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) - return -EINVAL; - srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); - - if (!srh_state->valid) { - if (unlikely((srh_state->hdrlen & 7) != 0)) - return -EBADMSG; - - srh->hdrlen = (u8)(srh_state->hdrlen >> 3); - if (unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3))) - return -EBADMSG; - srh_state->valid = 1; + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) { + srh_state->srh = NULL; + } else { + srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); + srh_state->hdrlen = srh_state->srh->hdrlen << 3; + srh_state->valid = true; } +} + +BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, + u32, action, void *, param, u32, param_len) +{ + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + int hdroff = 0; + int err; switch (action) { case SEG6_LOCAL_ACTION_END_X: + if (!seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; if (param_len != sizeof(struct in6_addr)) return -EINVAL; return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0); case SEG6_LOCAL_ACTION_END_T: + if (!seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; + if (param_len != sizeof(int)) + return -EINVAL; + return seg6_lookup_nexthop(skb, NULL, *(int *)param); + case SEG6_LOCAL_ACTION_END_DT6: + if (!seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; if (param_len != sizeof(int)) return -EINVAL; + + if (ipv6_find_hdr(skb, &hdroff, IPPROTO_IPV6, NULL, NULL) < 0) + return -EBADMSG; + if (!pskb_pull(skb, hdroff)) + return -EBADMSG; + + skb_postpull_rcsum(skb, skb_network_header(skb), hdroff); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb->encapsulation = 0; + + bpf_compute_data_pointers(skb); + bpf_update_srh_state(skb); return seg6_lookup_nexthop(skb, NULL, *(int *)param); case SEG6_LOCAL_ACTION_END_B6: + if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE, param, param_len); if (!err) - srh_state->hdrlen = - ((struct ipv6_sr_hdr *)param)->hdrlen << 3; + bpf_update_srh_state(skb); + return err; case SEG6_LOCAL_ACTION_END_B6_ENCAP: + if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) + return -EBADMSG; err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6, param, param_len); if (!err) - srh_state->hdrlen = - ((struct ipv6_sr_hdr *)param)->hdrlen << 3; + bpf_update_srh_state(skb); + return err; default: return -EINVAL; @@ -4644,15 +4672,14 @@ BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); + struct ipv6_sr_hdr *srh = srh_state->srh; void *srh_end, *srh_tlvs, *ptr; - struct ipv6_sr_hdr *srh; struct ipv6hdr *hdr; int srhoff = 0; int ret; - if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + if (unlikely(srh == NULL)) return -EINVAL; - srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) + ((srh->first_segment + 1) << 4)); @@ -4682,8 +4709,11 @@ BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, hdr = (struct ipv6hdr *)skb->data; hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return -EINVAL; + srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); srh_state->hdrlen += len; - srh_state->valid = 0; + srh_state->valid = false; return 0; } diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index e1025b493a18..60325dbfe88b 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -459,36 +459,57 @@ drop: DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states); +bool seg6_bpf_has_valid_srh(struct sk_buff *skb) +{ + struct seg6_bpf_srh_state *srh_state = + this_cpu_ptr(&seg6_bpf_srh_states); + struct ipv6_sr_hdr *srh = srh_state->srh; + + if (unlikely(srh == NULL)) + return false; + + if (unlikely(!srh_state->valid)) { + if ((srh_state->hdrlen & 7) != 0) + return false; + + srh->hdrlen = (u8)(srh_state->hdrlen >> 3); + if (!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3)) + return false; + + srh_state->valid = true; + } + + return true; +} + static int input_action_end_bpf(struct sk_buff *skb, struct seg6_local_lwt *slwt) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); - struct seg6_bpf_srh_state local_srh_state; struct ipv6_sr_hdr *srh; - int srhoff = 0; int ret; srh = get_and_validate_srh(skb); - if (!srh) - goto drop; + if (!srh) { + kfree_skb(skb); + return -EINVAL; + } advance_nextseg(srh, &ipv6_hdr(skb)->daddr); /* preempt_disable is needed to protect the per-CPU buffer srh_state, * which is also accessed by the bpf_lwt_seg6_* helpers */ preempt_disable(); + srh_state->srh = srh; srh_state->hdrlen = srh->hdrlen << 3; - srh_state->valid = 1; + srh_state->valid = true; rcu_read_lock(); bpf_compute_data_pointers(skb); ret = bpf_prog_run_save_cb(slwt->bpf.prog, skb); rcu_read_unlock(); - local_srh_state = *srh_state; - preempt_enable(); - switch (ret) { case BPF_OK: case BPF_REDIRECT: @@ -500,24 +521,17 @@ static int input_action_end_bpf(struct sk_buff *skb, goto drop; } - if (unlikely((local_srh_state.hdrlen & 7) != 0)) - goto drop; - - if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) - goto drop; - srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); - srh->hdrlen = (u8)(local_srh_state.hdrlen >> 3); - - if (!local_srh_state.valid && - unlikely(!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3))) + if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) goto drop; + preempt_enable(); if (ret != BPF_REDIRECT) seg6_lookup_nexthop(skb, NULL, 0); return dst_input(skb); drop: + preempt_enable(); kfree_skb(skb); return -EINVAL; } -- cgit v1.2.3 From e6476c21447c4b17c47e476aade6facf050f31e8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 30 Jul 2018 09:45:07 +0200 Subject: net: remove bogus RCU annotations on socket.wq We never use RCU protection for it, just a lot of cargo-cult rcu_deference_protects calls. Note that we do keep the kfree_rcu call for it, as the references through struct sock are RCU protected and thus might require a grace period before freeing. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Dumazet Acked-by: Paul E. McKenney Signed-off-by: David S. Miller --- include/linux/net.h | 2 +- include/net/sock.h | 2 +- net/socket.c | 10 ++++------ 3 files changed, 6 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/linux/net.h b/include/linux/net.h index 6554d3ba4396..e0930678c8bf 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -114,7 +114,7 @@ struct socket { unsigned long flags; - struct socket_wq __rcu *wq; + struct socket_wq *wq; struct file *file; struct sock *sk; diff --git a/include/net/sock.h b/include/net/sock.h index 2afea5d1bdfe..433f45fc2d68 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1788,7 +1788,7 @@ static inline void sock_graft(struct sock *sk, struct socket *parent) { WARN_ON(parent->sk); write_lock_bh(&sk->sk_callback_lock); - sk->sk_wq = parent->wq; + rcu_assign_pointer(sk->sk_wq, parent->wq); parent->sk = sk; sk_set_socket(sk, parent); sk->sk_uid = SOCK_INODE(parent)->i_uid; diff --git a/net/socket.c b/net/socket.c index 5b7df6695f4f..475247e347ae 100644 --- a/net/socket.c +++ b/net/socket.c @@ -251,7 +251,7 @@ static struct inode *sock_alloc_inode(struct super_block *sb) init_waitqueue_head(&wq->wait); wq->fasync_list = NULL; wq->flags = 0; - RCU_INIT_POINTER(ei->socket.wq, wq); + ei->socket.wq = wq; ei->socket.state = SS_UNCONNECTED; ei->socket.flags = 0; @@ -265,11 +265,9 @@ static struct inode *sock_alloc_inode(struct super_block *sb) static void sock_destroy_inode(struct inode *inode) { struct socket_alloc *ei; - struct socket_wq *wq; ei = container_of(inode, struct socket_alloc, vfs_inode); - wq = rcu_dereference_protected(ei->socket.wq, 1); - kfree_rcu(wq, rcu); + kfree_rcu(ei->socket.wq, rcu); kmem_cache_free(sock_inode_cachep, ei); } @@ -603,7 +601,7 @@ static void __sock_release(struct socket *sock, struct inode *inode) module_put(owner); } - if (rcu_dereference_protected(sock->wq, 1)->fasync_list) + if (sock->wq->fasync_list) pr_err("%s: fasync list not empty!\n", __func__); if (!sock->file) { @@ -1181,7 +1179,7 @@ static int sock_fasync(int fd, struct file *filp, int on) return -EINVAL; lock_sock(sk); - wq = rcu_dereference_protected(sock->wq, lockdep_sock_is_held(sk)); + wq = sock->wq; fasync_helper(fd, filp, on, &wq->fasync_list); if (!wq->fasync_list) -- cgit v1.2.3 From 83ba4645152d1177c161750e1064e3a8e7cee19b Mon Sep 17 00:00:00 2001 From: Vincent Bernat Date: Tue, 31 Jul 2018 21:18:11 +0200 Subject: net: add helpers checking if socket can be bound to nonlocal address The construction "net->ipv4.sysctl_ip_nonlocal_bind || inet->freebind || inet->transparent" is present three times and its IPv6 counterpart is also present three times. We introduce two small helpers to characterize these tests uniformly. Signed-off-by: Vincent Bernat Signed-off-by: David S. Miller --- include/net/inet_sock.h | 8 ++++++++ include/net/ipv6.h | 7 +++++++ net/ipv4/af_inet.c | 3 +-- net/ipv4/ping.c | 6 ++---- net/ipv6/af_inet6.c | 6 ++---- net/ipv6/datagram.c | 3 +-- 6 files changed, 21 insertions(+), 12 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 314be484c696..e03b93360f33 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -359,4 +359,12 @@ static inline bool inet_get_convert_csum(struct sock *sk) return !!inet_sk(sk)->convert_csum; } + +static inline bool inet_can_nonlocal_bind(struct net *net, + struct inet_sock *inet) +{ + return net->ipv4.sysctl_ip_nonlocal_bind || + inet->freebind || inet->transparent; +} + #endif /* _INET_SOCK_H */ diff --git a/include/net/ipv6.h b/include/net/ipv6.h index a44509f4e985..82deb684ba73 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -766,6 +766,13 @@ static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6, return hlimit; } +static inline bool ipv6_can_nonlocal_bind(struct net *net, + struct inet_sock *inet) +{ + return net->ipv6.sysctl.ip_nonlocal_bind || + inet->freebind || inet->transparent; +} + /* copy IPv6 saddr & daddr to flow_keys, possibly using 64bit load/store * Equivalent to : flow->v6addrs.src = iph->saddr; * flow->v6addrs.dst = iph->daddr; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f2a0a3bab6b5..ee707b91d1a7 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -486,8 +486,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, * is temporarily down) */ err = -EADDRNOTAVAIL; - if (!net->ipv4.sysctl_ip_nonlocal_bind && - !(inet->freebind || inet->transparent) && + if (!inet_can_nonlocal_bind(net, inet) && addr->sin_addr.s_addr != htonl(INADDR_ANY) && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index b54c964ad925..8d7aaf118a30 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -320,8 +320,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) chk_addr_ret = RTN_LOCAL; - if ((net->ipv4.sysctl_ip_nonlocal_bind == 0 && - isk->freebind == 0 && isk->transparent == 0 && + if ((!inet_can_nonlocal_bind(net, isk) && chk_addr_ret != RTN_LOCAL) || chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) @@ -361,8 +360,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, scoped); rcu_read_unlock(); - if (!(net->ipv6.sysctl.ip_nonlocal_bind || - isk->freebind || isk->transparent || has_addr || + if (!(ipv6_can_nonlocal_bind(net, isk) || has_addr || addr_type == IPV6_ADDR_ANY)) return -EADDRNOTAVAIL; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index c9535354149f..020f6e14a7af 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -322,8 +322,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, /* Reproduce AF_INET checks to make the bindings consistent */ v4addr = addr->sin6_addr.s6_addr32[3]; chk_addr_ret = inet_addr_type(net, v4addr); - if (!net->ipv4.sysctl_ip_nonlocal_bind && - !(inet->freebind || inet->transparent) && + if (!inet_can_nonlocal_bind(net, inet) && v4addr != htonl(INADDR_ANY) && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && @@ -362,8 +361,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, */ v4addr = LOOPBACK4_IPV6; if (!(addr_type & IPV6_ADDR_MULTICAST)) { - if (!net->ipv6.sysctl.ip_nonlocal_bind && - !(inet->freebind || inet->transparent) && + if (!ipv6_can_nonlocal_bind(net, inet) && !ipv6_chk_addr(net, &addr->sin6_addr, dev, 0)) { err = -EADDRNOTAVAIL; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index f0264dfd38de..1ede7a16a0be 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -803,8 +803,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, if (addr_type != IPV6_ADDR_ANY) { int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL; - if (!(net->ipv6.sysctl.ip_nonlocal_bind || - inet_sk(sk)->freebind || inet_sk(sk)->transparent) && + if (!ipv6_can_nonlocal_bind(net, inet_sk(sk)) && !ipv6_chk_addr_and_flags(net, &src_info->ipi6_addr, dev, !strict, 0, IFA_F_TENTATIVE) && -- cgit v1.2.3 From 432e05d328921c68c35bfdeff7d7b7400b8e3d1a Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 1 Aug 2018 00:36:03 +0200 Subject: net: ipv4: Control SKB reprioritization after forwarding After IPv4 packets are forwarded, the priority of the corresponding SKB is updated according to the TOS field of IPv4 header. This overrides any prioritization done earlier by e.g. an skbedit action or ingress-qos-map defined at a vlan device. Such overriding may not always be desirable. Even if the packet ends up being routed, which implies this is an L3 network node, an administrator may wish to preserve whatever prioritization was done earlier on in the pipeline. Therefore introduce a sysctl that controls this behavior. Keep the default value at 1 to maintain backward-compatible behavior. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 9 +++++++++ include/net/netns/ipv4.h | 1 + net/ipv4/af_inet.c | 1 + net/ipv4/ip_forward.c | 3 ++- net/ipv4/sysctl_net_ipv4.c | 9 +++++++++ 5 files changed, 22 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 77c37fb0b6a6..e74515ecaa9c 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -81,6 +81,15 @@ fib_multipath_hash_policy - INTEGER 0 - Layer 3 1 - Layer 4 +ip_forward_update_priority - INTEGER + Whether to update SKB priority from "TOS" field in IPv4 header after it + is forwarded. The new SKB priority is mapped from TOS field value + according to an rt_tos2priority table (see e.g. man tc-prio). + Default: 1 (Update priority.) + Possible values: + 0 - Do not update priority. + 1 - Update priority. + route/max_size - INTEGER Maximum number of routes allowed in the kernel. Increase this when using large numbers of interfaces and/or routes. diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 661348f23ea5..e47503b4e4d1 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -98,6 +98,7 @@ struct netns_ipv4 { int sysctl_ip_default_ttl; int sysctl_ip_no_pmtu_disc; int sysctl_ip_fwd_use_pmtu; + int sysctl_ip_fwd_update_priority; int sysctl_ip_nonlocal_bind; /* Shall we try to damage output packets if routing dev changes? */ int sysctl_ip_dynaddr; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ee707b91d1a7..20fda8fb8ffd 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1801,6 +1801,7 @@ static __net_init int inet_init_net(struct net *net) * We set them here, in case sysctl is not compiled. */ net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; + net->ipv4.sysctl_ip_fwd_update_priority = 1; net->ipv4.sysctl_ip_dynaddr = 0; net->ipv4.sysctl_ip_early_demux = 1; net->ipv4.sysctl_udp_early_demux = 1; diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index b54b948b0596..32662e9e5d21 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -143,7 +143,8 @@ int ip_forward(struct sk_buff *skb) !skb_sec_path(skb)) ip_rt_send_redirect(skb); - skb->priority = rt_tos2priority(iph->tos); + if (net->ipv4.sysctl_ip_fwd_update_priority) + skb->priority = rt_tos2priority(iph->tos); return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, net, NULL, skb, skb->dev, rt->dst.dev, diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 5fa335fd3852..e21dda015513 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -663,6 +663,15 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "ip_forward_update_priority", + .data = &init_net.ipv4.sysctl_ip_fwd_update_priority, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, { .procname = "ip_nonlocal_bind", .data = &init_net.ipv4.sysctl_ip_nonlocal_bind, -- cgit v1.2.3 From d18c5d1995aa322b722fa731397e28ebcd00b3c6 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 1 Aug 2018 00:36:42 +0200 Subject: net: ipv4: Notify about changes to ip_forward_update_priority Drivers may make offloading decision based on whether ip_forward_update_priority is enabled or not. Therefore distribute netevent notifications to give them a chance to react to a change. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/netevent.h | 1 + net/ipv4/sysctl_net_ipv4.c | 19 ++++++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netevent.h b/include/net/netevent.h index d9918261701c..4107016c3bb4 100644 --- a/include/net/netevent.h +++ b/include/net/netevent.h @@ -28,6 +28,7 @@ enum netevent_notif_type { NETEVENT_DELAY_PROBE_TIME_UPDATE, /* arg is struct neigh_parms ptr */ NETEVENT_IPV4_MPATH_HASH_UPDATE, /* arg is struct net ptr */ NETEVENT_IPV6_MPATH_HASH_UPDATE, /* arg is struct net ptr */ + NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE, /* arg is struct net ptr */ }; int register_netevent_notifier(struct notifier_block *nb); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e21dda015513..b92f422f2fa8 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -201,6 +201,23 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write, return ret; } +static int ipv4_fwd_update_priority(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + struct net *net; + int ret; + + net = container_of(table->data, struct net, + ipv4.sysctl_ip_fwd_update_priority); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (write && ret == 0) + call_netevent_notifiers(NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE, + net); + + return ret; +} + static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -668,7 +685,7 @@ static struct ctl_table ipv4_net_table[] = { .data = &init_net.ipv4.sysctl_ip_fwd_update_priority, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = ipv4_fwd_update_priority, .extra1 = &zero, .extra2 = &one, }, -- cgit v1.2.3 From 290b1c8b1a902c0902df9ec05577ab209296f345 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 1 Aug 2018 12:36:57 +0200 Subject: net: sched: make tcf_chain_{get,put}() static These are no longer used outside of cls_api.c so make them static. Move tcf_chain_flush() to avoid fwd declaration of tcf_chain_put(). Signed-off-by: Jiri Pirko v1->v2: - new patch Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 3 --- net/sched/cls_api.c | 34 ++++++++++++++++------------------ 2 files changed, 16 insertions(+), 21 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 22bfc3a13c25..ef727f71336e 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -40,11 +40,8 @@ struct tcf_block_cb; bool tcf_queue_work(struct rcu_work *rwork, work_func_t func); #ifdef CONFIG_NET_CLS -struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, - bool create); struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 chain_index); -void tcf_chain_put(struct tcf_chain *chain); void tcf_chain_put_by_act(struct tcf_chain *chain); void tcf_block_netif_keep_dst(struct tcf_block *block); int tcf_block_get(struct tcf_block **p_block, diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index b194a5abfc6a..e8b0bbd0883f 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -232,19 +232,6 @@ static void tcf_chain0_head_change(struct tcf_chain *chain, tcf_chain_head_change_item(item, tp_head); } -static void tcf_chain_flush(struct tcf_chain *chain) -{ - struct tcf_proto *tp = rtnl_dereference(chain->filter_chain); - - tcf_chain0_head_change(chain, NULL); - while (tp) { - RCU_INIT_POINTER(chain->filter_chain, tp->next); - tcf_proto_destroy(tp, NULL); - tp = rtnl_dereference(chain->filter_chain); - tcf_chain_put(chain); - } -} - static void tcf_chain_destroy(struct tcf_chain *chain) { struct tcf_block *block = chain->block; @@ -316,12 +303,11 @@ static struct tcf_chain *__tcf_chain_get(struct tcf_block *block, return chain; } -struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, - bool create) +static struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, + bool create) { return __tcf_chain_get(block, chain_index, create, false); } -EXPORT_SYMBOL(tcf_chain_get); struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 chain_index) { @@ -347,11 +333,10 @@ static void __tcf_chain_put(struct tcf_chain *chain, bool by_act) } } -void tcf_chain_put(struct tcf_chain *chain) +static void tcf_chain_put(struct tcf_chain *chain) { __tcf_chain_put(chain, false); } -EXPORT_SYMBOL(tcf_chain_put); void tcf_chain_put_by_act(struct tcf_chain *chain) { @@ -365,6 +350,19 @@ static void tcf_chain_put_explicitly_created(struct tcf_chain *chain) tcf_chain_put(chain); } +static void tcf_chain_flush(struct tcf_chain *chain) +{ + struct tcf_proto *tp = rtnl_dereference(chain->filter_chain); + + tcf_chain0_head_change(chain, NULL); + while (tp) { + RCU_INIT_POINTER(chain->filter_chain, tp->next); + tcf_proto_destroy(tp, NULL); + tp = rtnl_dereference(chain->filter_chain); + tcf_chain_put(chain); + } +} + static bool tcf_block_offload_in_use(struct tcf_block *block) { return block->offloadcnt; -- cgit v1.2.3 From db57dc7c7a5c42bb653425a01b6d73c49514b5db Mon Sep 17 00:00:00 2001 From: Vincent Bernat Date: Wed, 1 Aug 2018 22:05:10 +0200 Subject: net: don't declare IPv6 non-local bind helper if CONFIG_IPV6 undefined Fixes: 83ba4645152d ("net: add helpers checking if socket can be bound to nonlocal address") Signed-off-by: Vincent Bernat Signed-off-by: David S. Miller --- include/net/ipv6.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 82deb684ba73..ff33f498c137 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -766,13 +766,6 @@ static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6, return hlimit; } -static inline bool ipv6_can_nonlocal_bind(struct net *net, - struct inet_sock *inet) -{ - return net->ipv6.sysctl.ip_nonlocal_bind || - inet->freebind || inet->transparent; -} - /* copy IPv6 saddr & daddr to flow_keys, possibly using 64bit load/store * Equivalent to : flow->v6addrs.src = iph->saddr; * flow->v6addrs.dst = iph->daddr; @@ -789,6 +782,13 @@ static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow, #if IS_ENABLED(CONFIG_IPV6) +static inline bool ipv6_can_nonlocal_bind(struct net *net, + struct inet_sock *inet) +{ + return net->ipv6.sysctl.ip_nonlocal_bind || + inet->freebind || inet->transparent; +} + /* Sysctl settings for net ipv6.auto_flowlabels */ #define IP6_AUTO_FLOW_LABEL_OFF 0 #define IP6_AUTO_FLOW_LABEL_OPTOUT 1 -- cgit v1.2.3 From 285189c78eeb6f684a024b86fb5997d10c6aa564 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Wed, 25 Jul 2018 15:52:13 +0800 Subject: netfilter: use kvmalloc_array to allocate memory for hashtable nf_ct_alloc_hashtable is used to allocate memory for conntrack, NAT bysrc and expectation hashtable. Assuming 64k bucket size, which means 7th order page allocation, __get_free_pages, called by nf_ct_alloc_hashtable, will trigger the direct memory reclaim and stall for a long time, when system has lots of memory stress so replace combination of __get_free_pages and vzalloc with kvmalloc_array, which provides a overflow check and a fallback if no high order memory is available, and do not retry to reclaim memory, reduce stall and remove nf_ct_free_hashtable, since it is just a kvfree Signed-off-by: Zhang Yu Signed-off-by: Wang Li Signed-off-by: Li RongQing Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 2 -- net/netfilter/nf_conntrack_core.c | 29 ++++++----------------------- net/netfilter/nf_conntrack_expect.c | 2 +- net/netfilter/nf_conntrack_helper.c | 4 ++-- net/netfilter/nf_nat_core.c | 4 ++-- 5 files changed, 11 insertions(+), 30 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index a2b0ed025908..7e012312cd61 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -176,8 +176,6 @@ void nf_ct_netns_put(struct net *net, u8 nfproto); */ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls); -void nf_ct_free_hashtable(void *hash, unsigned int size); - int nf_conntrack_hash_check_insert(struct nf_conn *ct); bool nf_ct_delete(struct nf_conn *ct, u32 pid, int report); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 8a113ca1eea2..a676d5f76bdc 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2022,16 +2022,6 @@ static int kill_all(struct nf_conn *i, void *data) return net_eq(nf_ct_net(i), data); } -void nf_ct_free_hashtable(void *hash, unsigned int size) -{ - if (is_vmalloc_addr(hash)) - vfree(hash); - else - free_pages((unsigned long)hash, - get_order(sizeof(struct hlist_head) * size)); -} -EXPORT_SYMBOL_GPL(nf_ct_free_hashtable); - void nf_conntrack_cleanup_start(void) { conntrack_gc_work.exiting = true; @@ -2042,7 +2032,7 @@ void nf_conntrack_cleanup_end(void) { RCU_INIT_POINTER(nf_ct_hook, NULL); cancel_delayed_work_sync(&conntrack_gc_work.dwork); - nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); + kvfree(nf_conntrack_hash); nf_conntrack_proto_fini(); nf_conntrack_seqadj_fini(); @@ -2108,7 +2098,6 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) { struct hlist_nulls_head *hash; unsigned int nr_slots, i; - size_t sz; if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) return NULL; @@ -2116,14 +2105,8 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); - if (nr_slots > (UINT_MAX / sizeof(struct hlist_nulls_head))) - return NULL; - - sz = nr_slots * sizeof(struct hlist_nulls_head); - hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, - get_order(sz)); - if (!hash) - hash = vzalloc(sz); + hash = kvmalloc_array(nr_slots, sizeof(struct hlist_nulls_head), + GFP_KERNEL | __GFP_ZERO); if (hash && nulls) for (i = 0; i < nr_slots; i++) @@ -2150,7 +2133,7 @@ int nf_conntrack_hash_resize(unsigned int hashsize) old_size = nf_conntrack_htable_size; if (old_size == hashsize) { - nf_ct_free_hashtable(hash, hashsize); + kvfree(hash); return 0; } @@ -2186,7 +2169,7 @@ int nf_conntrack_hash_resize(unsigned int hashsize) local_bh_enable(); synchronize_net(); - nf_ct_free_hashtable(old_hash, old_size); + kvfree(old_hash); return 0; } @@ -2350,7 +2333,7 @@ err_acct: err_expect: kmem_cache_destroy(nf_conntrack_cachep); err_cachep: - nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); + kvfree(nf_conntrack_hash); return ret; } diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 3f586ba23d92..27b84231db10 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -712,5 +712,5 @@ void nf_conntrack_expect_fini(void) { rcu_barrier(); /* Wait for call_rcu() before destroy */ kmem_cache_destroy(nf_ct_expect_cachep); - nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_hsize); + kvfree(nf_ct_expect_hash); } diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index d557a425289d..e24b762ffa1d 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -562,12 +562,12 @@ int nf_conntrack_helper_init(void) return 0; out_extend: - nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize); + kvfree(nf_ct_helper_hash); return ret; } void nf_conntrack_helper_fini(void) { nf_ct_extend_unregister(&helper_extend); - nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize); + kvfree(nf_ct_helper_hash); } diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 6366f0c0b8c1..e2b196054dfc 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -1056,7 +1056,7 @@ static int __init nf_nat_init(void) ret = nf_ct_extend_register(&nat_extend); if (ret < 0) { - nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); + kvfree(nf_nat_bysource); pr_err("Unable to register extension\n"); return ret; } @@ -1094,7 +1094,7 @@ static void __exit nf_nat_cleanup(void) for (i = 0; i < NFPROTO_NUMPROTO; i++) kfree(nf_nat_l4protos[i]); synchronize_net(); - nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); + kvfree(nf_nat_bysource); unregister_pernet_subsys(&nat_net_ops); } -- cgit v1.2.3 From eb9950eb31f56e57582a61c92073336d04a26542 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 3 Aug 2018 17:06:56 +0100 Subject: rxrpc: Push iov_iter up from rxrpc_kernel_recv_data() to caller Push iov_iter up from rxrpc_kernel_recv_data() to its caller to allow non-contiguous iovs to be passed down, thereby permitting file reading to be simplified in the AFS filesystem in a future patch. Signed-off-by: David Howells Signed-off-by: David S. Miller --- fs/afs/rxrpc.c | 28 +++++++++++++++++----------- include/net/af_rxrpc.h | 2 +- net/rxrpc/recvmsg.c | 33 +++++++++++---------------------- 3 files changed, 29 insertions(+), 34 deletions(-) (limited to 'include/net') diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index a1b18082991b..19db5f672a9d 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -346,7 +346,6 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, struct rxrpc_call *rxcall; struct msghdr msg; struct kvec iov[1]; - size_t offset; s64 tx_total_len; int ret; @@ -433,10 +432,10 @@ error_do_abort: rxrpc_kernel_abort_call(call->net->socket, rxcall, RX_USER_ABORT, ret, "KSD"); } else { - offset = 0; - rxrpc_kernel_recv_data(call->net->socket, rxcall, NULL, - 0, &offset, false, &call->abort_code, - &call->service_id); + iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, NULL, 0, 0); + rxrpc_kernel_recv_data(call->net->socket, rxcall, + &msg.msg_iter, false, + &call->abort_code, &call->service_id); ac->abort_code = call->abort_code; ac->responded = true; } @@ -467,13 +466,14 @@ static void afs_deliver_to_call(struct afs_call *call) state == AFS_CALL_SV_AWAIT_ACK ) { if (state == AFS_CALL_SV_AWAIT_ACK) { - size_t offset = 0; + struct iov_iter iter; + + iov_iter_kvec(&iter, READ | ITER_KVEC, NULL, 0, 0); ret = rxrpc_kernel_recv_data(call->net->socket, - call->rxcall, - NULL, 0, &offset, false, + call->rxcall, &iter, false, &remote_abort, &call->service_id); - trace_afs_recv_data(call, 0, offset, false, ret); + trace_afs_recv_data(call, 0, 0, false, ret); if (ret == -EINPROGRESS || ret == -EAGAIN) return; @@ -894,6 +894,8 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count, bool want_more) { struct afs_net *net = call->net; + struct iov_iter iter; + struct kvec iov; enum afs_call_state state; u32 remote_abort = 0; int ret; @@ -903,10 +905,14 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count, ASSERTCMP(call->offset, <=, count); - ret = rxrpc_kernel_recv_data(net->socket, call->rxcall, - buf, count, &call->offset, + iov.iov_base = buf + call->offset; + iov.iov_len = count - call->offset; + iov_iter_kvec(&iter, ITER_KVEC | READ, &iov, 1, count - call->offset); + + ret = rxrpc_kernel_recv_data(net->socket, call->rxcall, &iter, want_more, &remote_abort, &call->service_id); + call->offset += (count - call->offset) - iov_iter_count(&iter); trace_afs_recv_data(call, count, call->offset, want_more, ret); if (ret == 0 || ret == -EAGAIN) return ret; diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 8ae8ee004258..f53edb3754bc 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -61,7 +61,7 @@ int rxrpc_kernel_send_data(struct socket *, struct rxrpc_call *, struct msghdr *, size_t, rxrpc_notify_end_tx_t); int rxrpc_kernel_recv_data(struct socket *, struct rxrpc_call *, - void *, size_t, size_t *, bool, u32 *, u16 *); + struct iov_iter *, bool, u32 *, u16 *); bool rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *, u32, int, const char *); void rxrpc_kernel_end_call(struct socket *, struct rxrpc_call *); diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index a57ea96c84ea..816b19a78809 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -611,9 +611,7 @@ wait_error: * rxrpc_kernel_recv_data - Allow a kernel service to receive data/info * @sock: The socket that the call exists on * @call: The call to send data through - * @buf: The buffer to receive into - * @size: The size of the buffer, including data already read - * @_offset: The running offset into the buffer. + * @iter: The buffer to receive into * @want_more: True if more data is expected to be read * @_abort: Where the abort code is stored if -ECONNABORTED is returned * @_service: Where to store the actual service ID (may be upgraded) @@ -626,39 +624,30 @@ wait_error: * Note that we may return -EAGAIN to drain empty packets at the end of the * data, even if we've already copied over the requested data. * - * This function adds the amount it transfers to *_offset, so this should be - * precleared as appropriate. Note that the amount remaining in the buffer is - * taken to be size - *_offset. - * * *_abort should also be initialised to 0. */ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, - void *buf, size_t size, size_t *_offset, + struct iov_iter *iter, bool want_more, u32 *_abort, u16 *_service) { - struct iov_iter iter; - struct kvec iov; + size_t offset = 0; int ret; - _enter("{%d,%s},%zu/%zu,%d", + _enter("{%d,%s},%zu,%d", call->debug_id, rxrpc_call_states[call->state], - *_offset, size, want_more); + iov_iter_count(iter), want_more); - ASSERTCMP(*_offset, <=, size); ASSERTCMP(call->state, !=, RXRPC_CALL_SERVER_ACCEPTING); - iov.iov_base = buf + *_offset; - iov.iov_len = size - *_offset; - iov_iter_kvec(&iter, ITER_KVEC | READ, &iov, 1, size - *_offset); - mutex_lock(&call->user_mutex); switch (READ_ONCE(call->state)) { case RXRPC_CALL_CLIENT_RECV_REPLY: case RXRPC_CALL_SERVER_RECV_REQUEST: case RXRPC_CALL_SERVER_ACK_REQUEST: - ret = rxrpc_recvmsg_data(sock, call, NULL, &iter, size, 0, - _offset); + ret = rxrpc_recvmsg_data(sock, call, NULL, iter, + iov_iter_count(iter), 0, + &offset); if (ret < 0) goto out; @@ -667,7 +656,7 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, * full buffer or have been given -EAGAIN. */ if (ret == 1) { - if (*_offset < size) + if (iov_iter_count(iter) > 0) goto short_data; if (!want_more) goto read_phase_complete; @@ -704,7 +693,7 @@ out: if (_service) *_service = call->service_id; mutex_unlock(&call->user_mutex); - _leave(" = %d [%zu,%d]", ret, *_offset, *_abort); + _leave(" = %d [%zu,%d]", ret, iov_iter_count(iter), *_abort); return ret; short_data: @@ -720,7 +709,7 @@ call_complete: ret = call->error; if (call->completion == RXRPC_CALL_SUCCEEDED) { ret = 1; - if (size > 0) + if (iov_iter_count(iter) > 0) ret = -ECONNRESET; } goto out; -- cgit v1.2.3 From 31ba191bf5ab2975c1955f1adf771a5a0b57afaa Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Fri, 3 Aug 2018 14:53:15 +0800 Subject: include/net/bond_3ad: Simplify the code by using the ARRAY_SIZE We prefer to ARRAY_SIZE rather than the open code to calculate size. Signed-off-by: zhong jiang Signed-off-by: David S. Miller --- include/net/bond_3ad.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h index f358ad5e4214..fc3111515f5c 100644 --- a/include/net/bond_3ad.h +++ b/include/net/bond_3ad.h @@ -283,7 +283,7 @@ static inline const char *bond_3ad_churn_desc(churn_state_t state) "none", "unknown" }; - int max_size = sizeof(churn_description) / sizeof(churn_description[0]); + int max_size = ARRAY_SIZE(churn_description); if (state >= max_size) state = max_size - 1; -- cgit v1.2.3 From fa0f527358bd900ef92f925878ed6bfbd51305cc Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Thu, 2 Aug 2018 23:34:39 +0000 Subject: ip: use rb trees for IP frag queue. Similar to TCP OOO RX queue, it makes sense to use rb trees to store IP fragments, so that OOO fragments are inserted faster. Tested: - a follow-up patch contains a rather comprehensive ip defrag self-test (functional) - ran neper `udp_stream -c -H -F 100 -l 300 -T 20`: netstat --statistics Ip: 282078937 total packets received 0 forwarded 0 incoming packets discarded 946760 incoming packets delivered 18743456 requests sent out 101 fragments dropped after timeout 282077129 reassemblies required 944952 packets reassembled ok 262734239 packet reassembles failed (The numbers/stats above are somewhat better re: reassemblies vs a kernel without this patchset. More comprehensive performance testing TBD). Reported-by: Jann Horn Reported-by: Juha-Matti Tilli Suggested-by: Eric Dumazet Signed-off-by: Peter Oskolkov Signed-off-by: Eric Dumazet Cc: Florian Westphal Signed-off-by: David S. Miller --- include/linux/skbuff.h | 9 +- include/net/inet_frag.h | 3 +- net/ipv4/inet_fragment.c | 16 +-- net/ipv4/ip_fragment.c | 182 ++++++++++++++++++-------------- net/ipv6/netfilter/nf_conntrack_reasm.c | 1 + net/ipv6/reassembly.c | 1 + 6 files changed, 121 insertions(+), 91 deletions(-) (limited to 'include/net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 47848367c816..7ebdf158a795 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -676,13 +676,16 @@ struct sk_buff { * UDP receive path is one user. */ unsigned long dev_scratch; - int ip_defrag_offset; }; }; - struct rb_node rbnode; /* used in netem & tcp stack */ + struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */ struct list_head list; }; - struct sock *sk; + + union { + struct sock *sk; + int ip_defrag_offset; + }; union { ktime_t tstamp; diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index f4272a29dc44..b86d14528188 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -75,7 +75,8 @@ struct inet_frag_queue { struct timer_list timer; spinlock_t lock; refcount_t refcnt; - struct sk_buff *fragments; + struct sk_buff *fragments; /* Used in IPv6. */ + struct rb_root rb_fragments; /* Used in IPv4. */ struct sk_buff *fragments_tail; ktime_t stamp; int len; diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index ccd140e4082d..6d258a5669e7 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -137,12 +137,16 @@ void inet_frag_destroy(struct inet_frag_queue *q) fp = q->fragments; nf = q->net; f = nf->f; - while (fp) { - struct sk_buff *xp = fp->next; - - sum_truesize += fp->truesize; - kfree_skb(fp); - fp = xp; + if (fp) { + do { + struct sk_buff *xp = fp->next; + + sum_truesize += fp->truesize; + kfree_skb(fp); + fp = xp; + } while (fp); + } else { + sum_truesize = skb_rbtree_purge(&q->rb_fragments); } sum = sum_truesize + f->qsize; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 960bf5eab59f..0e8f8de77e71 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -136,7 +136,7 @@ static void ip_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); const struct iphdr *iph; - struct sk_buff *head; + struct sk_buff *head = NULL; struct net *net; struct ipq *qp; int err; @@ -152,14 +152,31 @@ static void ip_expire(struct timer_list *t) ipq_kill(qp); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); - - head = qp->q.fragments; - __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); - if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head) + if (!qp->q.flags & INET_FRAG_FIRST_IN) goto out; + /* sk_buff::dev and sk_buff::rbnode are unionized. So we + * pull the head out of the tree in order to be able to + * deal with head->dev. + */ + if (qp->q.fragments) { + head = qp->q.fragments; + qp->q.fragments = head->next; + } else { + head = skb_rb_first(&qp->q.rb_fragments); + if (!head) + goto out; + rb_erase(&head->rbnode, &qp->q.rb_fragments); + memset(&head->rbnode, 0, sizeof(head->rbnode)); + barrier(); + } + if (head == qp->q.fragments_tail) + qp->q.fragments_tail = NULL; + + sub_frag_mem_limit(qp->q.net, head->truesize); + head->dev = dev_get_by_index_rcu(net, qp->iif); if (!head->dev) goto out; @@ -179,16 +196,16 @@ static void ip_expire(struct timer_list *t) (skb_rtable(head)->rt_type != RTN_LOCAL)) goto out; - skb_get(head); spin_unlock(&qp->q.lock); icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); - kfree_skb(head); goto out_rcu_unlock; out: spin_unlock(&qp->q.lock); out_rcu_unlock: rcu_read_unlock(); + if (head) + kfree_skb(head); ipq_put(qp); } @@ -231,7 +248,7 @@ static int ip_frag_too_far(struct ipq *qp) end = atomic_inc_return(&peer->rid); qp->rid = end; - rc = qp->q.fragments && (end - start) > max; + rc = qp->q.fragments_tail && (end - start) > max; if (rc) { struct net *net; @@ -245,7 +262,6 @@ static int ip_frag_too_far(struct ipq *qp) static int ip_frag_reinit(struct ipq *qp) { - struct sk_buff *fp; unsigned int sum_truesize = 0; if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { @@ -253,20 +269,14 @@ static int ip_frag_reinit(struct ipq *qp) return -ETIMEDOUT; } - fp = qp->q.fragments; - do { - struct sk_buff *xp = fp->next; - - sum_truesize += fp->truesize; - kfree_skb(fp); - fp = xp; - } while (fp); + sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments); sub_frag_mem_limit(qp->q.net, sum_truesize); qp->q.flags = 0; qp->q.len = 0; qp->q.meat = 0; qp->q.fragments = NULL; + qp->q.rb_fragments = RB_ROOT; qp->q.fragments_tail = NULL; qp->iif = 0; qp->ecn = 0; @@ -278,7 +288,8 @@ static int ip_frag_reinit(struct ipq *qp) static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { struct net *net = container_of(qp->q.net, struct net, ipv4.frags); - struct sk_buff *prev, *next; + struct rb_node **rbn, *parent; + struct sk_buff *skb1; struct net_device *dev; unsigned int fragsize; int flags, offset; @@ -341,58 +352,58 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) if (err) goto err; - /* Find out which fragments are in front and at the back of us - * in the chain of fragments so far. We must know where to put - * this fragment, right? - */ - prev = qp->q.fragments_tail; - if (!prev || prev->ip_defrag_offset < offset) { - next = NULL; - goto found; - } - prev = NULL; - for (next = qp->q.fragments; next != NULL; next = next->next) { - if (next->ip_defrag_offset >= offset) - break; /* bingo! */ - prev = next; - } + /* Note : skb->rbnode and skb->dev share the same location. */ + dev = skb->dev; + /* Makes sure compiler wont do silly aliasing games */ + barrier(); -found: /* RFC5722, Section 4, amended by Errata ID : 3089 * When reassembling an IPv6 datagram, if * one or more its constituent fragments is determined to be an * overlapping fragment, the entire datagram (and any constituent * fragments) MUST be silently discarded. * - * We do the same here for IPv4. + * We do the same here for IPv4 (and increment an snmp counter). */ - /* Is there an overlap with the previous fragment? */ - if (prev && - (prev->ip_defrag_offset + prev->len) > offset) - goto discard_qp; - - /* Is there an overlap with the next fragment? */ - if (next && next->ip_defrag_offset < end) - goto discard_qp; + /* Find out where to put this fragment. */ + skb1 = qp->q.fragments_tail; + if (!skb1) { + /* This is the first fragment we've received. */ + rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node); + qp->q.fragments_tail = skb; + } else if ((skb1->ip_defrag_offset + skb1->len) < end) { + /* This is the common/special case: skb goes to the end. */ + /* Detect and discard overlaps. */ + if (offset < (skb1->ip_defrag_offset + skb1->len)) + goto discard_qp; + /* Insert after skb1. */ + rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right); + qp->q.fragments_tail = skb; + } else { + /* Binary search. Note that skb can become the first fragment, but + * not the last (covered above). */ + rbn = &qp->q.rb_fragments.rb_node; + do { + parent = *rbn; + skb1 = rb_to_skb(parent); + if (end <= skb1->ip_defrag_offset) + rbn = &parent->rb_left; + else if (offset >= skb1->ip_defrag_offset + skb1->len) + rbn = &parent->rb_right; + else /* Found an overlap with skb1. */ + goto discard_qp; + } while (*rbn); + /* Here we have parent properly set, and rbn pointing to + * one of its NULL left/right children. Insert skb. */ + rb_link_node(&skb->rbnode, parent, rbn); + } + rb_insert_color(&skb->rbnode, &qp->q.rb_fragments); - /* Note : skb->ip_defrag_offset and skb->dev share the same location */ - dev = skb->dev; if (dev) qp->iif = dev->ifindex; - /* Makes sure compiler wont do silly aliasing games */ - barrier(); skb->ip_defrag_offset = offset; - /* Insert this fragment in the chain of fragments. */ - skb->next = next; - if (!next) - qp->q.fragments_tail = skb; - if (prev) - prev->next = skb; - else - qp->q.fragments = skb; - qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; qp->ecn |= ecn; @@ -414,7 +425,7 @@ found: unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; - err = ip_frag_reasm(qp, prev, dev); + err = ip_frag_reasm(qp, skb, dev); skb->_skb_refdst = orefdst; return err; } @@ -431,15 +442,15 @@ err: return err; } - /* Build a new IP datagram from all its fragments. */ - -static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, +static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, struct net_device *dev) { struct net *net = container_of(qp->q.net, struct net, ipv4.frags); struct iphdr *iph; - struct sk_buff *fp, *head = qp->q.fragments; + struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments); + struct sk_buff **nextp; /* To build frag_list. */ + struct rb_node *rbn; int len; int ihlen; int err; @@ -453,25 +464,20 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, goto out_fail; } /* Make the one we just received the head. */ - if (prev) { - head = prev->next; - fp = skb_clone(head, GFP_ATOMIC); + if (head != skb) { + fp = skb_clone(skb, GFP_ATOMIC); if (!fp) goto out_nomem; - - fp->next = head->next; - if (!fp->next) + rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments); + if (qp->q.fragments_tail == skb) qp->q.fragments_tail = fp; - prev->next = fp; - - skb_morph(head, qp->q.fragments); - head->next = qp->q.fragments->next; - - consume_skb(qp->q.fragments); - qp->q.fragments = head; + skb_morph(skb, head); + rb_replace_node(&head->rbnode, &skb->rbnode, + &qp->q.rb_fragments); + consume_skb(head); + head = skb; } - WARN_ON(!head); WARN_ON(head->ip_defrag_offset != 0); /* Allocate a new buffer for the datagram. */ @@ -496,24 +502,35 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, clone = alloc_skb(0, GFP_ATOMIC); if (!clone) goto out_nomem; - clone->next = head->next; - head->next = clone; skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; skb_frag_list_init(head); for (i = 0; i < skb_shinfo(head)->nr_frags; i++) plen += skb_frag_size(&skb_shinfo(head)->frags[i]); clone->len = clone->data_len = head->data_len - plen; - head->data_len -= clone->len; - head->len -= clone->len; + skb->truesize += clone->truesize; clone->csum = 0; clone->ip_summed = head->ip_summed; add_frag_mem_limit(qp->q.net, clone->truesize); + skb_shinfo(head)->frag_list = clone; + nextp = &clone->next; + } else { + nextp = &skb_shinfo(head)->frag_list; } - skb_shinfo(head)->frag_list = head->next; skb_push(head, head->data - skb_network_header(head)); - for (fp=head->next; fp; fp = fp->next) { + /* Traverse the tree in order, to build frag_list. */ + rbn = rb_next(&head->rbnode); + rb_erase(&head->rbnode, &qp->q.rb_fragments); + while (rbn) { + struct rb_node *rbnext = rb_next(rbn); + fp = rb_to_skb(rbn); + rb_erase(rbn, &qp->q.rb_fragments); + rbn = rbnext; + *nextp = fp; + nextp = &fp->next; + fp->prev = NULL; + memset(&fp->rbnode, 0, sizeof(fp->rbnode)); head->data_len += fp->len; head->len += fp->len; if (head->ip_summed != fp->ip_summed) @@ -524,7 +541,9 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, } sub_frag_mem_limit(qp->q.net, head->truesize); + *nextp = NULL; head->next = NULL; + head->prev = NULL; head->dev = dev; head->tstamp = qp->q.stamp; IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size); @@ -552,6 +571,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS); qp->q.fragments = NULL; + qp->q.rb_fragments = RB_ROOT; qp->q.fragments_tail = NULL; return 0; diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 0610bdab721c..38d69ef516d5 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -463,6 +463,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic head->csum); fq->q.fragments = NULL; + fq->q.rb_fragments = RB_ROOT; fq->q.fragments_tail = NULL; return true; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 6edd2ac8ae4b..b4e558ab39fa 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -405,6 +405,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); rcu_read_unlock(); fq->q.fragments = NULL; + fq->q.rb_fragments = RB_ROOT; fq->q.fragments_tail = NULL; return 1; -- cgit v1.2.3 From e4cc5a1873ac1297615962185f94adbbfaf6456b Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Mon, 6 Aug 2018 17:58:40 +0200 Subject: Bluetooth: btqca: Introduce HCI_EV_VENDOR and use it Using HCI_VENDOR_PKT for vendor specific events does work since it has also the value 0xff, but it is actually the packet type indicator constant and not the event constant. So introduce HCI_EV_VENDOR and use it. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- drivers/bluetooth/btqca.c | 6 +++--- include/net/bluetooth/hci.h | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/drivers/bluetooth/btqca.c b/drivers/bluetooth/btqca.c index 488f5e7521dd..ec9e03a6b778 100644 --- a/drivers/bluetooth/btqca.c +++ b/drivers/bluetooth/btqca.c @@ -39,7 +39,7 @@ int qca_read_soc_version(struct hci_dev *hdev, u32 *soc_version) cmd = EDL_PATCH_VER_REQ_CMD; skb = __hci_cmd_sync_ev(hdev, EDL_PATCH_CMD_OPCODE, EDL_PATCH_CMD_LEN, - &cmd, HCI_VENDOR_PKT, HCI_INIT_TIMEOUT); + &cmd, HCI_EV_VENDOR, HCI_INIT_TIMEOUT); if (IS_ERR(skb)) { err = PTR_ERR(skb); bt_dev_err(hdev, "Reading QCA version information failed (%d)", @@ -229,7 +229,7 @@ static int qca_tlv_send_segment(struct hci_dev *hdev, int seg_size, cmd); skb = __hci_cmd_sync_ev(hdev, EDL_PATCH_CMD_OPCODE, seg_size + 2, cmd, - HCI_VENDOR_PKT, HCI_INIT_TIMEOUT); + HCI_EV_VENDOR, HCI_INIT_TIMEOUT); if (IS_ERR(skb)) { err = PTR_ERR(skb); bt_dev_err(hdev, "QCA Failed to send TLV segment (%d)", err); @@ -318,7 +318,7 @@ int qca_set_bdaddr_rome(struct hci_dev *hdev, const bdaddr_t *bdaddr) cmd[2] = sizeof(bdaddr_t); /* size */ memcpy(cmd + 3, bdaddr, sizeof(bdaddr_t)); skb = __hci_cmd_sync_ev(hdev, EDL_NVM_ACCESS_OPCODE, sizeof(cmd), cmd, - HCI_VENDOR_PKT, HCI_INIT_TIMEOUT); + HCI_EV_VENDOR, HCI_INIT_TIMEOUT); if (IS_ERR(skb)) { err = PTR_ERR(skb); bt_dev_err(hdev, "QCA Change address command failed (%d)", err); diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 7f008097552e..4619a79b1bbb 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -2176,6 +2176,8 @@ struct hci_evt_le_ext_adv_set_term { __u8 num_evts; } __packed; +#define HCI_EV_VENDOR 0xff + /* Internal events generated by Bluetooth stack */ #define HCI_EV_STACK_INTERNAL 0xfd struct hci_ev_stack_internal { -- cgit v1.2.3 From 4e665afbd7bee29b44b5d22821b56207f8459e39 Mon Sep 17 00:00:00 2001 From: Harsha Sharma Date: Tue, 7 Aug 2018 17:14:10 +0200 Subject: netfilter: cttimeout: move ctnl_untimeout to nf_conntrack As, ctnl_untimeout is required by nft_ct, so move ctnl_timeout from nfnetlink_cttimeout to nf_conntrack_timeout and rename as nf_ct_timeout. Signed-off-by: Harsha Sharma Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_timeout.h | 1 + net/netfilter/nf_conntrack_timeout.c | 17 +++++++++++++++++ net/netfilter/nfnetlink_cttimeout.c | 20 ++------------------ 3 files changed, 20 insertions(+), 18 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h index 80ceb3d0291d..7a21bc0f00eb 100644 --- a/include/net/netfilter/nf_conntrack_timeout.h +++ b/include/net/netfilter/nf_conntrack_timeout.h @@ -83,6 +83,7 @@ static inline unsigned int *nf_ct_timeout_lookup(const struct nf_conn *ct) #ifdef CONFIG_NF_CONNTRACK_TIMEOUT int nf_conntrack_timeout_init(void); void nf_conntrack_timeout_fini(void); +void nf_ct_untimeout(struct net *net, struct ctnl_timeout *timeout); #else static inline int nf_conntrack_timeout_init(void) { diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c index 46aee65f339b..401c2cce4a61 100644 --- a/net/netfilter/nf_conntrack_timeout.c +++ b/net/netfilter/nf_conntrack_timeout.c @@ -31,6 +31,23 @@ EXPORT_SYMBOL_GPL(nf_ct_timeout_find_get_hook); void (*nf_ct_timeout_put_hook)(struct ctnl_timeout *timeout) __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_timeout_put_hook); +static int untimeout(struct nf_conn *ct, void *timeout) +{ + struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct); + + if (timeout_ext && (!timeout || timeout_ext->timeout == timeout)) + RCU_INIT_POINTER(timeout_ext->timeout, NULL); + + /* We are not intended to delete this conntrack. */ + return 0; +} + +void nf_ct_untimeout(struct net *net, struct ctnl_timeout *timeout) +{ + nf_ct_iterate_cleanup_net(net, untimeout, timeout, 0, 0); +} +EXPORT_SYMBOL_GPL(nf_ct_untimeout); + static const struct nf_ct_ext_type timeout_extend = { .len = sizeof(struct nf_conn_timeout), .align = __alignof__(struct nf_conn_timeout), diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 4199e5300575..df53aef2d642 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -297,22 +297,6 @@ static int cttimeout_get_timeout(struct net *net, struct sock *ctnl, return ret; } -static int untimeout(struct nf_conn *ct, void *timeout) -{ - struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct); - - if (timeout_ext && (!timeout || timeout_ext->timeout == timeout)) - RCU_INIT_POINTER(timeout_ext->timeout, NULL); - - /* We are not intended to delete this conntrack. */ - return 0; -} - -static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout) -{ - nf_ct_iterate_cleanup_net(net, untimeout, timeout, 0, 0); -} - /* try to delete object, fail if it is still in use. */ static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout) { @@ -325,7 +309,7 @@ static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout) /* We are protected by nfnl mutex. */ list_del_rcu(&timeout->head); nf_ct_l4proto_put(timeout->l4proto); - ctnl_untimeout(net, timeout); + nf_ct_untimeout(net, timeout); kfree_rcu(timeout, rcu_head); } else { ret = -EBUSY; @@ -573,7 +557,7 @@ static void __net_exit cttimeout_net_exit(struct net *net) struct ctnl_timeout *cur, *tmp; nf_ct_unconfirmed_destroy(net); - ctnl_untimeout(net, NULL); + nf_ct_untimeout(net, NULL); list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list, head) { list_del_rcu(&cur->head); -- cgit v1.2.3 From 6c1fd7dc489d9bf64196f5b0fa33e059f64460c8 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 7 Aug 2018 17:14:15 +0200 Subject: netfilter: cttimeout: decouple timeout policy from nfnetlink_cttimeout object The timeout policy is currently embedded into the nfnetlink_cttimeout object, move the policy into an independent object. This allows us to reuse part of the existing conntrack timeout extension from nf_tables without adding dependencies with the nfnetlink_cttimeout object layout. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_timeout.h | 22 ++++++++++------- net/netfilter/nf_conntrack_timeout.c | 6 ++--- net/netfilter/nfnetlink_cttimeout.c | 37 ++++++++++++++++------------ net/netfilter/xt_CT.c | 4 +-- 4 files changed, 39 insertions(+), 30 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h index 7a21bc0f00eb..d5f62cc6c2ae 100644 --- a/include/net/netfilter/nf_conntrack_timeout.h +++ b/include/net/netfilter/nf_conntrack_timeout.h @@ -11,24 +11,28 @@ #define CTNL_TIMEOUT_NAME_MAX 32 +struct nf_ct_timeout { + __u16 l3num; + const struct nf_conntrack_l4proto *l4proto; + char data[0]; +}; + struct ctnl_timeout { struct list_head head; struct rcu_head rcu_head; refcount_t refcnt; char name[CTNL_TIMEOUT_NAME_MAX]; - __u16 l3num; - const struct nf_conntrack_l4proto *l4proto; - char data[0]; + struct nf_ct_timeout timeout; }; struct nf_conn_timeout { - struct ctnl_timeout __rcu *timeout; + struct nf_ct_timeout __rcu *timeout; }; static inline unsigned int * nf_ct_timeout_data(struct nf_conn_timeout *t) { - struct ctnl_timeout *timeout; + struct nf_ct_timeout *timeout; timeout = rcu_dereference(t->timeout); if (timeout == NULL) @@ -49,7 +53,7 @@ struct nf_conn_timeout *nf_ct_timeout_find(const struct nf_conn *ct) static inline struct nf_conn_timeout *nf_ct_timeout_ext_add(struct nf_conn *ct, - struct ctnl_timeout *timeout, + struct nf_ct_timeout *timeout, gfp_t gfp) { #ifdef CONFIG_NF_CONNTRACK_TIMEOUT @@ -83,7 +87,7 @@ static inline unsigned int *nf_ct_timeout_lookup(const struct nf_conn *ct) #ifdef CONFIG_NF_CONNTRACK_TIMEOUT int nf_conntrack_timeout_init(void); void nf_conntrack_timeout_fini(void); -void nf_ct_untimeout(struct net *net, struct ctnl_timeout *timeout); +void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout); #else static inline int nf_conntrack_timeout_init(void) { @@ -97,8 +101,8 @@ static inline void nf_conntrack_timeout_fini(void) #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ #ifdef CONFIG_NF_CONNTRACK_TIMEOUT -extern struct ctnl_timeout *(*nf_ct_timeout_find_get_hook)(struct net *net, const char *name); -extern void (*nf_ct_timeout_put_hook)(struct ctnl_timeout *timeout); +extern struct nf_ct_timeout *(*nf_ct_timeout_find_get_hook)(struct net *net, const char *name); +extern void (*nf_ct_timeout_put_hook)(struct nf_ct_timeout *timeout); #endif #endif /* _NF_CONNTRACK_TIMEOUT_H */ diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c index 401c2cce4a61..91fbd183da2d 100644 --- a/net/netfilter/nf_conntrack_timeout.c +++ b/net/netfilter/nf_conntrack_timeout.c @@ -24,11 +24,11 @@ #include #include -struct ctnl_timeout * +struct nf_ct_timeout * (*nf_ct_timeout_find_get_hook)(struct net *net, const char *name) __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_timeout_find_get_hook); -void (*nf_ct_timeout_put_hook)(struct ctnl_timeout *timeout) __read_mostly; +void (*nf_ct_timeout_put_hook)(struct nf_ct_timeout *timeout) __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_timeout_put_hook); static int untimeout(struct nf_conn *ct, void *timeout) @@ -42,7 +42,7 @@ static int untimeout(struct nf_conn *ct, void *timeout) return 0; } -void nf_ct_untimeout(struct net *net, struct ctnl_timeout *timeout) +void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout) { nf_ct_iterate_cleanup_net(net, untimeout, timeout, 0, 0); } diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index df53aef2d642..d46a236cdf31 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -113,13 +113,13 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, /* You cannot replace one timeout policy by another of * different kind, sorry. */ - if (matching->l3num != l3num || - matching->l4proto->l4proto != l4num) + if (matching->timeout.l3num != l3num || + matching->timeout.l4proto->l4proto != l4num) return -EINVAL; - return ctnl_timeout_parse_policy(&matching->data, - matching->l4proto, net, - cda[CTA_TIMEOUT_DATA]); + return ctnl_timeout_parse_policy(&matching->timeout.data, + matching->timeout.l4proto, + net, cda[CTA_TIMEOUT_DATA]); } return -EBUSY; @@ -140,14 +140,14 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, goto err_proto_put; } - ret = ctnl_timeout_parse_policy(&timeout->data, l4proto, net, + ret = ctnl_timeout_parse_policy(&timeout->timeout.data, l4proto, net, cda[CTA_TIMEOUT_DATA]); if (ret < 0) goto err; strcpy(timeout->name, nla_data(cda[CTA_TIMEOUT_NAME])); - timeout->l3num = l3num; - timeout->l4proto = l4proto; + timeout->timeout.l3num = l3num; + timeout->timeout.l4proto = l4proto; refcount_set(&timeout->refcnt, 1); list_add_tail_rcu(&timeout->head, &net->nfct_timeout_list); @@ -166,7 +166,7 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; - const struct nf_conntrack_l4proto *l4proto = timeout->l4proto; + const struct nf_conntrack_l4proto *l4proto = timeout->timeout.l4proto; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); @@ -179,8 +179,9 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, nfmsg->res_id = 0; if (nla_put_string(skb, CTA_TIMEOUT_NAME, timeout->name) || - nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(timeout->l3num)) || - nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, timeout->l4proto->l4proto) || + nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, + htons(timeout->timeout.l3num)) || + nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto) || nla_put_be32(skb, CTA_TIMEOUT_USE, htonl(refcount_read(&timeout->refcnt)))) goto nla_put_failure; @@ -194,7 +195,8 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, if (!nest_parms) goto nla_put_failure; - ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, &timeout->data); + ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, + &timeout->timeout.data); if (ret < 0) goto nla_put_failure; @@ -308,8 +310,8 @@ static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout) if (refcount_dec_if_one(&timeout->refcnt)) { /* We are protected by nfnl mutex. */ list_del_rcu(&timeout->head); - nf_ct_l4proto_put(timeout->l4proto); - nf_ct_untimeout(net, timeout); + nf_ct_l4proto_put(timeout->timeout.l4proto); + nf_ct_untimeout(net, &timeout->timeout); kfree_rcu(timeout, rcu_head); } else { ret = -EBUSY; @@ -510,8 +512,11 @@ err: return matching; } -static void ctnl_timeout_put(struct ctnl_timeout *timeout) +static void ctnl_timeout_put(struct nf_ct_timeout *t) { + struct ctnl_timeout *timeout = + container_of(t, struct ctnl_timeout, timeout); + if (refcount_dec_and_test(&timeout->refcnt)) kfree_rcu(timeout, rcu_head); @@ -561,7 +566,7 @@ static void __net_exit cttimeout_net_exit(struct net *net) list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list, head) { list_del_rcu(&cur->head); - nf_ct_l4proto_put(cur->l4proto); + nf_ct_l4proto_put(cur->timeout.l4proto); if (refcount_dec_and_test(&cur->refcnt)) kfree_rcu(cur, rcu_head); diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 7ba454e9e3fa..89457efd2e00 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -104,7 +104,7 @@ xt_ct_set_helper(struct nf_conn *ct, const char *helper_name, } #ifdef CONFIG_NF_CONNTRACK_TIMEOUT -static void __xt_ct_tg_timeout_put(struct ctnl_timeout *timeout) +static void __xt_ct_tg_timeout_put(struct nf_ct_timeout *timeout) { typeof(nf_ct_timeout_put_hook) timeout_put; @@ -121,7 +121,7 @@ xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par, #ifdef CONFIG_NF_CONNTRACK_TIMEOUT typeof(nf_ct_timeout_find_get_hook) timeout_find_get; const struct nf_conntrack_l4proto *l4proto; - struct ctnl_timeout *timeout; + struct nf_ct_timeout *timeout; struct nf_conn_timeout *timeout_ext; const char *errmsg = NULL; int ret = 0; -- cgit v1.2.3 From ad83f2a9ce37a264202f48f4fd8889ee9056b703 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 7 Aug 2018 17:14:19 +0200 Subject: netfilter: remove ifdef around cttimeout in struct nf_conntrack_l4proto Simplify this, include it inconditionally in this structure layout as we do with ctnetlink. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 6068c6da3eac..8465263b297d 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -77,7 +77,6 @@ struct nf_conntrack_l4proto { struct nf_conntrack_tuple *t); const struct nla_policy *nla_policy; -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) struct { int (*nlattr_to_obj)(struct nlattr *tb[], struct net *net, void *data); @@ -87,7 +86,6 @@ struct nf_conntrack_l4proto { u16 nlattr_max; const struct nla_policy *nla_policy; } ctnl_timeout; -#endif #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ void (*print_conntrack)(struct seq_file *s, struct nf_conn *); -- cgit v1.2.3 From 92e2c4053623f21d61a683f7ef7bd61c8300ac7d Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 7 Aug 2018 17:36:00 +0200 Subject: flow_dissector: allow dissection of tunnel options from metadata Allow the existing 'dissection' of tunnel metadata to 'dissect' options already present in tunnel metadata. This dissection is controlled by a new dissector key, FLOW_DISSECTOR_KEY_ENC_OPTS. This dissection only occurs when skb_flow_dissect_tunnel_info() is called, currently only the Flower classifier makes that call. So there should be no impact on other users of the flow dissector. This is in preparation for allowing the flower classifier to match on Geneve options. Signed-off-by: Simon Horman Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 17 +++++++++++++++++ net/core/flow_dissector.c | 19 ++++++++++++++++++- 2 files changed, 35 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 2a17f041f7a1..6a4586dcdede 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -57,6 +57,21 @@ struct flow_dissector_key_mpls { mpls_label:20; }; +#define FLOW_DIS_TUN_OPTS_MAX 255 +/** + * struct flow_dissector_key_enc_opts: + * @data: tunnel option data + * @len: length of tunnel option data + * @dst_opt_type: tunnel option type + */ +struct flow_dissector_key_enc_opts { + u8 data[FLOW_DIS_TUN_OPTS_MAX]; /* Using IP_TUNNEL_OPTS_MAX is desired + * here but seems difficult to #include + */ + u8 len; + __be16 dst_opt_type; +}; + struct flow_dissector_key_keyid { __be32 keyid; }; @@ -208,6 +223,8 @@ enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_IP, /* struct flow_dissector_key_ip */ FLOW_DISSECTOR_KEY_CVLAN, /* struct flow_dissector_key_flow_vlan */ FLOW_DISSECTOR_KEY_ENC_IP, /* struct flow_dissector_key_ip */ + FLOW_DISSECTOR_KEY_ENC_OPTS, /* struct flow_dissector_key_enc_opts */ + FLOW_DISSECTOR_KEY_MAX, }; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 08a5184f4b34..ce9eeeb7c024 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -154,7 +154,9 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS) && !dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_ENC_IP)) + FLOW_DISSECTOR_KEY_ENC_IP) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_OPTS)) return; info = skb_tunnel_info(skb); @@ -224,6 +226,21 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb, ip->tos = key->tos; ip->ttl = key->ttl; } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) { + struct flow_dissector_key_enc_opts *enc_opt; + + enc_opt = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_OPTS, + target_container); + + if (info->options_len) { + enc_opt->len = info->options_len; + ip_tunnel_info_opts_get(enc_opt->data, info); + enc_opt->dst_opt_type = info->key.tun_flags & + TUNNEL_OPTIONS_PRESENT; + } + } } EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); -- cgit v1.2.3 From a8d5b4ab353738e16e5f9d21ab1e3d44b37983d0 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 3 Aug 2018 16:58:12 +0900 Subject: xdp: Helper function to clear kernel pointers in xdp_frame xdp_frame has kernel pointers which should not be readable from bpf programs. When we want to reuse xdp_frame region but it may be read by bpf programs later, we can use this helper to clear kernel pointers. This is more efficient than calling memset() for the entire struct. Signed-off-by: Toshiaki Makita Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- include/net/xdp.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/net') diff --git a/include/net/xdp.h b/include/net/xdp.h index fcb033f51d8c..76b95256c266 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -84,6 +84,13 @@ struct xdp_frame { struct net_device *dev_rx; /* used by cpumap */ }; +/* Clear kernel pointers in xdp_frame */ +static inline void xdp_scrub_frame(struct xdp_frame *frame) +{ + frame->data = NULL; + frame->dev_rx = NULL; +} + /* Convert xdp_buff to xdp_frame */ static inline struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) -- cgit v1.2.3 From aa12af77aae05008b3e637b85944dcd512f75eba Mon Sep 17 00:00:00 2001 From: Ankit Navik Date: Tue, 7 Aug 2018 13:16:35 +0530 Subject: Bluetooth: Add definitions for LE set address resolution Add the definitions for LE address resolution enable HCI commands. When the LE address resolution enable gets changed via HCI commands make sure that flag gets updated. Signed-off-by: Ankit Navik Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 3 +++ net/bluetooth/hci_event.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 4619a79b1bbb..cdd9f1fe7cfa 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -269,6 +269,7 @@ enum { HCI_VENDOR_DIAG, HCI_FORCE_BREDR_SMP, HCI_FORCE_STATIC_ADDR, + HCI_LL_RPA_RESOLUTION, __HCI_NUM_FLAGS, }; @@ -1524,6 +1525,8 @@ struct hci_rp_le_read_resolv_list_size { __u8 size; } __packed; +#define HCI_OP_LE_SET_ADDR_RESOLV_ENABLE 0x202d + #define HCI_OP_LE_READ_MAX_DATA_LEN 0x202f struct hci_rp_le_read_max_data_len { __u8 status; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 8078587572fe..f12555f23a49 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1480,6 +1480,30 @@ static void hci_cc_le_read_resolv_list_size(struct hci_dev *hdev, hdev->le_resolv_list_size = rp->size; } +static void hci_cc_le_set_addr_resolution_enable(struct hci_dev *hdev, + struct sk_buff *skb) +{ + __u8 *sent, status = *((__u8 *) skb->data); + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + if (status) + return; + + sent = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_ADDR_RESOLV_ENABLE); + if (!sent) + return; + + hci_dev_lock(hdev); + + if (*sent) + hci_dev_set_flag(hdev, HCI_LL_RPA_RESOLUTION); + else + hci_dev_clear_flag(hdev, HCI_LL_RPA_RESOLUTION); + + hci_dev_unlock(hdev); +} + static void hci_cc_le_read_max_data_len(struct hci_dev *hdev, struct sk_buff *skb) { @@ -3263,6 +3287,10 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_le_read_resolv_list_size(hdev, skb); break; + case HCI_OP_LE_SET_ADDR_RESOLV_ENABLE: + hci_cc_le_set_addr_resolution_enable(hdev, skb); + break; + case HCI_OP_LE_READ_MAX_DATA_LEN: hci_cc_le_read_max_data_len(hdev, skb); break; -- cgit v1.2.3 From 40a1227ea845a37ab197dd1caffb60b047fa36b1 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Aug 2018 01:01:21 -0700 Subject: tcp: Avoid TCP syncookie rejected by SO_REUSEPORT socket Although the actual cookie check "__cookie_v[46]_check()" does not involve sk specific info, it checks whether the sk has recent synq overflow event in "tcp_synq_no_recent_overflow()". The tcp_sk(sk)->rx_opt.ts_recent_stamp is updated every second when it has sent out a syncookie (through "tcp_synq_overflow()"). The above per sk "recent synq overflow event timestamp" works well for non SO_REUSEPORT use case. However, it may cause random connection request reject/discard when SO_REUSEPORT is used with syncookie because it fails the "tcp_synq_no_recent_overflow()" test. When SO_REUSEPORT is used, it usually has multiple listening socks serving TCP connection requests destinated to the same local IP:PORT. There are cases that the TCP-ACK-COOKIE may not be received by the same sk that sent out the syncookie. For example, if reuse->socks[] began with {sk0, sk1}, 1) sk1 sent out syncookies and tcp_sk(sk1)->rx_opt.ts_recent_stamp was updated. 2) the reuse->socks[] became {sk1, sk2} later. e.g. sk0 was first closed and then sk2 was added. Here, sk2 does not have ts_recent_stamp set. There are other ordering that will trigger the similar situation below but the idea is the same. 3) When the TCP-ACK-COOKIE comes back, sk2 was selected. "tcp_synq_no_recent_overflow(sk2)" returns true. In this case, all syncookies sent by sk1 will be handled (and rejected) by sk2 while sk1 is still alive. The userspace may create and remove listening SO_REUSEPORT sockets as it sees fit. E.g. Adding new thread (and SO_REUSEPORT sock) to handle incoming requests, old process stopping and new process starting...etc. With or without SO_ATTACH_REUSEPORT_[CB]BPF, the sockets leaving and joining a reuseport group makes picking the same sk to check the syncookie very difficult (if not impossible). The later patches will allow bpf prog more flexibility in deciding where a sk should be located in a bpf map and selecting a particular SO_REUSEPORT sock as it sees fit. e.g. Without closing any sock, replace the whole bpf reuseport_array in one map_update() by using map-in-map. Getting the syncookie check working smoothly across socks in the same "reuse->socks[]" is important. A partial solution is to set the newly added sk's ts_recent_stamp to the max ts_recent_stamp of a reuseport group but that will require to iterate through reuse->socks[] OR pessimistically set it to "now - TCP_SYNCOOKIE_VALID" when a sk is joining a reuseport group. However, neither of them will solve the existing sk getting moved around the reuse->socks[] and that sk may not have ts_recent_stamp updated, unlikely under continuous synflood but not impossible. This patch opts to treat the reuseport group as a whole when considering the last synq overflow timestamp since they are serving the same IP:PORT from the userspace (and BPF program) perspective. "synq_overflow_ts" is added to "struct sock_reuseport". The tcp_synq_overflow() and tcp_synq_no_recent_overflow() will update/check reuse->synq_overflow_ts if the sk is in a reuseport group. Similar to the reuseport decision in __inet_lookup_listener(), both sk->sk_reuseport and sk->sk_reuseport_cb are tested for SO_REUSEPORT usage. Update on "synq_overflow_ts" happens at roughly once every second. A synflood test was done with a 16 rx-queues and 16 reuseport sockets. No meaningful performance change is observed. Before and after the change is ~9Mpps in IPv4. Cc: Eric Dumazet Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/net/sock_reuseport.h | 4 ++++ include/net/tcp.h | 30 ++++++++++++++++++++++++++++-- net/core/sock_reuseport.c | 1 + 3 files changed, 33 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index 0054b3a9b923..6bef7a0052f2 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -12,6 +12,10 @@ struct sock_reuseport { u16 max_socks; /* length of socks */ u16 num_socks; /* elements in socks */ + /* The last synq overflow event timestamp of this + * reuse->socks[] group. + */ + unsigned int synq_overflow_ts; struct bpf_prog __rcu *prog; /* optional BPF sock selector */ struct sock *socks[0]; /* array of sock pointers */ }; diff --git a/include/net/tcp.h b/include/net/tcp.h index d769dc20359b..d196901c9dba 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -473,9 +474,22 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb); */ static inline void tcp_synq_overflow(const struct sock *sk) { - unsigned int last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; + unsigned int last_overflow; unsigned int now = jiffies; + if (sk->sk_reuseport) { + struct sock_reuseport *reuse; + + reuse = rcu_dereference(sk->sk_reuseport_cb); + if (likely(reuse)) { + last_overflow = READ_ONCE(reuse->synq_overflow_ts); + if (time_after32(now, last_overflow + HZ)) + WRITE_ONCE(reuse->synq_overflow_ts, now); + return; + } + } + + last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; if (time_after32(now, last_overflow + HZ)) tcp_sk(sk)->rx_opt.ts_recent_stamp = now; } @@ -483,9 +497,21 @@ static inline void tcp_synq_overflow(const struct sock *sk) /* syncookies: no recent synqueue overflow on this listening socket? */ static inline bool tcp_synq_no_recent_overflow(const struct sock *sk) { - unsigned int last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; + unsigned int last_overflow; unsigned int now = jiffies; + if (sk->sk_reuseport) { + struct sock_reuseport *reuse; + + reuse = rcu_dereference(sk->sk_reuseport_cb); + if (likely(reuse)) { + last_overflow = READ_ONCE(reuse->synq_overflow_ts); + return time_after32(now, last_overflow + + TCP_SYNCOOKIE_VALID); + } + } + + last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp; return time_after32(now, last_overflow + TCP_SYNCOOKIE_VALID); } diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 064acb04be0f..3f188fad0162 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -81,6 +81,7 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) memcpy(more_reuse->socks, reuse->socks, reuse->num_socks * sizeof(struct sock *)); + more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts); for (i = 0; i < reuse->num_socks; ++i) rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb, -- cgit v1.2.3 From 736b46027eb4a4c602d3b8b93d2f48c9facbd915 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Aug 2018 01:01:22 -0700 Subject: net: Add ID (if needed) to sock_reuseport and expose reuseport_lock A later patch will introduce a BPF_MAP_TYPE_REUSEPORT_ARRAY which allows a SO_REUSEPORT sk to be added to a bpf map. When a sk is removed from reuse->socks[], it also needs to be removed from the bpf map. Also, when adding a sk to a bpf map, the bpf map needs to ensure it is indeed in a reuse->socks[]. Hence, reuseport_lock is needed by the bpf map to ensure its map_update_elem() and map_delete_elem() operations are in-sync with the reuse->socks[]. The BPF_MAP_TYPE_REUSEPORT_ARRAY map will only acquire the reuseport_lock after ensuring the adding sk is already in a reuseport group (i.e. reuse->socks[]). The map_lookup_elem() will be lockless. This patch also adds an ID to sock_reuseport. A later patch will introduce BPF_PROG_TYPE_SK_REUSEPORT which allows a bpf prog to select a sk from a bpf map. It is inflexible to statically enforce a bpf map can only contain the sk belonging to a particular reuse->socks[] (i.e. same IP:PORT) during the bpf verification time. For example, think about the the map-in-map situation where the inner map can be dynamically changed in runtime and the outer map may have inner maps belonging to different reuseport groups. Hence, when the bpf prog (in the new BPF_PROG_TYPE_SK_REUSEPORT type) selects a sk, this selected sk has to be checked to ensure it belongs to the requesting reuseport group (i.e. the group serving that IP:PORT). The "sk->sk_reuseport_cb" pointer cannot be used for this checking purpose because the pointer value will change after reuseport_grow(). Instead of saving all checking conditions like the ones preced calling "reuseport_add_sock()" and compare them everytime a bpf_prog is run, a 32bits ID is introduced to survive the reuseport_grow(). The ID is only acquired if any of the reuse->socks[] is added to the newly introduced "BPF_MAP_TYPE_REUSEPORT_ARRAY" map. If "BPF_MAP_TYPE_REUSEPORT_ARRAY" is not used, the changes in this patch is a no-op. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/net/sock_reuseport.h | 6 ++++++ net/core/sock_reuseport.c | 27 ++++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index 6bef7a0052f2..e1a7681856f7 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -5,8 +5,11 @@ #include #include #include +#include #include +extern spinlock_t reuseport_lock; + struct sock_reuseport { struct rcu_head rcu; @@ -16,6 +19,8 @@ struct sock_reuseport { * reuse->socks[] group. */ unsigned int synq_overflow_ts; + /* ID stays the same even after the size of socks[] grows. */ + unsigned int reuseport_id; struct bpf_prog __rcu *prog; /* optional BPF sock selector */ struct sock *socks[0]; /* array of sock pointers */ }; @@ -29,5 +34,6 @@ extern struct sock *reuseport_select_sock(struct sock *sk, int hdr_len); extern struct bpf_prog *reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog); +int reuseport_get_id(struct sock_reuseport *reuse); #endif /* _SOCK_REUSEPORT_H */ diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 3f188fad0162..cf2e4d305af9 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -8,11 +8,33 @@ #include #include +#include #include #define INIT_SOCKS 128 -static DEFINE_SPINLOCK(reuseport_lock); +DEFINE_SPINLOCK(reuseport_lock); + +#define REUSEPORT_MIN_ID 1 +static DEFINE_IDA(reuseport_ida); + +int reuseport_get_id(struct sock_reuseport *reuse) +{ + int id; + + if (reuse->reuseport_id) + return reuse->reuseport_id; + + id = ida_simple_get(&reuseport_ida, REUSEPORT_MIN_ID, 0, + /* Called under reuseport_lock */ + GFP_ATOMIC); + if (id < 0) + return id; + + reuse->reuseport_id = id; + + return reuse->reuseport_id; +} static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) { @@ -78,6 +100,7 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) more_reuse->max_socks = more_socks_size; more_reuse->num_socks = reuse->num_socks; more_reuse->prog = reuse->prog; + more_reuse->reuseport_id = reuse->reuseport_id; memcpy(more_reuse->socks, reuse->socks, reuse->num_socks * sizeof(struct sock *)); @@ -102,6 +125,8 @@ static void reuseport_free_rcu(struct rcu_head *head) reuse = container_of(head, struct sock_reuseport, rcu); if (reuse->prog) bpf_prog_destroy(reuse->prog); + if (reuse->reuseport_id) + ida_simple_remove(&reuseport_ida, reuse->reuseport_id); kfree(reuse); } -- cgit v1.2.3 From 2dbb9b9e6df67d444fbe425c7f6014858d337adf Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Aug 2018 01:01:25 -0700 Subject: bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT This patch adds a BPF_PROG_TYPE_SK_REUSEPORT which can select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY. Like other non SK_FILTER/CGROUP_SKB program, it requires CAP_SYS_ADMIN. BPF_PROG_TYPE_SK_REUSEPORT introduces "struct sk_reuseport_kern" to store the bpf context instead of using the skb->cb[48]. At the SO_REUSEPORT sk lookup time, it is in the middle of transiting from a lower layer (ipv4/ipv6) to a upper layer (udp/tcp). At this point, it is not always clear where the bpf context can be appended in the skb->cb[48] to avoid saving-and-restoring cb[]. Even putting aside the difference between ipv4-vs-ipv6 and udp-vs-tcp. It is not clear if the lower layer is only ipv4 and ipv6 in the future and will it not touch the cb[] again before transiting to the upper layer. For example, in udp_gro_receive(), it uses the 48 byte NAPI_GRO_CB instead of IP[6]CB and it may still modify the cb[] after calling the udp[46]_lib_lookup_skb(). Because of the above reason, if sk->cb is used for the bpf ctx, saving-and-restoring is needed and likely the whole 48 bytes cb[] has to be saved and restored. Instead of saving, setting and restoring the cb[], this patch opts to create a new "struct sk_reuseport_kern" and setting the needed values in there. The new BPF_PROG_TYPE_SK_REUSEPORT and "struct sk_reuseport_(kern|md)" will serve all ipv4/ipv6 + udp/tcp combinations. There is no protocol specific usage at this point and it is also inline with the current sock_reuseport.c implementation (i.e. no protocol specific requirement). In "struct sk_reuseport_md", this patch exposes data/data_end/len with semantic similar to other existing usages. Together with "bpf_skb_load_bytes()" and "bpf_skb_load_bytes_relative()", the bpf prog can peek anywhere in the skb. The "bind_inany" tells the bpf prog that the reuseport group is bind-ed to a local INANY address which cannot be learned from skb. The new "bind_inany" is added to "struct sock_reuseport" which will be used when running the new "BPF_PROG_TYPE_SK_REUSEPORT" bpf prog in order to avoid repeating the "bind INANY" test on "sk_v6_rcv_saddr/sk->sk_rcv_saddr" every time a bpf prog is run. It can only be properly initialized when a "sk->sk_reuseport" enabled sk is adding to a hashtable (i.e. during "reuseport_alloc()" and "reuseport_add_sock()"). The new "sk_select_reuseport()" is the main helper that the bpf prog will use to select a SO_REUSEPORT sk. It is the only function that can use the new BPF_MAP_TYPE_REUSEPORT_ARRAY. As mentioned in the earlier patch, the validity of a selected sk is checked in run time in "sk_select_reuseport()". Doing the check in verification time is difficult and inflexible (consider the map-in-map use case). The runtime check is to compare the selected sk's reuseport_id with the reuseport_id that we want. This helper will return -EXXX if the selected sk cannot serve the incoming request (e.g. reuseport_id not match). The bpf prog can decide if it wants to do SK_DROP as its discretion. When the bpf prog returns SK_PASS, the kernel will check if a valid sk has been selected (i.e. "reuse_kern->selected_sk != NULL"). If it does , it will use the selected sk. If not, the kernel will select one from "reuse->socks[]" (as before this patch). The SK_DROP and SK_PASS handling logic will be in the next patch. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_types.h | 3 + include/linux/filter.h | 15 +++ include/net/addrconf.h | 1 + include/net/sock_reuseport.h | 6 +- include/uapi/linux/bpf.h | 36 +++++- kernel/bpf/verifier.c | 9 ++ net/core/filter.c | 269 +++++++++++++++++++++++++++++++++++++++- net/core/sock_reuseport.c | 20 ++- net/ipv4/inet_connection_sock.c | 9 ++ net/ipv4/inet_hashtables.c | 5 +- net/ipv4/udp.c | 5 +- 11 files changed, 365 insertions(+), 13 deletions(-) (limited to 'include/net') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 14fd6c02d258..cd26c090e7c0 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -29,6 +29,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) #ifdef CONFIG_BPF_LIRC_MODE2 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) #endif +#ifdef CONFIG_INET +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/linux/filter.h b/include/linux/filter.h index 2b072dab32c0..70e9d57677fe 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -32,6 +32,7 @@ struct seccomp_data; struct bpf_prog_aux; struct xdp_rxq_info; struct xdp_buff; +struct sock_reuseport; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function @@ -833,6 +834,20 @@ void bpf_warn_invalid_xdp_action(u32 act); struct sock *do_sk_redirect_map(struct sk_buff *skb); struct sock *do_msg_redirect_map(struct sk_msg_buff *md); +#ifdef CONFIG_INET +struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, + struct bpf_prog *prog, struct sk_buff *skb, + u32 hash); +#else +static inline struct sock * +bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, + struct bpf_prog *prog, struct sk_buff *skb, + u32 hash) +{ + return NULL; +} +#endif + #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; extern int bpf_jit_harden; diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 5f43f7a70fe6..6def0351bcc3 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -108,6 +108,7 @@ int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, bool match_wildcard); +bool inet_rcv_saddr_any(const struct sock *sk); void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr); diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index e1a7681856f7..73b569556be6 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -21,12 +21,14 @@ struct sock_reuseport { unsigned int synq_overflow_ts; /* ID stays the same even after the size of socks[] grows. */ unsigned int reuseport_id; + bool bind_inany; struct bpf_prog __rcu *prog; /* optional BPF sock selector */ struct sock *socks[0]; /* array of sock pointers */ }; -extern int reuseport_alloc(struct sock *sk); -extern int reuseport_add_sock(struct sock *sk, struct sock *sk2); +extern int reuseport_alloc(struct sock *sk, bool bind_inany); +extern int reuseport_add_sock(struct sock *sk, struct sock *sk2, + bool bind_inany); extern void reuseport_detach_sock(struct sock *sk); extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 40f584bc7da0..3102a2a23c31 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -151,6 +151,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_PROG_TYPE_LWT_SEG6LOCAL, BPF_PROG_TYPE_LIRC_MODE2, + BPF_PROG_TYPE_SK_REUSEPORT, }; enum bpf_attach_type { @@ -2114,6 +2115,14 @@ union bpf_attr { * the shared data. * Return * Pointer to the local storage area. + * + * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) + * Description + * Select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY map + * It checks the selected sk is matching the incoming + * request in the skb. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2197,7 +2206,8 @@ union bpf_attr { FN(rc_keydown), \ FN(skb_cgroup_id), \ FN(get_current_cgroup_id), \ - FN(get_local_storage), + FN(get_local_storage), \ + FN(sk_select_reuseport), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2414,6 +2424,30 @@ struct sk_msg_md { __u32 local_port; /* stored in host byte order */ }; +struct sk_reuseport_md { + /* + * Start of directly accessible data. It begins from + * the tcp/udp header. + */ + void *data; + void *data_end; /* End of directly accessible data */ + /* + * Total length of packet (starting from the tcp/udp header). + * Note that the directly accessible bytes (data_end - data) + * could be less than this "len". Those bytes could be + * indirectly read by a helper "bpf_skb_load_bytes()". + */ + __u32 len; + /* + * Eth protocol in the mac header (network byte order). e.g. + * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD) + */ + __u32 eth_protocol; + __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ + __u32 bind_inany; /* Is sock bound to an INANY address? */ + __u32 hash; /* A hash of the packet 4 tuples */ +}; + #define BPF_TAG_SIZE 8 struct bpf_prog_info { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 587468a9c37d..ca90679a7fe5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1310,6 +1310,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_SEG6LOCAL: + case BPF_PROG_TYPE_SK_REUSEPORT: /* dst_input() and dst_output() can't write for now */ if (t == BPF_WRITE) return false; @@ -2166,6 +2167,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_msg_redirect_hash) goto error; break; + case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: + if (func_id != BPF_FUNC_sk_select_reuseport) + goto error; + break; default: break; } @@ -2217,6 +2222,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) goto error; break; + case BPF_FUNC_sk_select_reuseport: + if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) + goto error; + break; default: break; } diff --git a/net/core/filter.c b/net/core/filter.c index 2de7dd9f2a57..142595b4e0d1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1462,7 +1462,7 @@ static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) return -ENOMEM; if (sk_unhashed(sk) && sk->sk_reuseport) { - err = reuseport_alloc(sk); + err = reuseport_alloc(sk, false); if (err) return err; } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { @@ -7013,3 +7013,270 @@ out: release_sock(sk); return ret; } + +#ifdef CONFIG_INET +struct sk_reuseport_kern { + struct sk_buff *skb; + struct sock *sk; + struct sock *selected_sk; + void *data_end; + u32 hash; + u32 reuseport_id; + bool bind_inany; +}; + +static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, + struct sock_reuseport *reuse, + struct sock *sk, struct sk_buff *skb, + u32 hash) +{ + reuse_kern->skb = skb; + reuse_kern->sk = sk; + reuse_kern->selected_sk = NULL; + reuse_kern->data_end = skb->data + skb_headlen(skb); + reuse_kern->hash = hash; + reuse_kern->reuseport_id = reuse->reuseport_id; + reuse_kern->bind_inany = reuse->bind_inany; +} + +struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, + struct bpf_prog *prog, struct sk_buff *skb, + u32 hash) +{ + struct sk_reuseport_kern reuse_kern; + enum sk_action action; + + bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash); + action = BPF_PROG_RUN(prog, &reuse_kern); + + if (action == SK_PASS) + return reuse_kern.selected_sk; + else + return ERR_PTR(-ECONNREFUSED); +} + +BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, + struct bpf_map *, map, void *, key, u32, flags) +{ + struct sock_reuseport *reuse; + struct sock *selected_sk; + + selected_sk = map->ops->map_lookup_elem(map, key); + if (!selected_sk) + return -ENOENT; + + reuse = rcu_dereference(selected_sk->sk_reuseport_cb); + if (!reuse) + /* selected_sk is unhashed (e.g. by close()) after the + * above map_lookup_elem(). Treat selected_sk has already + * been removed from the map. + */ + return -ENOENT; + + if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) { + struct sock *sk; + + if (unlikely(!reuse_kern->reuseport_id)) + /* There is a small race between adding the + * sk to the map and setting the + * reuse_kern->reuseport_id. + * Treat it as the sk has not been added to + * the bpf map yet. + */ + return -ENOENT; + + sk = reuse_kern->sk; + if (sk->sk_protocol != selected_sk->sk_protocol) + return -EPROTOTYPE; + else if (sk->sk_family != selected_sk->sk_family) + return -EAFNOSUPPORT; + + /* Catch all. Likely bound to a different sockaddr. */ + return -EBADFD; + } + + reuse_kern->selected_sk = selected_sk; + + return 0; +} + +static const struct bpf_func_proto sk_select_reuseport_proto = { + .func = sk_select_reuseport, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(sk_reuseport_load_bytes, + const struct sk_reuseport_kern *, reuse_kern, u32, offset, + void *, to, u32, len) +{ + return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len); +} + +static const struct bpf_func_proto sk_reuseport_load_bytes_proto = { + .func = sk_reuseport_load_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(sk_reuseport_load_bytes_relative, + const struct sk_reuseport_kern *, reuse_kern, u32, offset, + void *, to, u32, len, u32, start_header) +{ + return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to, + len, start_header); +} + +static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = { + .func = sk_reuseport_load_bytes_relative, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +sk_reuseport_func_proto(enum bpf_func_id func_id, + const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_sk_select_reuseport: + return &sk_select_reuseport_proto; + case BPF_FUNC_skb_load_bytes: + return &sk_reuseport_load_bytes_proto; + case BPF_FUNC_skb_load_bytes_relative: + return &sk_reuseport_load_bytes_relative_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static bool +sk_reuseport_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const u32 size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct sk_reuseport_md) || + off % size || type != BPF_READ) + return false; + + switch (off) { + case offsetof(struct sk_reuseport_md, data): + info->reg_type = PTR_TO_PACKET; + return size == sizeof(__u64); + + case offsetof(struct sk_reuseport_md, data_end): + info->reg_type = PTR_TO_PACKET_END; + return size == sizeof(__u64); + + case offsetof(struct sk_reuseport_md, hash): + return size == size_default; + + /* Fields that allow narrowing */ + case offsetof(struct sk_reuseport_md, eth_protocol): + if (size < FIELD_SIZEOF(struct sk_buff, protocol)) + return false; + case offsetof(struct sk_reuseport_md, ip_protocol): + case offsetof(struct sk_reuseport_md, bind_inany): + case offsetof(struct sk_reuseport_md, len): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + + default: + return false; + } +} + +#define SK_REUSEPORT_LOAD_FIELD(F) ({ \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \ + si->dst_reg, si->src_reg, \ + bpf_target_off(struct sk_reuseport_kern, F, \ + FIELD_SIZEOF(struct sk_reuseport_kern, F), \ + target_size)); \ + }) + +#define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD) \ + SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ + struct sk_buff, \ + skb, \ + SKB_FIELD) + +#define SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(SK_FIELD, BPF_SIZE, EXTRA_OFF) \ + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(struct sk_reuseport_kern, \ + struct sock, \ + sk, \ + SK_FIELD, BPF_SIZE, EXTRA_OFF) + +static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct sk_reuseport_md, data): + SK_REUSEPORT_LOAD_SKB_FIELD(data); + break; + + case offsetof(struct sk_reuseport_md, len): + SK_REUSEPORT_LOAD_SKB_FIELD(len); + break; + + case offsetof(struct sk_reuseport_md, eth_protocol): + SK_REUSEPORT_LOAD_SKB_FIELD(protocol); + break; + + case offsetof(struct sk_reuseport_md, ip_protocol): + BUILD_BUG_ON(hweight_long(SK_FL_PROTO_MASK) != BITS_PER_BYTE); + SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(__sk_flags_offset, + BPF_W, 0); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, + SK_FL_PROTO_SHIFT); + /* SK_FL_PROTO_MASK and SK_FL_PROTO_SHIFT are endian + * aware. No further narrowing or masking is needed. + */ + *target_size = 1; + break; + + case offsetof(struct sk_reuseport_md, data_end): + SK_REUSEPORT_LOAD_FIELD(data_end); + break; + + case offsetof(struct sk_reuseport_md, hash): + SK_REUSEPORT_LOAD_FIELD(hash); + break; + + case offsetof(struct sk_reuseport_md, bind_inany): + SK_REUSEPORT_LOAD_FIELD(bind_inany); + break; + } + + return insn - insn_buf; +} + +const struct bpf_verifier_ops sk_reuseport_verifier_ops = { + .get_func_proto = sk_reuseport_func_proto, + .is_valid_access = sk_reuseport_is_valid_access, + .convert_ctx_access = sk_reuseport_convert_ctx_access, +}; + +const struct bpf_prog_ops sk_reuseport_prog_ops = { +}; +#endif /* CONFIG_INET */ diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 8235f2439816..d260167f5f77 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -51,7 +51,7 @@ static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) return reuse; } -int reuseport_alloc(struct sock *sk) +int reuseport_alloc(struct sock *sk, bool bind_inany) { struct sock_reuseport *reuse; @@ -63,9 +63,17 @@ int reuseport_alloc(struct sock *sk) /* Allocation attempts can occur concurrently via the setsockopt path * and the bind/hash path. Nothing to do when we lose the race. */ - if (rcu_dereference_protected(sk->sk_reuseport_cb, - lockdep_is_held(&reuseport_lock))) + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + if (reuse) { + /* Only set reuse->bind_inany if the bind_inany is true. + * Otherwise, it will overwrite the reuse->bind_inany + * which was set by the bind/hash path. + */ + if (bind_inany) + reuse->bind_inany = bind_inany; goto out; + } reuse = __reuseport_alloc(INIT_SOCKS); if (!reuse) { @@ -75,6 +83,7 @@ int reuseport_alloc(struct sock *sk) reuse->socks[0] = sk; reuse->num_socks = 1; + reuse->bind_inany = bind_inany; rcu_assign_pointer(sk->sk_reuseport_cb, reuse); out: @@ -101,6 +110,7 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) more_reuse->num_socks = reuse->num_socks; more_reuse->prog = reuse->prog; more_reuse->reuseport_id = reuse->reuseport_id; + more_reuse->bind_inany = reuse->bind_inany; memcpy(more_reuse->socks, reuse->socks, reuse->num_socks * sizeof(struct sock *)); @@ -136,12 +146,12 @@ static void reuseport_free_rcu(struct rcu_head *head) * @sk2: Socket belonging to the existing reuseport group. * May return ENOMEM and not add socket to group under memory pressure. */ -int reuseport_add_sock(struct sock *sk, struct sock *sk2) +int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany) { struct sock_reuseport *old_reuse, *reuse; if (!rcu_access_pointer(sk2->sk_reuseport_cb)) { - int err = reuseport_alloc(sk2); + int err = reuseport_alloc(sk2, bind_inany); if (err) return err; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 33a88e045efd..dfd5009f96ef 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -107,6 +107,15 @@ bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, } EXPORT_SYMBOL(inet_rcv_saddr_equal); +bool inet_rcv_saddr_any(const struct sock *sk) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + return ipv6_addr_any(&sk->sk_v6_rcv_saddr); +#endif + return !sk->sk_rcv_saddr; +} + void inet_get_local_port_range(struct net *net, int *low, int *high) { unsigned int seq; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 3647167c8fa3..370e24463fb7 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -567,10 +567,11 @@ static int inet_reuseport_add_sock(struct sock *sk, inet_csk(sk2)->icsk_bind_hash == tb && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && inet_rcv_saddr_equal(sk, sk2, false)) - return reuseport_add_sock(sk, sk2); + return reuseport_add_sock(sk, sk2, + inet_rcv_saddr_any(sk)); } - return reuseport_alloc(sk); + return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } int __inet_hash(struct sock *sk, struct sock *osk) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 060e841dde40..038dd7909051 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -221,11 +221,12 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot) (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && inet_rcv_saddr_equal(sk, sk2, false)) { - return reuseport_add_sock(sk, sk2); + return reuseport_add_sock(sk, sk2, + inet_rcv_saddr_any(sk)); } } - return reuseport_alloc(sk); + return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } /** -- cgit v1.2.3 From 8217ca653ec601246832d562207bc24bdf652d2f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Aug 2018 01:01:26 -0700 Subject: bpf: Enable BPF_PROG_TYPE_SK_REUSEPORT bpf prog in reuseport selection This patch allows a BPF_PROG_TYPE_SK_REUSEPORT bpf prog to select a SO_REUSEPORT sk from a BPF_MAP_TYPE_REUSEPORT_ARRAY introduced in the earlier patch. "bpf_run_sk_reuseport()" will return -ECONNREFUSED when the BPF_PROG_TYPE_SK_REUSEPORT prog returns SK_DROP. The callers, in inet[6]_hashtable.c and ipv[46]/udp.c, are modified to handle this case and return NULL immediately instead of continuing the sk search from its hashtable. It re-uses the existing SO_ATTACH_REUSEPORT_EBPF setsockopt to attach BPF_PROG_TYPE_SK_REUSEPORT. The "sk_reuseport_attach_bpf()" will check if the attaching bpf prog is in the new SK_REUSEPORT or the existing SOCKET_FILTER type and then check different things accordingly. One level of "__reuseport_attach_prog()" call is removed. The "sk_unhashed() && ..." and "sk->sk_reuseport_cb" tests are pushed back to "reuseport_attach_prog()" in sock_reuseport.c. sock_reuseport.c seems to have more knowledge on those test requirements than filter.c. In "reuseport_attach_prog()", after new_prog is attached to reuse->prog, the old_prog (if any) is also directly freed instead of returning the old_prog to the caller and asking the caller to free. The sysctl_optmem_max check is moved back to the "sk_reuseport_attach_filter()" and "sk_reuseport_attach_bpf()". As of other bpf prog types, the new BPF_PROG_TYPE_SK_REUSEPORT is only bounded by the usual "bpf_prog_charge_memlock()" during load time instead of bounded by both bpf_prog_charge_memlock and sysctl_optmem_max. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 1 + include/net/sock_reuseport.h | 3 +- net/core/filter.c | 87 ++++++++++++++++++++++++++------------------ net/core/sock_reuseport.c | 36 +++++++++++++----- net/ipv4/inet_hashtables.c | 14 ++++--- net/ipv4/udp.c | 4 ++ net/ipv6/inet6_hashtables.c | 14 ++++--- net/ipv6/udp.c | 4 ++ 8 files changed, 106 insertions(+), 57 deletions(-) (limited to 'include/net') diff --git a/include/linux/filter.h b/include/linux/filter.h index 70e9d57677fe..5d565c50bcb2 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -753,6 +753,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_attach_bpf(u32 ufd, struct sock *sk); int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk); +void sk_reuseport_prog_free(struct bpf_prog *prog); int sk_detach_filter(struct sock *sk); int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, unsigned int len); diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index 73b569556be6..8a5f70c7cdf2 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -34,8 +34,7 @@ extern struct sock *reuseport_select_sock(struct sock *sk, u32 hash, struct sk_buff *skb, int hdr_len); -extern struct bpf_prog *reuseport_attach_prog(struct sock *sk, - struct bpf_prog *prog); +extern int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog); int reuseport_get_id(struct sock_reuseport *reuse); #endif /* _SOCK_REUSEPORT_H */ diff --git a/net/core/filter.c b/net/core/filter.c index 142595b4e0d1..22906b31d43f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1453,30 +1453,6 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) return 0; } -static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) -{ - struct bpf_prog *old_prog; - int err; - - if (bpf_prog_size(prog->len) > sysctl_optmem_max) - return -ENOMEM; - - if (sk_unhashed(sk) && sk->sk_reuseport) { - err = reuseport_alloc(sk, false); - if (err) - return err; - } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { - /* The socket wasn't bound with SO_REUSEPORT */ - return -EINVAL; - } - - old_prog = reuseport_attach_prog(sk, prog); - if (old_prog) - bpf_prog_destroy(old_prog); - - return 0; -} - static struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) { @@ -1550,13 +1526,15 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) if (IS_ERR(prog)) return PTR_ERR(prog); - err = __reuseport_attach_prog(prog, sk); - if (err < 0) { + if (bpf_prog_size(prog->len) > sysctl_optmem_max) + err = -ENOMEM; + else + err = reuseport_attach_prog(sk, prog); + + if (err) __bpf_prog_release(prog); - return err; - } - return 0; + return err; } static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) @@ -1586,19 +1564,58 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) { - struct bpf_prog *prog = __get_bpf(ufd, sk); + struct bpf_prog *prog; int err; + if (sock_flag(sk, SOCK_FILTER_LOCKED)) + return -EPERM; + + prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); + if (IS_ERR(prog) && PTR_ERR(prog) == -EINVAL) + prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT); if (IS_ERR(prog)) return PTR_ERR(prog); - err = __reuseport_attach_prog(prog, sk); - if (err < 0) { - bpf_prog_put(prog); - return err; + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) { + /* Like other non BPF_PROG_TYPE_SOCKET_FILTER + * bpf prog (e.g. sockmap). It depends on the + * limitation imposed by bpf_prog_load(). + * Hence, sysctl_optmem_max is not checked. + */ + if ((sk->sk_type != SOCK_STREAM && + sk->sk_type != SOCK_DGRAM) || + (sk->sk_protocol != IPPROTO_UDP && + sk->sk_protocol != IPPROTO_TCP) || + (sk->sk_family != AF_INET && + sk->sk_family != AF_INET6)) { + err = -ENOTSUPP; + goto err_prog_put; + } + } else { + /* BPF_PROG_TYPE_SOCKET_FILTER */ + if (bpf_prog_size(prog->len) > sysctl_optmem_max) { + err = -ENOMEM; + goto err_prog_put; + } } - return 0; + err = reuseport_attach_prog(sk, prog); +err_prog_put: + if (err) + bpf_prog_put(prog); + + return err; +} + +void sk_reuseport_prog_free(struct bpf_prog *prog) +{ + if (!prog) + return; + + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) + bpf_prog_put(prog); + else + bpf_prog_destroy(prog); } struct bpf_scratchpad { diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index d260167f5f77..ba5cba56f574 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #define INIT_SOCKS 128 @@ -133,8 +134,7 @@ static void reuseport_free_rcu(struct rcu_head *head) struct sock_reuseport *reuse; reuse = container_of(head, struct sock_reuseport, rcu); - if (reuse->prog) - bpf_prog_destroy(reuse->prog); + sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1)); if (reuse->reuseport_id) ida_simple_remove(&reuseport_ida, reuse->reuseport_id); kfree(reuse); @@ -219,9 +219,9 @@ void reuseport_detach_sock(struct sock *sk) } EXPORT_SYMBOL(reuseport_detach_sock); -static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks, - struct bpf_prog *prog, struct sk_buff *skb, - int hdr_len) +static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks, + struct bpf_prog *prog, struct sk_buff *skb, + int hdr_len) { struct sk_buff *nskb = NULL; u32 index; @@ -282,9 +282,15 @@ struct sock *reuseport_select_sock(struct sock *sk, /* paired with smp_wmb() in reuseport_add_sock() */ smp_rmb(); - if (prog && skb) - sk2 = run_bpf(reuse, socks, prog, skb, hdr_len); + if (!prog || !skb) + goto select_by_hash; + + if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) + sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash); + else + sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len); +select_by_hash: /* no bpf or invalid bpf result: fall back to hash usage */ if (!sk2) sk2 = reuse->socks[reciprocal_scale(hash, socks)]; @@ -296,12 +302,21 @@ out: } EXPORT_SYMBOL(reuseport_select_sock); -struct bpf_prog * -reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) +int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) { struct sock_reuseport *reuse; struct bpf_prog *old_prog; + if (sk_unhashed(sk) && sk->sk_reuseport) { + int err = reuseport_alloc(sk, false); + + if (err) + return err; + } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { + /* The socket wasn't bound with SO_REUSEPORT */ + return -EINVAL; + } + spin_lock_bh(&reuseport_lock); reuse = rcu_dereference_protected(sk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); @@ -310,6 +325,7 @@ reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) rcu_assign_pointer(reuse->prog, prog); spin_unlock_bh(&reuseport_lock); - return old_prog; + sk_reuseport_prog_free(old_prog); + return 0; } EXPORT_SYMBOL(reuseport_attach_prog); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 370e24463fb7..f5c9ef2586de 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -328,7 +328,7 @@ struct sock *__inet_lookup_listener(struct net *net, saddr, sport, daddr, hnum, dif, sdif); if (result) - return result; + goto done; /* Lookup lhash2 with INADDR_ANY */ @@ -337,9 +337,10 @@ struct sock *__inet_lookup_listener(struct net *net, if (ilb2->count > ilb->count) goto port_lookup; - return inet_lhash2_lookup(net, ilb2, skb, doff, - saddr, sport, daddr, hnum, - dif, sdif); + result = inet_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + goto done; port_lookup: sk_for_each_rcu(sk, &ilb->head) { @@ -352,12 +353,15 @@ port_lookup: result = reuseport_select_sock(sk, phash, skb, doff); if (result) - return result; + goto done; } result = sk; hiscore = score; } } +done: + if (unlikely(IS_ERR(result))) + return NULL; return result; } EXPORT_SYMBOL_GPL(__inet_lookup_listener); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 038dd7909051..f4e35b2ff8b8 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -499,6 +499,8 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, daddr, hnum, dif, sdif, exact_dif, hslot2, skb); } + if (unlikely(IS_ERR(result))) + return NULL; return result; } begin: @@ -513,6 +515,8 @@ begin: saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); + if (unlikely(IS_ERR(result))) + return NULL; if (result) return result; } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 595ad408dba0..3d7c7460a0c5 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -191,7 +191,7 @@ struct sock *inet6_lookup_listener(struct net *net, saddr, sport, daddr, hnum, dif, sdif); if (result) - return result; + goto done; /* Lookup lhash2 with in6addr_any */ @@ -200,9 +200,10 @@ struct sock *inet6_lookup_listener(struct net *net, if (ilb2->count > ilb->count) goto port_lookup; - return inet6_lhash2_lookup(net, ilb2, skb, doff, - saddr, sport, daddr, hnum, - dif, sdif); + result = inet6_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + goto done; port_lookup: sk_for_each(sk, &ilb->head) { @@ -214,12 +215,15 @@ port_lookup: result = reuseport_select_sock(sk, phash, skb, doff); if (result) - return result; + goto done; } result = sk; hiscore = score; } } +done: + if (unlikely(IS_ERR(result))) + return NULL; return result; } EXPORT_SYMBOL_GPL(inet6_lookup_listener); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index f6b96956a8ed..83f4c77c79d8 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -235,6 +235,8 @@ struct sock *__udp6_lib_lookup(struct net *net, exact_dif, hslot2, skb); } + if (unlikely(IS_ERR(result))) + return NULL; return result; } begin: @@ -249,6 +251,8 @@ begin: saddr, sport); result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); + if (unlikely(IS_ERR(result))) + return NULL; if (result) return result; } -- cgit v1.2.3 From 466466dc6c28ca9dc401f10e235b9cde9a7c9162 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 9 Aug 2018 09:38:09 -0700 Subject: tcp: mandate a one-time immediate ACK Add a new flag to indicate a one-time immediate ACK. This flag is occasionaly set under specific TCP protocol states in addition to the more common quickack mechanism for interactive application. In several cases in the TCP code we want to force an immediate ACK but do not want to call tcp_enter_quickack_mode() because we do not want to forget the icsk_ack.pingpong or icsk_ack.ato state. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 3 ++- net/ipv4/tcp_input.c | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 0a6c9e0f2b5a..fa43b82607d9 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -167,7 +167,8 @@ enum inet_csk_ack_state_t { ICSK_ACK_SCHED = 1, ICSK_ACK_TIMER = 2, ICSK_ACK_PUSHED = 4, - ICSK_ACK_PUSHED2 = 8 + ICSK_ACK_PUSHED2 = 8, + ICSK_ACK_NOW = 16 /* Send the next ACK immediately (once) */ }; void inet_csk_init_xmit_timers(struct sock *sk, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 715d541b52dd..b8849588c440 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5179,7 +5179,9 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || __tcp_select_window(sk) >= tp->rcv_wnd)) || /* We ACK each frame or... */ - tcp_in_quickack_mode(sk)) { + tcp_in_quickack_mode(sk) || + /* Protocol state mandates a one-time immediate ACK */ + inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) { send_now: tcp_send_ack(sk); return; -- cgit v1.2.3 From 05364ca03cfd419caecb292fede20eb39667eaae Mon Sep 17 00:00:00 2001 From: Konstantin Khorenko Date: Fri, 10 Aug 2018 20:11:42 +0300 Subject: net/sctp: Make wrappers for accessing in/out streams This patch introduces wrappers for accessing in/out streams indirectly. This will enable to replace physically contiguous memory arrays of streams with flexible arrays (or maybe any other appropriate mechanism) which do memory allocation on a per-page basis. Signed-off-by: Oleg Babin Signed-off-by: Konstantin Khorenko Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 35 +++++++++++++++++------- net/sctp/chunk.c | 6 ++-- net/sctp/outqueue.c | 11 ++++---- net/sctp/socket.c | 4 +-- net/sctp/stream.c | 65 +++++++++++++++++++++++--------------------- net/sctp/stream_interleave.c | 20 +++++++------- net/sctp/stream_sched.c | 13 +++++---- net/sctp/stream_sched_prio.c | 22 +++++++-------- net/sctp/stream_sched_rr.c | 8 +++--- 9 files changed, 103 insertions(+), 81 deletions(-) (limited to 'include/net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index ab869e0d8326..6b2b8df8a1d2 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -398,37 +398,35 @@ void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new); /* What is the current SSN number for this stream? */ #define sctp_ssn_peek(stream, type, sid) \ - ((stream)->type[sid].ssn) + (sctp_stream_##type((stream), (sid))->ssn) /* Return the next SSN number for this stream. */ #define sctp_ssn_next(stream, type, sid) \ - ((stream)->type[sid].ssn++) + (sctp_stream_##type((stream), (sid))->ssn++) /* Skip over this ssn and all below. */ #define sctp_ssn_skip(stream, type, sid, ssn) \ - ((stream)->type[sid].ssn = ssn + 1) + (sctp_stream_##type((stream), (sid))->ssn = ssn + 1) /* What is the current MID number for this stream? */ #define sctp_mid_peek(stream, type, sid) \ - ((stream)->type[sid].mid) + (sctp_stream_##type((stream), (sid))->mid) /* Return the next MID number for this stream. */ #define sctp_mid_next(stream, type, sid) \ - ((stream)->type[sid].mid++) + (sctp_stream_##type((stream), (sid))->mid++) /* Skip over this mid and all below. */ #define sctp_mid_skip(stream, type, sid, mid) \ - ((stream)->type[sid].mid = mid + 1) - -#define sctp_stream_in(asoc, sid) (&(asoc)->stream.in[sid]) + (sctp_stream_##type((stream), (sid))->mid = mid + 1) /* What is the current MID_uo number for this stream? */ #define sctp_mid_uo_peek(stream, type, sid) \ - ((stream)->type[sid].mid_uo) + (sctp_stream_##type((stream), (sid))->mid_uo) /* Return the next MID_uo number for this stream. */ #define sctp_mid_uo_next(stream, type, sid) \ - ((stream)->type[sid].mid_uo++) + (sctp_stream_##type((stream), (sid))->mid_uo++) /* * Pointers to address related SCTP functions. @@ -1463,6 +1461,23 @@ struct sctp_stream { struct sctp_stream_interleave *si; }; +static inline struct sctp_stream_out *sctp_stream_out( + const struct sctp_stream *stream, + __u16 sid) +{ + return ((struct sctp_stream_out *)(stream->out)) + sid; +} + +static inline struct sctp_stream_in *sctp_stream_in( + const struct sctp_stream *stream, + __u16 sid) +{ + return ((struct sctp_stream_in *)(stream->in)) + sid; +} + +#define SCTP_SO(s, i) sctp_stream_out((s), (i)) +#define SCTP_SI(s, i) sctp_stream_in((s), (i)) + #define SCTP_STREAM_CLOSED 0x00 #define SCTP_STREAM_OPEN 0x01 diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index bfb9f812e2ef..ce8087846f05 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -325,7 +325,8 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) && time_after(jiffies, chunk->msg->expires_at)) { struct sctp_stream_out *streamout = - &chunk->asoc->stream.out[chunk->sinfo.sinfo_stream]; + SCTP_SO(&chunk->asoc->stream, + chunk->sinfo.sinfo_stream); if (chunk->sent_count) { chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++; @@ -339,7 +340,8 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) } else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) && chunk->sent_count > chunk->sinfo.sinfo_timetolive) { struct sctp_stream_out *streamout = - &chunk->asoc->stream.out[chunk->sinfo.sinfo_stream]; + SCTP_SO(&chunk->asoc->stream, + chunk->sinfo.sinfo_stream); chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++; streamout->ext->abandoned_sent[SCTP_PR_INDEX(RTX)]++; diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index d68aa33485a9..d74d00b29942 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -80,7 +80,7 @@ static inline void sctp_outq_head_data(struct sctp_outq *q, q->out_qlen += ch->skb->len; stream = sctp_chunk_stream_no(ch); - oute = q->asoc->stream.out[stream].ext; + oute = SCTP_SO(&q->asoc->stream, stream)->ext; list_add(&ch->stream_list, &oute->outq); } @@ -101,7 +101,7 @@ static inline void sctp_outq_tail_data(struct sctp_outq *q, q->out_qlen += ch->skb->len; stream = sctp_chunk_stream_no(ch); - oute = q->asoc->stream.out[stream].ext; + oute = SCTP_SO(&q->asoc->stream, stream)->ext; list_add_tail(&ch->stream_list, &oute->outq); } @@ -372,7 +372,7 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc, sctp_insert_list(&asoc->outqueue.abandoned, &chk->transmitted_list); - streamout = &asoc->stream.out[chk->sinfo.sinfo_stream]; + streamout = SCTP_SO(&asoc->stream, chk->sinfo.sinfo_stream); asoc->sent_cnt_removable--; asoc->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; streamout->ext->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; @@ -416,7 +416,7 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; if (chk->sinfo.sinfo_stream < asoc->stream.outcnt) { struct sctp_stream_out *streamout = - &asoc->stream.out[chk->sinfo.sinfo_stream]; + SCTP_SO(&asoc->stream, chk->sinfo.sinfo_stream); streamout->ext->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; } @@ -1082,6 +1082,7 @@ static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx, /* Finally, transmit new packets. */ while ((chunk = sctp_outq_dequeue_data(ctx->q)) != NULL) { __u32 sid = ntohs(chunk->subh.data_hdr->stream); + __u8 stream_state = SCTP_SO(&ctx->asoc->stream, sid)->state; /* Has this chunk expired? */ if (sctp_chunk_abandoned(chunk)) { @@ -1091,7 +1092,7 @@ static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx, continue; } - if (ctx->asoc->stream.out[sid].state == SCTP_STREAM_CLOSED) { + if (stream_state == SCTP_STREAM_CLOSED) { sctp_outq_head_data(ctx->q, chunk); break; } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 502c0d7cb105..e96b15a66aba 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1911,7 +1911,7 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc, goto err; } - if (unlikely(!asoc->stream.out[sinfo->sinfo_stream].ext)) { + if (unlikely(!SCTP_SO(&asoc->stream, sinfo->sinfo_stream)->ext)) { err = sctp_stream_init_ext(&asoc->stream, sinfo->sinfo_stream); if (err) goto err; @@ -7154,7 +7154,7 @@ static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len, if (!asoc || params.sprstat_sid >= asoc->stream.outcnt) goto out; - streamoute = asoc->stream.out[params.sprstat_sid].ext; + streamoute = SCTP_SO(&asoc->stream, params.sprstat_sid)->ext; if (!streamoute) { /* Not allocated yet, means all stats are 0 */ params.sprstat_abandoned_unsent = 0; diff --git a/net/sctp/stream.c b/net/sctp/stream.c index f1f1d1b232ba..7ca6fe4e7882 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -162,7 +162,7 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, stream->outcnt = outcnt; for (i = 0; i < stream->outcnt; i++) - stream->out[i].state = SCTP_STREAM_OPEN; + SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; sched->init(stream); @@ -193,7 +193,7 @@ int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid) soute = kzalloc(sizeof(*soute), GFP_KERNEL); if (!soute) return -ENOMEM; - stream->out[sid].ext = soute; + SCTP_SO(stream, sid)->ext = soute; return sctp_sched_init_sid(stream, sid, GFP_KERNEL); } @@ -205,7 +205,7 @@ void sctp_stream_free(struct sctp_stream *stream) sched->free(stream); for (i = 0; i < stream->outcnt; i++) - kfree(stream->out[i].ext); + kfree(SCTP_SO(stream, i)->ext); kfree(stream->out); kfree(stream->in); } @@ -215,12 +215,12 @@ void sctp_stream_clear(struct sctp_stream *stream) int i; for (i = 0; i < stream->outcnt; i++) { - stream->out[i].mid = 0; - stream->out[i].mid_uo = 0; + SCTP_SO(stream, i)->mid = 0; + SCTP_SO(stream, i)->mid_uo = 0; } for (i = 0; i < stream->incnt; i++) - stream->in[i].mid = 0; + SCTP_SI(stream, i)->mid = 0; } void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new) @@ -273,8 +273,8 @@ static bool sctp_stream_outq_is_empty(struct sctp_stream *stream, for (i = 0; i < str_nums; i++) { __u16 sid = ntohs(str_list[i]); - if (stream->out[sid].ext && - !list_empty(&stream->out[sid].ext->outq)) + if (SCTP_SO(stream, sid)->ext && + !list_empty(&SCTP_SO(stream, sid)->ext->outq)) return false; } @@ -361,11 +361,11 @@ int sctp_send_reset_streams(struct sctp_association *asoc, if (out) { if (str_nums) for (i = 0; i < str_nums; i++) - stream->out[str_list[i]].state = + SCTP_SO(stream, str_list[i])->state = SCTP_STREAM_CLOSED; else for (i = 0; i < stream->outcnt; i++) - stream->out[i].state = SCTP_STREAM_CLOSED; + SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED; } asoc->strreset_chunk = chunk; @@ -380,11 +380,11 @@ int sctp_send_reset_streams(struct sctp_association *asoc, if (str_nums) for (i = 0; i < str_nums; i++) - stream->out[str_list[i]].state = + SCTP_SO(stream, str_list[i])->state = SCTP_STREAM_OPEN; else for (i = 0; i < stream->outcnt; i++) - stream->out[i].state = SCTP_STREAM_OPEN; + SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; goto out; } @@ -418,7 +418,7 @@ int sctp_send_reset_assoc(struct sctp_association *asoc) /* Block further xmit of data until this request is completed */ for (i = 0; i < stream->outcnt; i++) - stream->out[i].state = SCTP_STREAM_CLOSED; + SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED; asoc->strreset_chunk = chunk; sctp_chunk_hold(asoc->strreset_chunk); @@ -429,7 +429,7 @@ int sctp_send_reset_assoc(struct sctp_association *asoc) asoc->strreset_chunk = NULL; for (i = 0; i < stream->outcnt; i++) - stream->out[i].state = SCTP_STREAM_OPEN; + SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; return retval; } @@ -609,10 +609,10 @@ struct sctp_chunk *sctp_process_strreset_outreq( } for (i = 0; i < nums; i++) - stream->in[ntohs(str_p[i])].mid = 0; + SCTP_SI(stream, ntohs(str_p[i]))->mid = 0; } else { for (i = 0; i < stream->incnt; i++) - stream->in[i].mid = 0; + SCTP_SI(stream, i)->mid = 0; } result = SCTP_STRRESET_PERFORMED; @@ -683,11 +683,11 @@ struct sctp_chunk *sctp_process_strreset_inreq( if (nums) for (i = 0; i < nums; i++) - stream->out[ntohs(str_p[i])].state = + SCTP_SO(stream, ntohs(str_p[i]))->state = SCTP_STREAM_CLOSED; else for (i = 0; i < stream->outcnt; i++) - stream->out[i].state = SCTP_STREAM_CLOSED; + SCTP_SO(stream, i)->state = SCTP_STREAM_CLOSED; asoc->strreset_chunk = chunk; asoc->strreset_outstanding = 1; @@ -786,11 +786,11 @@ struct sctp_chunk *sctp_process_strreset_tsnreq( * incoming and outgoing streams. */ for (i = 0; i < stream->outcnt; i++) { - stream->out[i].mid = 0; - stream->out[i].mid_uo = 0; + SCTP_SO(stream, i)->mid = 0; + SCTP_SO(stream, i)->mid_uo = 0; } for (i = 0; i < stream->incnt; i++) - stream->in[i].mid = 0; + SCTP_SI(stream, i)->mid = 0; result = SCTP_STRRESET_PERFORMED; @@ -979,15 +979,18 @@ struct sctp_chunk *sctp_process_strreset_resp( sizeof(__u16); if (result == SCTP_STRRESET_PERFORMED) { + struct sctp_stream_out *sout; if (nums) { for (i = 0; i < nums; i++) { - stream->out[ntohs(str_p[i])].mid = 0; - stream->out[ntohs(str_p[i])].mid_uo = 0; + sout = SCTP_SO(stream, ntohs(str_p[i])); + sout->mid = 0; + sout->mid_uo = 0; } } else { for (i = 0; i < stream->outcnt; i++) { - stream->out[i].mid = 0; - stream->out[i].mid_uo = 0; + sout = SCTP_SO(stream, i); + sout->mid = 0; + sout->mid_uo = 0; } } @@ -995,7 +998,7 @@ struct sctp_chunk *sctp_process_strreset_resp( } for (i = 0; i < stream->outcnt; i++) - stream->out[i].state = SCTP_STREAM_OPEN; + SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; *evp = sctp_ulpevent_make_stream_reset_event(asoc, flags, nums, str_p, GFP_ATOMIC); @@ -1050,15 +1053,15 @@ struct sctp_chunk *sctp_process_strreset_resp( asoc->adv_peer_ack_point = asoc->ctsn_ack_point; for (i = 0; i < stream->outcnt; i++) { - stream->out[i].mid = 0; - stream->out[i].mid_uo = 0; + SCTP_SO(stream, i)->mid = 0; + SCTP_SO(stream, i)->mid_uo = 0; } for (i = 0; i < stream->incnt; i++) - stream->in[i].mid = 0; + SCTP_SI(stream, i)->mid = 0; } for (i = 0; i < stream->outcnt; i++) - stream->out[i].state = SCTP_STREAM_OPEN; + SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; *evp = sctp_ulpevent_make_assoc_reset_event(asoc, flags, stsn, rtsn, GFP_ATOMIC); @@ -1072,7 +1075,7 @@ struct sctp_chunk *sctp_process_strreset_resp( if (result == SCTP_STRRESET_PERFORMED) for (i = number; i < stream->outcnt; i++) - stream->out[i].state = SCTP_STREAM_OPEN; + SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN; else stream->outcnt = number; diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c index d3764c181299..0a78cdf86463 100644 --- a/net/sctp/stream_interleave.c +++ b/net/sctp/stream_interleave.c @@ -197,7 +197,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_partial( __u32 next_fsn = 0; int is_last = 0; - sin = sctp_stream_in(ulpq->asoc, event->stream); + sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); @@ -278,7 +278,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_reassembled( __u32 pd_len = 0; __u32 mid = 0; - sin = sctp_stream_in(ulpq->asoc, event->stream); + sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); @@ -368,7 +368,7 @@ static struct sctp_ulpevent *sctp_intl_reasm(struct sctp_ulpq *ulpq, sctp_intl_store_reasm(ulpq, event); - sin = sctp_stream_in(ulpq->asoc, event->stream); + sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); if (sin->pd_mode && event->mid == sin->mid && event->fsn == sin->fsn) retval = sctp_intl_retrieve_partial(ulpq, event); @@ -575,7 +575,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_partial_uo( __u32 next_fsn = 0; int is_last = 0; - sin = sctp_stream_in(ulpq->asoc, event->stream); + sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm_uo, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); @@ -659,7 +659,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_reassembled_uo( __u32 pd_len = 0; __u32 mid = 0; - sin = sctp_stream_in(ulpq->asoc, event->stream); + sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm_uo, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); @@ -750,7 +750,7 @@ static struct sctp_ulpevent *sctp_intl_reasm_uo(struct sctp_ulpq *ulpq, sctp_intl_store_reasm_uo(ulpq, event); - sin = sctp_stream_in(ulpq->asoc, event->stream); + sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); if (sin->pd_mode_uo && event->mid == sin->mid_uo && event->fsn == sin->fsn_uo) retval = sctp_intl_retrieve_partial_uo(ulpq, event); @@ -774,7 +774,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_first_uo(struct sctp_ulpq *ulpq) skb_queue_walk(&ulpq->reasm_uo, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); - csin = sctp_stream_in(ulpq->asoc, cevent->stream); + csin = sctp_stream_in(&ulpq->asoc->stream, cevent->stream); if (csin->pd_mode_uo) continue; @@ -875,7 +875,7 @@ static struct sctp_ulpevent *sctp_intl_retrieve_first(struct sctp_ulpq *ulpq) skb_queue_walk(&ulpq->reasm, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); - csin = sctp_stream_in(ulpq->asoc, cevent->stream); + csin = sctp_stream_in(&ulpq->asoc->stream, cevent->stream); if (csin->pd_mode) continue; @@ -1053,7 +1053,7 @@ static void sctp_intl_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp) __u16 sid; for (sid = 0; sid < stream->incnt; sid++) { - struct sctp_stream_in *sin = &stream->in[sid]; + struct sctp_stream_in *sin = SCTP_SI(stream, sid); __u32 mid; if (sin->pd_mode_uo) { @@ -1247,7 +1247,7 @@ static void sctp_handle_fwdtsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk) static void sctp_intl_skip(struct sctp_ulpq *ulpq, __u16 sid, __u32 mid, __u8 flags) { - struct sctp_stream_in *sin = sctp_stream_in(ulpq->asoc, sid); + struct sctp_stream_in *sin = sctp_stream_in(&ulpq->asoc->stream, sid); struct sctp_stream *stream = &ulpq->asoc->stream; if (flags & SCTP_FTSN_U_BIT) { diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c index f5fcd425232a..a6c04a94b08f 100644 --- a/net/sctp/stream_sched.c +++ b/net/sctp/stream_sched.c @@ -161,7 +161,7 @@ int sctp_sched_set_sched(struct sctp_association *asoc, /* Give the next scheduler a clean slate. */ for (i = 0; i < asoc->stream.outcnt; i++) { - void *p = asoc->stream.out[i].ext; + void *p = SCTP_SO(&asoc->stream, i)->ext; if (!p) continue; @@ -175,7 +175,7 @@ int sctp_sched_set_sched(struct sctp_association *asoc, asoc->outqueue.sched = n; n->init(&asoc->stream); for (i = 0; i < asoc->stream.outcnt; i++) { - if (!asoc->stream.out[i].ext) + if (!SCTP_SO(&asoc->stream, i)->ext) continue; ret = n->init_sid(&asoc->stream, i, GFP_KERNEL); @@ -217,7 +217,7 @@ int sctp_sched_set_value(struct sctp_association *asoc, __u16 sid, if (sid >= asoc->stream.outcnt) return -EINVAL; - if (!asoc->stream.out[sid].ext) { + if (!SCTP_SO(&asoc->stream, sid)->ext) { int ret; ret = sctp_stream_init_ext(&asoc->stream, sid); @@ -234,7 +234,7 @@ int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid, if (sid >= asoc->stream.outcnt) return -EINVAL; - if (!asoc->stream.out[sid].ext) + if (!SCTP_SO(&asoc->stream, sid)->ext) return 0; return asoc->outqueue.sched->get(&asoc->stream, sid, value); @@ -252,7 +252,7 @@ void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch) * priority stream comes in. */ sid = sctp_chunk_stream_no(ch); - sout = &q->asoc->stream.out[sid]; + sout = SCTP_SO(&q->asoc->stream, sid); q->asoc->stream.out_curr = sout; return; } @@ -272,8 +272,9 @@ void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch) int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); + struct sctp_stream_out_ext *ext = SCTP_SO(stream, sid)->ext; - INIT_LIST_HEAD(&stream->out[sid].ext->outq); + INIT_LIST_HEAD(&ext->outq); return sched->init_sid(stream, sid, gfp); } diff --git a/net/sctp/stream_sched_prio.c b/net/sctp/stream_sched_prio.c index 7997d35dd0fd..2245083a98f2 100644 --- a/net/sctp/stream_sched_prio.c +++ b/net/sctp/stream_sched_prio.c @@ -75,10 +75,10 @@ static struct sctp_stream_priorities *sctp_sched_prio_get_head( /* No luck. So we search on all streams now. */ for (i = 0; i < stream->outcnt; i++) { - if (!stream->out[i].ext) + if (!SCTP_SO(stream, i)->ext) continue; - p = stream->out[i].ext->prio_head; + p = SCTP_SO(stream, i)->ext->prio_head; if (!p) /* Means all other streams won't be initialized * as well. @@ -165,7 +165,7 @@ static void sctp_sched_prio_sched(struct sctp_stream *stream, static int sctp_sched_prio_set(struct sctp_stream *stream, __u16 sid, __u16 prio, gfp_t gfp) { - struct sctp_stream_out *sout = &stream->out[sid]; + struct sctp_stream_out *sout = SCTP_SO(stream, sid); struct sctp_stream_out_ext *soute = sout->ext; struct sctp_stream_priorities *prio_head, *old; bool reschedule = false; @@ -186,7 +186,7 @@ static int sctp_sched_prio_set(struct sctp_stream *stream, __u16 sid, return 0; for (i = 0; i < stream->outcnt; i++) { - soute = stream->out[i].ext; + soute = SCTP_SO(stream, i)->ext; if (soute && soute->prio_head == old) /* It's still in use, nothing else to do here. */ return 0; @@ -201,7 +201,7 @@ static int sctp_sched_prio_set(struct sctp_stream *stream, __u16 sid, static int sctp_sched_prio_get(struct sctp_stream *stream, __u16 sid, __u16 *value) { - *value = stream->out[sid].ext->prio_head->prio; + *value = SCTP_SO(stream, sid)->ext->prio_head->prio; return 0; } @@ -215,7 +215,7 @@ static int sctp_sched_prio_init(struct sctp_stream *stream) static int sctp_sched_prio_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { - INIT_LIST_HEAD(&stream->out[sid].ext->prio_list); + INIT_LIST_HEAD(&SCTP_SO(stream, sid)->ext->prio_list); return sctp_sched_prio_set(stream, sid, 0, gfp); } @@ -233,9 +233,9 @@ static void sctp_sched_prio_free(struct sctp_stream *stream) */ sctp_sched_prio_unsched_all(stream); for (i = 0; i < stream->outcnt; i++) { - if (!stream->out[i].ext) + if (!SCTP_SO(stream, i)->ext) continue; - prio = stream->out[i].ext->prio_head; + prio = SCTP_SO(stream, i)->ext->prio_head; if (prio && list_empty(&prio->prio_sched)) list_add(&prio->prio_sched, &list); } @@ -255,7 +255,7 @@ static void sctp_sched_prio_enqueue(struct sctp_outq *q, ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list); sid = sctp_chunk_stream_no(ch); stream = &q->asoc->stream; - sctp_sched_prio_sched(stream, stream->out[sid].ext); + sctp_sched_prio_sched(stream, SCTP_SO(stream, sid)->ext); } static struct sctp_chunk *sctp_sched_prio_dequeue(struct sctp_outq *q) @@ -297,7 +297,7 @@ static void sctp_sched_prio_dequeue_done(struct sctp_outq *q, * this priority. */ sid = sctp_chunk_stream_no(ch); - soute = q->asoc->stream.out[sid].ext; + soute = SCTP_SO(&q->asoc->stream, sid)->ext; prio = soute->prio_head; sctp_sched_prio_next_stream(prio); @@ -317,7 +317,7 @@ static void sctp_sched_prio_sched_all(struct sctp_stream *stream) __u16 sid; sid = sctp_chunk_stream_no(ch); - sout = &stream->out[sid]; + sout = SCTP_SO(stream, sid); if (sout->ext) sctp_sched_prio_sched(stream, sout->ext); } diff --git a/net/sctp/stream_sched_rr.c b/net/sctp/stream_sched_rr.c index 1155692448f1..52ba743fa7a7 100644 --- a/net/sctp/stream_sched_rr.c +++ b/net/sctp/stream_sched_rr.c @@ -100,7 +100,7 @@ static int sctp_sched_rr_init(struct sctp_stream *stream) static int sctp_sched_rr_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { - INIT_LIST_HEAD(&stream->out[sid].ext->rr_list); + INIT_LIST_HEAD(&SCTP_SO(stream, sid)->ext->rr_list); return 0; } @@ -120,7 +120,7 @@ static void sctp_sched_rr_enqueue(struct sctp_outq *q, ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list); sid = sctp_chunk_stream_no(ch); stream = &q->asoc->stream; - sctp_sched_rr_sched(stream, stream->out[sid].ext); + sctp_sched_rr_sched(stream, SCTP_SO(stream, sid)->ext); } static struct sctp_chunk *sctp_sched_rr_dequeue(struct sctp_outq *q) @@ -154,7 +154,7 @@ static void sctp_sched_rr_dequeue_done(struct sctp_outq *q, /* Last chunk on that msg, move to the next stream */ sid = sctp_chunk_stream_no(ch); - soute = q->asoc->stream.out[sid].ext; + soute = SCTP_SO(&q->asoc->stream, sid)->ext; sctp_sched_rr_next_stream(&q->asoc->stream); @@ -173,7 +173,7 @@ static void sctp_sched_rr_sched_all(struct sctp_stream *stream) __u16 sid; sid = sctp_chunk_stream_no(ch); - soute = stream->out[sid].ext; + soute = SCTP_SO(stream, sid)->ext; if (soute) sctp_sched_rr_sched(stream, soute); } -- cgit v1.2.3 From 0d493b4d0be352b5e361e4fa0bc3efe952d8b10e Mon Sep 17 00:00:00 2001 From: Konstantin Khorenko Date: Fri, 10 Aug 2018 20:11:43 +0300 Subject: net/sctp: Replace in/out stream arrays with flex_array This path replaces physically contiguous memory arrays allocated using kmalloc_array() with flexible arrays. This enables to avoid memory allocation failures on the systems under a memory stress. Signed-off-by: Oleg Babin Signed-off-by: Konstantin Khorenko Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 9 ++--- net/sctp/stream.c | 88 ++++++++++++++++++++++++++++++++++------------ 2 files changed, 71 insertions(+), 26 deletions(-) (limited to 'include/net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 6b2b8df8a1d2..28a7c8e44636 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -57,6 +57,7 @@ #include /* This gets us atomic counters. */ #include /* We need sk_buff_head. */ #include /* We need tq_struct. */ +#include /* We need flex_array. */ #include /* We need sctp* header structs. */ #include /* We need auth specific structs */ #include /* For inet_skb_parm */ @@ -1438,8 +1439,8 @@ struct sctp_stream_in { }; struct sctp_stream { - struct sctp_stream_out *out; - struct sctp_stream_in *in; + struct flex_array *out; + struct flex_array *in; __u16 outcnt; __u16 incnt; /* Current stream being sent, if any */ @@ -1465,14 +1466,14 @@ static inline struct sctp_stream_out *sctp_stream_out( const struct sctp_stream *stream, __u16 sid) { - return ((struct sctp_stream_out *)(stream->out)) + sid; + return flex_array_get(stream->out, sid); } static inline struct sctp_stream_in *sctp_stream_in( const struct sctp_stream *stream, __u16 sid) { - return ((struct sctp_stream_in *)(stream->in)) + sid; + return flex_array_get(stream->in, sid); } #define SCTP_SO(s, i) sctp_stream_out((s), (i)) diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 7ca6fe4e7882..ffb940d3b57c 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -37,6 +37,53 @@ #include #include +static struct flex_array *fa_alloc(size_t elem_size, size_t elem_count, + gfp_t gfp) +{ + struct flex_array *result; + int err; + + result = flex_array_alloc(elem_size, elem_count, gfp); + if (result) { + err = flex_array_prealloc(result, 0, elem_count, gfp); + if (err) { + flex_array_free(result); + result = NULL; + } + } + + return result; +} + +static void fa_free(struct flex_array *fa) +{ + if (fa) + flex_array_free(fa); +} + +static void fa_copy(struct flex_array *fa, struct flex_array *from, + size_t index, size_t count) +{ + void *elem; + + while (count--) { + elem = flex_array_get(from, index); + flex_array_put(fa, index, elem, 0); + index++; + } +} + +static void fa_zero(struct flex_array *fa, size_t index, size_t count) +{ + void *elem; + + while (count--) { + elem = flex_array_get(fa, index); + memset(elem, 0, fa->element_size); + index++; + } +} + /* Migrates chunks from stream queues to new stream queues if needed, * but not across associations. Also, removes those chunks to streams * higher than the new max. @@ -78,34 +125,33 @@ static void sctp_stream_outq_migrate(struct sctp_stream *stream, * sctp_stream_update will swap ->out pointers. */ for (i = 0; i < outcnt; i++) { - kfree(new->out[i].ext); - new->out[i].ext = stream->out[i].ext; - stream->out[i].ext = NULL; + kfree(SCTP_SO(new, i)->ext); + SCTP_SO(new, i)->ext = SCTP_SO(stream, i)->ext; + SCTP_SO(stream, i)->ext = NULL; } } for (i = outcnt; i < stream->outcnt; i++) - kfree(stream->out[i].ext); + kfree(SCTP_SO(stream, i)->ext); } static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt, gfp_t gfp) { - struct sctp_stream_out *out; + struct flex_array *out; + size_t elem_size = sizeof(struct sctp_stream_out); - out = kmalloc_array(outcnt, sizeof(*out), gfp); + out = fa_alloc(elem_size, outcnt, gfp); if (!out) return -ENOMEM; if (stream->out) { - memcpy(out, stream->out, min(outcnt, stream->outcnt) * - sizeof(*out)); - kfree(stream->out); + fa_copy(out, stream->out, 0, min(outcnt, stream->outcnt)); + fa_free(stream->out); } if (outcnt > stream->outcnt) - memset(out + stream->outcnt, 0, - (outcnt - stream->outcnt) * sizeof(*out)); + fa_zero(out, stream->outcnt, (outcnt - stream->outcnt)); stream->out = out; @@ -115,22 +161,20 @@ static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt, static int sctp_stream_alloc_in(struct sctp_stream *stream, __u16 incnt, gfp_t gfp) { - struct sctp_stream_in *in; - - in = kmalloc_array(incnt, sizeof(*stream->in), gfp); + struct flex_array *in; + size_t elem_size = sizeof(struct sctp_stream_in); + in = fa_alloc(elem_size, incnt, gfp); if (!in) return -ENOMEM; if (stream->in) { - memcpy(in, stream->in, min(incnt, stream->incnt) * - sizeof(*in)); - kfree(stream->in); + fa_copy(in, stream->in, 0, min(incnt, stream->incnt)); + fa_free(stream->in); } if (incnt > stream->incnt) - memset(in + stream->incnt, 0, - (incnt - stream->incnt) * sizeof(*in)); + fa_zero(in, stream->incnt, (incnt - stream->incnt)); stream->in = in; @@ -174,7 +218,7 @@ in: ret = sctp_stream_alloc_in(stream, incnt, gfp); if (ret) { sched->free(stream); - kfree(stream->out); + fa_free(stream->out); stream->out = NULL; stream->outcnt = 0; goto out; @@ -206,8 +250,8 @@ void sctp_stream_free(struct sctp_stream *stream) sched->free(stream); for (i = 0; i < stream->outcnt; i++) kfree(SCTP_SO(stream, i)->ext); - kfree(stream->out); - kfree(stream->in); + fa_free(stream->out); + fa_free(stream->in); } void sctp_stream_clear(struct sctp_stream *stream) -- cgit v1.2.3 From 84a75b329be84c108a21ab9c02a52a9bf9e5a919 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:52 +0300 Subject: net: sched: extend action ops with put_dev callback As a preparation for removing dependency on rtnl lock from rules update path, all users of shared objects must take reference while working with them. Extend action ops with put_dev() API to be used on net device returned by get_dev(). Modify mirred action (only action that implements get_dev callback): - Take reference to net device in get_dev. - Implement put_dev API that releases reference to net device. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- include/net/act_api.h | 1 + net/sched/act_mirred.c | 12 +++++++++++- net/sched/cls_api.c | 1 + 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 8c9bc02d05e1..1ad5b19e83a9 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -101,6 +101,7 @@ struct tc_action_ops { void (*stats_update)(struct tc_action *, u64, u32, u64); size_t (*get_fill_size)(const struct tc_action *act); struct net_device *(*get_dev)(const struct tc_action *a); + void (*put_dev)(struct net_device *dev); int (*delete)(struct net *net, u32 index); }; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index b26d060da08e..7a045cc7fe3b 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -358,8 +358,17 @@ static struct notifier_block mirred_device_notifier = { static struct net_device *tcf_mirred_get_dev(const struct tc_action *a) { struct tcf_mirred *m = to_mirred(a); + struct net_device *dev = rtnl_dereference(m->tcfm_dev); + + if (dev) + dev_hold(dev); - return rtnl_dereference(m->tcfm_dev); + return dev; +} + +static void tcf_mirred_put_dev(struct net_device *dev) +{ + dev_put(dev); } static int tcf_mirred_delete(struct net *net, u32 index) @@ -382,6 +391,7 @@ static struct tc_action_ops act_mirred_ops = { .lookup = tcf_mirred_search, .size = sizeof(struct tcf_mirred), .get_dev = tcf_mirred_get_dev, + .put_dev = tcf_mirred_put_dev, .delete = tcf_mirred_delete, }; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index f922ce27ed5e..31bd1439cf60 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -2176,6 +2176,7 @@ static int tc_exts_setup_cb_egdev_call(struct tcf_exts *exts, if (!dev) continue; ret = tc_setup_cb_egdev_call(dev, type, type_data, err_stop); + a->ops->put_dev(dev); if (ret < 0) return ret; ok_count += ret; -- cgit v1.2.3 From 51a9f5ae653979ac4bdbd81778a10431f0177e3c Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:54 +0300 Subject: net: core: protect rate estimator statistics pointer with lock Extend gen_new_estimator() to also take stats_lock when re-assigning rate estimator statistics pointer. (to be used by unlocked actions) Rename 'stats_lock' to 'lock' and change argument description to explain that it is now also used for control path. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- include/net/gen_stats.h | 4 ++-- net/core/gen_estimator.c | 21 +++++++++++++-------- 2 files changed, 15 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index 0304ba2ae353..883bb9085f15 100644 --- a/include/net/gen_stats.h +++ b/include/net/gen_stats.h @@ -59,13 +59,13 @@ int gnet_stats_finish_copy(struct gnet_dump *d); int gen_new_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, - spinlock_t *stats_lock, + spinlock_t *lock, seqcount_t *running, struct nlattr *opt); void gen_kill_estimator(struct net_rate_estimator __rcu **ptr); int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct net_rate_estimator __rcu **ptr, - spinlock_t *stats_lock, + spinlock_t *lock, seqcount_t *running, struct nlattr *opt); bool gen_estimator_active(struct net_rate_estimator __rcu **ptr); bool gen_estimator_read(struct net_rate_estimator __rcu **ptr, diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 98fd12721221..e4e442d70c2d 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -112,7 +112,7 @@ static void est_timer(struct timer_list *t) * @bstats: basic statistics * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics - * @stats_lock: statistics lock + * @lock: lock for statistics and control path * @running: qdisc running seqcount * @opt: rate estimator configuration TLV * @@ -128,7 +128,7 @@ static void est_timer(struct timer_list *t) int gen_new_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, - spinlock_t *stats_lock, + spinlock_t *lock, seqcount_t *running, struct nlattr *opt) { @@ -154,19 +154,22 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, seqcount_init(&est->seq); intvl_log = parm->interval + 2; est->bstats = bstats; - est->stats_lock = stats_lock; + est->stats_lock = lock; est->running = running; est->ewma_log = parm->ewma_log; est->intvl_log = intvl_log; est->cpu_bstats = cpu_bstats; - if (stats_lock) + if (lock) local_bh_disable(); est_fetch_counters(est, &b); - if (stats_lock) + if (lock) local_bh_enable(); est->last_bytes = b.bytes; est->last_packets = b.packets; + + if (lock) + spin_lock_bh(lock); old = rcu_dereference_protected(*rate_est, 1); if (old) { del_timer_sync(&old->timer); @@ -179,6 +182,8 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, mod_timer(&est->timer, est->next_jiffies); rcu_assign_pointer(*rate_est, est); + if (lock) + spin_unlock_bh(lock); if (old) kfree_rcu(old, rcu); return 0; @@ -209,7 +214,7 @@ EXPORT_SYMBOL(gen_kill_estimator); * @bstats: basic statistics * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics - * @stats_lock: statistics lock + * @lock: lock for statistics and control path * @running: qdisc running seqcount (might be NULL) * @opt: rate estimator configuration TLV * @@ -221,11 +226,11 @@ EXPORT_SYMBOL(gen_kill_estimator); int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, - spinlock_t *stats_lock, + spinlock_t *lock, seqcount_t *running, struct nlattr *opt) { return gen_new_estimator(bstats, cpu_bstats, rate_est, - stats_lock, running, opt); + lock, running, opt); } EXPORT_SYMBOL(gen_replace_estimator); -- cgit v1.2.3 From 353c9cb360874e737fb000545f783df756c06f9a Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Sat, 11 Aug 2018 20:27:24 +0000 Subject: ip: add helpers to process in-order fragments faster. This patch introduces several helper functions/macros that will be used in the follow-up patch. No runtime changes yet. The new logic (fully implemented in the second patch) is as follows: * Nodes in the rb-tree will now contain not single fragments, but lists of consecutive fragments ("runs"). * At each point in time, the current "active" run at the tail is maintained/tracked. Fragments that arrive in-order, adjacent to the previous tail fragment, are added to this tail run without triggering the re-balancing of the rb-tree. * If a fragment arrives out of order with the offset _before_ the tail run, it is inserted into the rb-tree as a single fragment. * If a fragment arrives after the current tail fragment (with a gap), it starts a new "tail" run, as is inserted into the rb-tree at the end as the head of the new run. skb->cb is used to store additional information needed here (suggested by Eric Dumazet). Reported-by: Willem de Bruijn Signed-off-by: Peter Oskolkov Cc: Eric Dumazet Cc: Florian Westphal Signed-off-by: David S. Miller --- include/net/inet_frag.h | 6 ++++ net/ipv4/ip_fragment.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) (limited to 'include/net') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index b86d14528188..1662cbc0b46b 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -57,7 +57,9 @@ struct frag_v6_compare_key { * @lock: spinlock protecting this frag * @refcnt: reference count of the queue * @fragments: received fragments head + * @rb_fragments: received fragments rb-tree root * @fragments_tail: received fragments tail + * @last_run_head: the head of the last "run". see ip_fragment.c * @stamp: timestamp of the last received fragment * @len: total length of the original datagram * @meat: length of received fragments so far @@ -78,6 +80,7 @@ struct inet_frag_queue { struct sk_buff *fragments; /* Used in IPv6. */ struct rb_root rb_fragments; /* Used in IPv4. */ struct sk_buff *fragments_tail; + struct sk_buff *last_run_head; ktime_t stamp; int len; int meat; @@ -113,6 +116,9 @@ void inet_frag_kill(struct inet_frag_queue *q); void inet_frag_destroy(struct inet_frag_queue *q); struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key); +/* Free all skbs in the queue; return the sum of their truesizes. */ +unsigned int inet_frag_rbtree_purge(struct rb_root *root); + static inline void inet_frag_put(struct inet_frag_queue *q) { if (refcount_dec_and_test(&q->refcnt)) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 7cb7ed761d8c..26ace9d2d976 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -57,6 +57,57 @@ */ static const char ip_frag_cache_name[] = "ip4-frags"; +/* Use skb->cb to track consecutive/adjacent fragments coming at + * the end of the queue. Nodes in the rb-tree queue will + * contain "runs" of one or more adjacent fragments. + * + * Invariants: + * - next_frag is NULL at the tail of a "run"; + * - the head of a "run" has the sum of all fragment lengths in frag_run_len. + */ +struct ipfrag_skb_cb { + struct inet_skb_parm h; + struct sk_buff *next_frag; + int frag_run_len; +}; + +#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) + +static void ip4_frag_init_run(struct sk_buff *skb) +{ + BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); + + FRAG_CB(skb)->next_frag = NULL; + FRAG_CB(skb)->frag_run_len = skb->len; +} + +/* Append skb to the last "run". */ +static void ip4_frag_append_to_last_run(struct inet_frag_queue *q, + struct sk_buff *skb) +{ + RB_CLEAR_NODE(&skb->rbnode); + FRAG_CB(skb)->next_frag = NULL; + + FRAG_CB(q->last_run_head)->frag_run_len += skb->len; + FRAG_CB(q->fragments_tail)->next_frag = skb; + q->fragments_tail = skb; +} + +/* Create a new "run" with the skb. */ +static void ip4_frag_create_run(struct inet_frag_queue *q, struct sk_buff *skb) +{ + if (q->last_run_head) + rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, + &q->last_run_head->rbnode.rb_right); + else + rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); + rb_insert_color(&skb->rbnode, &q->rb_fragments); + + ip4_frag_init_run(skb); + q->fragments_tail = skb; + q->last_run_head = skb; +} + /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { struct inet_frag_queue q; @@ -654,6 +705,28 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) } EXPORT_SYMBOL(ip_check_defrag); +unsigned int inet_frag_rbtree_purge(struct rb_root *root) +{ + struct rb_node *p = rb_first(root); + unsigned int sum = 0; + + while (p) { + struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); + + p = rb_next(p); + rb_erase(&skb->rbnode, root); + while (skb) { + struct sk_buff *next = FRAG_CB(skb)->next_frag; + + sum += skb->truesize; + kfree_skb(skb); + skb = next; + } + } + return sum; +} +EXPORT_SYMBOL(inet_frag_rbtree_purge); + #ifdef CONFIG_SYSCTL static int dist_min; -- cgit v1.2.3 From 0b243d004ea640875115d1500ec429a3e9f9fae9 Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Fri, 10 Aug 2018 20:46:41 +0530 Subject: net/tls: Combined memory allocation for decryption request For preparing decryption request, several memory chunks are required (aead_req, sgin, sgout, iv, aad). For submitting the decrypt request to an accelerator, it is required that the buffers which are read by the accelerator must be dma-able and not come from stack. The buffers for aad and iv can be separately kmalloced each, but it is inefficient. This patch does a combined allocation for preparing decryption request and then segments into aead_req || sgin || sgout || iv || aad. Signed-off-by: Vakul Garg Signed-off-by: David S. Miller --- include/net/tls.h | 4 - net/tls/tls_sw.c | 238 ++++++++++++++++++++++++++++++++---------------------- 2 files changed, 142 insertions(+), 100 deletions(-) (limited to 'include/net') diff --git a/include/net/tls.h b/include/net/tls.h index d8b3b6578c01..d5c683e8bb22 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -124,10 +124,6 @@ struct tls_sw_context_rx { struct sk_buff *recv_pkt; u8 control; bool decrypted; - - char rx_aad_ciphertext[TLS_AAD_SPACE_SIZE]; - char rx_aad_plaintext[TLS_AAD_SPACE_SIZE]; - }; struct tls_record_info { diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 83d67df33f0c..52fbe727d7c1 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -48,19 +48,13 @@ static int tls_do_decryption(struct sock *sk, struct scatterlist *sgout, char *iv_recv, size_t data_len, - struct sk_buff *skb, - gfp_t flags) + struct aead_request *aead_req) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - struct aead_request *aead_req; - int ret; - aead_req = aead_request_alloc(ctx->aead_recv, flags); - if (!aead_req) - return -ENOMEM; - + aead_request_set_tfm(aead_req, ctx->aead_recv); aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); aead_request_set_crypt(aead_req, sgin, sgout, data_len + tls_ctx->rx.tag_size, @@ -69,8 +63,6 @@ static int tls_do_decryption(struct sock *sk, crypto_req_done, &ctx->async_wait); ret = crypto_wait_req(crypto_aead_decrypt(aead_req), &ctx->async_wait); - - aead_request_free(aead_req); return ret; } @@ -657,8 +649,132 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags, return skb; } +/* This function decrypts the input skb into either out_iov or in out_sg + * or in skb buffers itself. The input parameter 'zc' indicates if + * zero-copy mode needs to be tried or not. With zero-copy mode, either + * out_iov or out_sg must be non-NULL. In case both out_iov and out_sg are + * NULL, then the decryption happens inside skb buffers itself, i.e. + * zero-copy gets disabled and 'zc' is updated. + */ + +static int decrypt_internal(struct sock *sk, struct sk_buff *skb, + struct iov_iter *out_iov, + struct scatterlist *out_sg, + int *chunk, bool *zc) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + struct strp_msg *rxm = strp_msg(skb); + int n_sgin, n_sgout, nsg, mem_size, aead_size, err, pages = 0; + struct aead_request *aead_req; + struct sk_buff *unused; + u8 *aad, *iv, *mem = NULL; + struct scatterlist *sgin = NULL; + struct scatterlist *sgout = NULL; + const int data_len = rxm->full_len - tls_ctx->rx.overhead_size; + + if (*zc && (out_iov || out_sg)) { + if (out_iov) + n_sgout = iov_iter_npages(out_iov, INT_MAX) + 1; + else + n_sgout = sg_nents(out_sg); + } else { + n_sgout = 0; + *zc = false; + } + + n_sgin = skb_cow_data(skb, 0, &unused); + if (n_sgin < 1) + return -EBADMSG; + + /* Increment to accommodate AAD */ + n_sgin = n_sgin + 1; + + nsg = n_sgin + n_sgout; + + aead_size = sizeof(*aead_req) + crypto_aead_reqsize(ctx->aead_recv); + mem_size = aead_size + (nsg * sizeof(struct scatterlist)); + mem_size = mem_size + TLS_AAD_SPACE_SIZE; + mem_size = mem_size + crypto_aead_ivsize(ctx->aead_recv); + + /* Allocate a single block of memory which contains + * aead_req || sgin[] || sgout[] || aad || iv. + * This order achieves correct alignment for aead_req, sgin, sgout. + */ + mem = kmalloc(mem_size, sk->sk_allocation); + if (!mem) + return -ENOMEM; + + /* Segment the allocated memory */ + aead_req = (struct aead_request *)mem; + sgin = (struct scatterlist *)(mem + aead_size); + sgout = sgin + n_sgin; + aad = (u8 *)(sgout + n_sgout); + iv = aad + TLS_AAD_SPACE_SIZE; + + /* Prepare IV */ + err = skb_copy_bits(skb, rxm->offset + TLS_HEADER_SIZE, + iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, + tls_ctx->rx.iv_size); + if (err < 0) { + kfree(mem); + return err; + } + memcpy(iv, tls_ctx->rx.iv, TLS_CIPHER_AES_GCM_128_SALT_SIZE); + + /* Prepare AAD */ + tls_make_aad(aad, rxm->full_len - tls_ctx->rx.overhead_size, + tls_ctx->rx.rec_seq, tls_ctx->rx.rec_seq_size, + ctx->control); + + /* Prepare sgin */ + sg_init_table(sgin, n_sgin); + sg_set_buf(&sgin[0], aad, TLS_AAD_SPACE_SIZE); + err = skb_to_sgvec(skb, &sgin[1], + rxm->offset + tls_ctx->rx.prepend_size, + rxm->full_len - tls_ctx->rx.prepend_size); + if (err < 0) { + kfree(mem); + return err; + } + + if (n_sgout) { + if (out_iov) { + sg_init_table(sgout, n_sgout); + sg_set_buf(&sgout[0], aad, TLS_AAD_SPACE_SIZE); + + *chunk = 0; + err = zerocopy_from_iter(sk, out_iov, data_len, &pages, + chunk, &sgout[1], + (n_sgout - 1), false); + if (err < 0) + goto fallback_to_reg_recv; + } else if (out_sg) { + memcpy(sgout, out_sg, n_sgout * sizeof(*sgout)); + } else { + goto fallback_to_reg_recv; + } + } else { +fallback_to_reg_recv: + sgout = sgin; + pages = 0; + *chunk = 0; + *zc = false; + } + + /* Prepare and submit AEAD request */ + err = tls_do_decryption(sk, sgin, sgout, iv, data_len, aead_req); + + /* Release the pages in case iov was mapped to pages */ + for (; pages > 0; pages--) + put_page(sg_page(&sgout[pages])); + + kfree(mem); + return err; +} + static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, - struct scatterlist *sgout, bool *zc) + struct iov_iter *dest, int *chunk, bool *zc) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); @@ -671,7 +787,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, return err; #endif if (!ctx->decrypted) { - err = decrypt_skb(sk, skb, sgout); + err = decrypt_internal(sk, skb, dest, NULL, chunk, zc); if (err < 0) return err; } else { @@ -690,54 +806,10 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, int decrypt_skb(struct sock *sk, struct sk_buff *skb, struct scatterlist *sgout) { - struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); - char iv[TLS_CIPHER_AES_GCM_128_SALT_SIZE + MAX_IV_SIZE]; - struct scatterlist sgin_arr[MAX_SKB_FRAGS + 2]; - struct scatterlist *sgin = &sgin_arr[0]; - struct strp_msg *rxm = strp_msg(skb); - int ret, nsg = ARRAY_SIZE(sgin_arr); - struct sk_buff *unused; - - ret = skb_copy_bits(skb, rxm->offset + TLS_HEADER_SIZE, - iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, - tls_ctx->rx.iv_size); - if (ret < 0) - return ret; - - memcpy(iv, tls_ctx->rx.iv, TLS_CIPHER_AES_GCM_128_SALT_SIZE); - if (!sgout) { - nsg = skb_cow_data(skb, 0, &unused) + 1; - sgin = kmalloc_array(nsg, sizeof(*sgin), sk->sk_allocation); - sgout = sgin; - } - - sg_init_table(sgin, nsg); - sg_set_buf(&sgin[0], ctx->rx_aad_ciphertext, TLS_AAD_SPACE_SIZE); - - nsg = skb_to_sgvec(skb, &sgin[1], - rxm->offset + tls_ctx->rx.prepend_size, - rxm->full_len - tls_ctx->rx.prepend_size); - if (nsg < 0) { - ret = nsg; - goto out; - } - - tls_make_aad(ctx->rx_aad_ciphertext, - rxm->full_len - tls_ctx->rx.overhead_size, - tls_ctx->rx.rec_seq, - tls_ctx->rx.rec_seq_size, - ctx->control); - - ret = tls_do_decryption(sk, sgin, sgout, iv, - rxm->full_len - tls_ctx->rx.overhead_size, - skb, sk->sk_allocation); - -out: - if (sgin != &sgin_arr[0]) - kfree(sgin); + bool zc = true; + int chunk; - return ret; + return decrypt_internal(sk, skb, NULL, sgout, &chunk, &zc); } static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb, @@ -816,43 +888,17 @@ int tls_sw_recvmsg(struct sock *sk, } if (!ctx->decrypted) { - int page_count; - int to_copy; - - page_count = iov_iter_npages(&msg->msg_iter, - MAX_SKB_FRAGS); - to_copy = rxm->full_len - tls_ctx->rx.overhead_size; - if (!is_kvec && to_copy <= len && page_count < MAX_SKB_FRAGS && - likely(!(flags & MSG_PEEK))) { - struct scatterlist sgin[MAX_SKB_FRAGS + 1]; - int pages = 0; + int to_copy = rxm->full_len - tls_ctx->rx.overhead_size; + if (!is_kvec && to_copy <= len && + likely(!(flags & MSG_PEEK))) zc = true; - sg_init_table(sgin, MAX_SKB_FRAGS + 1); - sg_set_buf(&sgin[0], ctx->rx_aad_plaintext, - TLS_AAD_SPACE_SIZE); - - err = zerocopy_from_iter(sk, &msg->msg_iter, - to_copy, &pages, - &chunk, &sgin[1], - MAX_SKB_FRAGS, false); - if (err < 0) - goto fallback_to_reg_recv; - - err = decrypt_skb_update(sk, skb, sgin, &zc); - for (; pages > 0; pages--) - put_page(sg_page(&sgin[pages])); - if (err < 0) { - tls_err_abort(sk, EBADMSG); - goto recv_end; - } - } else { -fallback_to_reg_recv: - err = decrypt_skb_update(sk, skb, NULL, &zc); - if (err < 0) { - tls_err_abort(sk, EBADMSG); - goto recv_end; - } + + err = decrypt_skb_update(sk, skb, &msg->msg_iter, + &chunk, &zc); + if (err < 0) { + tls_err_abort(sk, EBADMSG); + goto recv_end; } ctx->decrypted = true; } @@ -903,7 +949,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, int err = 0; long timeo; int chunk; - bool zc; + bool zc = false; lock_sock(sk); @@ -920,7 +966,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, } if (!ctx->decrypted) { - err = decrypt_skb_update(sk, skb, NULL, &zc); + err = decrypt_skb_update(sk, skb, NULL, &chunk, &zc); if (err < 0) { tls_err_abort(sk, EBADMSG); -- cgit v1.2.3 From e6f86b0f7ae473969a3301b74bf98af9e42ecd0e Mon Sep 17 00:00:00 2001 From: Virgile Jarry Date: Fri, 10 Aug 2018 17:48:15 +0200 Subject: ipv6: Add icmp_echo_ignore_all support for ICMPv6 Preventing the kernel from responding to ICMP Echo Requests messages can be useful in several ways. The sysctl parameter 'icmp_echo_ignore_all' can be used to prevent the kernel from responding to IPv4 ICMP echo requests. For IPv6 pings, such a sysctl kernel parameter did not exist. Add the ability to prevent the kernel from responding to IPv6 ICMP echo requests through the use of the following sysctl parameter : /proc/sys/net/ipv6/icmp/echo_ignore_all. Update the documentation to reflect this change. Signed-off-by: Virgile Jarry Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 5 +++++ include/net/netns/ipv6.h | 1 + include/uapi/linux/sysctl.h | 3 ++- net/ipv6/af_inet6.c | 1 + net/ipv6/icmp.c | 16 +++++++++++++--- 5 files changed, 22 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index e74515ecaa9c..8313a636dd53 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1882,6 +1882,11 @@ ratelimit - INTEGER otherwise the minimal space between responses in milliseconds. Default: 1000 +echo_ignore_all - BOOLEAN + If set non-zero, then the kernel will ignore all ICMP ECHO + requests sent to it over the IPv6 protocol. + Default: 0 + xfrm6_gc_thresh - INTEGER The threshold at which we will start garbage collecting for IPv6 destination cache entries. At twice this value the system will diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 762ac9931b62..f0e396ab9bec 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -32,6 +32,7 @@ struct netns_sysctl_ipv6 { int flowlabel_consistency; int auto_flowlabels; int icmpv6_time; + int icmpv6_echo_ignore_all; int anycast_src_echo_reply; int ip_nonlocal_bind; int fwmark_reflect; diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index 6b58371b1f0d..d71013fffaf6 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -575,7 +575,8 @@ enum { /* /proc/sys/net/ipv6/icmp */ enum { - NET_IPV6_ICMP_RATELIMIT=1 + NET_IPV6_ICMP_RATELIMIT = 1, + NET_IPV6_ICMP_ECHO_IGNORE_ALL = 2 }; /* /proc/sys/net//neigh/ */ diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 020f6e14a7af..673bba31eb18 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -832,6 +832,7 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.bindv6only = 0; net->ipv6.sysctl.icmpv6_time = 1*HZ; + net->ipv6.sysctl.icmpv6_echo_ignore_all = 0; net->ipv6.sysctl.flowlabel_consistency = 1; net->ipv6.sysctl.auto_flowlabels = IP6_DEFAULT_AUTO_FLOW_LABELS; net->ipv6.sysctl.idgen_retries = 3; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 7f6b1f81c200..c9c53ade55c3 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -794,6 +794,7 @@ out: static int icmpv6_rcv(struct sk_buff *skb) { + struct net *net = dev_net(skb->dev); struct net_device *dev = skb->dev; struct inet6_dev *idev = __in6_dev_get(dev); const struct in6_addr *saddr, *daddr; @@ -843,7 +844,8 @@ static int icmpv6_rcv(struct sk_buff *skb) switch (type) { case ICMPV6_ECHO_REQUEST: - icmpv6_echo_reply(skb); + if (!net->ipv6.sysctl.icmpv6_echo_ignore_all) + icmpv6_echo_reply(skb); break; case ICMPV6_ECHO_REPLY: @@ -1104,6 +1106,13 @@ static struct ctl_table ipv6_icmp_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, + { + .procname = "echo_ignore_all", + .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_all, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { }, }; @@ -1115,9 +1124,10 @@ struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net) sizeof(ipv6_icmp_table_template), GFP_KERNEL); - if (table) + if (table) { table[0].data = &net->ipv6.sysctl.icmpv6_time; - + table[1].data = &net->ipv6.sysctl.icmpv6_echo_ignore_all; + } return table; } #endif -- cgit v1.2.3 From 96d18d8254dc5a3f0067a629866af4165b3afe32 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Wed, 1 Aug 2018 14:57:59 -0700 Subject: inet/connection_sock: prefer _THIS_IP_ to current_text_addr As part of the effort to reduce the code duplication between _THIS_IP_ and current_text_addr(), let's consolidate callers of current_text_addr() to use _THIS_IP_. Signed-off-by: Nick Desaulniers Acked-by: David S. Miller Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index fa43b82607d9..371b3b45fd5c 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -225,7 +226,7 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, if (when > max_when) { pr_debug("reset_xmit_timer: sk=%p %d when=0x%lx, caller=%p\n", - sk, what, when, current_text_addr()); + sk, what, when, (void *)_THIS_IP_); when = max_when; } -- cgit v1.2.3