From 4aca87515a5083ae0e31ce3177189fd43b6d05ac Mon Sep 17 00:00:00 2001 From: Andreas Baumann Date: Sat, 3 Jan 2015 13:58:15 +0100 Subject: patch to Vanilla Tomato 1.28 --- release/src/linux/linux/net/core/skbuff.c | 16 + release/src/linux/linux/net/ipv4/arp.c | 24 +- release/src/linux/linux/net/ipv4/igmp.c | 5 +- .../src/linux/linux/net/ipv4/netfilter/Config.in | 63 +- .../src/linux/linux/net/ipv4/netfilter/Makefile | 41 +- .../linux/linux/net/ipv4/netfilter/arp_tables.c | 13 +- .../linux/net/ipv4/netfilter/ip_conntrack_core.c | 185 +- .../linux/net/ipv4/netfilter/ip_conntrack_h323.c | 34 +- .../linux/net/ipv4/netfilter/ip_conntrack_pptp.c | 150 +- .../net/ipv4/netfilter/ip_conntrack_proto_esp.c | 0 .../net/ipv4/netfilter/ip_conntrack_proto_gre.c | 55 +- .../net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 8 +- .../net/ipv4/netfilter/ip_conntrack_proto_udp.c | 30 +- .../net/ipv4/netfilter/ip_conntrack_standalone.c | 550 +----- .../linux/linux/net/ipv4/netfilter/ip_nat_core.c | 7 +- .../linux/linux/net/ipv4/netfilter/ip_nat_h323.c | 12 +- .../linux/linux/net/ipv4/netfilter/ip_nat_helper.c | 38 +- .../linux/linux/net/ipv4/netfilter/ip_nat_pptp.c | 265 +-- .../linux/net/ipv4/netfilter/ip_nat_proto_esp.c | 0 .../linux/net/ipv4/netfilter/ip_nat_proto_gre.c | 21 +- .../linux/net/ipv4/netfilter/ip_nat_proto_udp.c | 3 - .../src/linux/linux/net/ipv4/netfilter/ip_tables.c | 33 +- .../linux/linux/net/ipv4/netfilter/ipt_BCOUNT.c | 63 + .../linux/linux/net/ipv4/netfilter/ipt_CLASSIFY.c | 82 + .../linux/linux/net/ipv4/netfilter/ipt_CONNMARK.c | 128 ++ .../src/linux/linux/net/ipv4/netfilter/ipt_IMQ.c | 78 + .../linux/linux/net/ipv4/netfilter/ipt_MACSAVE.c | 65 + .../src/linux/linux/net/ipv4/netfilter/ipt_ROUTE.c | 422 +++++ .../linux/linux/net/ipv4/netfilter/ipt_TRIGGER.c | 14 +- .../src/linux/linux/net/ipv4/netfilter/ipt_TTL.c | 110 ++ .../linux/linux/net/ipv4/netfilter/ipt_account.c | 942 ++++++++++ .../linux/linux/net/ipv4/netfilter/ipt_bcount.c | 59 + .../linux/linux/net/ipv4/netfilter/ipt_condition.c | 256 +++ .../linux/linux/net/ipv4/netfilter/ipt_connlimit.c | 222 +++ .../linux/linux/net/ipv4/netfilter/ipt_connmark.c | 83 + .../src/linux/linux/net/ipv4/netfilter/ipt_exp.c | 57 + .../src/linux/linux/net/ipv4/netfilter/ipt_geoip.c | 272 +++ .../src/linux/linux/net/ipv4/netfilter/ipt_ipp2p.c | 868 ++++++++++ .../linux/linux/net/ipv4/netfilter/ipt_iprange.c | 101 ++ .../linux/linux/net/ipv4/netfilter/ipt_layer7.c | 570 ++++++ .../src/linux/linux/net/ipv4/netfilter/ipt_mac.c | 6 +- .../linux/linux/net/ipv4/netfilter/ipt_macsave.c | 62 + .../src/linux/linux/net/ipv4/netfilter/ipt_mport.c | 4 + .../src/linux/linux/net/ipv4/netfilter/ipt_quota.c | 88 + .../linux/linux/net/ipv4/netfilter/ipt_recent.c | 998 +++++++++++ .../linux/linux/net/ipv4/netfilter/ipt_string.c | 218 +++ .../src/linux/linux/net/ipv4/netfilter/ipt_time.c | 46 +- .../src/linux/linux/net/ipv4/netfilter/ipt_u32.c | 211 +++ .../src/linux/linux/net/ipv4/netfilter/ipt_web.c | 246 +++ .../linux/linux/net/ipv4/netfilter/regexp/regexp.c | 1195 +++++++++++++ .../linux/linux/net/ipv4/netfilter/regexp/regexp.h | 40 + .../linux/net/ipv4/netfilter/regexp/regmagic.h | 5 + .../linux/linux/net/ipv4/netfilter/regexp/regsub.c | 95 + .../src/linux/linux/net/ipv4/netfilter/tomato_ct.c | 181 ++ release/src/linux/linux/net/ipv4/route.c | 10 + release/src/linux/linux/net/ipv4/sysctl_net_ipv4.c | 12 + release/src/linux/linux/net/ipv4/tcp_input.c | 311 +++- release/src/linux/linux/net/ipv4/tcp_minisocks.c | 3 +- release/src/linux/linux/net/ipv4/tcp_output.c | 20 +- .../src/linux/linux/net/ipv6/netfilter/Config.in | 4 + .../src/linux/linux/net/ipv6/netfilter/Makefile | 3 + .../linux/linux/net/ipv6/netfilter/ip6_tables.c | 6 - .../src/linux/linux/net/ipv6/netfilter/ip6t_IMQ.c | 78 + .../linux/linux/net/ipv6/netfilter/ip6t_ROUTE.c | 308 ++++ .../linux/net/ipv6/netfilter/ip6t_condition.c | 254 +++ release/src/linux/linux/net/sched/Config.in | 3 +- release/src/linux/linux/net/sched/Makefile | 1 + release/src/linux/linux/net/sched/sch_api.c | 3 + release/src/linux/linux/net/sched/sch_esfq.c | 652 +++++++ release/src/linux/linux/net/sched/sch_fifo.c | 15 +- release/src/linux/linux/net/sched/sch_generic.c | 13 +- release/src/linux/linux/net/sched/sch_hfsc.c | 1817 ++++++++++++++++++++ release/src/linux/linux/net/sched/sch_htb.c | 255 ++- release/src/linux/linux/net/sched/sch_ingress.c | 4 - release/src/linux/linux/net/sched/sch_sfq.c | 8 +- release/src/linux/linux/net/socket.c | 3 + 76 files changed, 12035 insertions(+), 1088 deletions(-) mode change 100755 => 100644 release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_proto_esp.c mode change 100755 => 100644 release/src/linux/linux/net/ipv4/netfilter/ip_nat_proto_esp.c create mode 100644 release/src/linux/linux/net/ipv4/netfilter/ipt_BCOUNT.c create mode 100644 release/src/linux/linux/net/ipv4/netfilter/ipt_CLASSIFY.c create mode 100644 release/src/linux/linux/net/ipv4/netfilter/ipt_CONNMARK.c create mode 100644 release/src/linux/linux/net/ipv4/netfilter/ipt_IMQ.c create mode 100644 release/src/linux/linux/net/ipv4/netfilter/ipt_MACSAVE.c create mode 100644 release/src/linux/linux/net/ipv4/netfilter/ipt_ROUTE.c create mode 100644 release/src/linux/linux/net/ipv4/netfilter/ipt_TTL.c create mode 100644 release/src/linux/linux/net/ipv4/netfilter/ipt_account.c create mode 100644 release/src/linux/linux/net/ipv4/netfilter/ipt_bcount.c create mode 100644 release/src/linux/linux/net/ipv4/netfilter/ipt_condition.c create mode 100644 release/src/linux/linux/net/ipv4/netfilter/ipt_connlimit.c create mode 100644 release/src/linux/linux/net/ipv4/netfilter/ipt_connmark.c create mode 100644 release/src/linux/linux/net/ipv4/netfilter/ipt_exp.c create mode 100644 release/src/linux/linux/net/ipv4/netfilter/ipt_geoip.c create mode 100644 release/src/linux/linux/net/ipv4/netfilter/ipt_ipp2p.c create mode 100644 release/src/linux/linux/net/ipv4/netfilter/ipt_iprange.c create mode 100644 release/src/linux/linux/net/ipv4/netfilter/ipt_layer7.c create mode 100644 release/src/linux/linux/net/ipv4/netfilter/ipt_macsave.c create mode 100644 release/src/linux/linux/net/ipv4/netfilter/ipt_quota.c create mode 100644 release/src/linux/linux/net/ipv4/netfilter/ipt_recent.c create mode 100644 release/src/linux/linux/net/ipv4/netfilter/ipt_string.c create mode 100644 release/src/linux/linux/net/ipv4/netfilter/ipt_u32.c create mode 100644 release/src/linux/linux/net/ipv4/netfilter/ipt_web.c create mode 100644 release/src/linux/linux/net/ipv4/netfilter/regexp/regexp.c create mode 100644 release/src/linux/linux/net/ipv4/netfilter/regexp/regexp.h create mode 100644 release/src/linux/linux/net/ipv4/netfilter/regexp/regmagic.h create mode 100644 release/src/linux/linux/net/ipv4/netfilter/regexp/regsub.c create mode 100644 release/src/linux/linux/net/ipv4/netfilter/tomato_ct.c create mode 100644 release/src/linux/linux/net/ipv6/netfilter/ip6t_IMQ.c create mode 100644 release/src/linux/linux/net/ipv6/netfilter/ip6t_ROUTE.c create mode 100644 release/src/linux/linux/net/ipv6/netfilter/ip6t_condition.c create mode 100644 release/src/linux/linux/net/sched/sch_esfq.c create mode 100644 release/src/linux/linux/net/sched/sch_hfsc.c (limited to 'release/src/linux/linux/net') diff --git a/release/src/linux/linux/net/core/skbuff.c b/release/src/linux/linux/net/core/skbuff.c index 57d19374..32476545 100644 --- a/release/src/linux/linux/net/core/skbuff.c +++ b/release/src/linux/linux/net/core/skbuff.c @@ -201,6 +201,10 @@ struct sk_buff *alloc_skb(unsigned int size,int gfp_mask) /* Set up other state */ skb->len = 0; skb->cloned = 0; +#if defined(CONFIG_IMQ) || defined (CONFIG_IMQ_MODULE) + skb->imq_flags = 0; + skb->nf_info = NULL; +#endif skb->data_len = 0; atomic_set(&skb->users, 1); @@ -248,6 +252,10 @@ static inline void skb_headerinit(void *p, kmem_cache_t *cache, #ifdef CONFIG_NET_SCHED skb->tc_index = 0; #endif +#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) + skb->imq_flags = 0; + skb->nf_info = NULL; +#endif } static void skb_drop_fraglist(struct sk_buff *skb) @@ -397,6 +405,10 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask) #ifdef CONFIG_NET_SCHED C(tc_index); #endif +#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) + C(imq_flags); + C(nf_info); +#endif atomic_inc(&(skb_shinfo(skb)->dataref)); skb->cloned = 1; @@ -440,6 +452,10 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #ifdef CONFIG_NET_SCHED new->tc_index = old->tc_index; #endif +#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) + new->imq_flags=old->imq_flags; + new->nf_info=old->nf_info; +#endif } /** diff --git a/release/src/linux/linux/net/ipv4/arp.c b/release/src/linux/linux/net/ipv4/arp.c index aecd020a..e458f7d5 100644 --- a/release/src/linux/linux/net/ipv4/arp.c +++ b/release/src/linux/linux/net/ipv4/arp.c @@ -171,8 +171,8 @@ struct neigh_table arp_tbl = { id: "arp_cache", parms: { tbl: &arp_tbl, - /*zhijian 2006-10-23 modify to solve arp entry timeout problem(cdrouter3.3 scaling module)*/ - #if 0 +/*zhijian 2006-10-23 modify to solve arp entry timeout problem(cdrouter3.3 scaling module)*/ +#if 0 base_reachable_time: 30 * HZ, retrans_time: 1 * HZ, gc_staletime: 60 * HZ, @@ -181,16 +181,16 @@ struct neigh_table arp_tbl = { queue_len: 3, ucast_probes: 3, mcast_probes: 3, - #else - base_reachable_time: 60 * HZ, - retrans_time: 5 * HZ, - gc_staletime: 120 * HZ, - reachable_time: 60 * HZ, - delay_probe_time: 10 * HZ, - queue_len: 3, - ucast_probes: 6, - mcast_probes: 6, - #endif +#else + base_reachable_time: 60 * HZ, + retrans_time: 5 * HZ, + gc_staletime: 120 * HZ, + reachable_time: 60 * HZ, + delay_probe_time: 10 * HZ, + queue_len: 3, + ucast_probes: 6, + mcast_probes: 6, +#endif anycast_delay: 1 * HZ, proxy_delay: (8 * HZ) / 10, proxy_qlen: 64, diff --git a/release/src/linux/linux/net/ipv4/igmp.c b/release/src/linux/linux/net/ipv4/igmp.c index 3f718f2a..c53d8feb 100644 --- a/release/src/linux/linux/net/ipv4/igmp.c +++ b/release/src/linux/linux/net/ipv4/igmp.c @@ -677,8 +677,9 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) count++; } err = -ENOBUFS; - //if (iml == NULL || count >= sysctl_igmp_max_memberships) - if (iml == NULL || count > sysctl_igmp_max_memberships)// modify for cdrouter v3.3 item 300(cdrouter_mcast_100) bug + // if (iml == NULL || count >= sysctl_igmp_max_memberships) + // 43011: modify for cdrouter v3.3 item 300(cdrouter_mcast_100) bug + if (iml == NULL || count > sysctl_igmp_max_memberships) goto done; memcpy(&iml->multi, imr, sizeof(*imr)); iml->next = sk->protinfo.af_inet.mc_list; diff --git a/release/src/linux/linux/net/ipv4/netfilter/Config.in b/release/src/linux/linux/net/ipv4/netfilter/Config.in index 7662305e..b1f7f985 100644 --- a/release/src/linux/linux/net/ipv4/netfilter/Config.in +++ b/release/src/linux/linux/net/ipv4/netfilter/Config.in @@ -7,16 +7,17 @@ comment ' IP: Netfilter Configuration' tristate 'Connection tracking (required for masq/NAT)' CONFIG_IP_NF_CONNTRACK if [ "$CONFIG_IP_NF_CONNTRACK" != "n" ]; then dep_tristate ' FTP protocol support' CONFIG_IP_NF_FTP $CONFIG_IP_NF_CONNTRACK - dep_tristate ' TFTP protocol support' CONFIG_IP_NF_TFTP $CONFIG_IP_NF_CONNTRACK + bool ' Connection mark tracking support' CONFIG_IP_NF_CONNTRACK_MARK dep_tristate ' H.323 (netmeeting) support' CONFIG_IP_NF_H323 $CONFIG_IP_NF_CONNTRACK + dep_tristate ' TFTP protocol support' CONFIG_IP_NF_TFTP $CONFIG_IP_NF_CONNTRACK dep_tristate ' IRC protocol support' CONFIG_IP_NF_IRC $CONFIG_IP_NF_CONNTRACK dep_tristate ' CuSeeMe protocol support' CONFIG_IP_NF_CUSEEME $CONFIG_IP_NF_CONNTRACK dep_tristate ' Quake III protocol support' CONFIG_IP_NF_QUAKE3 $CONFIG_IP_NF_CONNTRACK dep_tristate ' RTSP protocol support' CONFIG_IP_NF_RTSP $CONFIG_IP_NF_CONNTRACK dep_tristate ' MMS protocol support' CONFIG_IP_NF_MMS $CONFIG_IP_NF_CONNTRACK - dep_tristate ' SIP protocol support' CONFIG_IP_NF_SIP $CONFIG_IP_NF_CONNTRACK dep_tristate ' GRE protocol support' CONFIG_IP_NF_CT_PROTO_GRE $CONFIG_IP_NF_CONNTRACK dep_tristate ' PPTP protocol support' CONFIG_IP_NF_PPTP $CONFIG_IP_NF_CT_PROTO_GRE + dep_tristate ' SIP protocol support' CONFIG_IP_NF_SIP $CONFIG_IP_NF_CONNTRACK dep_tristate ' ESP protocol support' CONFIG_IP_NF_CT_PROTO_ESP $CONFIG_IP_NF_CONNTRACK fi @@ -27,6 +28,10 @@ tristate 'IP tables support (required for filtering/masq/NAT)' CONFIG_IP_NF_IPTA if [ "$CONFIG_IP_NF_IPTABLES" != "n" ]; then # The simple matches. dep_tristate ' limit match support' CONFIG_IP_NF_MATCH_LIMIT $CONFIG_IP_NF_IPTABLES + dep_tristate ' IPP2P match support' CONFIG_IP_NF_MATCH_IPP2P $CONFIG_IP_NF_IPTABLES + dep_tristate ' geoip match support' CONFIG_IP_NF_MATCH_GEOIP $CONFIG_IP_NF_IPTABLES + dep_tristate ' quota match support' CONFIG_IP_NF_MATCH_QUOTA $CONFIG_IP_NF_IPTABLES + dep_tristate ' IP range match support' CONFIG_IP_NF_MATCH_IPRANGE $CONFIG_IP_NF_IPTABLES dep_tristate ' IP address pool support' CONFIG_IP_NF_POOL $CONFIG_IP_NF_IPTABLES if [ "$CONFIG_IP_NF_POOL" = "y" -o "$CONFIG_IP_NF_POOL" = "m" ]; then @@ -39,6 +44,12 @@ if [ "$CONFIG_IP_NF_IPTABLES" != "n" ]; then dep_tristate ' Multiple port match support' CONFIG_IP_NF_MATCH_MULTIPORT $CONFIG_IP_NF_IPTABLES dep_tristate ' Multiple port with ranges match support' CONFIG_IP_NF_MATCH_MPORT $CONFIG_IP_NF_IPTABLES dep_tristate ' TOS match support' CONFIG_IP_NF_MATCH_TOS $CONFIG_IP_NF_IPTABLES + dep_tristate ' recent match support' CONFIG_IP_NF_MATCH_RECENT $CONFIG_IP_NF_IPTABLES + dep_tristate ' account match support' CONFIG_IP_NF_MATCH_ACCOUNT $CONFIG_IP_NF_IPTABLES $CONFIG_PROC_FS + if [ "$CONFIG_IP_NF_MATCH_ACCOUNT" != "n" ]; then + bool ' account debugging output' CONFIG_IP_NF_MATCH_ACCOUNT_DEBUG + fi + dep_tristate ' condition match support' CONFIG_IP_NF_MATCH_CONDITION $CONFIG_IP_NF_IPTABLES dep_tristate ' TIME match support (EXPERIMENTAL)' CONFIG_IP_NF_MATCH_TIME $CONFIG_IP_NF_IPTABLES dep_tristate ' ECN match support' CONFIG_IP_NF_MATCH_ECN $CONFIG_IP_NF_IPTABLES @@ -46,19 +57,36 @@ if [ "$CONFIG_IP_NF_IPTABLES" != "n" ]; then dep_tristate ' AH/ESP match support' CONFIG_IP_NF_MATCH_AH_ESP $CONFIG_IP_NF_IPTABLES dep_tristate ' LENGTH match support' CONFIG_IP_NF_MATCH_LENGTH $CONFIG_IP_NF_IPTABLES + dep_tristate ' U32 match support' CONFIG_IP_NF_MATCH_U32 $CONFIG_IP_NF_U32 dep_tristate ' TTL match support' CONFIG_IP_NF_MATCH_TTL $CONFIG_IP_NF_IPTABLES dep_tristate ' tcpmss match support' CONFIG_IP_NF_MATCH_TCPMSS $CONFIG_IP_NF_IPTABLES + if [ "$CONFIG_IP_NF_CONNTRACK" != "n" ]; then dep_tristate ' Helper match support' CONFIG_IP_NF_MATCH_HELPER $CONFIG_IP_NF_IPTABLES fi if [ "$CONFIG_IP_NF_CONNTRACK" != "n" ]; then dep_tristate ' Connection state match support' CONFIG_IP_NF_MATCH_STATE $CONFIG_IP_NF_CONNTRACK $CONFIG_IP_NF_IPTABLES + dep_tristate ' Connections/IP limit match support' CONFIG_IP_NF_MATCH_CONNLIMIT $CONFIG_IP_NF_IPTABLES + if [ "$CONFIG_IP_NF_CONNTRACK_MARK" != "n" ]; then + dep_tristate ' Connection mark match support' CONFIG_IP_NF_MATCH_CONNMARK $CONFIG_IP_NF_IPTABLES + fi dep_tristate ' Connection tracking match support' CONFIG_IP_NF_MATCH_CONNTRACK $CONFIG_IP_NF_CONNTRACK $CONFIG_IP_NF_IPTABLES fi if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then dep_tristate ' Unclean match support (EXPERIMENTAL)' CONFIG_IP_NF_MATCH_UNCLEAN $CONFIG_IP_NF_IPTABLES + dep_tristate ' String match support (EXPERIMENTAL)' CONFIG_IP_NF_MATCH_STRING $CONFIG_IP_NF_IPTABLES dep_tristate ' Webstr match support (EXPERIMENTAL)' CONFIG_IP_NF_MATCH_WEBSTR $CONFIG_IP_NF_IPTABLES dep_tristate ' Owner match support (EXPERIMENTAL)' CONFIG_IP_NF_MATCH_OWNER $CONFIG_IP_NF_IPTABLES + dep_tristate ' Layer 7 match support (EXPERIMENTAL)' CONFIG_IP_NF_MATCH_LAYER7 $CONFIG_IP_NF_CONNTRACK + dep_mbool ' Layer 7 debugging output (EXPERIMENTAL)' CONFIG_IP_NF_MATCH_LAYER7_DEBUG $CONFIG_IP_NF_MATCH_LAYER7 + + dep_tristate ' web match' CONFIG_IP_NF_MATCH_WEB $CONFIG_IP_NF_IPTABLES + dep_tristate ' BCOUNT target' CONFIG_IP_NF_TARGET_BCOUNT $CONFIG_IP_NF_IPTABLES + dep_tristate ' bcount match' CONFIG_IP_NF_MATCH_BCOUNT $CONFIG_IP_NF_TARGET_BCOUNT + dep_tristate ' MACSAVE target' CONFIG_IP_NF_TARGET_MACSAVE $CONFIG_IP_NF_IPTABLES + dep_tristate ' macsave match' CONFIG_IP_NF_MATCH_MACSAVE $CONFIG_IP_NF_TARGET_MACSAVE + dep_tristate ' exp match (experimental rig - do not use)' CONFIG_IP_NF_MATCH_EXP $CONFIG_IP_NF_IPTABLES + fi # The targets dep_tristate ' Packet filtering' CONFIG_IP_NF_FILTER $CONFIG_IP_NF_IPTABLES @@ -75,8 +103,6 @@ if [ "$CONFIG_IP_NF_IPTABLES" != "n" ]; then define_bool CONFIG_IP_NF_NAT_NEEDED y dep_tristate ' MASQUERADE target support' CONFIG_IP_NF_TARGET_MASQUERADE $CONFIG_IP_NF_NAT dep_tristate ' REDIRECT target support' CONFIG_IP_NF_TARGET_REDIRECT $CONFIG_IP_NF_NAT - dep_tristate ' Automatic port forwarding (autofw) target support' CONFIG_IP_NF_AUTOFW $CONFIG_IP_NF_NAT - dep_tristate ' TRIGGER target support (port-trigger)' CONFIG_IP_NF_TARGET_TRIGGER $CONFIG_IP_NF_NAT if [ "$CONFIG_IP_NF_H323" = "m" ]; then define_tristate CONFIG_IP_NF_NAT_H323 m else @@ -84,6 +110,8 @@ if [ "$CONFIG_IP_NF_IPTABLES" != "n" ]; then define_tristate CONFIG_IP_NF_NAT_H323 $CONFIG_IP_NF_NAT fi fi + dep_tristate ' Automatic port forwarding (autofw) target support' CONFIG_IP_NF_AUTOFW $CONFIG_IP_NF_NAT + dep_tristate ' TRIGGER target support (port-trigger)' CONFIG_IP_NF_TARGET_TRIGGER $CONFIG_IP_NF_NAT if [ "$CONFIG_IP_NF_PPTP" = "m" ]; then define_tristate CONFIG_IP_NF_NAT_PPTP m else @@ -109,13 +137,20 @@ if [ "$CONFIG_IP_NF_IPTABLES" != "n" ]; then if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then dep_tristate ' Basic SNMP-ALG support (EXPERIMENTAL)' CONFIG_IP_NF_NAT_SNMP_BASIC $CONFIG_IP_NF_NAT fi + if [ "$CONFIG_IP_NF_RTSP" = "m" ]; then + define_tristate CONFIG_IP_NF_NAT_RTSP m + else + if [ "$CONFIG_IP_NF_RTSP" = "y" ]; then + define_tristate CONFIG_IP_NF_NAT_RTSP $CONFIG_IP_NF_NAT + fi + fi if [ "$CONFIG_IP_NF_IRC" = "m" ]; then define_tristate CONFIG_IP_NF_NAT_IRC m else if [ "$CONFIG_IP_NF_IRC" = "y" ]; then define_tristate CONFIG_IP_NF_NAT_IRC $CONFIG_IP_NF_NAT fi - fi + fi if [ "$CONFIG_IP_NF_CUSEEME" = "m" ]; then define_tristate CONFIG_IP_NF_NAT_CUSEEME m else @@ -152,13 +187,6 @@ if [ "$CONFIG_IP_NF_IPTABLES" != "n" ]; then if [ "$CONFIG_IP_NF_TFTP" = "y" ]; then define_tristate CONFIG_IP_NF_NAT_TFTP $CONFIG_IP_NF_NAT fi - fi - if [ "$CONFIG_IP_NF_RTSP" = "m" ]; then - define_tristate CONFIG_IP_NF_NAT_RTSP m - else - if [ "$CONFIG_IP_NF_RTSP" = "y" ]; then - define_tristate CONFIG_IP_NF_NAT_RTSP $CONFIG_IP_NF_NAT - fi fi if [ "$CONFIG_IP_NF_CT_PROTO_ESP" = "m" ]; then define_tristate CONFIG_IP_NF_NAT_PROTO_ESP m @@ -178,8 +206,15 @@ if [ "$CONFIG_IP_NF_IPTABLES" != "n" ]; then dep_tristate ' DSCP target support' CONFIG_IP_NF_TARGET_DSCP $CONFIG_IP_NF_MANGLE dep_tristate ' MARK target support' CONFIG_IP_NF_TARGET_MARK $CONFIG_IP_NF_MANGLE + dep_tristate ' ROUTE target support' CONFIG_IP_NF_TARGET_ROUTE $CONFIG_IP_NF_MANGLE + dep_tristate ' CLASSIFY target support (EXPERIMENTAL)' CONFIG_IP_NF_TARGET_CLASSIFY $CONFIG_IP_NF_FILTER + dep_tristate ' IMQ target support' CONFIG_IP_NF_TARGET_IMQ $CONFIG_IP_NF_MANGLE fi dep_tristate ' LOG target support' CONFIG_IP_NF_TARGET_LOG $CONFIG_IP_NF_IPTABLES + if [ "$CONFIG_IP_NF_CONNTRACK_MARK" != "n" ]; then + dep_tristate ' CONNMARK target support' CONFIG_IP_NF_TARGET_CONNMARK $CONFIG_IP_NF_IPTABLES + fi + dep_tristate ' TTL target support' CONFIG_IP_NF_TARGET_TTL $CONFIG_IP_NF_IPTABLES dep_tristate ' ULOG target support' CONFIG_IP_NF_TARGET_ULOG $CONFIG_IP_NF_IPTABLES dep_tristate ' TCPMSS target support' CONFIG_IP_NF_TARGET_TCPMSS $CONFIG_IP_NF_IPTABLES fi @@ -189,6 +224,10 @@ if [ "$CONFIG_IP_NF_ARPTABLES" != "n" ]; then dep_tristate ' ARP packet filtering' CONFIG_IP_NF_ARPFILTER $CONFIG_IP_NF_ARPTABLES fi +if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then + tristate 'tomato_ct' CONFIG_IP_NF_TOMATOCT +fi + # Backwards compatibility modules: only if you don't build in the others. if [ "$CONFIG_IP_NF_CONNTRACK" != "y" ]; then if [ "$CONFIG_IP_NF_IPTABLES" != "y" ]; then diff --git a/release/src/linux/linux/net/ipv4/netfilter/Makefile b/release/src/linux/linux/net/ipv4/netfilter/Makefile index abf55469..80de56f3 100644 --- a/release/src/linux/linux/net/ipv4/netfilter/Makefile +++ b/release/src/linux/linux/net/ipv4/netfilter/Makefile @@ -33,10 +33,10 @@ obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o # H.323 support obj-$(CONFIG_IP_NF_H323) += ip_conntrack_h323.o -obj-$(CONFIG_IP_NF_NAT_H323) += ip_nat_h323.o -ifdef CONFIG_IP_NF_NAT_H323 +ifdef CONFIG_IP_NF_H323 export-objs += ip_conntrack_h323.o endif +obj-$(CONFIG_IP_NF_NAT_H323) += ip_nat_h323.o # connection tracking protocol helpers @@ -83,10 +83,14 @@ obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o ifdef CONFIG_IP_NF_NAT_IRC export-objs += ip_conntrack_irc.o endif + +# rtsp protocol support obj-$(CONFIG_IP_NF_RTSP) += ip_conntrack_rtsp.o ifdef CONFIG_IP_NF_NAT_RTSP export-objs += ip_conntrack_rtsp.o endif +obj-$(CONFIG_IP_NF_NAT_RTSP) += ip_nat_rtsp.o + # NAT helpers obj-$(CONFIG_IP_NF_NAT_CUSEEME) += ip_nat_cuseeme.o obj-$(CONFIG_IP_NF_NAT_PPTP) += ip_nat_pptp.o @@ -95,7 +99,6 @@ obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o obj-$(CONFIG_IP_NF_NAT_QUAKE3) += ip_nat_quake3.o -obj-$(CONFIG_IP_NF_NAT_RTSP) += ip_nat_rtsp.o obj-$(CONFIG_IP_NF_NAT_MMS) += ip_nat_mms.o # generic IP tables @@ -109,48 +112,64 @@ obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o # matches obj-$(CONFIG_IP_NF_MATCH_HELPER) += ipt_helper.o obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o +obj-$(CONFIG_IP_NF_MATCH_IPP2P) += ipt_ipp2p.o +obj-$(CONFIG_IP_NF_MATCH_GEOIP) += ipt_geoip.o +obj-$(CONFIG_IP_NF_MATCH_QUOTA) += ipt_quota.o +obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o obj-$(CONFIG_IP_NF_POOL) += ipt_pool.o ip_pool.o obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o - obj-$(CONFIG_IP_NF_MATCH_PKTTYPE) += ipt_pkttype.o obj-$(CONFIG_IP_NF_MATCH_MULTIPORT) += ipt_multiport.o - obj-$(CONFIG_IP_NF_MATCH_MPORT) += ipt_mport.o - obj-$(CONFIG_IP_NF_MATCH_OWNER) += ipt_owner.o obj-$(CONFIG_IP_NF_MATCH_TOS) += ipt_tos.o - +obj-$(CONFIG_IP_NF_MATCH_RECENT) += ipt_recent.o +obj-$(CONFIG_IP_NF_MATCH_ACCOUNT) += ipt_account.o +obj-$(CONFIG_IP_NF_MATCH_CONDITION) += ipt_condition.o obj-$(CONFIG_IP_NF_MATCH_TIME) += ipt_time.o - obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o obj-$(CONFIG_IP_NF_MATCH_DSCP) += ipt_dscp.o obj-$(CONFIG_IP_NF_MATCH_AH_ESP) += ipt_ah.o ipt_esp.o - obj-$(CONFIG_IP_NF_MATCH_LENGTH) += ipt_length.o - +obj-$(CONFIG_IP_NF_MATCH_U32) += ipt_u32.o obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o obj-$(CONFIG_IP_NF_MATCH_STATE) += ipt_state.o +obj-$(CONFIG_IP_NF_MATCH_CONNLIMIT) += ipt_connlimit.o +obj-$(CONFIG_IP_NF_MATCH_CONNMARK) += ipt_connmark.o obj-$(CONFIG_IP_NF_MATCH_CONNTRACK) += ipt_conntrack.o obj-$(CONFIG_IP_NF_MATCH_UNCLEAN) += ipt_unclean.o +obj-$(CONFIG_IP_NF_MATCH_STRING) += ipt_string.o obj-$(CONFIG_IP_NF_MATCH_WEBSTR) += ipt_webstr.o obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o +obj-$(CONFIG_IP_NF_MATCH_LAYER7) += ipt_layer7.o +obj-$(CONFIG_IP_NF_MATCH_WEB) += ipt_web.o +obj-$(CONFIG_IP_NF_MATCH_MACSAVE) += ipt_macsave.o +obj-$(CONFIG_IP_NF_MATCH_EXP) += ipt_exp.o +obj-$(CONFIG_IP_NF_MATCH_BCOUNT) += ipt_bcount.o # targets obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o obj-$(CONFIG_IP_NF_TARGET_MIRROR) += ipt_MIRROR.o +obj-$(CONFIG_IP_NF_TARGET_CLASSIFY) += ipt_CLASSIFY.o obj-$(CONFIG_IP_NF_TARGET_TOS) += ipt_TOS.o obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o obj-$(CONFIG_IP_NF_TARGET_DSCP) += ipt_DSCP.o obj-$(CONFIG_IP_NF_TARGET_MARK) += ipt_MARK.o +obj-$(CONFIG_IP_NF_TARGET_IMQ) += ipt_IMQ.o obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o +obj-$(CONFIG_IP_NF_TARGET_ROUTE) += ipt_ROUTE.o obj-$(CONFIG_IP_NF_NAT_SNMP_BASIC) += ip_nat_snmp_basic.o obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o +obj-$(CONFIG_IP_NF_TARGET_CONNMARK) += ipt_CONNMARK.o +obj-$(CONFIG_IP_NF_TARGET_TTL) += ipt_TTL.o obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o obj-$(CONFIG_IP_NF_AUTOFW) += ip_autofw.o obj-$(CONFIG_IP_NF_TARGET_TRIGGER) += ipt_TRIGGER.o +obj-$(CONFIG_IP_NF_TARGET_MACSAVE) += ipt_MACSAVE.o +obj-$(CONFIG_IP_NF_TARGET_BCOUNT) += ipt_BCOUNT.o # generic ARP tables obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o @@ -164,6 +183,8 @@ obj-$(CONFIG_IP_NF_COMPAT_IPFWADM) += ipfwadm.o obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o +obj-$(CONFIG_IP_NF_TOMATOCT) += tomato_ct.o + include $(TOPDIR)/Rules.make ip_conntrack.o: $(ip_conntrack-objs) diff --git a/release/src/linux/linux/net/ipv4/netfilter/arp_tables.c b/release/src/linux/linux/net/ipv4/netfilter/arp_tables.c index aa1c034a..757fc2ab 100644 --- a/release/src/linux/linux/net/ipv4/netfilter/arp_tables.c +++ b/release/src/linux/linux/net/ipv4/netfilter/arp_tables.c @@ -986,13 +986,12 @@ static int do_add_counters(void *user, unsigned int len) goto free; write_lock_bh(&t->lock); - /************************************* - * modify by tanghui @ 2006-10-11 - * for a RACE CONDITION in the "do_add_counters()" function - *************************************/ - //if (t->private->number != paddc->num_counters) { - if (t->private->number != tmp.num_counters) { - /*************************************/ + +#if 0 // removed 1.11 forward bug test + // 43011 (09?): checkme: modify by tanghui @ 2006-10-11 for a RACE CONDITION in the "do_add_counters()" function + // if (t->private->number != tmp.num_counters) { +#endif + if (t->private->number != paddc->num_counters) { ret = -EINVAL; goto unlock_up_free; } diff --git a/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_core.c b/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_core.c index 9c6f040f..324951ee 100644 --- a/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_core.c +++ b/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_core.c @@ -34,7 +34,12 @@ /* For ERR_PTR(). Yeah, I know... --RR */ #include -#include +#define TEST_JHASH // test jhash from 2.4.33 -- zzz + +#ifdef TEST_JHASH +#include +#include +#endif /* This rwlock protects the main hash table, protocol/helper/expected registrations, conntrack timers*/ @@ -71,7 +76,7 @@ static kmem_cache_t *ip_conntrack_cachep; int sysctl_ip_conntrack_tcp_timeouts[10] = { 30 MINS, /* TCP_CONNTRACK_NONE, */ - 5 DAYS, /* TCP_CONNTRACK_ESTABLISHED, */ + 4 HOURS, /* TCP_CONNTRACK_ESTABLISHED, */ // was 5 days zzz 2 MINS, /* TCP_CONNTRACK_SYN_SENT, */ 60 SECS, /* TCP_CONNTRACK_SYN_RECV, */ 2 MINS, /* TCP_CONNTRACK_FIN_WAIT, */ @@ -128,9 +133,20 @@ ip_conntrack_put(struct ip_conntrack *ct) nf_conntrack_put(&ct->infos[0]); } +#ifdef TEST_JHASH +static int ip_conntrack_hash_rnd_initted; +static unsigned int ip_conntrack_hash_rnd; +#endif + static inline u_int32_t hash_conntrack(const struct ip_conntrack_tuple *tuple) { +#ifdef TEST_JHASH + return (jhash_3words(tuple->src.ip, + (tuple->dst.ip ^ tuple->dst.protonum), + (tuple->src.u.all | (tuple->dst.u.all << 16)), + ip_conntrack_hash_rnd) % ip_conntrack_htable_size); +#else /* ntohl because more differences in low bits. */ /* To ensure that halves of the same connection don't hash clash, we add the source per-proto again. */ @@ -139,6 +155,7 @@ hash_conntrack(const struct ip_conntrack_tuple *tuple) + tuple->dst.protonum) + ntohs(tuple->src.u.all)) % ip_conntrack_htable_size; +#endif } inline int @@ -314,9 +331,6 @@ clean_from_lists(struct ip_conntrack *ct) { DEBUGP("clean_from_lists(%p)\n", ct); MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); - /* Remove from both hash lists: must not NULL out next ptrs, - otherwise we'll look unconfirmed. Fortunately, LIST_DELETE - doesn't do this. --RR */ LIST_DELETE(&ip_conntrack_hash [hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)], &ct->tuplehash[IP_CT_DIR_ORIGINAL]); @@ -359,6 +373,14 @@ destroy_conntrack(struct nf_conntrack *nfct) list_del(&ct->master->expected_list); kfree(ct->master); } + + #if defined(CONFIG_IP_NF_MATCH_LAYER7) || defined(CONFIG_IP_NF_MATCH_LAYER7_MODULE) + if(ct->layer7.app_proto) + kfree(ct->layer7.app_proto); + if(ct->layer7.app_data) + kfree(ct->layer7.app_data); + #endif + WRITE_UNLOCK(&ip_conntrack_lock); DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct); @@ -489,6 +511,7 @@ __ip_conntrack_confirm(struct nf_ct_info *nfct) ct->timeout.expires += jiffies; add_timer(&ct->timeout); atomic_inc(&ct->ct_general.use); + set_bit(IPS_CONFIRMED_BIT, &ct->status); WRITE_UNLOCK(&ip_conntrack_lock); return NF_ACCEPT; } @@ -606,7 +629,7 @@ icmp_error_track(struct sk_buff *skb, connection. Too bad: we're in trouble anyway. */ static inline int unreplied(const struct ip_conntrack_tuple_hash *i) { - return !(i->ctrack->status & IPS_ASSURED); + return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status)); } static int early_drop(struct list_head *chain) @@ -632,31 +655,6 @@ static int early_drop(struct list_head *chain) return dropped; } -/******************lzh add *************************************** -* DESCRIPTION:delete seleted ip conntrack from conntrack_hash list -* INPUT : ip_conntrack_tuple_hash h -* OUTPUT: NULL -* AUTHOR: linzhihong -* DATE : 2006.7.20 -*****************************************************************/ -void del_selected_conntrack(struct ip_conntrack_tuple_hash *h) -{ - DEBUGP("hahaha enter %s\n", __FUNCTION__); - if(h) - { - #if 1 - ip_ct_refresh(h->ctrack, 1*HZ); - #else - if(del_timer(&h->ctrack->timeout)) - { - death_by_timeout((unsigned long)h->ctrack); - } - //ip_conntrack_put(h->ctrack); - #endif - } -} -/**************************** lzh end ******************************/ - static inline int helper_cmp(const struct ip_conntrack_helper *i, const struct ip_conntrack_tuple *rtuple) { @@ -670,41 +668,6 @@ struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *t tuple); } -#define RESERVE_CONNTRACK_FOR_ROUTER -#ifdef RESERVE_CONNTRACK_FOR_ROUTER -#define RESERVE_CONNTRACK_NUM 20 -/* - Check if the packet is for Router AP(LAN side only), or generate from - Router itself(Both sides). - */ -static int cmp_local_ip(u_int32_t dst, u_int32_t src) -{ -#define IF_LAN_NAME "br0" - - int ret = -1; - struct in_device *in_dev; - struct net_device *dev; - struct in_ifaddr **ifap = NULL; - struct in_ifaddr *ifa = NULL; - - for(dev = dev_base; dev != NULL; dev = dev->next){ - if((in_dev=__in_dev_get(dev)) != NULL){ - for(ifap=&in_dev->ifa_list; (ifa=*ifap) != NULL; ifap=&ifa->ifa_next){ - if((ifa->ifa_address == dst && !strcmp(IF_LAN_NAME, ifa->ifa_label)) || ifa->ifa_address == src){ - /*match*/ - ret = 0; - break; - } - } - } - } - - return ret; - -#undef IF_LAN_NAME -} -#endif - /* Allocate a new conntrack: we return -ENOMEM if classification failed due to stress. Otherwise it really is unclassifiable. */ static struct ip_conntrack_tuple_hash * @@ -719,9 +682,15 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, int i; static unsigned int drop_next = 0; +#ifdef TEST_JHASH + if (!ip_conntrack_hash_rnd_initted) { + get_random_bytes(&ip_conntrack_hash_rnd, 4); + ip_conntrack_hash_rnd_initted = 1; + } +#endif + hash = hash_conntrack(tuple); - #ifndef RESERVE_CONNTRACK_FOR_ROUTER if (ip_conntrack_max && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) { /* Try dropping from random chain, or else from the @@ -738,32 +707,6 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, return ERR_PTR(-ENOMEM); } } - #else -#define IPV4_BROADCAST_ADDR 0x000000FF -#define IPV4_MULTICAST_ADDR 0xE0000000 - if (ip_conntrack_max && - (ip_conntrack_max - atomic_read(&ip_conntrack_count)) <= RESERVE_CONNTRACK_NUM){ - if((atomic_read(&ip_conntrack_count) < ip_conntrack_max) && - (((tuple->dst).ip & IPV4_BROADCAST_ADDR == IPV4_BROADCAST_ADDR) || ((tuple->dst).ip & IPV4_BROADCAST_ADDR == IPV4_MULTICAST_ADDR) || !cmp_local_ip((tuple->dst).ip, (tuple->src).ip))){ - //packet for router(LAN side only) or packet from router, let it go thru - } - else{ - /* Try dropping from random chain, or else from the - chain about to put into (in case they're trying to - bomb one hash chain). */ - unsigned int next = (drop_next++)%ip_conntrack_htable_size; - - if (!early_drop(&ip_conntrack_hash[next]) - && !early_drop(&ip_conntrack_hash[hash])) { - if (net_ratelimit()) - printk(KERN_WARNING - "ip_conntrack: table full, dropping" - " packet.\n"); - return ERR_PTR(-ENOMEM); - } - } - } - #endif if (!invert_tuple(&repl_tuple, tuple, protocol)) { DEBUGP("Can't invert tuple.\n"); @@ -829,9 +772,12 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, conntrack, expected); /* Welcome, Mr. Bond. We've been expecting you... */ IP_NF_ASSERT(master_ct(conntrack)); - conntrack->status = IPS_EXPECTED; + __set_bit(IPS_EXPECTED_BIT, &conntrack->status); conntrack->master = expected; expected->sibling = conntrack; +#if CONFIG_IP_NF_CONNTRACK_MARK + conntrack->mark = expected->expectant->mark; +#endif LIST_DELETE(&ip_conntrack_expect_list, expected); INIT_LIST_HEAD(&expected->list); expected->expectant->expecting--; @@ -878,11 +824,11 @@ resolve_normal_ct(struct sk_buff *skb, *set_reply = 1; } else { /* Once we've had two way comms, always ESTABLISHED. */ - if (h->ctrack->status & IPS_SEEN_REPLY) { + if (test_bit(IPS_SEEN_REPLY_BIT, &h->ctrack->status)) { DEBUGP("ip_conntrack_in: normal packet for %p\n", h->ctrack); *ctinfo = IP_CT_ESTABLISHED; - } else if (h->ctrack->status & IPS_EXPECTED) { + } else if (test_bit(IPS_EXPECTED_BIT, &h->ctrack->status)) { DEBUGP("ip_conntrack_in: related packet for %p\n", h->ctrack); *ctinfo = IP_CT_RELATED; @@ -1056,16 +1002,15 @@ int ip_conntrack_expect_related(struct ip_conntrack *related_to, } if (old) { - /************************* lzh add ****************************************** - * fix sip alg CDROUTE test fail - * 2007/3/16 - ***************************************************************************/ - if (old->help.exp_sip_info.nated && (old->help.exp_sip_info.type == CONN_RTP)) - { - DEBUGP("%s: found old exp and nated, rtp port=%d\n", __FUNCTION__,ntohs(old->tuple.dst.u.udp.port)); - related_to->help.ct_sip_info.rtpport = ntohs(old->tuple.dst.u.udp.port); +#if 0 // removed 1.11 forward bug test + if (1) { // 43011 (09?): checkme + // lzh add, fix sip alg CDROUTE test fail, 2007/3/16 + if (old->help.exp_sip_info.nated && (old->help.exp_sip_info.type == CONN_RTP)) { + DEBUGP("%s: found old exp and nated, rtp port=%d\n", __FUNCTION__,ntohs(old->tuple.dst.u.udp.port)); + related_to->help.ct_sip_info.rtpport = ntohs(old->tuple.dst.u.udp.port); + } } - /************************ lzh end ******************************************/ +#endif WRITE_UNLOCK(&ip_conntrack_lock); return -EEXIST; } @@ -1530,6 +1475,7 @@ int __init ip_conntrack_init(void) unsigned int i; int ret; +#if 0 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB * machine has 256 buckets. >= 1GB machines have 8192 buckets. */ if (hashsize) { @@ -1544,6 +1490,33 @@ int __init ip_conntrack_init(void) ip_conntrack_htable_size = 16; } ip_conntrack_max = 8 * ip_conntrack_htable_size; +#else +/* + + sizeof(list_head) = 8 + x 4096 = 32K + + sizeof(ip_conntrack) = 368 + x 2048 = 736K + +*/ + +#ifdef TEST_JHASH +/* + if (hashsize) ip_conntrack_htable_size = hashsize; + else ip_conntrack_htable_size = 4096; + ip_conntrack_max = 2048; +*/ + if (hashsize) ip_conntrack_htable_size = hashsize; + else ip_conntrack_htable_size = 8092; + ip_conntrack_max = 4096; +#else + if (hashsize) ip_conntrack_htable_size = hashsize; + else ip_conntrack_htable_size = 4099; + ip_conntrack_max = 2048; +#endif + +#endif printk("ip_conntrack version %s (%u buckets, %d max)" " - %d bytes per conntrack\n", IP_CONNTRACK_VERSION, @@ -1605,3 +1578,5 @@ err_unreg_sockopt: return -ENOMEM; } + + diff --git a/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_h323.c b/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_h323.c index cb0b1da5..c6172945 100644 --- a/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_h323.c +++ b/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_h323.c @@ -104,30 +104,26 @@ static int h245_help(const struct iphdr *iph, size_t len, exp->seq = ntohl(tcph->seq) + i; - *((u_int32_t *)data) = ct->tuplehash[!dir].tuple.dst.ip; //!!! Netmeeting fix - - { - unsigned int chksum; - - chksum = csum_partial((char *)tcph + tcph->doff*4, - datalen, 0); - - tcph->check = 0; - tcph->check = tcp_v4_check(tcph, tcplen, iph->saddr, iph->daddr, - csum_partial((char *)tcph, tcph->doff*4, chksum)); - - } - + // 43011 (09?): checkme + if (1) { + unsigned int chksum; + + *((u_int32_t *)data) = ct->tuplehash[!dir].tuple.dst.ip; //!!! Netmeeting fix + chksum = csum_partial((char *)tcph + tcph->doff*4, datalen, 0); + tcph->check = 0; + tcph->check = tcp_v4_check(tcph, tcplen, iph->saddr, iph->daddr, + csum_partial((char *)tcph, tcph->doff*4, chksum)); + } exp->tuple = ((struct ip_conntrack_tuple) { { ct->tuplehash[!dir].tuple.src.ip, { 0 } }, { data_ip, - { data_port }, + { .tcp = { data_port } }, IPPROTO_UDP }}); exp->mask = ((struct ip_conntrack_tuple) { { 0xFFFFFFFF, { 0 } }, - { 0xFFFFFFFF, { 0xFFFF }, 0xFFFF }}); + { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFFFF }}); exp->expectfn = NULL; @@ -252,11 +248,11 @@ static int h225_help(const struct iphdr *iph, size_t len, { { ct->tuplehash[!dir].tuple.src.ip, { 0 } }, { data_ip, - { data_port }, + { .tcp = { data_port } }, IPPROTO_TCP }}); exp->mask = ((struct ip_conntrack_tuple) { { 0xFFFFFFFF, { 0 } }, - { 0xFFFFFFFF, { 0xFFFF }, 0xFFFF }}); + { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFFFF }}); exp->expectfn = h225_expect; @@ -317,9 +313,7 @@ static void __exit fini(void) ip_conntrack_helper_unregister(&h225); } -#ifdef CONFIG_IP_NF_NAT_NEEDED EXPORT_SYMBOL(ip_h323_lock); -#endif module_init(init); module_exit(fini); diff --git a/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_pptp.c b/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_pptp.c index 17cf2b7b..a0b41051 100644 --- a/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_pptp.c +++ b/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_pptp.c @@ -1,5 +1,5 @@ /* - * ip_conntrack_pptp.c - Version 1.11 + * ip_conntrack_pptp.c - Version 1.9 * * Connection tracking support for PPTP (Point to Point Tunneling Protocol). * PPTP is a a protocol for creating virtual private networks. @@ -9,7 +9,7 @@ * GRE is defined in RFC 1701 and RFC 1702. Documentation of * PPTP can be found in RFC 2637 * - * (C) 2000-2002 by Harald Welte , + * (C) 2000-2003 by Harald Welte * * Development of this code funded by Astaro AG (http://www.astaro.com/) * @@ -21,6 +21,18 @@ * TODO: - finish support for multiple calls within one session * (needs expect reservations in newnat) * - testing of incoming PPTP calls + * + * Changes: + * 2002-02-05 - Version 1.3 + * - Call ip_conntrack_unexpect_related() from + * pptp_timeout_related() to destroy expectations in case + * CALL_DISCONNECT_NOTIFY or tcp fin packet was seen + * (Philip Craig ) + * - Add Version information at module loadtime + * 2002-02-10 - Version 1.6 + * - move to C99 style initializers + * - remove second expectation if first arrives + * */ #include @@ -35,13 +47,21 @@ #include #include +#define IP_CT_PPTP_VERSION "1.9" + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte "); MODULE_DESCRIPTION("Netfilter connection tracking helper module for PPTP"); DECLARE_LOCK(ip_pptp_lock); +#if 0 +#include "ip_conntrack_pptp_priv.h" +#define DEBUGP(format, args...) printk(KERN_DEBUG __FILE__ ":" __FUNCTION__ \ + ": " format, ## args) +#else #define DEBUGP(format, args...) +#endif #define SECS *HZ #define MINS * 60 SECS @@ -53,8 +73,8 @@ DECLARE_LOCK(ip_pptp_lock); static int pptp_expectfn(struct ip_conntrack *ct) { - struct ip_conntrack_expect *exp, *other_exp; struct ip_conntrack *master; + struct ip_conntrack_expect *exp; DEBUGP("increasing timeouts\n"); /* increase timeout of GRE data channel conntrack entry */ @@ -67,6 +87,12 @@ static int pptp_expectfn(struct ip_conntrack *ct) return 0; } + exp = ct->master; + if (!exp) { + DEBUGP("no expectation!!\n"); + return 0; + } + DEBUGP("completing tuples with ct info\n"); /* we can do this, since we're unconfirmed */ if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.gre.key == @@ -83,6 +109,26 @@ static int pptp_expectfn(struct ip_conntrack *ct) ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.gre.key = htonl(master->help.ct_pptp_info.pac_call_id); } + + /* delete other expectation */ + if (exp->expected_list.next != &exp->expected_list) { + struct ip_conntrack_expect *other_exp; + struct list_head *cur_item, *next; + + for (cur_item = master->sibling_list.next; + cur_item != &master->sibling_list; cur_item = next) { + next = cur_item->next; + other_exp = list_entry(cur_item, + struct ip_conntrack_expect, + expected_list); + /* remove only if occurred at same sequence number */ + if (other_exp != exp && other_exp->seq == exp->seq) { + DEBUGP("unexpecting other direction\n"); + ip_ct_gre_keymap_destroy(other_exp); + ip_conntrack_unexpect_related(other_exp); + } + } + } return 0; } @@ -90,15 +136,21 @@ static int pptp_expectfn(struct ip_conntrack *ct) /* timeout GRE data connections */ static int pptp_timeout_related(struct ip_conntrack *ct) { - struct list_head *cur_item; + struct list_head *cur_item, *next; struct ip_conntrack_expect *exp; - list_for_each(cur_item, &ct->sibling_list) { + /* FIXME: do we have to lock something ? */ + for (cur_item = ct->sibling_list.next; + cur_item != &ct->sibling_list; cur_item = next) { + next = cur_item->next; exp = list_entry(cur_item, struct ip_conntrack_expect, expected_list); - if (!exp->sibling) + ip_ct_gre_keymap_destroy(exp); + if (!exp->sibling) { + ip_conntrack_unexpect_related(exp); continue; + } DEBUGP("setting timeout of conntrack %p to 0\n", exp->sibling); @@ -110,7 +162,7 @@ static int pptp_timeout_related(struct ip_conntrack *ct) return 0; } -/* expect GRE connection in PNS->PAC direction */ +/* expect GRE connections (PNS->PAC and PAC->PNS direction) */ static inline int exp_gre(struct ip_conntrack *master, u_int32_t seq, @@ -121,7 +173,7 @@ exp_gre(struct ip_conntrack *master, struct ip_conntrack_tuple inv_tuple; memset(&exp, 0, sizeof(exp)); - /* tuple in original direction, PAC->PNS */ + /* tuple in original direction, PNS->PAC */ exp.tuple.src.ip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip; exp.tuple.src.u.gre.key = htonl(ntohs(peer_callid)); exp.tuple.dst.ip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip; @@ -148,12 +200,44 @@ exp_gre(struct ip_conntrack *master, DEBUGP("calling expect_related "); DUMP_TUPLE_RAW(&exp.tuple); + /* Add GRE keymap entries */ + if (ip_ct_gre_keymap_add(&exp, &exp.tuple, 0) != 0) + return 1; + + invert_tuplepr(&inv_tuple, &exp.tuple); + if (ip_ct_gre_keymap_add(&exp, &inv_tuple, 1) != 0) { + ip_ct_gre_keymap_destroy(&exp); + return 1; + } + + if (ip_conntrack_expect_related(master, &exp) != 0) { + ip_ct_gre_keymap_destroy(&exp); + DEBUGP("cannot expect_related()\n"); + return 1; + } + + /* tuple in reply direction, PAC->PNS */ + exp.tuple.src.ip = master->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip; + exp.tuple.src.u.gre.key = htonl(ntohs(callid)); + exp.tuple.dst.ip = master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip; + exp.tuple.dst.u.gre.key = htonl(ntohs(peer_callid)); + + DEBUGP("calling expect_related "); + DUMP_TUPLE_RAW(&exp.tuple); + /* Add GRE keymap entries */ ip_ct_gre_keymap_add(&exp, &exp.tuple, 0); invert_tuplepr(&inv_tuple, &exp.tuple); ip_ct_gre_keymap_add(&exp, &inv_tuple, 1); + /* FIXME: cannot handle error correctly, since we need to free + * the above keymap :( */ - ip_conntrack_expect_related(master, &exp); + if (ip_conntrack_expect_related(master, &exp) != 0) { + /* free the second pair of keypmaps */ + ip_ct_gre_keymap_destroy(&exp); + DEBUGP("cannot expect_related():\n"); + return 1; + } return 0; } @@ -240,7 +324,8 @@ pptp_inbound_pkt(struct tcphdr *tcph, info->cstate = PPTP_CALL_OUT_CONF; seq = ntohl(tcph->seq) + ((void *)pcid - (void *)pptph); - exp_gre(ct, seq, *cid, *pcid); + if (exp_gre(ct, seq, *cid, *pcid) != 0) + printk("ip_conntrack_pptp: error during exp_gre\n"); break; case PPTP_IN_CALL_REQUEST: @@ -282,7 +367,8 @@ pptp_inbound_pkt(struct tcphdr *tcph, /* we expect a GRE connection from PAC to PNS */ seq = ntohl(tcph->seq) + ((void *)pcid - (void *)pptph); - exp_gre(ct, seq, *cid, *pcid); + if (exp_gre(ct, seq, *cid, *pcid) != 0) + printk("ip_conntrack_pptp: error during exp_gre\n"); break; @@ -294,7 +380,6 @@ pptp_inbound_pkt(struct tcphdr *tcph, /* untrack this call id, unexpect GRE packets */ pptp_timeout_related(ct); - /* NEWNAT: look up exp for call id and unexpct_related */ break; case PPTP_WAN_ERROR_NOTIFY: @@ -446,7 +531,8 @@ conntrack_pptp_help(const struct iphdr *iph, size_t len, if (tcp_v4_check(tcph, tcplen, iph->saddr, iph->daddr, csum_partial((char *) tcph, tcplen, 0))) { printk(KERN_NOTICE __FILE__ ": bad csum\n"); -// return NF_ACCEPT; + /* W2K PPTP server sends TCP packets with wrong checksum :(( */ + //return NF_ACCEPT; } if (tcph->fin || tcph->rst) { @@ -456,8 +542,6 @@ conntrack_pptp_help(const struct iphdr *iph, size_t len, /* untrack this call id, unexpect GRE packets */ pptp_timeout_related(ct); - /* no need to call unexpect_related since master conn - * dies anyway */ } @@ -482,6 +566,8 @@ conntrack_pptp_help(const struct iphdr *iph, size_t len, LOCK_BH(&ip_pptp_lock); + /* FIXME: We just blindly assume that the control connection is always + * established from PNS->PAC. However, RFC makes no guarantee */ if (dir == IP_CT_DIR_ORIGINAL) /* client -> server (PNS -> PAC) */ ret = pptp_outbound_pkt(tcph, pptph, datalen, ct, ctinfo); @@ -497,13 +583,31 @@ conntrack_pptp_help(const struct iphdr *iph, size_t len, /* control protocol helper */ static struct ip_conntrack_helper pptp = { - { NULL, NULL }, - "pptp", IP_CT_HELPER_F_REUSE_EXPECT, THIS_MODULE, 2, 0, - { { 0, { tcp: { port: __constant_htons(PPTP_CONTROL_PORT) } } }, - { 0, { 0 }, IPPROTO_TCP } }, - { { 0, { tcp: { port: 0xffff } } }, - { 0, { 0 }, 0xffff } }, - conntrack_pptp_help }; + .list = { NULL, NULL }, + .name = "pptp", + .flags = IP_CT_HELPER_F_REUSE_EXPECT, + .me = THIS_MODULE, + .max_expected = 2, + .timeout = 0, + .tuple = { .src = { .ip = 0, + .u = { .tcp = { .port = + __constant_htons(PPTP_CONTROL_PORT) } } + }, + .dst = { .ip = 0, + .u = { .all = 0 }, + .protonum = IPPROTO_TCP + } + }, + .mask = { .src = { .ip = 0, + .u = { .tcp = { .port = 0xffff } } + }, + .dst = { .ip = 0, + .u = { .all = 0 }, + .protonum = 0xffff + } + }, + .help = conntrack_pptp_help +}; /* ip_conntrack_pptp initialization */ static int __init init(void) @@ -517,12 +621,14 @@ static int __init init(void) return -EIO; } + printk("ip_conntrack_pptp version %s loaded\n", IP_CT_PPTP_VERSION); return 0; } static void __exit fini(void) { ip_conntrack_helper_unregister(&pptp); + printk("ip_conntrack_pptp version %s unloaded\n", IP_CT_PPTP_VERSION); } module_init(init); diff --git a/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_proto_esp.c b/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_proto_esp.c old mode 100755 new mode 100644 diff --git a/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_proto_gre.c index 4ff06dcb..55b3ecea 100644 --- a/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_proto_gre.c +++ b/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_proto_gre.c @@ -1,5 +1,5 @@ /* - * ip_conntrack_proto_gre.c - Version 1.11 + * ip_conntrack_proto_gre.c - Version 1.2 * * Connection tracking protocol helper module for GRE. * @@ -17,7 +17,7 @@ * * Documentation about PPTP can be found in RFC 2637 * - * (C) 2000-2002 by Harald Welte + * (C) 2000-2003 by Harald Welte * * Development of this code funded by Astaro AG (http://www.astaro.com/) * @@ -54,8 +54,18 @@ MODULE_DESCRIPTION("netfilter connection tracking protocol helper for GRE"); #define GRE_TIMEOUT (30*HZ) #define GRE_STREAM_TIMEOUT (180*HZ) +#if 0 +#define DEBUGP(format, args...) printk(KERN_DEBUG __FILE__ ":" __FUNCTION__ \ + ": " format, ## args) +#define DUMP_TUPLE_GRE(x) printk("%u.%u.%u.%u:0x%x -> %u.%u.%u.%u:0x%x:%u:0x%x\n", \ + NIPQUAD((x)->src.ip), ntohl((x)->src.u.gre.key), \ + NIPQUAD((x)->dst.ip), ntohl((x)->dst.u.gre.key), \ + (x)->dst.u.gre.version, \ + ntohs((x)->dst.u.gre.protocol)) +#else #define DEBUGP(x, args...) #define DUMP_TUPLE_GRE(x) +#endif /* GRE KEYMAP HANDLING FUNCTIONS */ static LIST_HEAD(gre_keymap_list); @@ -103,7 +113,6 @@ int ip_ct_gre_keymap_add(struct ip_conntrack_expect *exp, memset(km, 0, sizeof(*km)); memcpy(&km->tuple, t, sizeof(*t)); - km->master = exp; if (!reply) exp->proto.gre.keymap_orig = km; @@ -132,6 +141,26 @@ void ip_ct_gre_keymap_change(struct ip_ct_gre_keymap *km, WRITE_UNLOCK(&ip_ct_gre_lock); } +/* destroy the keymap entries associated with specified expect */ +void ip_ct_gre_keymap_destroy(struct ip_conntrack_expect *exp) +{ + DEBUGP("entering for exp %p\n", exp); + WRITE_LOCK(&ip_ct_gre_lock); + if (exp->proto.gre.keymap_orig) { + DEBUGP("removing %p from list\n", exp->proto.gre.keymap_orig); + list_del(&exp->proto.gre.keymap_orig->list); + kfree(exp->proto.gre.keymap_orig); + exp->proto.gre.keymap_orig = NULL; + } + if (exp->proto.gre.keymap_reply) { + DEBUGP("removing %p from list\n", exp->proto.gre.keymap_reply); + list_del(&exp->proto.gre.keymap_reply->list); + kfree(exp->proto.gre.keymap_reply); + exp->proto.gre.keymap_reply = NULL; + } + WRITE_UNLOCK(&ip_ct_gre_lock); +} + /* PUBLIC CONNTRACK PROTO HELPER FUNCTIONS */ @@ -186,6 +215,10 @@ static int gre_pkt_to_tuple(const void *datah, size_t datalen, srckey = gre_keymap_lookup(tuple); +#if 0 + DEBUGP("found src key %x for tuple ", ntohl(srckey)); + DUMP_TUPLE_GRE(tuple); +#endif tuple->src.u.gre.key = srckey; return 1; @@ -256,18 +289,7 @@ static void gre_destroy(struct ip_conntrack *ct) return; } - WRITE_LOCK(&ip_ct_gre_lock); - if (master->proto.gre.keymap_orig) { - DEBUGP("removing %p from list\n", master->proto.gre.keymap_orig); - list_del(&master->proto.gre.keymap_orig->list); - kfree(master->proto.gre.keymap_orig); - } - if (master->proto.gre.keymap_reply) { - DEBUGP("removing %p from list\n", master->proto.gre.keymap_reply); - list_del(&master->proto.gre.keymap_reply->list); - kfree(master->proto.gre.keymap_reply); - } - WRITE_UNLOCK(&ip_ct_gre_lock); + ip_ct_gre_keymap_destroy(master); } /* protocol helper struct */ @@ -304,7 +326,7 @@ static void __exit fini(void) /* delete all keymap entries */ WRITE_LOCK(&ip_ct_gre_lock); list_for_each_safe(pos, n, &gre_keymap_list) { - DEBUGP("deleting keymap %p\n", pos); + DEBUGP("deleting keymap %p at module unload time\n", pos); list_del(pos); kfree(pos); } @@ -315,6 +337,7 @@ static void __exit fini(void) EXPORT_SYMBOL(ip_ct_gre_keymap_add); EXPORT_SYMBOL(ip_ct_gre_keymap_change); +EXPORT_SYMBOL(ip_ct_gre_keymap_destroy); module_init(init); module_exit(fini); diff --git a/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 02f20742..ebb9b493 100644 --- a/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -173,7 +173,7 @@ static int tcp_packet(struct ip_conntrack *conntrack, have an established connection: this is a fairly common problem case, so we can delete the conntrack immediately. --RR */ - if (!(conntrack->status & IPS_SEEN_REPLY) && tcph->rst) { + if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status) && tcph->rst) { WRITE_UNLOCK(&tcp_lock); if (del_timer(&conntrack->timeout)) conntrack->timeout.function((unsigned long)conntrack); @@ -211,12 +211,6 @@ static int tcp_new(struct ip_conntrack *conntrack, return 0; } - if (tcph->syn && tcph->ack) - { - DEBUGP("ip_conntrack_tcp: invalid new deleting.\n"); - return 0; - } - conntrack->proto.tcp.state = newconntrack; return 1; } diff --git a/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_proto_udp.c index 49ac62c7..af660a27 100644 --- a/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ b/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_proto_udp.c @@ -7,25 +7,13 @@ #include #include -unsigned long ip_ct_udp_isakmp_timeout = (300*HZ); - static int udp_pkt_to_tuple(const void *datah, size_t datalen, struct ip_conntrack_tuple *tuple) { const struct udphdr *hdr = datah; - struct isakmp_hdr *isakmp_h = (void *)hdr + 8; tuple->src.u.udp.port = hdr->source; tuple->dst.u.udp.port = hdr->dest; - if(ntohs(hdr->source) == 500 && ntohs(hdr->dest) == 500) - { - if(NULL == isakmp_h) - tuple->dst.u.udp.init_cookie = 0; - else - tuple->dst.u.udp.init_cookie = (unsigned int)(isakmp_h->init_cookie[0]); - } - else - tuple->dst.u.udp.init_cookie = 0; return 1; } @@ -35,7 +23,6 @@ static int udp_invert_tuple(struct ip_conntrack_tuple *tuple, { tuple->src.u.udp.port = orig->dst.u.udp.port; tuple->dst.u.udp.port = orig->src.u.udp.port; - tuple->dst.u.udp.init_cookie = orig->dst.u.udp.init_cookie; return 1; } @@ -60,21 +47,16 @@ static int udp_packet(struct ip_conntrack *conntrack, struct iphdr *iph, size_t len, enum ip_conntrack_info conntrackinfo) { - u_int16_t *portptr; - portptr = &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port; /* If we've seen traffic both ways, this is some kind of UDP stream. Extend timeout. */ - if (conntrack->status & IPS_SEEN_REPLY) - { - if(ntohs(*portptr) == 500) - ip_ct_refresh(conntrack, ip_ct_udp_isakmp_timeout); - else - ip_ct_refresh(conntrack, sysctl_ip_conntrack_udp_timeouts[UDP_STREAM_TIMEOUT]); + if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { + ip_ct_refresh(conntrack, + sysctl_ip_conntrack_udp_timeouts[UDP_STREAM_TIMEOUT]); /* Also, more likely to be important, and not a probe */ set_bit(IPS_ASSURED_BIT, &conntrack->status); - } - else - ip_ct_refresh(conntrack, sysctl_ip_conntrack_udp_timeouts[UDP_TIMEOUT]); + } else + ip_ct_refresh(conntrack, + sysctl_ip_conntrack_udp_timeouts[UDP_TIMEOUT]); return NF_ACCEPT; } diff --git a/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_standalone.c b/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_standalone.c index c7e31931..78c3062c 100644 --- a/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/release/src/linux/linux/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -32,503 +32,6 @@ struct module *ip_conntrack_module = THIS_MODULE; MODULE_LICENSE("GPL"); -#define CLEAR_IP_CONNTRACK -#define DEL_IP_CONNTRACK_ENTRY 1 -#ifdef DEL_IP_CONNTRACK_ENTRY -/* - * - *This part of code add for delete an entry in ip_conntrack table. - * - */ - - -#define DEL_LIST_PATH "/tmp/.del_ip_conntrack" -#define printkerrline() printk("del_ip_conntrack error : %s %s %d\n", __FILE__, __func__, __LINE__) - -struct del_list -{ - unsigned short proto; - unsigned int begin_port; - unsigned int end_port; - unsigned int ip; - struct del_list *next; -}; - -void free_del_list(struct del_list *head); -void print_del_list(struct del_list *head); -static struct del_list * malloc_new_node(const char *buf, struct del_list * head); -struct del_list * init_del_list(const char *buf, size_t size); -static int read_del_file(char * buf, unsigned int size, char *path); -static int del_match_method(const struct ip_conntrack_tuple_hash *pConn, const struct del_list * pList); -static int del_conntrack_check(const struct ip_conntrack_tuple_hash *pConn, const struct del_list * head); -void pf_del_ip_conntrack(void); -static int proc_read_del_ip_conntrack(char *page, char **start, off_t off, int count, int *eof, void *context); -static int proc_write_del_ip_conntrack(struct file *file, const char *buffer, unsigned long count, void *data); -static int end_proc_read(const char *p, char *page, off_t off, int count, char **start, int *eof); - -void pf_del_ip_conntrack(void) -{ -#define MAX_BUF_SIZE 1024*2 - int i; - char buf[MAX_BUF_SIZE]; - struct del_list * del_head = NULL; - struct list_head *head, *temp_head; - struct ip_conntrack_tuple_hash *tuple_hash; - - //printk("pf_del_ip_conntrack---------------------------------------1\n"); - memset(buf, 0, MAX_BUF_SIZE); - - if(read_del_file(buf, MAX_BUF_SIZE, DEL_LIST_PATH) == -1) - { - goto final_return; - } - - buf[MAX_BUF_SIZE - 1] = '\0'; - del_head = init_del_list(buf, MAX_BUF_SIZE - 1); - //print_del_list(del_head); - READ_LOCK(&ip_conntrack_lock); - for (i = 0; i < ip_conntrack_htable_size; i++) - { - head = &ip_conntrack_hash[i]; - temp_head = head; - while(1) - { - temp_head = temp_head->next; - if(temp_head == head) - { - head = NULL; - temp_head = NULL; - break; - } - tuple_hash = (struct ip_conntrack_tuple_hash *)temp_head; - if(del_conntrack_check(tuple_hash, del_head) == 1) - { - del_selected_conntrack(tuple_hash); - } - } - } - READ_UNLOCK(&ip_conntrack_lock); - free_del_list(del_head); - -final_return: - - //printk("pf_del_ip_conntrack---------------------------------------2\n"); - return; -#undef MAX_BUF_SIZE -} - -static int del_conntrack_check(const struct ip_conntrack_tuple_hash *pConn, const struct del_list * head) -{ - int ret; - const struct del_list * p; - - ret = 0; - - if(pConn == NULL || head == NULL) - { - ret = -1; - goto final_return; - } - - for(p = head; p; p = p->next) - { - if(del_match_method(pConn, p) == 1) - { - //Match,jump out - ret = 1; - break; - } - } - -final_return: - return ret; -} - -static int del_match_method(const struct ip_conntrack_tuple_hash *pConn, const struct del_list * pList) -{ - int ret; - typedef enum - { - TCP_PROTO = 0x06, - UDP_PROTO = 0x11, - }proto_type; - proto_type pt[2] = {TCP_PROTO, UDP_PROTO}; - - ret = 0; - //Check tcp and udp only - if(pConn->tuple.dst.protonum == TCP_PROTO || pConn->tuple.dst.protonum == UDP_PROTO) - { - //Check proto match - if((pList->proto == 3) || - ((pList->proto == 0 || pList->proto == 1) && (pConn->tuple.dst.protonum == pt[pList->proto]))) - { - //Chcek ip address match - if(pConn->ctrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip == pList->ip) - { - //Check port match - unsigned int tport; - if(pConn->tuple.dst.protonum == TCP_PROTO) - { - //TCP - tport = pConn->ctrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port; - } - else - { - //UDP - tport = pConn->ctrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.udp.port; - } - tport = htons(tport); - if(tport >= pList->begin_port && tport <= pList->end_port) - { - ret = 1; - } - } - } - } - return ret; -} - -static int read_del_file(char * buf, unsigned int size, char *path) -{ - int retval, orgfsuid, orgfsgid; - mm_segment_t orgfs; - struct file *srcf; - - // Save uid and gid used for filesystem access. - // Set user and group to 0 (root) - orgfsuid = current->fsuid; - orgfsgid = current->fsgid; - current->fsuid=current->fsgid = 0; - orgfs = get_fs(); - set_fs(KERNEL_DS); - - if(path && *path) - { - srcf = filp_open(path, O_RDONLY, 0); - if(IS_ERR(srcf)) - { - printkerrline(); - retval = -1; - goto final_return; - } - else - { - if(srcf->f_op && srcf->f_op->read) - { - memset(buf, 0x00, size); - retval=srcf->f_op->read(srcf, buf, size, &srcf->f_pos); - if(retval < 0) - { - printkerrline(); - retval = -1; - goto final_return; - } - else - { - //Success,go! - retval = 0; - goto final_return; - } - } - else - { - printkerrline(); - retval = -1; - goto final_return; - } - } - } - else - { - printkerrline(); - retval = -1; - goto final_return; - } - -final_return: - if(!IS_ERR(srcf)) - { - retval=filp_close(srcf,NULL); - if(retval) - { - printkerrline(); - retval = -1; - } - } - set_fs(orgfs); - current->fsuid = orgfsuid; - current->fsgid = orgfsgid; - - return retval; -} - -struct del_list * init_del_list(const char *buf, size_t size) -{ -#define LINE_FEED "\n" -#define TMP_BUF_SIZE 100 - const char *begin, *end; - char tmpbuf[TMP_BUF_SIZE]; - struct del_list * head = NULL, *tmp_p; - - if(buf == NULL || size <= 0 || buf[size] != '\0') - { - head = NULL; - goto final_return; - } - - for(begin = end = buf; begin && (begin - buf < size); begin = end + strlen(LINE_FEED)) - { - end = strstr(begin, LINE_FEED); - if(end) - { - if((end - begin) > (TMP_BUF_SIZE - 1)) - { - //Too large,go on - continue; - } - else - { - memcpy(tmpbuf, begin, end - begin); - tmpbuf[end - begin] = '\0'; - //printk("obtain string : %s\n", tmpbuf); - if((tmp_p = malloc_new_node(tmpbuf, head)) == NULL) - { - //Invalid format or malloc fail,go on - continue; - } - else - { - head = tmp_p; - } - } - } - else - { - //printk("Last string : %s\n", begin); - if((tmp_p = malloc_new_node(begin, head)) == NULL) - { - //Invalid format or malloc fail,jump out - break; - } - else - { - head = tmp_p; - } - } - } - -final_return: - return head; - -#undef TMP_BUF_SIZE -#undef LINE_FEED -} - -static struct del_list * malloc_new_node(const char *buf, struct del_list * head) -{ -#define SSCANF_MATCH_NUM 7 - int i, j, k, c1, c2, c3, c4; - struct del_list *p = NULL; - - if(sscanf(buf, "%d %d.%d.%d.%d %d-%d", &i, &c4, &c3, &c2, &c1, &j, &k) != SSCANF_MATCH_NUM) - { - p = NULL; - goto final_return; - } - else - { - if(p = (struct del_list *)kmalloc(sizeof(struct del_list), GFP_ATOMIC)) - { - p->proto = i; - #if 0 - //Big endian - ((char *)&(p->ip))[0] = (char)c1; - ((char *)&(p->ip))[1] = (char)c2; - ((char *)&(p->ip))[2] = (char)c3; - ((char *)&(p->ip))[3] = (char)c4; - #else - //Little endian - ((char *)&(p->ip))[3] = (char)c1; - ((char *)&(p->ip))[2] = (char)c2; - ((char *)&(p->ip))[1] = (char)c3; - ((char *)&(p->ip))[0] = (char)c4; - #endif - p->begin_port = j; - p->end_port = k; - p->next = head; - } - else - { - p = NULL; - goto final_return; - } - } - -final_return: - return p; -#undef SSCANF_MATCH_NUM -} - -void print_del_list(struct del_list *head) -{ - int i; - struct del_list *tmp_p; - - for(i = 1, tmp_p = head; tmp_p; tmp_p = tmp_p->next, i++) - { - printk("Node(%d): proto=%d | ip=%0x | port=[%d-%d]\n", i, tmp_p->proto, tmp_p->ip, tmp_p->begin_port, tmp_p->end_port); - } -} - -void free_del_list(struct del_list *head) -{ - int i; - struct del_list *tmp_p; - - if(head == NULL) - { - goto final_return; - } - for(i = 1, tmp_p = head; head; head = tmp_p, i++) - { - tmp_p = head->next; - //printk("Free@Node(%d):proto=%d | ip=%0x | port=[%d-%d]\n", i, head->proto, head->ip, head->begin_port, head->end_port); - kfree(head); - } - -final_return: - return; -} - -static int proc_read_del_ip_conntrack(char *page, char **start, off_t off, int count, int *eof, void *context) -{ - char *p; - - p = page; - p += sprintf(page, "%s\n", "use echo \"1(0)\" to enable or disbable"); - return end_proc_read(p, page, off, count, start, eof); -} - -static int proc_write_del_ip_conntrack(struct file *file, const char *buffer, unsigned long count, void *data) -{ - unsigned char tmp[2]; - - if(buffer) - { - memset(tmp, 0, sizeof(tmp)); - copy_from_user(tmp, buffer, count); - tmp[1] = 0x00; - switch(*tmp) - { - case '0': - //Do something here - break; - - case '1': - pf_del_ip_conntrack(); - break; - - default: - printk("<1>invalid args\n"); - } - return count; - } - return 0; -} - -static int end_proc_read(const char *p, char *page, off_t off, int count, char **start, int *eof) -{ - int len = p - page; - - if(len < off + count) - { - *eof = 1; - } - - *start = page + off; - len -= off; - if(len > count) - { - len = count; - } - - if(len < 0) - { - len = 0; - } - - return len; -} - -#endif - -#ifdef CLEAR_IP_CONNTRACK -void clear_ip_conntrack(void) -{ - int i; - struct list_head *head, *temp_head; - struct ip_conntrack_tuple_hash *tuple_hash; - - printk("warning : %s %d\n", __func__, __LINE__); - - READ_LOCK(&ip_conntrack_lock); - for (i = 0; i < ip_conntrack_htable_size; i++) - { - head = &ip_conntrack_hash[i]; - temp_head = head; - while(1) - { - temp_head = temp_head->next; - if(temp_head == head) - { - head = NULL; - temp_head = NULL; - break; - } - tuple_hash = (struct ip_conntrack_tuple_hash *)temp_head; - del_selected_conntrack(tuple_hash); - } - } - READ_UNLOCK(&ip_conntrack_lock); -} - -static int proc_read_clear_ip_conntrack(char *page, char **start, off_t off, int count, int *eof, void *context) -{ - char *p; - - p = page; - p += sprintf(page, "%s\n", "use echo \"1(0)\" to enable or disbable"); - return end_proc_read(p, page, off, count, start, eof); -} - -static int proc_write_clear_ip_conntrack(struct file *file, const char *buffer, unsigned long count, void *data) -{ - unsigned char tmp[2]; - - if(buffer) - { - memset(tmp, 0, sizeof(tmp)); - copy_from_user(tmp, buffer, count); - tmp[1] = 0x00; - switch(*tmp) - { - case '0': - //Do something here - break; - - case '1': - clear_ip_conntrack(); - break; - - default: - printk("<1>invalid args\n"); - } - return count; - } - return 0; -} -#endif - static int kill_proto(const struct ip_conntrack *i, void *data) { return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum == @@ -575,7 +78,7 @@ print_expect(char *buffer, const struct ip_conntrack_expect *expect) } static unsigned int -print_conntrack(char *buffer, const struct ip_conntrack *conntrack) +print_conntrack(char *buffer, struct ip_conntrack *conntrack) { unsigned int len; struct ip_conntrack_protocol *proto @@ -593,15 +96,38 @@ print_conntrack(char *buffer, const struct ip_conntrack *conntrack) len += print_tuple(buffer + len, &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, proto); - if (!(conntrack->status & IPS_SEEN_REPLY)) + if (!(test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status))) len += sprintf(buffer + len, "[UNREPLIED] "); len += print_tuple(buffer + len, &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple, proto); - if (conntrack->status & IPS_ASSURED) + if (test_bit(IPS_ASSURED_BIT, &conntrack->status)) len += sprintf(buffer + len, "[ASSURED] "); len += sprintf(buffer + len, "use=%u ", atomic_read(&conntrack->ct_general.use)); +#if defined(CONFIG_IP_NF_CONNTRACK_MARK) + len += sprintf(buffer + len, "mark=%ld ", conntrack->mark); +#endif +#if defined(CONFIG_IP_NF_MATCH_LAYER7) || defined(CONFIG_IP_NF_MATCH_LAYER7_MODULE) + if(conntrack->layer7.app_proto) + len += sprintf(buffer + len, "l7proto=%s ", + conntrack->layer7.app_proto); +#endif +#if defined(CONFIG_IP_NF_TARGET_MACSAVE) || defined(CONFIG_IP_NF_TARGET_MACSAVE_MODULE) + if ((*((u32 *)conntrack->macsave) != 0) || (*((u16*)(conntrack->macsave + 4)) != 0)) { + len += sprintf(buffer + len, "macsave=%02X:%02X:%02X:%02X:%02X:%02X ", + conntrack->macsave[0], conntrack->macsave[1], conntrack->macsave[2], + conntrack->macsave[3], conntrack->macsave[4], conntrack->macsave[5]); + } +#endif +#if defined(CONFIG_IP_NF_TARGET_BCOUNT) || defined(CONFIG_IP_NF_TARGET_BCOUNT_MODULE) +#if 0 + if (conntrack->bcount != 0) { +// len += sprintf(buffer + len, "bcount=0x%lx ", conntrack->bcount); + len += sprintf(buffer + len, "bcount=%ldK ", conntrack->bcount / 1024); + } +#endif +#endif len += sprintf(buffer + len, "\n"); return len; @@ -748,30 +274,6 @@ static int init_or_cleanup(int init) if (ret < 0) goto cleanup_nothing; -#ifdef DEL_IP_CONNTRACK_ENTRY - proc = proc_net_create("del_ip_conntrack", S_IFREG | S_IRUGO | S_IWUSR, proc_read_del_ip_conntrack); - if(proc) - { - proc->write_proc = proc_write_del_ip_conntrack; - proc->owner = THIS_MODULE; - } - else - { - //Maybe we can just let it go! - } -#endif -#ifdef CLEAR_IP_CONNTRACK - proc = proc_net_create("clear_ip_conntrack", S_IFREG | S_IRUGO | S_IWUSR, proc_read_clear_ip_conntrack); - if(proc) - { - proc->write_proc = proc_write_clear_ip_conntrack; - proc->owner = THIS_MODULE; - } - else - { - //Maybe we can just let it go! - } -#endif proc = proc_net_create("ip_conntrack",0,list_conntracks); if (!proc) goto cleanup_init; proc->owner = THIS_MODULE; diff --git a/release/src/linux/linux/net/ipv4/netfilter/ip_nat_core.c b/release/src/linux/linux/net/ipv4/netfilter/ip_nat_core.c index f64ddabf..de6b4925 100644 --- a/release/src/linux/linux/net/ipv4/netfilter/ip_nat_core.c +++ b/release/src/linux/linux/net/ipv4/netfilter/ip_nat_core.c @@ -763,12 +763,11 @@ do_bindings(struct ip_conntrack *ct, /* if this expectation is already established, skip */ if (exp->sibling) { - //lzh add 2007/3/16 for fix sip alg CDROUTE test - exp = NULL; - //lzh end +#if 0 // removed 1.11 forward bug test + exp = NULL; // lzh add 2007/3/16 for fix sip alg CDROUTE test +#endif continue; } - if (exp_for_packet(exp, pskb)) { DEBUGP("calling nat helper (exp=%p) for packet\n", diff --git a/release/src/linux/linux/net/ipv4/netfilter/ip_nat_h323.c b/release/src/linux/linux/net/ipv4/netfilter/ip_nat_h323.c index 2c080cf3..bcf886e8 100644 --- a/release/src/linux/linux/net/ipv4/netfilter/ip_nat_h323.c +++ b/release/src/linux/linux/net/ipv4/netfilter/ip_nat_h323.c @@ -129,7 +129,7 @@ h225_nat_expected(struct sk_buff **pskb, mr.range[0].flags |= IP_NAT_RANGE_PROTO_SPECIFIED; mr.range[0].min = mr.range[0].max = ((union ip_conntrack_manip_proto) - { port }); + { .tcp = { port } }); } ret = ip_nat_setup_info(ct, &mr, hooknum); @@ -184,12 +184,14 @@ static int h323_signal_address_fixup(struct ip_conntrack *ct, if (!between(info->seq[i] + 6, ntohl(tcph->seq), ntohl(tcph->seq) + datalen)) { /* Partial retransmisison. It's a cracker being funky. */ +#if 0 // ... or a miss id? zzz if (net_ratelimit()) { printk("H.323_NAT: partial packet %u/6 in %u/%u\n", info->seq[i], ntohl(tcph->seq), ntohl(tcph->seq) + datalen); } +#endif return 0; } @@ -252,18 +254,18 @@ static int h323_data_fixup(struct ip_ct_h225_expect *info, DEBUGP("h323_data_fixup: offset %u + 6 in %u\n", info->offset, tcplen); DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); - - memset(&newtuple, 0, sizeof(newtuple)); if (!between(expect->seq + 6, ntohl(tcph->seq), ntohl(tcph->seq) + tcplen - tcph->doff * 4)) { /* Partial retransmisison. It's a cracker being funky. */ +#if 1 // also caused by bad id? if (net_ratelimit()) { printk("H.323_NAT: partial packet %u/6 in %u/%u\n", expect->seq, ntohl(tcph->seq), ntohl(tcph->seq) + tcplen - tcph->doff * 4); } +#endif return 0; } @@ -392,9 +394,9 @@ static struct ip_nat_helper h225 = "H.225", /* name */ IP_NAT_HELPER_F_ALWAYS, /* flags */ THIS_MODULE, /* module */ - { { 0, { __constant_htons(H225_PORT) } }, /* tuple */ + { { 0, { .tcp = { __constant_htons(H225_PORT) } } }, /* tuple */ { 0, { 0 }, IPPROTO_TCP } }, - { { 0, { 0xFFFF } }, /* mask */ + { { 0, { .tcp = { 0xFFFF } } }, /* mask */ { 0, { 0 }, 0xFFFF } }, h225_nat_help, /* helper */ h225_nat_expected /* expectfn */ diff --git a/release/src/linux/linux/net/ipv4/netfilter/ip_nat_helper.c b/release/src/linux/linux/net/ipv4/netfilter/ip_nat_helper.c index ffde5133..e7987430 100644 --- a/release/src/linux/linux/net/ipv4/netfilter/ip_nat_helper.c +++ b/release/src/linux/linux/net/ipv4/netfilter/ip_nat_helper.c @@ -79,7 +79,6 @@ ip_nat_resize_packet(struct sk_buff **skb, iph = (*skb)->nh.iph; if (iph->protocol == IPPROTO_TCP) { struct tcphdr *tcph = (void *)iph + iph->ihl*4; - void *data = (void *)tcph + tcph->doff*4; DEBUGP("ip_nat_resize_packet: Seq_offset before: "); DUMP_OFFSET(this_way); @@ -354,54 +353,49 @@ sack_adjust(struct tcphdr *tcph, } -/* TCP SACK sequence number adjustment, return 0 if sack found and adjusted */ -static inline int +/* TCP SACK sequence number adjustment. */ +static inline void ip_nat_sack_adjust(struct sk_buff *skb, struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) { - struct iphdr *iph; struct tcphdr *tcph; - unsigned char *ptr; - int length, dir, sack_adjusted = 0; + unsigned char *ptr, *optend; + unsigned int dir; - iph = skb->nh.iph; - tcph = (void *)iph + iph->ihl*4; - length = (tcph->doff*4)-sizeof(struct tcphdr); + tcph = (void *)skb->nh.iph + skb->nh.iph->ihl*4; + optend = (unsigned char *)tcph + tcph->doff*4; ptr = (unsigned char *)(tcph+1); dir = CTINFO2DIR(ctinfo); - while (length > 0) { - int opcode = *ptr++; + while (ptr < optend) { + int opcode = ptr[0]; int opsize; switch (opcode) { case TCPOPT_EOL: - return !sack_adjusted; + return; case TCPOPT_NOP: - length--; + ptr++; continue; default: - opsize = *ptr++; - if (opsize > length) /* no partial opts */ - return !sack_adjusted; + opsize = ptr[1]; + /* no partial opts */ + if (ptr + opsize > optend || opsize < 2) + return; if (opcode == TCPOPT_SACK) { /* found SACK */ if((opsize >= (TCPOLEN_SACK_BASE +TCPOLEN_SACK_PERBLOCK)) && !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK)) - sack_adjust(tcph, ptr-2, + sack_adjust(tcph, ptr, &ct->nat.info.seq[!dir]); - - sack_adjusted = 1; } - ptr += opsize-2; - length -= opsize; + ptr += opsize; } } - return !sack_adjusted; } /* TCP sequence number adjustment */ diff --git a/release/src/linux/linux/net/ipv4/netfilter/ip_nat_pptp.c b/release/src/linux/linux/net/ipv4/netfilter/ip_nat_pptp.c index 71077933..358a4579 100644 --- a/release/src/linux/linux/net/ipv4/netfilter/ip_nat_pptp.c +++ b/release/src/linux/linux/net/ipv4/netfilter/ip_nat_pptp.c @@ -1,5 +1,5 @@ /* - * ip_nat_pptp.c - Version 1.11 + * ip_nat_pptp.c - Version 1.5 * * NAT support for PPTP (Point to Point Tunneling Protocol). * PPTP is a a protocol for creating virtual private networks. @@ -9,7 +9,7 @@ * GRE is defined in RFC 1701 and RFC 1702. Documentation of * PPTP can be found in RFC 2637 * - * (C) 2000-2002 by Harald Welte + * (C) 2000-2003 by Harald Welte * * Development of this code funded by Astaro AG (http://www.astaro.com/) * @@ -17,7 +17,18 @@ * (needs netfilter newnat code) * - NAT to a unique tuple, not to TCP source port * (needs netfilter tuple reservation) - * - Support other NAT scenarios than SNAT of PNS + * + * Changes: + * 2002-02-10 - Version 1.3 + * - Use ip_nat_mangle_tcp_packet() because of cloned skb's + * in local connections (Philip Craig ) + * - add checks for magicCookie and pptp version + * - make argument list of pptp_{out,in}bound_packet() shorter + * - move to C99 style initializers + * - print version number at module loadtime + * 2003-09-22 - Version 1.5 + * - use SNATed tcp sourceport as callid, since we get called before + * TCP header is mangled (Philip Craig ) * */ @@ -34,6 +45,8 @@ #include #include +#define IP_NAT_PPTP_VERSION "1.5" + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte "); MODULE_DESCRIPTION("Netfilter NAT helper module for PPTP"); @@ -57,7 +70,7 @@ pptp_nat_expected(struct sk_buff **pskb, struct ip_nat_multi_range mr; struct ip_ct_pptp_master *ct_pptp_info; struct ip_nat_pptp *nat_pptp_info; - u_int32_t newsrcip, newdstip, newcid; + u_int32_t newip, newcid; int ret; IP_NF_ASSERT(info); @@ -72,7 +85,7 @@ pptp_nat_expected(struct sk_buff **pskb, /* need to alter GRE tuple because conntrack expectfn() used 'wrong' * (unmanipulated) values */ - if (hooknum == NF_IP_PRE_ROUTING) { + if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST) { DEBUGP("completing tuples with NAT info \n"); /* we can do this, since we're unconfirmed */ if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.gre.key == @@ -80,68 +93,43 @@ pptp_nat_expected(struct sk_buff **pskb, /* assume PNS->PAC */ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.gre.key = htonl(nat_pptp_info->pns_call_id); -// ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.gre.key = -// htonl(nat_pptp_info->pac_call_id); ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.gre.key = htonl(nat_pptp_info->pns_call_id); + newip = master->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip; + newcid = htonl(nat_pptp_info->pac_call_id); } else { /* assume PAC->PNS */ - DEBUGP("WRONG DIRECTION\n"); ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.gre.key = htonl(nat_pptp_info->pac_call_id); ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.gre.key = htonl(nat_pptp_info->pac_call_id); - } - } - - if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST) - { - if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.gre.key == htonl(ct_pptp_info->pac_call_id)) - { - /* assume PNS->PAC */ - newdstip = master->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip; - newcid = htonl(nat_pptp_info->pac_call_id); - } - else - { - /* assume PAC->PNS */ - newdstip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip; + newip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip; newcid = htonl(nat_pptp_info->pns_call_id); } - mr.rangesize = 1; - mr.range[0].flags = IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED; - mr.range[0].min_ip = mr.range[0].max_ip = newdstip; - mr.range[0].min = mr.range[0].max = - ((union ip_conntrack_manip_proto ) { newcid }); - DEBUGP("change dest ip to %u.%u.%u.%u\n", - NIPQUAD(newdstip)); - DEBUGP("change dest key to 0x%x\n", ntohl(newcid)); - ret = ip_nat_setup_info(ct, &mr, hooknum); } else { - if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.gre.key == htonl(ct_pptp_info->pac_call_id)) - { - newsrcip = master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip; + if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.gre.key == + htonl(ct_pptp_info->pac_call_id)) { + /* assume PNS->PAC */ + newip = master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip; newcid = htonl(ct_pptp_info->pns_call_id); } - else - { + else { /* assume PAC->PNS */ - newsrcip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip; + newip = master->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip; newcid = htonl(ct_pptp_info->pac_call_id); } - - mr.rangesize = 1; - mr.range[0].flags = IP_NAT_RANGE_MAP_IPS - |IP_NAT_RANGE_PROTO_SPECIFIED; - mr.range[0].min_ip = mr.range[0].max_ip = newsrcip; - mr.range[0].min = mr.range[0].max = - ((union ip_conntrack_manip_proto ) { newcid }); - DEBUGP("change src ip to %u.%u.%u.%u\n", - NIPQUAD(newsrcip)); - DEBUGP("change 'src' key to 0x%x\n", ntohl(newcid)); - ret = ip_nat_setup_info(ct, &mr, hooknum); } + mr.rangesize = 1; + mr.range[0].flags = IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED; + mr.range[0].min_ip = mr.range[0].max_ip = newip; + mr.range[0].min = mr.range[0].max = + ((union ip_conntrack_manip_proto ) { newcid }); + DEBUGP("change ip to %u.%u.%u.%u\n", + NIPQUAD(newip)); + DEBUGP("change key to 0x%x\n", ntohl(newcid)); + ret = ip_nat_setup_info(ct, &mr, hooknum); + UNLOCK_BH(&ip_pptp_lock); return ret; @@ -150,13 +138,17 @@ pptp_nat_expected(struct sk_buff **pskb, /* outbound packets == from PNS to PAC */ static inline unsigned int -pptp_outbound_pkt(struct tcphdr *tcph, struct pptp_pkt_hdr *pptph, - size_t datalen, +pptp_outbound_pkt(struct sk_buff **pskb, struct ip_conntrack *ct, enum ip_conntrack_info ctinfo, struct ip_conntrack_expect *exp) { + struct iphdr *iph = (*pskb)->nh.iph; + struct tcphdr *tcph = (void *) iph + iph->ihl*4; + struct pptp_pkt_hdr *pptph = (struct pptp_pkt_hdr *) + ((void *)tcph + tcph->doff*4); + struct PptpControlHeader *ctlh; union pptp_ctrl_union pptpReq; struct ip_ct_pptp_master *ct_pptp_info = &ct->help.ct_pptp_info; @@ -164,6 +156,7 @@ pptp_outbound_pkt(struct tcphdr *tcph, struct pptp_pkt_hdr *pptph, u_int16_t msg, *cid = NULL, new_callid; + /* FIXME: size checks !!! */ ctlh = (struct PptpControlHeader *) ((void *) pptph + sizeof(*pptph)); pptpReq.rawreq = (void *) ((void *) ctlh + sizeof(*ctlh)); @@ -172,11 +165,18 @@ pptp_outbound_pkt(struct tcphdr *tcph, struct pptp_pkt_hdr *pptph, switch (msg = ntohs(ctlh->messageType)) { case PPTP_OUT_CALL_REQUEST: cid = &pptpReq.ocreq->callID; + /* FIXME: ideally we would want to reserve a call ID + * here. current netfilter NAT core is not able to do + * this :( For now we use TCP source port. This breaks + * multiple calls within one control session */ /* save original call ID in nat_info */ nat_pptp_info->pns_call_id = ct_pptp_info->pns_call_id; - new_callid = tcph->source; + /* don't use tcph->source since we are at a DSTmanip + * hook (e.g. PREROUTING) and pkt is not mangled yet */ + new_callid = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port; + /* save new call ID in ct info */ ct_pptp_info->pns_call_id = ntohs(new_callid); break; @@ -186,10 +186,6 @@ pptp_outbound_pkt(struct tcphdr *tcph, struct pptp_pkt_hdr *pptph, case PPTP_CALL_CLEAR_REQUEST: cid = &pptpReq.clrreq->callID; break; - case PPTP_CALL_DISCONNECT_NOTIFY: - cid = &pptpReq.disc->callID; - break; - default: DEBUGP("unknown outbound packet 0x%04x:%s\n", msg, (msg <= PPTP_MSG_MAX)? strMName[msg]:strMName[0]); @@ -204,11 +200,6 @@ pptp_outbound_pkt(struct tcphdr *tcph, struct pptp_pkt_hdr *pptph, case PPTP_ECHO_REQUEST: case PPTP_ECHO_REPLY: /* no need to alter packet */ - DEBUGP("outbound control message %s\n", strMName[msg]); - DEBUGP("ct->pac_call_id = %d\n", ct_pptp_info->pac_call_id); - DEBUGP("ct->pns_call_id = %d\n", ct_pptp_info->pns_call_id); - DEBUGP("nat->pac_call_id = %d\n", nat_pptp_info->pac_call_id); - DEBUGP("nat->pns_call_id = %d\n", nat_pptp_info->pns_call_id); return NF_ACCEPT; } @@ -216,27 +207,27 @@ pptp_outbound_pkt(struct tcphdr *tcph, struct pptp_pkt_hdr *pptph, DEBUGP("altering call id from 0x%04x to 0x%04x\n", ntohs(*cid), ntohs(new_callid)); + /* mangle packet */ - tcph->check = ip_nat_cheat_check(*cid^0xFFFF, - new_callid, tcph->check); - *cid = new_callid; - - DEBUGP("outbound control message %s\n", strMName[msg]); - DEBUGP("ct->pac_call_id = %d\n", ct_pptp_info->pac_call_id); - DEBUGP("ct->pns_call_id = %d\n", ct_pptp_info->pns_call_id); - DEBUGP("nat->pac_call_id = %d\n", nat_pptp_info->pac_call_id); - DEBUGP("nat->pns_call_id = %d\n", nat_pptp_info->pns_call_id); + ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, (void *)cid - (void *)pptph, + sizeof(new_callid), (char *)&new_callid, + sizeof(new_callid)); + return NF_ACCEPT; } /* inbound packets == from PAC to PNS */ static inline unsigned int -pptp_inbound_pkt(struct tcphdr *tcph, struct pptp_pkt_hdr *pptph, - size_t datalen, +pptp_inbound_pkt(struct sk_buff **pskb, struct ip_conntrack *ct, enum ip_conntrack_info ctinfo, struct ip_conntrack_expect *oldexp) { + struct iphdr *iph = (*pskb)->nh.iph; + struct tcphdr *tcph = (void *) iph + iph->ihl*4; + struct pptp_pkt_hdr *pptph = (struct pptp_pkt_hdr *) + ((void *)tcph + tcph->doff*4); + struct PptpControlHeader *ctlh; union pptp_ctrl_union pptpReq; struct ip_ct_pptp_master *ct_pptp_info = &ct->help.ct_pptp_info; @@ -245,8 +236,10 @@ pptp_inbound_pkt(struct tcphdr *tcph, struct pptp_pkt_hdr *pptph, u_int16_t msg, new_cid = 0, new_pcid, *pcid = NULL, *cid = NULL; u_int32_t old_dst_ip; - struct ip_conntrack_tuple t; + struct ip_conntrack_tuple t, inv_t; + struct ip_conntrack_tuple *orig_t, *reply_t; + /* FIXME: size checks !!! */ ctlh = (struct PptpControlHeader *) ((void *) pptph + sizeof(*pptph)); pptpReq.rawreq = (void *) ((void *) ctlh + sizeof(*ctlh)); @@ -262,23 +255,30 @@ pptp_inbound_pkt(struct tcphdr *tcph, struct pptp_pkt_hdr *pptph, } old_dst_ip = oldexp->tuple.dst.ip; t = oldexp->tuple; + invert_tuplepr(&inv_t, &t); /* save original PAC call ID in nat_info */ nat_pptp_info->pac_call_id = ct_pptp_info->pac_call_id; - /* store new callID in ct_info, so conntrack works */ - //ct_pptp_info->pac_call_id = ntohs(tcph->source); - //new_cid = htons(ct_pptp_info->pac_call_id); - /* alter expectation */ - if (t.dst.ip == ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip) { + orig_t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + reply_t = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; + if (t.src.ip == orig_t->src.ip && t.dst.ip == orig_t->dst.ip) { /* expectation for PNS->PAC direction */ - t.dst.u.gre.key = htonl(ct_pptp_info->pac_call_id); t.src.u.gre.key = htonl(nat_pptp_info->pns_call_id); + t.dst.u.gre.key = htonl(ct_pptp_info->pac_call_id); + inv_t.src.ip = reply_t->src.ip; + inv_t.dst.ip = reply_t->dst.ip; + inv_t.src.u.gre.key = htonl(nat_pptp_info->pac_call_id); + inv_t.dst.u.gre.key = htonl(ct_pptp_info->pns_call_id); } else { /* expectation for PAC->PNS direction */ - t.dst.ip = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip; - DEBUGP("EXPECTATION IN WRONG DIRECTION!!!\n"); + t.src.u.gre.key = htonl(nat_pptp_info->pac_call_id); + t.dst.u.gre.key = htonl(ct_pptp_info->pns_call_id); + inv_t.src.ip = orig_t->src.ip; + inv_t.dst.ip = orig_t->dst.ip; + inv_t.src.u.gre.key = htonl(nat_pptp_info->pns_call_id); + inv_t.dst.u.gre.key = htonl(ct_pptp_info->pac_call_id); } if (!ip_conntrack_change_expect(oldexp, &t)) { @@ -287,13 +287,7 @@ pptp_inbound_pkt(struct tcphdr *tcph, struct pptp_pkt_hdr *pptph, DEBUGP("can't change expect\n"); } ip_ct_gre_keymap_change(oldexp->proto.gre.keymap_orig, &t); - /* reply keymap */ - t.src.ip = ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip; - t.dst.ip = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip; - t.src.u.gre.key = htonl(nat_pptp_info->pac_call_id); - t.dst.u.gre.key = htonl(ct_pptp_info->pns_call_id); - ip_ct_gre_keymap_change(oldexp->proto.gre.keymap_reply, &t); - + ip_ct_gre_keymap_change(oldexp->proto.gre.keymap_reply, &inv_t); break; case PPTP_IN_CALL_CONNECT: pcid = &pptpReq.iccon->peersCallID; @@ -323,9 +317,10 @@ pptp_inbound_pkt(struct tcphdr *tcph, struct pptp_pkt_hdr *pptph, case PPTP_WAN_ERROR_NOTIFY: pcid = &pptpReq.wanerr->peersCallID; break; - case PPTP_SET_LINK_INFO: - pcid = &pptpReq.setlink->peersCallID; + case PPTP_CALL_DISCONNECT_NOTIFY: + pcid = &pptpReq.disc->callID; break; + default: DEBUGP("unknown inbound packet %s\n", (msg <= PPTP_MSG_MAX)? strMName[msg]:strMName[0]); @@ -334,14 +329,10 @@ pptp_inbound_pkt(struct tcphdr *tcph, struct pptp_pkt_hdr *pptph, case PPTP_START_SESSION_REQUEST: case PPTP_START_SESSION_REPLY: case PPTP_STOP_SESSION_REQUEST: + case PPTP_STOP_SESSION_REPLY: case PPTP_ECHO_REQUEST: case PPTP_ECHO_REPLY: /* no need to alter packet */ - DEBUGP("inbound control message %s\n", strMName[msg]); - DEBUGP("ct->pac_call_id = %d\n", ct_pptp_info->pac_call_id); - DEBUGP("ct->pns_call_id = %d\n", ct_pptp_info->pns_call_id); - DEBUGP("nat->pac_call_id = %d\n", nat_pptp_info->pac_call_id); - DEBUGP("nat->pns_call_id = %d\n", nat_pptp_info->pns_call_id); return NF_ACCEPT; } @@ -349,25 +340,21 @@ pptp_inbound_pkt(struct tcphdr *tcph, struct pptp_pkt_hdr *pptph, IP_NF_ASSERT(pcid); DEBUGP("altering peer call id from 0x%04x to 0x%04x\n", ntohs(*pcid), ntohs(new_pcid)); - tcph->check = ip_nat_cheat_check(*pcid^0xFFFF, - new_pcid, tcph->check); - *pcid = new_pcid; + ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, (void *)pcid - (void *)pptph, + sizeof(new_pcid), (char *)&new_pcid, + sizeof(new_pcid)); if (new_cid) { IP_NF_ASSERT(cid); DEBUGP("altering call id from 0x%04x to 0x%04x\n", ntohs(*cid), ntohs(new_cid)); - tcph->check = ip_nat_cheat_check(*cid^0xFFFF, - new_cid, tcph->check); - *cid = new_cid; + ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, + (void *)cid - (void *)pptph, + sizeof(new_cid), (char *)&new_cid, + sizeof(new_cid)); } /* great, at least we don't need to resize packets */ - DEBUGP("inbound control message %s\n", strMName[msg]); - DEBUGP("ct->pac_call_id = %d\n", ct_pptp_info->pac_call_id); - DEBUGP("ct->pns_call_id = %d\n", ct_pptp_info->pns_call_id); - DEBUGP("nat->pac_call_id = %d\n", nat_pptp_info->pac_call_id); - DEBUGP("nat->pns_call_id = %d\n", nat_pptp_info->pns_call_id); return NF_ACCEPT; } @@ -387,12 +374,13 @@ static unsigned int tcp_help(struct ip_conntrack *ct, DEBUGP("entering\n"); - /* Only mangle things once: original direction in POST_ROUTING - and reply direction on PRE_ROUTING. */ + /* Only mangle things once: DST for original direction + and SRC for reply direction. */ dir = CTINFO2DIR(ctinfo); - if (!((HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC && dir == IP_CT_DIR_ORIGINAL) - || (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST && dir == IP_CT_DIR_REPLY))) - { + if (!((HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC + && dir == IP_CT_DIR_ORIGINAL) + || (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST + && dir == IP_CT_DIR_REPLY))) { DEBUGP("Not touching dir %s at hook %s\n", dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY", hooknum == NF_IP_POST_ROUTING ? "POSTROUTING" @@ -409,13 +397,11 @@ static unsigned int tcp_help(struct ip_conntrack *ct, return NF_ACCEPT; } - pptph = (struct pptp_pkt_hdr *) ((void *)tcph + tcph->doff*4); /* if it's not a control message, we can't handle it */ if (ntohs(pptph->packetType) != PPTP_PACKET_CONTROL || - ntohl(pptph->magicCookie) != PPTP_MAGIC_COOKIE) - { + ntohl(pptph->magicCookie) != PPTP_MAGIC_COOKIE) { DEBUGP("not a pptp control packet\n"); return NF_ACCEPT; } @@ -424,10 +410,10 @@ static unsigned int tcp_help(struct ip_conntrack *ct, if (dir == IP_CT_DIR_ORIGINAL) { /* reuqests sent by client to server (PNS->PAC) */ - pptp_outbound_pkt(tcph, pptph, datalen, ct, ctinfo, exp); + pptp_outbound_pkt(pskb, ct, ctinfo, exp); } else { /* response from the server to the client (PAC->PNS) */ - pptp_inbound_pkt(tcph, pptph, datalen, ct, ctinfo, exp); + pptp_inbound_pkt(pskb, ct, ctinfo, exp); } UNLOCK_BH(&ip_pptp_lock); @@ -437,29 +423,52 @@ static unsigned int tcp_help(struct ip_conntrack *ct, /* nat helper struct for control connection */ static struct ip_nat_helper pptp_tcp_helper = { - { NULL, NULL }, - "pptp", IP_NAT_HELPER_F_ALWAYS, THIS_MODULE, - { { 0, { tcp: { port: __constant_htons(PPTP_CONTROL_PORT) } } }, - { 0, { 0 }, IPPROTO_TCP } }, - { { 0, { tcp: { port: 0xFFFF } } }, - { 0, { 0 }, 0xFFFF } }, - tcp_help, pptp_nat_expected }; + .list = { NULL, NULL }, + .name = "pptp", + .flags = IP_NAT_HELPER_F_ALWAYS, + .me = THIS_MODULE, + .tuple = { .src = { .ip = 0, + .u = { .tcp = { .port = + __constant_htons(PPTP_CONTROL_PORT) } + } + }, + .dst = { .ip = 0, + .u = { .all = 0 }, + .protonum = IPPROTO_TCP + } + }, + + .mask = { .src = { .ip = 0, + .u = { .tcp = { .port = 0xFFFF } } + }, + .dst = { .ip = 0, + .u = { .all = 0 }, + .protonum = 0xFFFF + } + }, + .help = tcp_help, + .expect = pptp_nat_expected +}; static int __init init(void) { - DEBUGP("init_module\n" ); - - if (ip_nat_helper_register(&pptp_tcp_helper)) + DEBUGP("%s: registering NAT helper\n", __FILE__); + if (ip_nat_helper_register(&pptp_tcp_helper)) { + printk(KERN_ERR "Unable to register NAT application helper " + "for pptp\n"); return -EIO; + } - return 0; + printk("ip_nat_pptp version %s loaded\n", IP_NAT_PPTP_VERSION); + return 0; } static void __exit fini(void) { DEBUGP("cleanup_module\n" ); - ip_nat_helper_unregister(&pptp_tcp_helper); + ip_nat_helper_unregister(&pptp_tcp_helper); + printk("ip_nat_pptp version %s unloaded\n", IP_NAT_PPTP_VERSION); } module_init(init); diff --git a/release/src/linux/linux/net/ipv4/netfilter/ip_nat_proto_esp.c b/release/src/linux/linux/net/ipv4/netfilter/ip_nat_proto_esp.c old mode 100755 new mode 100644 diff --git a/release/src/linux/linux/net/ipv4/netfilter/ip_nat_proto_gre.c b/release/src/linux/linux/net/ipv4/netfilter/ip_nat_proto_gre.c index 25aa1786..9be95857 100644 --- a/release/src/linux/linux/net/ipv4/netfilter/ip_nat_proto_gre.c +++ b/release/src/linux/linux/net/ipv4/netfilter/ip_nat_proto_gre.c @@ -1,5 +1,5 @@ /* - * ip_nat_proto_gre.c - Version 1.11 + * ip_nat_proto_gre.c - Version 1.2 * * NAT protocol helper module for GRE. * @@ -17,7 +17,7 @@ * * Documentation about PPTP can be found in RFC 2637 * - * (C) 2000-2002 by Harald Welte + * (C) 2000-2003 by Harald Welte * * Development of this code funded by Astaro AG (http://www.astaro.com/) * @@ -35,7 +35,12 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte "); MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE"); +#if 0 +#define DEBUGP(format, args...) printk(KERN_DEBUG __FILE__ ":" __FUNCTION__ \ + ": " format, ## args) +#else #define DEBUGP(x, args...) +#endif /* is key in given range between min and max */ static int @@ -44,8 +49,15 @@ gre_in_range(const struct ip_conntrack_tuple *tuple, const union ip_conntrack_manip_proto *min, const union ip_conntrack_manip_proto *max) { - return ntohl(tuple->src.u.gre.key) >= ntohl(min->gre.key) - && ntohl(tuple->src.u.gre.key) <= ntohl(max->gre.key); + u_int32_t key; + + if (maniptype == IP_NAT_MANIP_SRC) + key = tuple->src.u.gre.key; + else + key = tuple->dst.u.gre.key; + + return ntohl(key) >= ntohl(min->gre.key) + && ntohl(key) <= ntohl(max->gre.key); } /* generate unique tuple ... */ @@ -122,6 +134,7 @@ gre_manip_pkt(struct iphdr *iph, size_t len, break; } if (greh->csum) { + /* FIXME: Never tested this code... */ *(gre_csum(greh)) = ip_nat_cheat_check(~*(gre_key(greh)), manip->u.gre.key, diff --git a/release/src/linux/linux/net/ipv4/netfilter/ip_nat_proto_udp.c b/release/src/linux/linux/net/ipv4/netfilter/ip_nat_proto_udp.c index 05aefcd4..622aee05 100644 --- a/release/src/linux/linux/net/ipv4/netfilter/ip_nat_proto_udp.c +++ b/release/src/linux/linux/net/ipv4/netfilter/ip_nat_proto_udp.c @@ -40,9 +40,6 @@ udp_unique_tuple(struct ip_conntrack_tuple *tuple, else portptr = &tuple->dst.u.udp.port; - if(ntohs(*portptr) == 500) - return 0;//must not be "return 1" - /* If no range specified... */ if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { /* If it's dst rewrite, can't change port */ diff --git a/release/src/linux/linux/net/ipv4/netfilter/ip_tables.c b/release/src/linux/linux/net/ipv4/netfilter/ip_tables.c index 99438ca0..2e3004db 100644 --- a/release/src/linux/linux/net/ipv4/netfilter/ip_tables.c +++ b/release/src/linux/linux/net/ipv4/netfilter/ip_tables.c @@ -332,7 +332,7 @@ ipt_do_table(struct sk_buff **pskb, continue; } if (table_base + v - != (void *)e + e->next_offset) { + != (void *)e + e->next_offset && !(e->ip.flags & IPT_F_GOTO)) { /* Save old back ptr in next entry */ struct ipt_entry *next = (void *)e + e->next_offset; @@ -374,6 +374,12 @@ ipt_do_table(struct sk_buff **pskb, if (verdict == IPT_CONTINUE) e = (void *)e + e->next_offset; + else if (verdict == IPT_RETURN) { // added -- zzz + e = back; + back = get_entry(table_base, + back->comefrom); + continue; + } else /* Verdict */ break; @@ -1169,13 +1175,11 @@ do_add_counters(void *user, unsigned int len) goto free; write_lock_bh(&t->lock); - /************************************* - * modify by tanghui @ 2006-10-11 - * for a RACE CONDITION in the "do_add_counters()" function - *************************************/ - //if (t->private->number != paddc->num_counters) { - if (t->private->number != tmp.num_counters) { - /*************************************/ + +#if 0 // removed 1.11 forward bug test + // if (t->private->number != tmp.num_counters) { // 43011: modify by tanghui @ 2006-10-11 for a RACE CONDITION in the "do_add_counters()" function +#endif + if (t->private->number != paddc->num_counters) { ret = -EINVAL; goto unlock_up_free; } @@ -1676,7 +1680,7 @@ static struct ipt_match icmp_matchstruct = { { NULL, NULL }, "icmp", &icmp_match, &icmp_checkentry, NULL }; #ifdef CONFIG_PROC_FS -static inline int print_name(const char *i, +static int print_name(const char *i, off_t start_offset, char *buffer, int length, off_t *pos, unsigned int *count) { @@ -1694,6 +1698,15 @@ static inline int print_name(const char *i, return 0; } +static inline int print_target(const struct ipt_target *t, + off_t start_offset, char *buffer, int length, + off_t *pos, unsigned int *count) +{ + if (t == &ipt_standard_target || t == &ipt_error_target) + return 0; + return print_name((char *)t, start_offset, buffer, length, pos, count); +} + static int ipt_get_tables(char *buffer, char **start, off_t offset, int length) { off_t pos = 0; @@ -1720,7 +1733,7 @@ static int ipt_get_targets(char *buffer, char **start, off_t offset, int length) if (down_interruptible(&ipt_mutex) != 0) return 0; - LIST_FIND(&ipt_target, print_name, void *, + LIST_FIND(&ipt_target, print_target, struct ipt_target *, offset, buffer, length, &pos, &count); up(&ipt_mutex); diff --git a/release/src/linux/linux/net/ipv4/netfilter/ipt_BCOUNT.c b/release/src/linux/linux/net/ipv4/netfilter/ipt_BCOUNT.c new file mode 100644 index 00000000..b40e7e2e --- /dev/null +++ b/release/src/linux/linux/net/ipv4/netfilter/ipt_BCOUNT.c @@ -0,0 +1,63 @@ +/* + + BCOUNT target + Copyright (C) 2006 Jonathan Zarate + + Licensed under GNU GPL v2 or later. + +*/ +#include +#include +#include + +#include +#include +#include + +// #define DEBUG_BCOUNT + +static unsigned int target(struct sk_buff **pskb, unsigned int hooknum, + const struct net_device *in, const struct net_device *out, + const void *targinfo, void *userinfo) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + + ct = ip_conntrack_get(*pskb, &ctinfo); + if (ct) { + ct->bcount += (*pskb)->len; + if (ct->bcount >= 0x0FFFFFFF) ct->bcount = 0x0FFFFFFF; +#ifdef DEBUG_BCOUNT + if (net_ratelimit()) + printf(KERN_DEBUG "BCOUNT %lx %lx\n", (*pskb)->len, ct->bcount); +#endif + } + return IPT_CONTINUE; +} + +static int checkentry(const char *tablename, const struct ipt_entry *e, void *targinfo, + unsigned int targinfosize, unsigned int hook_mask) +{ + return (targinfosize == IPT_ALIGN(sizeof(struct ipt_BCOUNT_target))); +} + +static struct ipt_target BCOUNT_target += { { NULL, NULL }, "BCOUNT", target, checkentry, NULL, THIS_MODULE }; + +static int __init init(void) +{ + return ipt_register_target(&BCOUNT_target); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&BCOUNT_target); +} + +module_init(init); +module_exit(fini); + + +MODULE_AUTHOR("Jonathan Zarate"); +MODULE_DESCRIPTION("BCOUNT target"); +MODULE_LICENSE("GPL"); diff --git a/release/src/linux/linux/net/ipv4/netfilter/ipt_CLASSIFY.c b/release/src/linux/linux/net/ipv4/netfilter/ipt_CLASSIFY.c new file mode 100644 index 00000000..f7320721 --- /dev/null +++ b/release/src/linux/linux/net/ipv4/netfilter/ipt_CLASSIFY.c @@ -0,0 +1,82 @@ +/* + * This is a module which is used for setting the skb->priority field + * of an skb for qdisc classification. + */ + +#include +#include +#include +#include + +#include +#include + +MODULE_AUTHOR("Patrick McHardy "); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("iptables qdisc classification target module"); + +static unsigned int +target(struct sk_buff **pskb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + const void *targinfo, + void *userinfo) +{ + const struct ipt_classify_target_info *clinfo = targinfo; + + if((*pskb)->priority != clinfo->priority) { + (*pskb)->priority = clinfo->priority; + (*pskb)->nfcache |= NFC_ALTERED; + } + + return IPT_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_classify_target_info))){ + printk(KERN_ERR "CLASSIFY: invalid size (%u != %u).\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_classify_target_info))); + return 0; + } + + if (hook_mask & ~(1 << NF_IP_POST_ROUTING)) { + printk(KERN_ERR "CLASSIFY: only valid in POST_ROUTING.\n"); + return 0; + } + + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_WARNING "CLASSIFY: can only be called from " + "\"mangle\" table, not \"%s\".\n", + tablename); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_classify_reg += { { NULL, NULL }, "CLASSIFY", target, checkentry, NULL, THIS_MODULE }; + +static int __init init(void) +{ + if (ipt_register_target(&ipt_classify_reg)) + return -EINVAL; + + return 0; +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_classify_reg); +} + +module_init(init); +module_exit(fini); diff --git a/release/src/linux/linux/net/ipv4/netfilter/ipt_CONNMARK.c b/release/src/linux/linux/net/ipv4/netfilter/ipt_CONNMARK.c new file mode 100644 index 00000000..a6038378 --- /dev/null +++ b/release/src/linux/linux/net/ipv4/netfilter/ipt_CONNMARK.c @@ -0,0 +1,128 @@ +/* This kernel module is used to modify the connection mark values, or + * to optionally restore the skb nfmark from the connection mark + * + * Copyright (C) 2002,2004 MARA Systems AB + * by Henrik Nordstrom + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include +#include + +MODULE_AUTHOR("Henrik Nordstrom "); +MODULE_DESCRIPTION("IP tables CONNMARK matching module"); +MODULE_LICENSE("GPL"); + +#include +#include +#include + +static unsigned int +target(struct sk_buff **pskb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + const void *targinfo, + void *userinfo) +{ + const struct ipt_connmark_target_info *markinfo = targinfo; + unsigned long diff; + unsigned long nfmark; + unsigned long newmark; + + enum ip_conntrack_info ctinfo; + struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo); + if (ct) { + switch(markinfo->mode) { + case IPT_CONNMARK_SET: + newmark = (ct->mark & ~markinfo->mask) | markinfo->mark; + if (newmark != ct->mark) + ct->mark = newmark; + break; + case IPT_CONNMARK_SET_RETURN: + // Set connmark and nfmark, apply mask to nfmark, do IPT_RETURN - zzz + newmark = ct->mark = markinfo->mark; + newmark &= markinfo->mask; + nfmark = (*pskb)->nfmark; + if (newmark != nfmark) { + (*pskb)->nfmark = newmark; + (*pskb)->nfcache |= NFC_ALTERED; + } + return IPT_RETURN; + case IPT_CONNMARK_SAVE: + newmark = (ct->mark & ~markinfo->mask) | ((*pskb)->nfmark & markinfo->mask); + if (ct->mark != newmark) + ct->mark = newmark; + break; + case IPT_CONNMARK_RESTORE: + nfmark = (*pskb)->nfmark; + diff = (ct->mark ^ nfmark) & markinfo->mask; // zzz + if (diff != 0) { + (*pskb)->nfmark = nfmark ^ diff; + (*pskb)->nfcache |= NFC_ALTERED; + } + break; + } + } + + return IPT_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + struct ipt_connmark_target_info *matchinfo = targinfo; + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_connmark_target_info))) { + printk(KERN_WARNING "CONNMARK: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_connmark_target_info))); + return 0; + } + + if (matchinfo->mode == IPT_CONNMARK_RESTORE) { + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_WARNING "CONNMARK: restore can only be called from \"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + } + + return 1; +} + +static struct ipt_target ipt_connmark_reg = { + .name = "CONNMARK", + .target = &target, + .checkentry = &checkentry, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_connmark_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_connmark_reg); +} + +module_init(init); +module_exit(fini); diff --git a/release/src/linux/linux/net/ipv4/netfilter/ipt_IMQ.c b/release/src/linux/linux/net/ipv4/netfilter/ipt_IMQ.c new file mode 100644 index 00000000..2ba068b3 --- /dev/null +++ b/release/src/linux/linux/net/ipv4/netfilter/ipt_IMQ.c @@ -0,0 +1,78 @@ +/* This target marks packets to be enqueued to an imq device */ +#include +#include +#include +#include +#include + +static unsigned int imq_target(struct sk_buff **pskb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + const void *targinfo, + void *userinfo) +{ + struct ipt_imq_info *mr = (struct ipt_imq_info*)targinfo; + + (*pskb)->imq_flags = mr->todev | IMQ_F_ENQUEUE; + (*pskb)->nfcache |= NFC_ALTERED; + + return IPT_CONTINUE; +} + +static int imq_checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + struct ipt_imq_info *mr; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_imq_info))) { + printk(KERN_WARNING "IMQ: invalid targinfosize\n"); + return 0; + } + mr = (struct ipt_imq_info*)targinfo; + + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_WARNING + "IMQ: IMQ can only be called from \"mangle\" table, not \"%s\"\n", + tablename); + return 0; + } + + if (mr->todev > IMQ_MAX_DEVS) { + printk(KERN_WARNING + "IMQ: invalid device specified, highest is %u\n", + IMQ_MAX_DEVS); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_imq_reg = { + { NULL, NULL}, + "IMQ", + imq_target, + imq_checkentry, + NULL, + THIS_MODULE +}; + +static int __init init(void) +{ + if (ipt_register_target(&ipt_imq_reg)) + return -EINVAL; + + return 0; +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_imq_reg); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/release/src/linux/linux/net/ipv4/netfilter/ipt_MACSAVE.c b/release/src/linux/linux/net/ipv4/netfilter/ipt_MACSAVE.c new file mode 100644 index 00000000..62677279 --- /dev/null +++ b/release/src/linux/linux/net/ipv4/netfilter/ipt_MACSAVE.c @@ -0,0 +1,65 @@ +/* + + MACSAVE target + Copyright (C) 2006 Jonathan Zarate + + Licensed under GNU GPL v2 or later. + +*/ +#include +#include +#include + +#include +#include +#include + +static unsigned int target(struct sk_buff **pskb, unsigned int hooknum, + const struct net_device *in, const struct net_device *out, + const void *targinfo, void *userinfo) +{ +// const struct ipt_MACSAVE_target_info *info = targinfo; + struct sk_buff *skb = *pskb; + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + + if ((skb->mac.raw >= skb->head) && ((skb->mac.raw + ETH_HLEN) <= skb->data)) { + ct = ip_conntrack_get(skb, &ctinfo); + if (ct) { + memcpy(ct->macsave, skb->mac.ethernet->h_source, sizeof(ct->macsave)); + } + } + return IPT_CONTINUE; +} + +static int checkentry(const char *tablename, const struct ipt_entry *e, void *targinfo, + unsigned int targinfosize, unsigned int hook_mask) +{ + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_MACSAVE_target_info))) { + printk(KERN_ERR "MACSAVE: Invalid data size\n"); + return 0; + } + + if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_FORWARD) | (1 << NF_IP_LOCAL_IN))) { + printk(KERN_ERR "MACSAVE: Valid only in PREROUTING, FORWARD and INPUT\n"); + return 0; + } + return 1; +} + +static struct ipt_target macsave_target += { { NULL, NULL }, "MACSAVE", target, checkentry, NULL, THIS_MODULE }; + +static int __init init(void) +{ + return ipt_register_target(&macsave_target); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&macsave_target); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/release/src/linux/linux/net/ipv4/netfilter/ipt_ROUTE.c b/release/src/linux/linux/net/ipv4/netfilter/ipt_ROUTE.c new file mode 100644 index 00000000..b97d7792 --- /dev/null +++ b/release/src/linux/linux/net/ipv4/netfilter/ipt_ROUTE.c @@ -0,0 +1,422 @@ +/* + * This implements the ROUTE target, which enables you to setup unusual + * routes not supported by the standard kernel routing table. + * + * Copyright (C) 2002 Cedric de Launois + * + * v 1.11 2004/11/23 + * + * This software is distributed under GNU GPL v2, 1991 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + + +/* Try to route the packet according to the routing keys specified in + * route_info. Keys are : + * - ifindex : + * 0 if no oif preferred, + * otherwise set to the index of the desired oif + * - route_info->gw : + * 0 if no gateway specified, + * otherwise set to the next host to which the pkt must be routed + * If success, skb->dev is the output device to which the packet must + * be sent and skb->dst is not NULL + * + * RETURN: -1 if an error occured + * 1 if the packet was succesfully routed to the + * destination desired + * 0 if the kernel routing table could not route the packet + * according to the keys specified + */ +static int route(struct sk_buff *skb, + unsigned int ifindex, + const struct ipt_route_target_info *route_info) +{ + int err; + struct rtable *rt; + struct iphdr *iph = skb->nh.iph; + struct rt_key key = { + dst:iph->daddr, + src:0, + oif:ifindex, + tos:RT_TOS(iph->tos) + }; + + /* The destination address may be overloaded by the target */ + if (route_info->gw) + key.dst = route_info->gw; + + /* Trying to route the packet using the standard routing table. */ + if ((err = ip_route_output_key(&rt, &key))) { + if (net_ratelimit()) + DEBUGP("ipt_ROUTE: couldn't route pkt (err: %i)",err); + return -1; + } + + /* Drop old route. */ + dst_release(skb->dst); + skb->dst = NULL; + + /* Success if no oif specified or if the oif correspond to the + * one desired */ + if (!ifindex || rt->u.dst.dev->ifindex == ifindex) { + skb->dst = &rt->u.dst; + skb->dev = skb->dst->dev; + return 1; + } + + /* The interface selected by the routing table is not the one + * specified by the user. This may happen because the dst address + * is one of our own addresses. + */ + if (net_ratelimit()) + DEBUGP("ipt_ROUTE: failed to route as desired gw=%u.%u.%u.%u oif=%i (got oif=%i)\n", + NIPQUAD(route_info->gw), ifindex, rt->u.dst.dev->ifindex); + + return 0; +} + + +/* Stolen from ip_finish_output2 + * PRE : skb->dev is set to the device we are leaving by + * skb->dst is not NULL + * POST: the packet is sent with the link layer header pushed + * the packet is destroyed + */ +static void ip_direct_send(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct hh_cache *hh = dst->hh; + + if (hh) { + read_lock_bh(&hh->hh_lock); + memcpy(skb->data - 16, hh->hh_data, 16); + read_unlock_bh(&hh->hh_lock); + skb_push(skb, hh->hh_len); + hh->hh_output(skb); + } else if (dst->neighbour) + dst->neighbour->output(skb); + else { + if (net_ratelimit()) + DEBUGP(KERN_DEBUG "ipt_ROUTE: no hdr & no neighbour cache!\n"); + kfree_skb(skb); + } +} + + +/* PRE : skb->dev is set to the device we are leaving by + * POST: - the packet is directly sent to the skb->dev device, without + * pushing the link layer header. + * - the packet is destroyed + */ +static inline int dev_direct_send(struct sk_buff *skb) +{ + return dev_queue_xmit(skb); +} + + +static unsigned int route_oif(const struct ipt_route_target_info *route_info, + struct sk_buff *skb) +{ + unsigned int ifindex = 0; + struct net_device *dev_out = NULL; + + /* The user set the interface name to use. + * Getting the current interface index. + */ + if ((dev_out = dev_get_by_name(route_info->oif))) { + ifindex = dev_out->ifindex; + } else { + /* Unknown interface name : packet dropped */ + if (net_ratelimit()) + DEBUGP("ipt_ROUTE: oif interface %s not found\n", route_info->oif); + return NF_DROP; + } + + /* Trying the standard way of routing packets */ + switch (route(skb, ifindex, route_info)) { + case 1: + dev_put(dev_out); + if (route_info->flags & IPT_ROUTE_CONTINUE) + return IPT_CONTINUE; + + ip_direct_send(skb); + return NF_STOLEN; + + case 0: + /* Failed to send to oif. Trying the hard way */ + if (route_info->flags & IPT_ROUTE_CONTINUE) + return NF_DROP; + + if (net_ratelimit()) + DEBUGP("ipt_ROUTE: forcing the use of %i\n", + ifindex); + + /* We have to force the use of an interface. + * This interface must be a tunnel interface since + * otherwise we can't guess the hw address for + * the packet. For a tunnel interface, no hw address + * is needed. + */ + if ((dev_out->type != ARPHRD_TUNNEL) + && (dev_out->type != ARPHRD_IPGRE)) { + if (net_ratelimit()) + DEBUGP("ipt_ROUTE: can't guess the hw addr !\n"); + dev_put(dev_out); + return NF_DROP; + } + + /* Send the packet. This will also free skb + * Do not go through the POST_ROUTING hook because + * skb->dst is not set and because it will probably + * get confused by the destination IP address. + */ + skb->dev = dev_out; + dev_direct_send(skb); + dev_put(dev_out); + return NF_STOLEN; + + default: + /* Unexpected error */ + dev_put(dev_out); + return NF_DROP; + } +} + + +static unsigned int route_iif(const struct ipt_route_target_info *route_info, + struct sk_buff *skb) +{ + struct net_device *dev_in = NULL; + + /* Getting the current interface index. */ + if (!(dev_in = dev_get_by_name(route_info->iif))) { + if (net_ratelimit()) + DEBUGP("ipt_ROUTE: iif interface %s not found\n", route_info->iif); + return NF_DROP; + } + + skb->dev = dev_in; + dst_release(skb->dst); + skb->dst = NULL; + + netif_rx(skb); + dev_put(dev_in); + return NF_STOLEN; +} + + +static unsigned int route_gw(const struct ipt_route_target_info *route_info, + struct sk_buff *skb) +{ + if (route(skb, 0, route_info)!=1) + return NF_DROP; + + if (route_info->flags & IPT_ROUTE_CONTINUE) + return IPT_CONTINUE; + + ip_direct_send(skb); + return NF_STOLEN; +} + +/* To detect and deter routed packet loopback when using the --tee option, + * we take a page out of the raw.patch book: on the copied skb, we set up + * a fake ->nfct entry, pointing to the local &route_tee_track. We skip + * routing packets when we see they already have that ->nfct. + */ + +static struct ip_conntrack route_tee_track; + +static unsigned int ipt_route_target(struct sk_buff **pskb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + const void *targinfo, + void *userinfo) +{ + const struct ipt_route_target_info *route_info = targinfo; + struct sk_buff *skb = *pskb; + unsigned int res; + + /* If we are at PREROUTING or INPUT hook + * the TTL isn't decreased by the IP stack + */ + if (hooknum == NF_IP_PRE_ROUTING || + hooknum == NF_IP_LOCAL_IN) { + + struct iphdr *iph = skb->nh.iph; + + if (iph->ttl <= 1) { + struct rtable *rt; + + if (ip_route_output(&rt, iph->saddr, iph->daddr, + RT_TOS(iph->tos) | RTO_CONN, + 0)) { + return NF_DROP; + } + + if (skb->dev == rt->u.dst.dev) { + /* Drop old route. */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* this will traverse normal stack, and + * thus call conntrack on the icmp packet */ + icmp_send(skb, ICMP_TIME_EXCEEDED, + ICMP_EXC_TTL, 0); + } + + return NF_DROP; + } + + /* + * If we are at INPUT the checksum must be recalculated since + * the length could change as the result of a defragmentation. + * -- Rickard Molin + */ + if(hooknum == NF_IP_LOCAL_IN) { + iph->ttl = iph->ttl - 1; + iph->check = 0; + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + } else { + ip_decrease_ttl(iph); + } + } + + if ((route_info->flags & IPT_ROUTE_TEE)) { + /* + * Copy the *pskb, and route the copy. Will later return + * IPT_CONTINUE for the original skb, which should continue + * on its way as if nothing happened. The copy should be + * independantly delivered to the ROUTE --gw. + */ + skb = skb_copy(*pskb, GFP_ATOMIC); + if (!skb) { + if (net_ratelimit()) + DEBUGP(KERN_DEBUG "ipt_ROUTE: copy failed!\n"); + return IPT_CONTINUE; + } + } + + /* Tell conntrack to forget this packet since it may get confused + * when a packet is leaving with dst address == our address. + * Good idea ? Dunno. Need advice. + * + * NEW: mark the skb with our &route_tee_track, so we avoid looping + * on any already routed packet. + */ + if (!(route_info->flags & IPT_ROUTE_CONTINUE)) { + nf_conntrack_put(skb->nfct); + skb->nfct = &route_tee_track.infos[IP_CT_NEW]; + nf_conntrack_get(skb->nfct); + skb->nfcache = 0; +#ifdef CONFIG_NETFILTER_DEBUG + skb->nf_debug = 0; +#endif + } + + if (route_info->oif[0]) { + res = route_oif(route_info, skb); + } else if (route_info->iif[0]) { + res = route_iif(route_info, skb); + } else if (route_info->gw) { + res = route_gw(route_info, skb); + } else { + if (net_ratelimit()) + DEBUGP(KERN_DEBUG "ipt_ROUTE: no parameter !\n"); + res = IPT_CONTINUE; + } + + if ((route_info->flags & IPT_ROUTE_TEE)) + res = IPT_CONTINUE; + + return res; +} + + +static int ipt_route_checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + if (strcmp(tablename, "mangle") != 0) { + printk("ipt_ROUTE: bad table `%s', use the `mangle' table.\n", + tablename); + return 0; + } + + if (hook_mask & ~( (1 << NF_IP_PRE_ROUTING) + | (1 << NF_IP_LOCAL_IN) + | (1 << NF_IP_FORWARD) + | (1 << NF_IP_LOCAL_OUT) + | (1 << NF_IP_POST_ROUTING))) { + printk("ipt_ROUTE: bad hook\n"); + return 0; + } + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_route_target_info))) { + printk(KERN_WARNING "ipt_ROUTE: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_route_target_info))); + return 0; + } + + return 1; +} + + +static struct ipt_target ipt_route_reg += { { NULL, NULL }, "ROUTE", ipt_route_target, ipt_route_checkentry, NULL, + THIS_MODULE }; + + +static int __init init(void) +{ + /* Set up fake conntrack (stolen from raw.patch): + - to never be deleted, not in any hashes */ + atomic_set(&route_tee_track.ct_general.use, 1); + /* - and look it like as a confirmed connection */ + set_bit(IPS_CONFIRMED_BIT, &route_tee_track.status); + /* - and prepare the ctinfo field for REJECT/NAT. */ + route_tee_track.infos[IP_CT_NEW].master = + route_tee_track.infos[IP_CT_RELATED].master = + route_tee_track.infos[IP_CT_RELATED + IP_CT_IS_REPLY].master = + &route_tee_track.ct_general; + /* Initialize fake conntrack so that NAT will skip it */ + route_tee_track.nat.info.initialized |= + (1 << IP_NAT_MANIP_SRC) | (1 << IP_NAT_MANIP_DST); + + if (ipt_register_target(&ipt_route_reg)) + return -EINVAL; + + return 0; +} + + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_route_reg); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/release/src/linux/linux/net/ipv4/netfilter/ipt_TRIGGER.c b/release/src/linux/linux/net/ipv4/netfilter/ipt_TRIGGER.c index 99e7dfe7..07103fa5 100644 --- a/release/src/linux/linux/net/ipv4/netfilter/ipt_TRIGGER.c +++ b/release/src/linux/linux/net/ipv4/netfilter/ipt_TRIGGER.c @@ -62,7 +62,8 @@ LIST_HEAD(trigger_list); static void trigger_refresh(struct ipt_trigger *trig, unsigned long extra_jiffies) { - DEBUGP("%s: \n", __FUNCTION__); + DEBUGP("%s: mport=%u-%u\n", __FUNCTION__, trig->ports.mport[0], trig->ports.mport[1]); + IP_NF_ASSERT(trig); WRITE_LOCK(&ip_conntrack_lock); @@ -77,7 +78,8 @@ static void trigger_refresh(struct ipt_trigger *trig, unsigned long extra_jiffie static void __del_trigger(struct ipt_trigger *trig) { - DEBUGP("%s: \n", __FUNCTION__); + DEBUGP("%s: mport=%u-%u\n", __FUNCTION__, trig->ports.mport[0], trig->ports.mport[1]); + IP_NF_ASSERT(trig); MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); @@ -90,7 +92,9 @@ static void trigger_timeout(unsigned long ul_trig) { struct ipt_trigger *trig= (void *) ul_trig; - DEBUGP("trigger list %p timed out\n", trig); +// DEBUGP("trigger list %p timed out\n", trig); + DEBUGP("%s: mport=%u-%u\n", __FUNCTION__, trig->ports.mport[0], trig->ports.mport[1]); + WRITE_LOCK(&ip_conntrack_lock); __del_trigger(trig); WRITE_UNLOCK(&ip_conntrack_lock); @@ -250,7 +254,7 @@ trigger_dnat(struct sk_buff **pskb, IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW)); DEBUGP("%s: got ", __FUNCTION__); - DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + DUMP_TUPLE_RAW(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); /* Alter the destination of imcoming packet. */ newrange = ((struct ip_nat_multi_range) @@ -310,7 +314,7 @@ trigger_check(const char *tablename, DEBUGP("trigger_check: size %u.\n", targinfosize); return 0; } - if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_FORWARD))) { + if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_FORWARD) | (1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING))) { DEBUGP("trigger_check: bad hooks %x.\n", hook_mask); return 0; } diff --git a/release/src/linux/linux/net/ipv4/netfilter/ipt_TTL.c b/release/src/linux/linux/net/ipv4/netfilter/ipt_TTL.c new file mode 100644 index 00000000..2f0a4e7a --- /dev/null +++ b/release/src/linux/linux/net/ipv4/netfilter/ipt_TTL.c @@ -0,0 +1,110 @@ +/* TTL modification target for IP tables + * (C) 2000 by Harald Welte + * + * Version: 1.8 + * + * This software is distributed under the terms of GNU GPL + */ + +#include +#include +#include +#include + +#include +#include + +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("IP tables TTL modification module"); +MODULE_LICENSE("GPL"); + +static unsigned int ipt_ttl_target(struct sk_buff **pskb, unsigned int hooknum, + const struct net_device *in, const struct net_device *out, + const void *targinfo, void *userinfo) +{ + struct iphdr *iph = (*pskb)->nh.iph; + const struct ipt_TTL_info *info = targinfo; + u_int16_t diffs[2]; + int new_ttl; + + switch (info->mode) { + case IPT_TTL_SET: + new_ttl = info->ttl; + break; + case IPT_TTL_INC: + new_ttl = iph->ttl + info->ttl; + if (new_ttl > 255) + new_ttl = 255; + break; + case IPT_TTL_DEC: + new_ttl = iph->ttl - info->ttl; + if (new_ttl < 0) + new_ttl = 0; + break; + default: + new_ttl = iph->ttl; + break; + } + + if (new_ttl != iph->ttl) { + diffs[0] = htons(((unsigned)iph->ttl) << 8) ^ 0xFFFF; + iph->ttl = new_ttl; + diffs[1] = htons(((unsigned)iph->ttl) << 8); + iph->check = csum_fold(csum_partial((char *)diffs, + sizeof(diffs), + iph->check^0xFFFF)); + (*pskb)->nfcache |= NFC_ALTERED; + } + + return IPT_CONTINUE; +} + +static int ipt_ttl_checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + struct ipt_TTL_info *info = targinfo; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_TTL_info))) { + printk(KERN_WARNING "TTL: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_TTL_info))); + return 0; + } + + if (strcmp(tablename, "mangle")) { + printk(KERN_WARNING "TTL: can only be called from \"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + + if (info->mode > IPT_TTL_MAXMODE) { + printk(KERN_WARNING "TTL: invalid or unknown Mode %u\n", + info->mode); + return 0; + } + + if ((info->mode != IPT_TTL_SET) && (info->ttl == 0)) { + printk(KERN_WARNING "TTL: increment/decrement doesn't make sense with value 0\n"); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_TTL = { { NULL, NULL }, "TTL", + ipt_ttl_target, ipt_ttl_checkentry, NULL, THIS_MODULE }; + +static int __init init(void) +{ + return ipt_register_target(&ipt_TTL); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_TTL); +} + +module_init(init); +module_exit(fini); diff --git a/release/src/linux/linux/net/ipv4/netfilter/ipt_account.c b/release/src/linux/linux/net/ipv4/netfilter/ipt_account.c new file mode 100644 index 00000000..7fd34562 --- /dev/null +++ b/release/src/linux/linux/net/ipv4/netfilter/ipt_account.c @@ -0,0 +1,942 @@ +/* + * accounting match (ipt_account.c) + * (C) 2003,2004 by Piotr Gasidlo (quaker@barbara.eu.org) + * + * Version: 0.1.7 + * + * This software is distributed under the terms of GNU GPL + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include + +#include +#include + +#if defined(CONFIG_IP_NF_MATCH_ACCOUNT_DEBUG) + #define dprintk(format,args...) printk(format,##args) +#else + #define dprintk(format,args...) +#endif + +static char version[] = +KERN_INFO IPT_ACCOUNT_NAME " " IPT_ACCOUNT_VERSION " : Piotr Gasid³o , http://www.barbara.eu.org/~quaker/ipt_account/\n"; + +/* rights for files created in /proc/net/ipt_account/ */ +static int permissions = 0644; +/* maximal netmask for single table */ +static int netmask = 16; + +/* module information */ +MODULE_AUTHOR("Piotr Gasidlo "); +MODULE_DESCRIPTION("Traffic accounting modules"); +MODULE_LICENSE("GPL"); +MODULE_PARM(permissions,"i"); +MODULE_PARM_DESC(permissions,"permissions on /proc/net/ipt_account/* files"); +MODULE_PARM(netmask, "i"); +MODULE_PARM_DESC(netmask, "maximum *save* size of one list (netmask)"); + +/* structure with statistics counters */ +struct t_ipt_account_stat { + u_int64_t b_all, b_tcp, b_udp, b_icmp, b_other; /* byte counters for all/tcp/udp/icmp/other traffic */ + u_int64_t p_all, p_tcp, p_udp, p_icmp, p_other; /* packet counters for all/tcp/udp/icmp/other traffic */ +}; + +/* stucture with statistics counters, used when table is created with --ashort switch */ +struct t_ipt_account_stat_short { + u_int64_t b_all; /* byte counters for all traffic */ + u_int64_t p_all; /* packet counters for all traffic */ +}; + +/* structure holding to/from statistics for single ip */ +struct t_ipt_account_ip_list { + struct t_ipt_account_stat src; + struct t_ipt_account_stat dest; + unsigned long time; /* time when this record was last updated */ + +}; + +/* same as above, for tables with --ashort switch */ +struct t_ipt_account_ip_list_short { + struct t_ipt_account_stat_short src; + struct t_ipt_account_stat_short dest; + unsigned long time; +}; + +/* structure describing single table */ +struct t_ipt_account_table { + char name[IPT_ACCOUNT_NAME_LEN]; /* table name ( = filename in /proc/net/ipt_account/) */ + union { /* table with statistics for each ip in network/netmask */ + struct t_ipt_account_ip_list *l; + struct t_ipt_account_ip_list_short *s; + } ip_list; + u_int32_t network; /* network/netmask covered by table*/ + u_int32_t netmask; + u_int32_t count; + int shortlisting:1; /* show only total columns of counters */ + int use_count; /* rules counter - counting number of rules using this table */ + struct t_ipt_account_table *next; + spinlock_t ip_list_lock; + struct proc_dir_entry *status_file; +}; + +/* we must use spinlocks to avoid parallel modifications of table list */ +static spinlock_t account_lock = SPIN_LOCK_UNLOCKED; + +static struct proc_dir_entry *proc_net_ipt_account = NULL; + +/* root pointer holding list of the tables */ +static struct t_ipt_account_table *account_tables = NULL; + +/* convert ascii to ip */ +int atoip(char *buffer, u_int32_t *ip) { + + char *bufferptr = buffer; + int part, shift; + + /* zero ip */ + *ip = 0; + + /* first must be a digit */ + if (!isdigit(*bufferptr)) + return 0; + + /* parse first 3 octets (III.III.III.iii) */ + for (part = 0, shift = 24; *bufferptr && shift; bufferptr++) { + if (isdigit(*bufferptr)) { + part = part * 10 + (*bufferptr - '0'); + continue; + } + if (*bufferptr == '.') { + if (part > 255) + return 0; + *ip |= part << shift; + shift -= 8; + part = 0; + continue; + } + return 0; + } + + /* we expect more digts */ + if (!*bufferptr) + return 0; + /* parse last octet (iii.iii.iii.III) */ + for (; *bufferptr; bufferptr++) { + if (isdigit(*bufferptr)) { + part = part * 10 + (*bufferptr - '0'); + continue; + } else { + if (part > 255) + return 0; + *ip |= part; + break; + } + } + return (bufferptr - buffer); +} + +/* convert ascii to 64bit integer */ +int atoi64(char *buffer, u_int64_t *i) { + char *bufferptr = buffer; + + /* zero integer */ + *i = 0; + + while (isdigit(*bufferptr)) { + *i = *i * 10 + (*bufferptr - '0'); + bufferptr++; + } + return (bufferptr - buffer); +} + +static void *account_seq_start(struct seq_file *s, loff_t *pos) +{ + struct proc_dir_entry *pde = s->private; + struct t_ipt_account_table *table = pde->data; + + unsigned int *bucket; + + spin_lock_bh(&table->ip_list_lock); + if (*pos >= table->count) + return NULL; + + bucket = kmalloc(sizeof(unsigned int), GFP_KERNEL); + if (!bucket) + return ERR_PTR(-ENOMEM); + *bucket = *pos; + return bucket; +} + +static void *account_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct proc_dir_entry *pde = s->private; + struct t_ipt_account_table *table = pde->data; + + unsigned int *bucket = (unsigned int *)v; + + *pos = ++(*bucket); + if (*pos >= table->count) { + kfree(v); + return NULL; + } + return bucket; +} + +static void account_seq_stop(struct seq_file *s, void *v) +{ + struct proc_dir_entry *pde = s->private; + struct t_ipt_account_table *table = pde->data; + unsigned int *bucket = (unsigned int *)v; + kfree(bucket); + spin_unlock_bh(&table->ip_list_lock); +} + +static int account_seq_write(struct file *file, const char *ubuffer, + size_t ulength, loff_t *pos) +{ + struct proc_dir_entry *pde = ((struct seq_file *)file->private_data)->private; + struct t_ipt_account_table *table = pde->data; + char buffer[1024], *bufferptr; + int length; + + u_int32_t ip; + int len, i; + struct t_ipt_account_ip_list l; + struct t_ipt_account_ip_list_short s; + u_int64_t *p, dummy; + + + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() entered.\n"); + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() ulength = %zi.\n", ulength); + + length = ulength; + if (ulength > 1024) + length = 1024; + if (copy_from_user(buffer, ubuffer, length)) + return -EFAULT; + buffer[length - 1] = 0; + bufferptr = buffer; + + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() buffer = \'%s\' length = %i.\n", buffer, length); + + /* reset table counters */ + if (!memcmp(buffer, "reset", 5)) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() got \"reset\".\n"); + if (!table->shortlisting) { + spin_lock_bh(&table->ip_list_lock); + memset(table->ip_list.l, 0, sizeof(struct t_ipt_account_ip_list) * table->count); + spin_unlock_bh(&table->ip_list_lock); + } else { + spin_lock_bh(&table->ip_list_lock); + memset(table->ip_list.s, 0, sizeof(struct t_ipt_account_ip_list_short) * table->count); + spin_unlock_bh(&table->ip_list_lock); + } + return length; + } + + if (!memcmp(buffer, "ip", 2)) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() got \"ip\".\n"); + bufferptr += 2; + if (!isspace(*bufferptr)) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() expected space (%ti).\n", bufferptr - buffer); + return length; /* expected space */ + } + bufferptr += 1; + if (*bufferptr != '=') { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() expected equal (%ti).\n", bufferptr - buffer); + return length; /* expected equal */ + } + bufferptr += 1; + if (!isspace(*bufferptr)) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() expected space (%ti).\n", bufferptr - buffer); + return length; /* expected space */ + } + bufferptr += 1; + if (!(len = atoip(bufferptr, &ip))) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() expected ip (%ti).\n", bufferptr - buffer); + return length; /* expected ip */ + } + bufferptr += len; + if ((ip & table->netmask) != table->network) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() expected ip [%u.%u.%u.%u] from table's network/netmask [%u.%u.%u.%u/%u.%u.%u.%u].\n", HIPQUAD(ip), HIPQUAD(table->network), HIPQUAD(table->netmask)); + return length; /* expected ip from table's network/netmask */ + } + if (!table->shortlisting) { + memset(&l, 0, sizeof(struct t_ipt_account_ip_list)); + while(*bufferptr) { + if (!isspace(*bufferptr)) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() expected space (%ti).\n", bufferptr - buffer); + return length; /* expected space */ + } + bufferptr += 1; + if (!memcmp(bufferptr, "bytes_src", 9)) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() got bytes_src (%ti).\n", bufferptr - buffer); + p = &l.src.b_all; + bufferptr += 9; + } else if (!memcmp(bufferptr, "bytes_dest", 10)) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() got bytes_dest (%ti).\n", bufferptr - buffer); + p = &l.dest.b_all; + bufferptr += 10; + } else if (!memcmp(bufferptr, "packets_src", 11)) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() got packets_src (%ti).\n", bufferptr - buffer); + p = &l.src.p_all; + bufferptr += 11; + } else if (!memcmp(bufferptr, "packets_dest", 12)) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() got packets_dest (%ti).\n", bufferptr - buffer); + p = &l.dest.p_all; + bufferptr += 12; + } else if (!memcmp(bufferptr, "time", 4)) { + /* time hack, ignore time tokens */ + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() got time (%ti).\n", bufferptr - buffer); + bufferptr += 4; + if (!isspace(*bufferptr)) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() expected space (%ti).\n", bufferptr - buffer); + return length; /* expected space */ + } + bufferptr += 1; + if (*bufferptr != '=') { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() expected equal (%ti).\n", bufferptr - buffer); + return length; /* expected equal */ + } + bufferptr += 1; + if (!isspace(*bufferptr)) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() expected space (%ti).\n", bufferptr - buffer); + return length; /* expected space */ + } + bufferptr += 1; + if (!(len = atoi64(bufferptr, &dummy))) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() expected int64 (%ti).\n", bufferptr - buffer); + return length; /* expected int64 */ + } + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() got %llu (%ti).\n", dummy, bufferptr - buffer); + bufferptr += len; + continue; /* skip time token */ + } else + return length; /* expected token */ + if (!isspace(*bufferptr)) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() expected space (%ti).\n", bufferptr - buffer); + return length; /* expected space */ + } + bufferptr += 1; + if (*bufferptr != '=') { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() expected equal (%ti).\n", bufferptr - buffer); + return length; /* expected equal */ + } + bufferptr += 1; + for (i = 0; i < 5; i++) { + if (!isspace(*bufferptr)) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() expected space (%ti).\n", bufferptr - buffer); + return length; /* expected space */ + } + bufferptr += 1; + if (!(len = atoi64(bufferptr, p))) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() expected int64 (%ti).\n", bufferptr - buffer); + return length; /* expected int64 */ + } + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() got %llu (%ti).\n", *p, bufferptr - buffer); + bufferptr += len; + p++; + } + } + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() updating row.\n"); + spin_lock_bh(&table->ip_list_lock); + /* update counters, do not overwrite time field */ + memcpy(&table->ip_list.l[ip - table->network], &l, sizeof(struct t_ipt_account_ip_list) - sizeof(unsigned long)); + spin_unlock_bh(&table->ip_list_lock); + } else { + memset(&s, 0, sizeof(struct t_ipt_account_ip_list_short)); + while(*bufferptr) { + if (!isspace(*bufferptr)) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() expected space (%ti).\n", bufferptr - buffer); + return length; /* expected space */ + } + bufferptr += 1; + if (!memcmp(bufferptr, "bytes_src", 9)) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() got bytes_src (%ti).\n", bufferptr - buffer); + p = &s.src.b_all; + bufferptr += 9; + } else if (!memcmp(bufferptr, "bytes_dest", 10)) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() got bytes_dest (%ti).\n", bufferptr - buffer); + p = &s.dest.b_all; + bufferptr += 10; + } else if (!memcmp(bufferptr, "packets_src", 11)) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() got packets_src (%ti).\n", bufferptr - buffer); + p = &s.src.p_all; + bufferptr += 11; + } else if (!memcmp(bufferptr, "packets_dest", 12)) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() got packets_dest (%ti).\n", bufferptr - buffer); + p = &s.dest.p_all; + bufferptr += 12; + } else if (!memcmp(bufferptr, "time", 4)) { + /* time hack, ignore time tokens */ + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() got time (%ti).\n", bufferptr - buffer); + bufferptr += 4; + if (!isspace(*bufferptr)) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() expected space (%ti).\n", bufferptr - buffer); + return length; /* expected space */ + } + bufferptr += 1; + if (*bufferptr != '=') { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() expected equal (%ti).\n", bufferptr - buffer); + return length; /* expected equal */ + } + bufferptr += 1; + if (!isspace(*bufferptr)) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() expected space (%ti).\n", bufferptr - buffer); + return length; /* expected space */ + } + bufferptr += 1; + if (!(len = atoi64(bufferptr, &dummy))) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() expected int64 (%ti).\n", bufferptr - buffer); + return length; /* expected int64 */ + } + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() got %llu (%ti).\n", dummy, bufferptr - buffer); + bufferptr += len; + continue; /* skip time token */ + } else { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() expected token (%ti).\n", bufferptr - buffer); + return length; /* expected token */ + } + if (!isspace(*bufferptr)) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() expected space (%ti).\n", bufferptr - buffer); + return length; /* expected space */ + } + bufferptr += 1; + if (*bufferptr != '=') { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() expected equal (%ti).\n", bufferptr - buffer); + return length; /* expected equal */ + } + bufferptr += 1; + if (!isspace(*bufferptr)) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() expected space (%ti).\n", bufferptr - buffer); + return length; /* expected space */ + } + bufferptr += 1; + if (!(len = atoi64(bufferptr, p))) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() expected int64 (%ti).\n", bufferptr - buffer); + return length; /* expected int64 */ + } + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() got %llu (%ti).\n", *p, bufferptr - buffer); + bufferptr += len; + } + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() updating row.\n"); + spin_lock_bh(&table->ip_list_lock); + /* update counters, do not overwrite time field */ + memcpy(&table->ip_list.s[ip - table->network], &s, sizeof(struct t_ipt_account_ip_list_short) - sizeof(unsigned long)); + spin_unlock_bh(&table->ip_list_lock); + } + } + + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": account_seq_write() left.\n"); + return length; +} + + +static int account_seq_show(struct seq_file *s, void *v) +{ + struct proc_dir_entry *pde = s->private; + struct t_ipt_account_table *table = pde->data; + unsigned int *bucket = (unsigned int *)v; + + u_int32_t address = table->network + *bucket; + struct timespec last; + + if (!table->shortlisting) { + jiffies_to_timespec(jiffies - table->ip_list.l[*bucket].time, &last); + seq_printf(s, + "ip = %u.%u.%u.%u bytes_src = %llu %llu %llu %llu %llu packets_src = %llu %llu %llu %llu %llu bytes_dest = %llu %llu %llu %llu %llu packets_dest = %llu %llu %llu %llu %llu time = %lu\n", + HIPQUAD(address), + table->ip_list.l[*bucket].src.b_all, + table->ip_list.l[*bucket].src.b_tcp, + table->ip_list.l[*bucket].src.b_udp, + table->ip_list.l[*bucket].src.b_icmp, + table->ip_list.l[*bucket].src.b_other, + table->ip_list.l[*bucket].src.p_all, + table->ip_list.l[*bucket].src.p_tcp, + table->ip_list.l[*bucket].src.p_udp, + table->ip_list.l[*bucket].src.p_icmp, + table->ip_list.l[*bucket].src.p_other, + table->ip_list.l[*bucket].dest.b_all, + table->ip_list.l[*bucket].dest.b_tcp, + table->ip_list.l[*bucket].dest.b_udp, + table->ip_list.l[*bucket].dest.b_icmp, + table->ip_list.l[*bucket].dest.b_other, + table->ip_list.l[*bucket].dest.p_all, + table->ip_list.l[*bucket].dest.p_tcp, + table->ip_list.l[*bucket].dest.p_udp, + table->ip_list.l[*bucket].dest.p_icmp, + table->ip_list.l[*bucket].dest.p_other, + last.tv_sec + ); + } else { + jiffies_to_timespec(jiffies - table->ip_list.s[*bucket].time, &last); + seq_printf(s, + "ip = %u.%u.%u.%u bytes_src = %llu packets_src = %llu bytes_dest = %llu packets_dest = %llu time = %lu\n", + HIPQUAD(address), + table->ip_list.s[*bucket].src.b_all, + table->ip_list.s[*bucket].src.p_all, + table->ip_list.s[*bucket].dest.b_all, + table->ip_list.s[*bucket].dest.p_all, + last.tv_sec + ); + } + return 0; +} + +static struct seq_operations account_seq_ops = { + .start = account_seq_start, + .next = account_seq_next, + .stop = account_seq_stop, + .show = account_seq_show +}; + +static int account_seq_open(struct inode *inode, struct file *file) +{ + int ret = seq_open(file, &account_seq_ops); + + if (!ret) { + struct seq_file *sf = file->private_data; + sf->private = PDE(inode); + } + return ret; +} + +static struct file_operations account_file_ops = { + .owner = THIS_MODULE, + .open = account_seq_open, + .read = seq_read, + .write = account_seq_write, + .llseek = seq_lseek, + .release = seq_release +}; + +/* do raw accounting */ +static inline void do_account(struct t_ipt_account_stat *stat, const struct sk_buff *skb) { + + /* update packet & bytes counters in *stat structure */ + stat->b_all += skb->len; + stat->p_all++; + + switch (skb->nh.iph->protocol) { + case IPPROTO_TCP: + stat->b_tcp += skb->len; + stat->p_tcp++; + break; + case IPPROTO_UDP: + stat->b_udp += skb->len; + stat->p_udp++; + break; + case IPPROTO_ICMP: + stat->b_icmp += skb->len; + stat->p_icmp++; + break; + default: + stat->b_other += skb->len; + stat->p_other++; + } +} + +static inline void do_account_short(struct t_ipt_account_stat_short *stat, const struct sk_buff *skb) { + + /* update packet & bytes counters in *stat structure */ + stat->b_all += skb->len; + stat->p_all++; +} + +static int match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + const void *hdr, + u_int16_t datalen, + int *hotdrop) +{ + + const struct t_ipt_account_info *info = (struct t_ipt_account_info*)matchinfo; + struct t_ipt_account_table *table; + int ret; + unsigned long now; + + u_int32_t address; + + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": match() entered.\n"); + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": match() match name = %s.\n", info->name); + + spin_lock_bh(&account_lock); + /* find the right table */ + table = account_tables; + while (table && strncmp(table->name, info->name, IPT_ACCOUNT_NAME_LEN) && (table = table->next)); + spin_unlock_bh(&account_lock); + + if (table == NULL) { + /* ups, no table with that name */ + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": match() table %s not found. Leaving.\n", info->name); + return 0; + } + + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": match() table found %s\n", table->name); + + /* lock table while updating statistics */ + spin_lock_bh(&table->ip_list_lock); + + /* default: no match */ + ret = 0; + + /* get current time */ + now = jiffies; + + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": match() got packet src = %u.%u.%u.%u, dst = %u.%u.%u.%u, proto = %u.\n", NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr), skb->nh.iph->protocol); + + /* check whether traffic from source ip address ... */ + address = ntohl(skb->nh.iph->saddr); + /* ... is being accounted by this table */ + if (address && ((u_int32_t)(address & table->netmask) == (u_int32_t)table->network)) { + /* yes, account this packet */ + dprintk(KERN_INFO "ipt_account: match() accounting packet src = %u.%u.%u.%u, proto = %u.\n", HIPQUAD(address), skb->nh.iph->protocol); + /* update counters this host */ + if (!table->shortlisting) { + do_account(&table->ip_list.l[address - table->network].src, skb); + table->ip_list.l[address - table->network].time = now; + /* update also counters for all hosts in this table (network address) */ + if (table->netmask != INADDR_BROADCAST) { + do_account(&table->ip_list.l[0].src, skb); + table->ip_list.l[0].time = now; + } + } else { + do_account_short(&table->ip_list.s[address - table->network].src, skb); + table->ip_list.s[address - table->network].time = now; + /* update also counters for all hosts in this table (network address) */ + if (table->netmask != INADDR_BROADCAST) { + do_account_short(&table->ip_list.s[0].src, skb); + table->ip_list.s[0].time = now; + } + } + /* yes, it's a match */ + ret = 1; + } + + /* do the same thing with destination ip address */ + address = ntohl(skb->nh.iph->daddr); + if (address && ((u_int32_t)(address & table->netmask) == (u_int32_t)table->network)) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": match() accounting packet dst = %u.%u.%u.%u, proto = %u.\n", HIPQUAD(address), skb->nh.iph->protocol); + if (!table->shortlisting) { + do_account(&table->ip_list.l[address - table->network].dest, skb); + table->ip_list.l[address - table->network].time = now; + if (table->netmask != INADDR_BROADCAST) { + do_account(&table->ip_list.l[0].dest, skb); + table->ip_list.s[0].time = now; + } + } else { + do_account_short(&table->ip_list.s[address - table->network].dest, skb); + table->ip_list.s[address - table->network].time = now; + if (table->netmask != INADDR_BROADCAST) { + do_account_short(&table->ip_list.s[0].dest, skb); + table->ip_list.s[0].time = now; + } + } + ret = 1; + } + spin_unlock_bh(&table->ip_list_lock); + + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": match() left.\n"); + + return ret; +} + +static int checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask) +{ + const struct t_ipt_account_info *info = matchinfo; + struct t_ipt_account_table *table, *find_table, *last_table; + int ret = 0; + + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": checkentry() entered.\n"); + + if (matchinfosize != IPT_ALIGN(sizeof(struct t_ipt_account_info))) return 0; + if (!info->name || !info->name[0]) return 0; + + /* find whether table with this name already exists */ + spin_lock_bh(&account_lock); + find_table = account_tables; + while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_ACCOUNT_NAME_LEN) && (find_table = find_table->next) ); + if (find_table != NULL) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": checkentry() table %s found.\n", info->name); + /* if table exists, check whether table network/netmask equals rule network/netmask */ + if (find_table->network != info->network || find_table->netmask != info->netmask || find_table->shortlisting != info->shortlisting) { + spin_unlock_bh(&account_lock); + printk(KERN_INFO IPT_ACCOUNT_NAME ": checkentry() wrong parameters (not equals existing table parameters).\n"); + ret = 0; + goto failure; + } + /* increment table use count */ + find_table->use_count++; + spin_unlock_bh(&account_lock); + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": checkentry() incrementing use count.\n"); + ret = 1; + goto failure; + } + spin_unlock_bh(&account_lock); + + /* check netmask first, before allocating memory */ + if (info->netmask < ((1 << netmask) - 1)) { + printk(KERN_INFO IPT_ACCOUNT_NAME ": checkentry() too big netmask.\n"); + ret = 0; + goto failure; + } + + /* table doesn't exist - create new */ + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": checkentry() allocating %zu for new table %s.\n", sizeof(struct t_ipt_account_table), info->name); + table = vmalloc(sizeof(struct t_ipt_account_table)); + if (table == NULL) { + printk(KERN_INFO IPT_ACCOUNT_NAME ": checkentry() failed to allocate %zu for new table %s.\n", sizeof(struct t_ipt_account_table), info->name); + ret = 0; /* was -ENOMEM */ + goto failure; + } + + /* setup table parameters */ + table->ip_list_lock = SPIN_LOCK_UNLOCKED; + table->next = NULL; + table->use_count = 1; + table->network = info->network; + table->netmask = info->netmask; + table->shortlisting = info->shortlisting; + table->count = (~table->netmask) + 1; + strncpy(table->name,info->name,IPT_ACCOUNT_NAME_LEN); + table->name[IPT_ACCOUNT_NAME_LEN - 1] = '\0'; + + /* allocate memory for table->ip_list */ + if (!table->shortlisting) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": checkentry() allocating %zu for ip_list.\n", sizeof(struct t_ipt_account_ip_list) * table->count); + table->ip_list.l = vmalloc(sizeof(struct t_ipt_account_ip_list) * table->count); + if (table->ip_list.l == NULL) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": checkentry() failed to allocate %zu for ip_list.\n", sizeof(struct t_ipt_account_ip_list) * table->count); + ret = 0; /* was -ENOMEM */ + goto failure_table; + } + memset(table->ip_list.l, 0, sizeof(struct t_ipt_account_ip_list) * table->count); + } else { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": checkentry() allocating %zu for ip_list.\n", sizeof(struct t_ipt_account_ip_list_short) * table->count); + table->ip_list.s = vmalloc(sizeof(struct t_ipt_account_ip_list_short) * table->count); + if (table->ip_list.s == NULL) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": checkentry() failed to allocate %zu for ip_list.\n", sizeof(struct t_ipt_account_ip_list_short) * table->count); + ret = 0; /* was -ENOMEM */ + goto failure_table; + } + memset(table->ip_list.s, 0, sizeof(struct t_ipt_account_ip_list_short) * table->count); + } + + /* put table into chain */ + spin_lock_bh(&account_lock); + find_table = account_tables; + while( (last_table = find_table) && strncmp(info->name, find_table->name, IPT_ACCOUNT_NAME_LEN) && (find_table = find_table->next) ); + if (find_table != NULL) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": checkentry() table %s found.\n", info->name); + if (find_table->network != info->network || find_table->netmask != info->netmask) { + spin_unlock_bh(&account_lock); + printk(KERN_INFO IPT_ACCOUNT_NAME ": checkentry() wrong network/netmask.\n"); + ret = 0; + goto failure_ip_list; + } + find_table->use_count++; + spin_unlock_bh(&account_lock); + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": checkentry() incrementing use count.\n"); + ret = 1; + goto failure_ip_list; + } + if (!last_table) + account_tables = table; + else + last_table->next = table; + spin_unlock_bh(&account_lock); + + /* create procfs status file */ + table->status_file = create_proc_entry(table->name, permissions, proc_net_ipt_account); + if (table->status_file == NULL) { + ret = 0; /* was -ENOMEM */ + goto failure_unlink; + } + table->status_file->owner = THIS_MODULE; + table->status_file->data = table; + wmb(); +// if (!table->shortlisting) + table->status_file->proc_fops = &account_file_ops; +// else +// table->status_file->proc_fops = &account_file_ops_short; + + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": checkentry() left.\n"); + /* everything went just okey */ + return 1; + + /* do cleanup in case of failure */ +failure_unlink: + /* remove table from list */ + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": checkentry() removing table.\n"); + spin_lock_bh(&account_lock); + last_table = NULL; + table = account_tables; + if (table == NULL) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": checkentry() no table found. Leaving.\n"); + spin_unlock_bh(&account_lock); + return 0; /* was -ENOMEM */ + } + while (strncmp(info->name, table->name, IPT_ACCOUNT_NAME_LEN) && (last_table = table) && (table = table->next)); + if (table == NULL) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": checkentry() table already destroyed. Leaving.\n"); + spin_unlock_bh(&account_lock); + return 0; /* was -ENOMEM */ + } + if (last_table) + last_table->next = table->next; + else + account_tables = table->next; + spin_unlock_bh(&account_lock); +failure_ip_list: + /* free memory allocated for statistics table */ + if (!table->shortlisting) + vfree(table->ip_list.l); + else + vfree(table->ip_list.s); +failure_table: + /* free table */ + vfree(table); +failure: + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": checkentry() left. Table not created.\n"); + /* failure return */ + return ret; +} + +static void destroy(void *matchinfo, + unsigned int matchinfosize) +{ + const struct t_ipt_account_info *info = matchinfo; + struct t_ipt_account_table *table, *last_table; + + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": destory() entered.\n"); + + if (matchinfosize != IPT_ALIGN(sizeof(struct t_ipt_account_info))) return; + + /* search for table */ + spin_lock_bh(&account_lock); + last_table = NULL; + table = account_tables; + if(table == NULL) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": destory() no tables found. Leaving.\n"); + spin_unlock_bh(&account_lock); + return; + } + while( strncmp(info->name,table->name,IPT_ACCOUNT_NAME_LEN) && (last_table = table) && (table = table->next) ); + if (table == NULL) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": destory() no table %s not found. Leaving.\n", info->name); + spin_unlock_bh(&account_lock); + return; + } + + /* decrement table use-count */ + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": destory() decrementing use count.\n"); + table->use_count--; + if (table->use_count) { + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": destory() table still in use. Leaving.\n"); + spin_unlock_bh(&account_lock); + return; + } + + /* remove table if use-count is zero */ + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": destory() table %s not used. Removing.\n", table->name); + + /* unlink table */ + if(last_table) + last_table->next = table->next; + else + account_tables = table->next; + spin_unlock_bh(&account_lock); + + /* wait while table is still in use */ + spin_lock_bh(&table->ip_list_lock); + spin_unlock_bh(&table->ip_list_lock); + + /* remove proc entries */ + remove_proc_entry(table->name, proc_net_ipt_account); + + /* remove table */ + if (!table->shortlisting) + vfree(table->ip_list.l); + else + vfree(table->ip_list.s); + vfree(table); + + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": destory() left.\n"); + return; +} + +static struct ipt_match account_match = { + .name = "account", + .match = &match, + .checkentry = &checkentry, + .destroy = &destroy, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + int err; + + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": __init() entered.\n"); + printk(version); + /* check params */ + if (netmask > 32 || netmask < 0) { + printk(KERN_INFO "account: Wrong netmask given by netmask parameter (%i). Valid is 32 to 0.\n", netmask); + err = -EINVAL; + goto doexit; + } + + /* create /proc/net/ipt_account directory */ + proc_net_ipt_account = proc_mkdir("ipt_account", proc_net); + if (!proc_net_ipt_account) { + printk(KERN_INFO IPT_ACCOUNT_NAME ": checkentry() failed to create procfs entry.\n"); + err = -ENOMEM; + goto doexit; + } + proc_net_ipt_account->owner = THIS_MODULE; + + err = ipt_register_match(&account_match); + if (err) { + printk(KERN_INFO IPT_ACCOUNT_NAME ": checkentry() failed to register match.\n"); + remove_proc_entry("ipt_account", proc_net); + } +doexit: + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": __init() left.\n"); + return err; +} + +static void __exit fini(void) +{ + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": __exit() entered.\n"); + + ipt_unregister_match(&account_match); + /* remove /proc/net/ipt_account/ directory */ + remove_proc_entry("ipt_account", proc_net); + + dprintk(KERN_INFO IPT_ACCOUNT_NAME ": __exit() left.\n"); +} + +module_init(init); +module_exit(fini); + diff --git a/release/src/linux/linux/net/ipv4/netfilter/ipt_bcount.c b/release/src/linux/linux/net/ipv4/netfilter/ipt_bcount.c new file mode 100644 index 00000000..63f93a14 --- /dev/null +++ b/release/src/linux/linux/net/ipv4/netfilter/ipt_bcount.c @@ -0,0 +1,59 @@ +/* + + bcount match (experimental) + Copyright (C) 2006 Jonathan Zarate + + Licensed under GNU GPL v2 or later. + +*/ +#include +#include +#include +#include +#include +#include + +// #define LOG printk +#define LOG(...) do { } while (0); + + +static int match(const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, + const void *matchinfo, int offset, const void *hdr, u_int16_t datalen, int *hotdrop) +{ + const struct ipt_bcount_match *info = matchinfo; + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + + ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo); + if (!ct) return !info->invert; + return ((ct->bcount >= info->min) && (ct->bcount <= info->max)) ^ info->invert; +} + +static int checkentry(const char *tablename, const struct ipt_ip *ip, void *matchinfo, + unsigned int matchsize, unsigned int hook_mask) +{ + return (matchsize == IPT_ALIGN(sizeof(struct ipt_bcount_match))); +} + + +static struct ipt_match bcount_match += { { NULL, NULL }, "bcount", &match, &checkentry, NULL, THIS_MODULE }; + +static int __init init(void) +{ + LOG(KERN_INFO "ipt_bcount <" __DATE__ " " __TIME__ "> loaded\n"); + return ipt_register_match(&bcount_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&bcount_match); +} + +module_init(init); +module_exit(fini); + + +MODULE_AUTHOR("Jonathan Zarate"); +MODULE_DESCRIPTION("bcount match"); +MODULE_LICENSE("GPL"); diff --git a/release/src/linux/linux/net/ipv4/netfilter/ipt_condition.c b/release/src/linux/linux/net/ipv4/netfilter/ipt_condition.c new file mode 100644 index 00000000..c8ee72d5 --- /dev/null +++ b/release/src/linux/linux/net/ipv4/netfilter/ipt_condition.c @@ -0,0 +1,256 @@ +/*-------------------------------------------*\ +| Netfilter Condition Module | +| | +| Description: This module allows firewall | +| rules to match using condition variables | +| stored in /proc files. | +| | +| Author: Stephane Ouellette 2002-10-22 | +| | +| | +| History: | +| 2003-02-10 Second version with improved | +| locking and simplified code. | +| | +| This software is distributed under the | +| terms of the GNU GPL. | +\*-------------------------------------------*/ + +#include +#include +#include +#include +#include +#include +#include + + +#ifndef CONFIG_PROC_FS +#error "Proc file system support is required for this module" +#endif + + +MODULE_AUTHOR("Stephane Ouellette "); +MODULE_DESCRIPTION("Allows rules to match against condition variables"); +MODULE_LICENSE("GPL"); + + +struct condition_variable { + struct condition_variable *next; + struct proc_dir_entry *status_proc; + atomic_t refcount; + int enabled; /* TRUE == 1, FALSE == 0 */ +}; + + +static rwlock_t list_lock; +static struct condition_variable *head = NULL; +static struct proc_dir_entry *proc_net_condition = NULL; + + +static int +ipt_condition_read_info(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + struct condition_variable *var = + (struct condition_variable *) data; + + if (offset == 0) { + *start = buffer; + buffer[0] = (var->enabled) ? '1' : '0'; + buffer[1] = '\n'; + return 2; + } + + *eof = 1; + return 0; +} + + +static int +ipt_condition_write_info(struct file *file, const char *buffer, + unsigned long length, void *data) +{ + struct condition_variable *var = + (struct condition_variable *) data; + + if (length) { + /* Match only on the first character */ + switch (buffer[0]) { + case '0': + var->enabled = 0; + break; + case '1': + var->enabled = 1; + } + } + + return (int) length; +} + + +static int +match(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const void *matchinfo, int offset, + const void *hdr, u_int16_t datalen, int *hotdrop) +{ + const struct condition_info *info = + (const struct condition_info *) matchinfo; + struct condition_variable *var; + int condition_status = 0; + + read_lock(&list_lock); + + for (var = head; var; var = var->next) { + if (strcmp(info->name, var->status_proc->name) == 0) { + condition_status = var->enabled; + break; + } + } + + read_unlock(&list_lock); + + return condition_status ^ info->invert; +} + + + +static int +checkentry(const char *tablename, const struct ipt_ip *ip, + void *matchinfo, unsigned int matchsize, unsigned int hook_mask) +{ + struct condition_info *info = (struct condition_info *) matchinfo; + struct condition_variable *var, *newvar; + + if (matchsize != IPT_ALIGN(sizeof(struct condition_info))) + return 0; + + /* The first step is to check if the condition variable already exists. */ + /* Here, a read lock is sufficient because we won't change the list */ + read_lock(&list_lock); + + for (var = head; var; var = var->next) { + if (strcmp(info->name, var->status_proc->name) == 0) { + atomic_inc(&var->refcount); + read_unlock(&list_lock); + return 1; + } + } + + read_unlock(&list_lock); + + /* At this point, we need to allocate a new condition variable */ + newvar = kmalloc(sizeof(struct condition_variable), GFP_KERNEL); + + if (!newvar) + return -ENOMEM; + + /* Create the condition variable's proc file entry */ + newvar->status_proc = create_proc_entry(info->name, 0644, proc_net_condition); + + if (!newvar->status_proc) { + /* + * There are two possibilities: + * 1- Another condition variable with the same name has been created, which is valid. + * 2- There was a memory allocation error. + */ + kfree(newvar); + read_lock(&list_lock); + + for (var = head; var; var = var->next) { + if (strcmp(info->name, var->status_proc->name) == 0) { + atomic_inc(&var->refcount); + read_unlock(&list_lock); + return 1; + } + } + + read_unlock(&list_lock); + return -ENOMEM; + } + + atomic_set(&newvar->refcount, 1); + newvar->enabled = 0; + newvar->status_proc->owner = THIS_MODULE; + newvar->status_proc->data = newvar; + wmb(); + newvar->status_proc->read_proc = ipt_condition_read_info; + newvar->status_proc->write_proc = ipt_condition_write_info; + + write_lock(&list_lock); + + newvar->next = head; + head = newvar; + + write_unlock(&list_lock); + + return 1; +} + + +static void +destroy(void *matchinfo, unsigned int matchsize) +{ + struct condition_info *info = (struct condition_info *) matchinfo; + struct condition_variable *var, *prev = NULL; + + if (matchsize != IPT_ALIGN(sizeof(struct condition_info))) + return; + + write_lock(&list_lock); + + for (var = head; var && strcmp(info->name, var->status_proc->name); + prev = var, var = var->next); + + if (var && atomic_dec_and_test(&var->refcount)) { + if (prev) + prev->next = var->next; + else + head = var->next; + + write_unlock(&list_lock); + remove_proc_entry(var->status_proc->name, proc_net_condition); + kfree(var); + } else + write_unlock(&list_lock); +} + + +static struct ipt_match condition_match = { + .name = "condition", + .match = &match, + .checkentry = &checkentry, + .destroy = &destroy, + .me = THIS_MODULE +}; + + +static int __init +init(void) +{ + int errorcode; + + rwlock_init(&list_lock); + proc_net_condition = proc_mkdir("ipt_condition", proc_net); + + if (proc_net_condition) { + errorcode = ipt_register_match(&condition_match); + + if (errorcode) + remove_proc_entry("ipt_condition", proc_net); + } else + errorcode = -EACCES; + + return errorcode; +} + + +static void __exit +fini(void) +{ + ipt_unregister_match(&condition_match); + remove_proc_entry("ipt_condition", proc_net); +} + +module_init(init); +module_exit(fini); diff --git a/release/src/linux/linux/net/ipv4/netfilter/ipt_connlimit.c b/release/src/linux/linux/net/ipv4/netfilter/ipt_connlimit.c new file mode 100644 index 00000000..abf8efff --- /dev/null +++ b/release/src/linux/linux/net/ipv4/netfilter/ipt_connlimit.c @@ -0,0 +1,222 @@ +/* + * netfilter module to limit the number of parallel tcp + * connections per IP address. + * (c) 2000 Gerd Knorr + * Nov 2002: Martin Bene : + * only ignore TIME_WAIT or gone connections + * + * based on ... + * + * Kernel module to match connection tracking information. + * GPL (C) 1999 Rusty Russell (rusty@rustcorp.com.au). + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG 0 + +MODULE_LICENSE("GPL"); + +/* we'll save the tuples of all connections we care about */ +struct ipt_connlimit_conn +{ + struct list_head list; + struct ip_conntrack_tuple tuple; +}; + +struct ipt_connlimit_data { + spinlock_t lock; + struct list_head iphash[256]; +}; + +static inline unsigned ipt_iphash(const unsigned addr) +{ + return ((addr ^ (addr >> 8) ^ (addr >> 16) ^ (addr >> 24)) & 0xff); +} + +static int count_them(struct ipt_connlimit_data *data, + u_int32_t addr, u_int32_t mask, + struct ip_conntrack *ct) +{ +#if DEBUG + const static char *tcp[] = { "none", "established", "syn_sent", "syn_recv", + "fin_wait", "time_wait", "close", "close_wait", + "last_ack", "listen" }; +#endif + int addit = 1, matches = 0; + struct ip_conntrack_tuple tuple; + struct ip_conntrack_tuple_hash *found; + struct ipt_connlimit_conn *conn; + struct list_head *hash,*lh; + + spin_lock_bh(&data->lock); + tuple = ct->tuplehash[0].tuple; + hash = &data->iphash[ipt_iphash(addr & mask)]; + + /* check the saved connections */ + for (lh = hash->next; lh != hash; lh = lh->next) { + conn = list_entry(lh,struct ipt_connlimit_conn,list); + found = ip_conntrack_find_get(&conn->tuple,ct); + if (found != NULL && + 0 == memcmp(&conn->tuple,&tuple,sizeof(tuple)) && + found->ctrack->proto.tcp.state != TCP_CONNTRACK_TIME_WAIT) { + /* Just to be sure we have it only once in the list. + We should'nt see tuples twice unless someone hooks this + into a table without "-p tcp --syn" */ + addit = 0; + } +#if DEBUG + printk("ipt_connlimit [%d]: src=%u.%u.%u.%u:%d dst=%u.%u.%u.%u:%d %s\n", + ipt_iphash(addr & mask), + NIPQUAD(conn->tuple.src.ip), ntohs(conn->tuple.src.u.tcp.port), + NIPQUAD(conn->tuple.dst.ip), ntohs(conn->tuple.dst.u.tcp.port), + (NULL != found) ? tcp[found->ctrack->proto.tcp.state] : "gone"); +#endif + if (NULL == found) { + /* this one is gone */ + lh = lh->prev; + list_del(lh->next); + kfree(conn); + continue; + } + if (found->ctrack->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT) { + /* we don't care about connections which are + closed already -> ditch it */ + lh = lh->prev; + list_del(lh->next); + kfree(conn); + nf_conntrack_put(&found->ctrack->infos[0]); + continue; + } + if ((addr & mask) == (conn->tuple.src.ip & mask)) { + /* same source IP address -> be counted! */ + matches++; + } + nf_conntrack_put(&found->ctrack->infos[0]); + } + if (addit) { + /* save the new connection in our list */ +#if DEBUG + printk("ipt_connlimit [%d]: src=%u.%u.%u.%u:%d dst=%u.%u.%u.%u:%d new\n", + ipt_iphash(addr & mask), + NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port), + NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port)); +#endif + conn = kmalloc(sizeof(*conn),GFP_ATOMIC); + if (NULL == conn) + return -1; + memset(conn,0,sizeof(*conn)); + INIT_LIST_HEAD(&conn->list); + conn->tuple = tuple; + list_add(&conn->list,hash); + matches++; + } + spin_unlock_bh(&data->lock); + return matches; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + const void *hdr, + u_int16_t datalen, + int *hotdrop) +{ + const struct ipt_connlimit_info *info = matchinfo; + int connections, match; + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + + ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo); + if (NULL == ct) { + printk("ipt_connlimit: Oops: invalid ct state ?\n"); + *hotdrop = 1; + return 0; + } + connections = count_them(info->data,skb->nh.iph->saddr,info->mask,ct); + if (-1 == connections) { + printk("ipt_connlimit: Hmm, kmalloc failed :-(\n"); + *hotdrop = 1; /* let's free some memory :-) */ + return 0; + } + match = (info->inverse) ? (connections <= info->limit) : (connections > info->limit); +#if DEBUG + printk("ipt_connlimit: src=%u.%u.%u.%u mask=%u.%u.%u.%u " + "connections=%d limit=%d match=%s\n", + NIPQUAD(skb->nh.iph->saddr), NIPQUAD(info->mask), + connections, info->limit, match ? "yes" : "no"); +#endif + + return match; +} + +static int check(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + struct ipt_connlimit_info *info = matchinfo; + int i; + + /* verify size */ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_connlimit_info))) + return 0; + + /* refuse anything but tcp */ + if (ip->proto != IPPROTO_TCP) + return 0; + + /* init private data */ + info->data = kmalloc(sizeof(struct ipt_connlimit_data),GFP_KERNEL); + spin_lock_init(&(info->data->lock)); + for (i = 0; i < 256; i++) + INIT_LIST_HEAD(&(info->data->iphash[i])); + + return 1; +} + +static void destroy(void *matchinfo, unsigned int matchinfosize) +{ + struct ipt_connlimit_info *info = matchinfo; + struct ipt_connlimit_conn *conn; + struct list_head *hash; + int i; + + /* cleanup */ + for (i = 0; i < 256; i++) { + hash = &(info->data->iphash[i]); + while (hash != hash->next) { + conn = list_entry(hash->next,struct ipt_connlimit_conn,list); + list_del(hash->next); + kfree(conn); + } + } + kfree(info->data); +} + +static struct ipt_match connlimit_match += { { NULL, NULL }, "connlimit", &match, &check, &destroy, THIS_MODULE }; + +static int __init init(void) +{ + return ipt_register_match(&connlimit_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&connlimit_match); +} + +module_init(init); +module_exit(fini); diff --git a/release/src/linux/linux/net/ipv4/netfilter/ipt_connmark.c b/release/src/linux/linux/net/ipv4/netfilter/ipt_connmark.c new file mode 100644 index 00000000..d795a339 --- /dev/null +++ b/release/src/linux/linux/net/ipv4/netfilter/ipt_connmark.c @@ -0,0 +1,83 @@ +/* This kernel module matches connection mark values set by the + * CONNMARK target + * + * Copyright (C) 2002,2004 MARA Systems AB + * by Henrik Nordstrom + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include + +MODULE_AUTHOR("Henrik Nordstrom "); +MODULE_DESCRIPTION("IP tables connmark match module"); +MODULE_LICENSE("GPL"); + +#include +#include +#include + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + const void *hdr, + u_int16_t datalen, + int *hotdrop) +{ + const struct ipt_connmark_info *info = matchinfo; + enum ip_conntrack_info ctinfo; + struct ip_conntrack *ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo); + if (!ct) + return 0; + + return ((ct->mark & info->mask) == info->mark) ^ info->invert; +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_connmark_info))) + return 0; + + return 1; +} + +static struct ipt_match connmark_match = { + .name = "connmark", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_match(&connmark_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&connmark_match); +} + +module_init(init); +module_exit(fini); diff --git a/release/src/linux/linux/net/ipv4/netfilter/ipt_exp.c b/release/src/linux/linux/net/ipv4/netfilter/ipt_exp.c new file mode 100644 index 00000000..1b682b9c --- /dev/null +++ b/release/src/linux/linux/net/ipv4/netfilter/ipt_exp.c @@ -0,0 +1,57 @@ +/* + + Experimental Netfilter Crap + Copyright (C) 2006 Jonathan Zarate + +*/ +#include +#include +#include +#include + +#include +#include +#include "../../bridge/br_private.h" + + +static int match(const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, + const void *matchinfo, int offset, const void *hdr, u_int16_t datalen, int *hotdrop) +{ +// const struct ipt_exp_info *info = matchinfo; + + if ((skb->mac.raw >= skb->head) && ((skb->mac.raw + ETH_HLEN) <= skb->data)) { + printk(KERN_INFO "exp src=%02X:%02X:%02X:%02X:%02X:%02X dst=%02X:%02X:%02X:%02X:%02X:%02X\n", + skb->mac.ethernet->h_source[0], skb->mac.ethernet->h_source[1], skb->mac.ethernet->h_source[2], + skb->mac.ethernet->h_source[3], skb->mac.ethernet->h_source[4], skb->mac.ethernet->h_source[5], + skb->mac.ethernet->h_dest[0], skb->mac.ethernet->h_dest[1], skb->mac.ethernet->h_dest[2], + skb->mac.ethernet->h_dest[3], skb->mac.ethernet->h_dest[4], skb->mac.ethernet->h_dest[5]); + return 1; + } + printk(KERN_INFO "exp mac=%p head=%p in=%p\n", skb->mac.raw, skb->head, in); + return 0; +} + +static int checkentry(const char *tablename, const struct ipt_ip *ip, void *matchinfo, + unsigned int matchsize, unsigned int hook_mask) +{ + return (matchsize == IPT_ALIGN(sizeof(struct ipt_exp_info))); +} + +static struct ipt_match exp_match + = { { NULL, NULL }, "exp", &match, &checkentry, NULL, THIS_MODULE }; + +static int __init init(void) +{ + printk(KERN_INFO "exp init " __DATE__ " " __TIME__ "\n"); + return ipt_register_match(&exp_match); +} + +static void __exit fini(void) +{ + printk(KERN_INFO "exp fini\n"); + ipt_unregister_match(&exp_match); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/release/src/linux/linux/net/ipv4/netfilter/ipt_geoip.c b/release/src/linux/linux/net/ipv4/netfilter/ipt_geoip.c new file mode 100644 index 00000000..fbd1a95c --- /dev/null +++ b/release/src/linux/linux/net/ipv4/netfilter/ipt_geoip.c @@ -0,0 +1,272 @@ +/* netfilter's kernel module for the geoip match + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (c) 2004 Cookinglinux + */ +#include +#include +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Samuel Jean, Nicolas Bouliane"); +MODULE_DESCRIPTION("iptables/netfilter's geoip match"); + +struct geoip_info *head = NULL; +static spinlock_t geoip_lock = SPIN_LOCK_UNLOCKED; + +static struct geoip_info *add_node(struct geoip_info *memcpy) +{ + struct geoip_info *p = + (struct geoip_info *)kmalloc(sizeof(struct geoip_info), GFP_KERNEL); + + struct geoip_subnet *s; + + if ((p == NULL) || (copy_from_user(p, memcpy, sizeof(struct geoip_info)) != 0)) + return NULL; + + s = (struct geoip_subnet *)kmalloc(p->count * sizeof(struct geoip_subnet), GFP_KERNEL); + if ((s == NULL) || (copy_from_user(s, p->subnets, p->count * sizeof(struct geoip_subnet)) != 0)) + return NULL; + + spin_lock_bh(&geoip_lock); + + p->subnets = s; + p->ref = 1; + p->next = head; + p->prev = NULL; + if (p->next) p->next->prev = p; + head = p; + + spin_unlock_bh(&geoip_lock); + return p; +} + +static void remove_node(struct geoip_info *p) + { + spin_lock_bh(&geoip_lock); + + if (p->next) { /* Am I following a node ? */ + p->next->prev = p->prev; + if (p->prev) p->prev->next = p->next; /* Is there a node behind me ? */ + else head = p->next; /* No? Then I was the head */ + } + + else + if (p->prev) /* Is there a node behind me ? */ + p->prev->next = NULL; + else + head = NULL; /* No, we're alone */ + + /* So now am unlinked or the only one alive, right ? + * What are you waiting ? Free up some memory! + */ + + kfree(p->subnets); + kfree(p); + + spin_unlock_bh(&geoip_lock); + return; +} + +static struct geoip_info *find_node(u_int16_t cc) +{ + struct geoip_info *p = head; + spin_lock_bh(&geoip_lock); + + while (p) { + if (p->cc == cc) { + spin_unlock_bh(&geoip_lock); + return p; + } + p = p->next; + } + spin_unlock_bh(&geoip_lock); + return NULL; +} + +static int match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + const void *hdr, + u_int16_t datalen, + int *hotdrop) +{ + const struct ipt_geoip_info *info = matchinfo; + const struct geoip_info *node; /* This keeps the code sexy */ + const struct iphdr *iph = skb->nh.iph; + u_int32_t ip, j; + u_int8_t i; + + if (info->flags & IPT_GEOIP_SRC) + ip = ntohl(iph->saddr); + else + ip = ntohl(iph->daddr); + + spin_lock_bh(&geoip_lock); + for (i = 0; i < info->count; i++) { + if ((node = info->mem[i]) == NULL) { + printk(KERN_ERR "ipt_geoip: what the hell ?? '%c%c' isn't loaded into memory... skip it!\n", + COUNTRY(info->cc[i])); + + continue; + } + + for (j = 0; j < node->count; j++) + if ((ip > node->subnets[j].begin) && (ip < node->subnets[j].end)) { + spin_unlock_bh(&geoip_lock); + return (info->flags & IPT_GEOIP_INV) ? 0 : 1; + } + } + + spin_unlock_bh(&geoip_lock); + return (info->flags & IPT_GEOIP_INV) ? 1 : 0; +} + +static int geoip_checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + struct ipt_geoip_info *info = matchinfo; + struct geoip_info *node; + u_int8_t i; + + /* FIXME: Call a function to free userspace allocated memory. + * As Martin J. said; this match might eat lot of memory + * if commited with iptables-restore --noflush + void (*gfree)(struct geoip_info *oldmem); + gfree = info->fini; + */ + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_geoip_info))) { + printk(KERN_ERR "ipt_geoip: matchsize differ, you may have forgotten to recompile me\n"); + return 0; + } + + /* If info->refcount isn't NULL, then + * it means that checkentry() already + * initialized this entry. Increase a + * refcount to prevent destroy() of + * this entry. */ + if (info->refcount != NULL) { + atomic_inc((atomic_t *)info->refcount); + return 1; + } + + + for (i = 0; i < info->count; i++) { + + if ((node = find_node(info->cc[i])) != NULL) + atomic_inc((atomic_t *)&node->ref); //increase the reference + else + if ((node = add_node(info->mem[i])) == NULL) { + printk(KERN_ERR + "ipt_geoip: unable to load '%c%c' into memory\n", + COUNTRY(info->cc[i])); + return 0; + } + + /* Free userspace allocated memory for that country. + * FIXME: It's a bit odd to call this function everytime + * we process a country. Would be nice to call + * it once after all countries've been processed. + * - SJ + * *not implemented for now* + gfree(info->mem[i]); + */ + + /* Overwrite the now-useless pointer info->mem[i] with + * a pointer to the node's kernelspace structure. + * This avoids searching for a node in the match() and + * destroy() functions. + */ + info->mem[i] = node; + } + + /* We allocate some memory and give info->refcount a pointer + * to this memory. This prevents checkentry() from increasing a refcount + * different from the one used by destroy(). + * For explanation, see http://www.mail-archive.com/netfilter-devel@lists.samba.org/msg00625.html + */ + info->refcount = kmalloc(sizeof(u_int8_t), GFP_KERNEL); + if (info->refcount == NULL) { + printk(KERN_ERR "ipt_geoip: failed to allocate `refcount' memory\n"); + return 0; + } + *(info->refcount) = 1; + + return 1; +} + +static void geoip_destroy(void *matchinfo, unsigned int matchsize) +{ + struct ipt_geoip_info *info = matchinfo; + struct geoip_info *node; /* this keeps the code sexy */ + u_int8_t i; + + /* Decrease the previously increased refcount in checkentry() + * If it's equal to 1, we know this entry is just moving + * but not removed. We simply return to avoid useless destroy() + * processing. + */ + atomic_dec((atomic_t *)info->refcount); + if (*info->refcount) + return; + + /* Don't leak my memory, you idiot. + * Bug found with nfsim.. the netfilter's best + * friend. --peejix */ + kfree(info->refcount); + + /* This entry has been removed from the table so + * decrease the refcount of all countries it is + * using. + */ + + for (i = 0; i < info->count; i++) + if ((node = info->mem[i]) != NULL) { + atomic_dec((atomic_t *)&node->ref); + + /* Free up some memory if that node isn't used + * anymore. */ + if (node->ref < 1) + remove_node(node); + } + else + /* Something strange happened. There's no memory allocated for this + * country. Please send this bug to the mailing list. */ + printk(KERN_ERR + "ipt_geoip: What happened peejix ? What happened acidmen ?\n" + "ipt_geoip: please report this bug to the maintainers\n"); + return; +} + +static struct ipt_match geoip_match += { { NULL, NULL }, "geoip", &match, &geoip_checkentry, &geoip_destroy, THIS_MODULE }; + +static int __init init(void) +{ + return ipt_register_match(&geoip_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&geoip_match); + return; +} + +module_init(init); +module_exit(fini); diff --git a/release/src/linux/linux/net/ipv4/netfilter/ipt_ipp2p.c b/release/src/linux/linux/net/ipv4/netfilter/ipt_ipp2p.c new file mode 100644 index 00000000..c36b2005 --- /dev/null +++ b/release/src/linux/linux/net/ipv4/netfilter/ipt_ipp2p.c @@ -0,0 +1,868 @@ +#if defined(MODVERSIONS) +#include +#endif +#include +#include +#include +#include +#include +#include + +#define get_u8(X,O) (*(__u8 *)(X + O)) +#define get_u16(X,O) (*(__u16 *)(X + O)) +#define get_u32(X,O) (*(__u32 *)(X + O)) + +MODULE_AUTHOR("Eicke Friedrich/Klaus Degner "); +MODULE_DESCRIPTION("An extension to iptables to identify P2P traffic."); +MODULE_LICENSE("GPL"); + + +/*Search for UDP eDonkey/eMule/Kad commands*/ +int +udp_search_edk (unsigned char *haystack, int packet_len) +{ + unsigned char *t = haystack; + t += 8; + + switch (t[0]) { + case 0xe3: + { /*edonkey*/ + switch (t[1]) + { + /* client -> server status request */ + case 0x96: + if (packet_len == 14) return ((IPP2P_EDK * 100) + 50); + break; + /* server -> client status request */ + case 0x97: if (packet_len == 42) return ((IPP2P_EDK * 100) + 51); + break; + /* server description request */ + /* e3 2a ff f0 .. | size == 6 */ + case 0xa2: if ( (packet_len == 14) && ( get_u16(t,2) == __constant_htons(0xfff0) ) ) return ((IPP2P_EDK * 100) + 52); + break; + /* server description response */ + /* e3 a3 ff f0 .. | size > 40 && size < 200 */ + //case 0xa3: return ((IPP2P_EDK * 100) + 53); + // break; + case 0x9a: if (packet_len==26) return ((IPP2P_EDK * 100) + 54); + break; + + case 0x92: if (packet_len==18) return ((IPP2P_EDK * 100) + 55); + break; + } + break; + } + case 0xe4: + { + switch (t[1]) + { + /* e4 20 .. | size == 43 */ + case 0x20: if ((packet_len == 43) && (t[2] != 0x00) && (t[34] != 0x00)) return ((IPP2P_EDK * 100) + 60); + break; + /* e4 00 .. 00 | size == 35 ? */ + case 0x00: if ((packet_len == 35) && (t[26] == 0x00)) return ((IPP2P_EDK * 100) + 61); + break; + /* e4 10 .. 00 | size == 35 ? */ + case 0x10: if ((packet_len == 35) && (t[26] == 0x00)) return ((IPP2P_EDK * 100) + 62); + break; + /* e4 18 .. 00 | size == 35 ? */ + case 0x18: if ((packet_len == 35) && (t[26] == 0x00)) return ((IPP2P_EDK * 100) + 63); + break; + /* e4 52 .. | size = 44 */ + case 0x52: if (packet_len == 44 ) return ((IPP2P_EDK * 100) + 64); + break; + /* e4 58 .. | size == 6 */ + case 0x58: if (packet_len == 14 ) return ((IPP2P_EDK * 100) + 65); + break; + /* e4 59 .. | size == 2 */ + case 0x59: if (packet_len == 10 )return ((IPP2P_EDK * 100) + 66); + break; + /* e4 28 .. | packet_len == 52,77,102,127... */ + case 0x28: if (((packet_len-52) % 25) == 0) return ((IPP2P_EDK * 100) + 67); + break; + /* e4 50 xx xx | size == 4 */ + case 0x50: if (packet_len == 12) return ((IPP2P_EDK * 100) + 68); + break; + /* e4 40 xx xx | size == 48 */ + case 0x40: if (packet_len == 56) return ((IPP2P_EDK * 100) + 69); + break; + } + break; + } + } /* end of switch (t[0]) */ + return 0; +}/*udp_search_edk*/ + + +/*Search for UDP Gnutella commands*/ +int +udp_search_gnu (unsigned char *haystack, int packet_len) +{ + unsigned char *t = haystack; + t += 8; + + if (memcmp(t, "GND", 3) == 0) return ((IPP2P_GNU * 100) + 51); + if (memcmp(t, "GNUTELLA ", 9) == 0) return ((IPP2P_GNU * 100) + 52); + return 0; +}/*udp_search_gnu*/ + + +/*Search for UDP KaZaA commands*/ +int +udp_search_kazaa (unsigned char *haystack, int packet_len) +{ + unsigned char *t = haystack; + + if (t[packet_len-1] == 0x00){ + t += (packet_len - 6); + if (memcmp(t, "KaZaA", 5) == 0) return (IPP2P_KAZAA * 100 +50); + } + + return 0; +}/*udp_search_kazaa*/ + +/*Search for UDP DirectConnect commands*/ +int +udp_search_directconnect (unsigned char *haystack, int packet_len) +{ + unsigned char *t = haystack; + if ((*(t + 8) == 0x24) && (*(t + packet_len - 1) == 0x7c)) { + t+=8; + if (memcmp(t, "SR ", 3) == 0) return ((IPP2P_DC * 100) + 60); + if (memcmp(t, "Ping ", 5) == 0) return ((IPP2P_DC * 100) + 61); + } + return 0; +}/*udp_search_directconnect*/ + + + +/*Search for UDP BitTorrent commands*/ +int +udp_search_bit (unsigned char *haystack, int packet_len) +{ + switch(packet_len) + { + case 24: + /* ^ 00 00 04 17 27 10 19 80 */ + if ((ntohl(get_u32(haystack, 8)) == 0x00000417) && (ntohl(get_u32(haystack, 12)) == 0x27101980)) + return (IPP2P_BIT * 100 + 50); + break; + case 44: + if (get_u32(haystack, 16) == __constant_htonl(0x00000400) && get_u32(haystack, 36) == __constant_htonl(0x00000104)) + return (IPP2P_BIT * 100 + 51); + if (get_u32(haystack, 16) == __constant_htonl(0x00000400)) + return (IPP2P_BIT * 100 + 61); + break; + case 65: + if (get_u32(haystack, 16) == __constant_htonl(0x00000404) && get_u32(haystack, 36) == __constant_htonl(0x00000104)) + return (IPP2P_BIT * 100 + 52); + if (get_u32(haystack, 16) == __constant_htonl(0x00000404)) + return (IPP2P_BIT * 100 + 62); + break; + case 67: + if (get_u32(haystack, 16) == __constant_htonl(0x00000406) && get_u32(haystack, 36) == __constant_htonl(0x00000104)) + return (IPP2P_BIT * 100 + 53); + if (get_u32(haystack, 16) == __constant_htonl(0x00000406)) + return (IPP2P_BIT * 100 + 63); + break; + case 211: + if (get_u32(haystack, 8) == __constant_htonl(0x00000405)) + return (IPP2P_BIT * 100 + 54); + break; + case 29: + if ((get_u32(haystack, 8) == __constant_htonl(0x00000401))) + return (IPP2P_BIT * 100 + 55); + break; + case 52: + if (get_u32(haystack,8) == __constant_htonl(0x00000827) && + get_u32(haystack,12) == __constant_htonl(0x37502950)) + return (IPP2P_BIT * 100 + 80); + break; + default: + /* this packet does not have a constant size */ + if (packet_len >= 40 && get_u32(haystack, 16) == __constant_htonl(0x00000402) && get_u32(haystack, 36) == __constant_htonl(0x00000104)) + return (IPP2P_BIT * 100 + 56); + break; + } + + /* some extra-bitcomet rules: + * "d1:" [a|r] "d2:id20:" + */ + if (packet_len > 30 && get_u8(haystack, 8) == 'd' && get_u8(haystack, 9) == '1' && get_u8(haystack, 10) == ':' ) + { + if (get_u8(haystack, 11) == 'a' || get_u8(haystack, 11) == 'r') + { + if (memcmp(haystack+12,"d2:id20:",8)==0) + return (IPP2P_BIT * 100 + 57); + } + } + +#if 0 + /* bitlord rules */ + /* packetlen must be bigger than 40 */ + /* first 4 bytes are zero */ + if (packet_len > 40 && get_u32(haystack, 8) == 0x00000000) + { + /* first rule: 00 00 00 00 01 00 00 xx xx xx xx 00 00 00 00*/ + if (get_u32(haystack, 12) == 0x00000000 && + get_u32(haystack, 16) == 0x00010000 && + get_u32(haystack, 24) == 0x00000000 ) + return (IPP2P_BIT * 100 + 71); + + /* 00 01 00 00 0d 00 00 xx xx xx xx 00 00 00 00*/ + if (get_u32(haystack, 12) == 0x00000001 && + get_u32(haystack, 16) == 0x000d0000 && + get_u32(haystack, 24) == 0x00000000 ) + return (IPP2P_BIT * 100 + 71); + + + } +#endif + + return 0; +}/*udp_search_bit*/ + + + +/*Search for Ares commands*/ +//#define IPP2P_DEBUG_ARES +int +search_ares (const unsigned char *payload, const u16 plen) +//int search_ares (unsigned char *haystack, int packet_len, int head_len) +{ +// const unsigned char *t = haystack + head_len; + + /* all ares packets start with */ + if (payload[1] == 0 && (plen - payload[0]) == 3) + { + switch (payload[2]) + { + case 0x5a: + /* ares connect */ + if ( plen == 6 && payload[5] == 0x05 ) return ((IPP2P_ARES * 100) + 1); + break; + case 0x09: + /* ares search, min 3 chars --> 14 bytes + * lets define a search can be up to 30 chars --> max 34 bytes + */ + if ( plen >= 14 && plen <= 34 ) return ((IPP2P_ARES * 100) + 1); + break; +#ifdef IPP2P_DEBUG_ARES + default: + printk(KERN_DEBUG "Unknown Ares command %x recognized, len: %u \n", (unsigned int) payload[2],plen); +#endif /* IPP2P_DEBUG_ARES */ + } + } + +#if 0 + /* found connect packet: 03 00 5a 04 03 05 */ + /* new version ares 1.8: 03 00 5a xx xx 05 */ + if ((plen) == 6){ /* possible connect command*/ + if ((payload[0] == 0x03) && (payload[1] == 0x00) && (payload[2] == 0x5a) && (payload[5] == 0x05)) + return ((IPP2P_ARES * 100) + 1); + } + if ((plen) == 60){ /* possible download command*/ + if ((payload[59] == 0x0a) && (payload[58] == 0x0a)){ + if (memcmp(t, "PUSH SHA1:", 10) == 0) /* found download command */ + return ((IPP2P_ARES * 100) + 2); + } + } +#endif + + return 0; +} /*search_ares*/ + +/*Search for SoulSeek commands*/ +int +search_soul (const unsigned char *payload, const u16 plen) +{ +//#define IPP2P_DEBUG_SOUL + /* match: xx xx xx xx | xx = sizeof(payload) - 4 */ + if (get_u32(payload, 0) == (plen - 4)){ + const __u32 m=get_u32(payload, 4); + /* match 00 yy yy 00, yy can be everything */ + if ( get_u8(payload, 4) == 0x00 && get_u8(payload, 7) == 0x00 ) + { +#ifdef IPP2P_DEBUG_SOUL + printk(KERN_DEBUG "0: Soulseek command 0x%x recognized\n",get_u32(payload, 4)); +#endif /* IPP2P_DEBUG_SOUL */ + return ((IPP2P_SOUL * 100) + 1); + } + + /* next match: 01 yy 00 00 | yy can be everything */ + if ( get_u8(payload, 4) == 0x01 && get_u16(payload, 6) == 0x0000 ) + { +#ifdef IPP2P_DEBUG_SOUL + printk(KERN_DEBUG "1: Soulseek command 0x%x recognized\n",get_u16(payload, 4)); +#endif /* IPP2P_DEBUG_SOUL */ + return ((IPP2P_SOUL * 100) + 2); + } + + /* other soulseek commandos are: 1-5,7,9,13-18,22,23,26,28,35-37,40-46,50,51,60,62-69,91,92,1001 */ + /* try to do this in an intelligent way */ + /* get all small commandos */ + switch(m) + { + case 7: + case 9: + case 22: + case 23: + case 26: + case 28: + case 50: + case 51: + case 60: + case 91: + case 92: + case 1001: +#ifdef IPP2P_DEBUG_SOUL + printk(KERN_DEBUG "2: Soulseek command 0x%x recognized\n",get_u16(payload, 4)); +#endif /* IPP2P_DEBUG_SOUL */ + return ((IPP2P_SOUL * 100) + 3); + } + + if (m > 0 && m < 6 ) + { +#ifdef IPP2P_DEBUG_SOUL + printk(KERN_DEBUG "3: Soulseek command 0x%x recognized\n",get_u16(payload, 4)); +#endif /* IPP2P_DEBUG_SOUL */ + return ((IPP2P_SOUL * 100) + 4); + } + if (m > 12 && m < 19 ) + { +#ifdef IPP2P_DEBUG_SOUL + printk(KERN_DEBUG "4: Soulseek command 0x%x recognized\n",get_u16(payload, 4)); +#endif /* IPP2P_DEBUG_SOUL */ + return ((IPP2P_SOUL * 100) + 5); + } + + if (m > 34 && m < 38 ) + { +#ifdef IPP2P_DEBUG_SOUL + printk(KERN_DEBUG "5: Soulseek command 0x%x recognized\n",get_u16(payload, 4)); +#endif /* IPP2P_DEBUG_SOUL */ + return ((IPP2P_SOUL * 100) + 6); + } + + if (m > 39 && m < 47 ) + { +#ifdef IPP2P_DEBUG_SOUL + printk(KERN_DEBUG "6: Soulseek command 0x%x recognized\n",get_u16(payload, 4)); +#endif /* IPP2P_DEBUG_SOUL */ + return ((IPP2P_SOUL * 100) + 7); + } + + if (m > 61 && m < 70 ) + { +#ifdef IPP2P_DEBUG_SOUL + printk(KERN_DEBUG "7: Soulseek command 0x%x recognized\n",get_u16(payload, 4)); +#endif /* IPP2P_DEBUG_SOUL */ + return ((IPP2P_SOUL * 100) + 8); + } + +#ifdef IPP2P_DEBUG_SOUL + printk(KERN_DEBUG "unknown SOULSEEK command: 0x%x, first 16 bit: 0x%x, first 8 bit: 0x%x ,soulseek ???\n",get_u32(payload, 4),get_u16(payload, 4) >> 16,get_u8(payload, 4) >> 24); +#endif /* IPP2P_DEBUG_SOUL */ + } + + /* match 14 00 00 00 01 yy 00 00 00 STRING(YY) 01 00 00 00 00 46|50 00 00 00 00 */ + /* without size at the beginning !!! */ + if ( get_u32(payload, 0) == 0x14 && get_u8(payload, 4) == 0x01 ) + { + __u32 y=get_u32(payload, 5); + /* we need 19 chars + string */ + if ( (y + 19) <= (plen) ) + { + const unsigned char *w=payload+9+y; + if (get_u32(w, 0) == 0x01 && ( get_u16(w, 4) == 0x4600 || get_u16(w, 4) == 0x5000) && get_u32(w, 6) == 0x00); +#ifdef IPP2P_DEBUG_SOUL + printk(KERN_DEBUG "Soulssek special client command recognized\n"); +#endif /* IPP2P_DEBUG_SOUL */ + return ((IPP2P_SOUL * 100) + 9); + } + } + return 0; +} + + +/*Search for WinMX commands*/ +int +search_winmx (const unsigned char *payload, const u16 plen) +{ +//#define IPP2P_DEBUG_WINMX + if (((plen) == 4) && (memcmp(payload, "SEND", 4) == 0)) return ((IPP2P_WINMX * 100) + 1); + if (((plen) == 3) && (memcmp(payload, "GET", 3) == 0)) return ((IPP2P_WINMX * 100) + 2); + //if (packet_len < (head_len + 10)) return 0; + if (plen < 10) return 0; + + if ((memcmp(payload, "SEND", 4) == 0) || (memcmp(payload, "GET", 3) == 0)){ + u16 c=4; + const u16 end=plen-2; + u8 count=0; + while (c < end) + { + if (payload[c]== 0x20 && payload[c+1] == 0x22) + { + c++; + count++; + if (count>=2) return ((IPP2P_WINMX * 100) + 3); + } + c++; + } + } + + if ( plen == 149 && payload[0] == '8' ) + { +#ifdef IPP2P_DEBUG_WINMX + printk(KERN_INFO "maybe WinMX\n"); +#endif + if (get_u32(payload,17) == 0 && get_u32(payload,21) == 0 && get_u32(payload,25) == 0 && +// get_u32(payload,33) == __constant_htonl(0x71182b1a) && get_u32(payload,37) == __constant_htonl(0x05050000) && +// get_u32(payload,133) == __constant_htonl(0x31097edf) && get_u32(payload,145) == __constant_htonl(0xdcb8f792)) + get_u16(payload,39) == 0 && get_u16(payload,135) == __constant_htons(0x7edf) && get_u16(payload,147) == __constant_htons(0xf792)) + + { +#ifdef IPP2P_DEBUG_WINMX + printk(KERN_INFO "got WinMX\n"); +#endif + return ((IPP2P_WINMX * 100) + 4); + } + } + return 0; +} /*search_winmx*/ + + +/*Search for appleJuice commands*/ +int +search_apple (const unsigned char *payload, const u16 plen) +{ + if ( (plen > 7) && (payload[6] == 0x0d) && (payload[7] == 0x0a) && (memcmp(payload, "ajprot", 6) == 0)) return (IPP2P_APPLE * 100); + + return 0; +} + + +/*Search for BitTorrent commands*/ +int +search_bittorrent (const unsigned char *payload, const u16 plen) +{ + if (plen > 20) + { + /* test for match 0x13+"BitTorrent protocol" */ + if (payload[0] == 0x13) + { + if (memcmp(payload+1, "BitTorrent protocol", 19) == 0) return (IPP2P_BIT * 100); + } + + /* get tracker commandos, all starts with GET / + * then it can follow: scrape| announce + * and then ?hash_info= + */ + if (memcmp(payload,"GET /",5) == 0) + { + /* message scrape */ + if ( memcmp(payload+5,"scrape?info_hash=",17)==0 ) return (IPP2P_BIT * 100 + 1); + /* message announce */ + if ( memcmp(payload+5,"announce?info_hash=",19)==0 ) return (IPP2P_BIT * 100 + 2); + } + } + else + { + /* bitcomet encryptes the first packet, so we have to detect another + * one later in the flow */ + /* first try failed, too many missdetections */ + //if ( size == 5 && get_u32(t,0) == __constant_htonl(1) && t[4] < 3) return (IPP2P_BIT * 100 + 3); + + /* second try: block request packets */ + if ( plen == 17 && get_u32(payload,0) == __constant_htonl(0x0d) && payload[4] == 0x06 && get_u32(payload,13) == __constant_htonl(0x4000) ) return (IPP2P_BIT * 100 + 3); + } + + return 0; +} + + + +/*check for Kazaa get command*/ +int +search_kazaa (const unsigned char *payload, const u16 plen) + +{ + if ((payload[plen-2] == 0x0d) && (payload[plen-1] == 0x0a) && memcmp(payload, "GET /.hash=", 11) == 0) + return (IPP2P_DATA_KAZAA * 100); + + return 0; +} + + +/*check for gnutella get command*/ +int +search_gnu (const unsigned char *payload, const u16 plen) +{ + if ((payload[plen-2] == 0x0d) && (payload[plen-1] == 0x0a)) + { + if (memcmp(payload, "GET /get/", 9) == 0) return ((IPP2P_DATA_GNU * 100) + 1); + if (memcmp(payload, "GET /uri-res/", 13) == 0) return ((IPP2P_DATA_GNU * 100) + 2); + } + return 0; +} + + +/*check for gnutella get commands and other typical data*/ +int +search_all_gnu (const unsigned char *payload, const u16 plen) +{ + + if ((payload[plen-2] == 0x0d) && (payload[plen-1] == 0x0a)) + { + + if (memcmp(payload, "GNUTELLA CONNECT/", 17) == 0) return ((IPP2P_GNU * 100) + 1); + if (memcmp(payload, "GNUTELLA/", 9) == 0) return ((IPP2P_GNU * 100) + 2); + + + if ((memcmp(payload, "GET /get/", 9) == 0) || (memcmp(payload, "GET /uri-res/", 13) == 0)) + { + u16 c=8; + const u16 end=plen-22; + while (c < end) { + if ( payload[c] == 0x0a && payload[c+1] == 0x0d && ((memcmp(&payload[c+2], "X-Gnutella-", 11) == 0) || (memcmp(&payload[c+2], "X-Queue:", 8) == 0))) + return ((IPP2P_GNU * 100) + 3); + c++; + } + } + } + return 0; +} + + +/*check for KaZaA download commands and other typical data*/ +int +search_all_kazaa (const unsigned char *payload, const u16 plen) +{ + if ((payload[plen-2] == 0x0d) && (payload[plen-1] == 0x0a)) + { + + if (memcmp(payload, "GIVE ", 5) == 0) return ((IPP2P_KAZAA * 100) + 1); + + if (memcmp(payload, "GET /", 5) == 0) { + u16 c = 8; + const u16 end=plen-22; + while (c < end) { + if ( payload[c] == 0x0a && payload[c+1] == 0x0d && ((memcmp(&payload[c+2], "X-Kazaa-Username: ", 18) == 0) || (memcmp(&payload[c+2], "User-Agent: PeerEnabler/", 24) == 0))) + return ((IPP2P_KAZAA * 100) + 2); + c++; + } + } + } + return 0; +} + +/*fast check for edonkey file segment transfer command*/ +int +search_edk (const unsigned char *payload, const u16 plen) +{ + if (payload[0] != 0xe3) + return 0; + else { + if (payload[5] == 0x47) + return (IPP2P_DATA_EDK * 100); + else + return 0; + } +} + + + +/*intensive but slower search for some edonkey packets including size-check*/ +int +search_all_edk (const unsigned char *payload, const u16 plen) +{ + if (payload[0] != 0xe3) + return 0; + else { + //t += head_len; + const u16 cmd = get_u16(payload, 1); + if (cmd == (plen - 5)) { + switch (payload[5]) { + case 0x01: return ((IPP2P_EDK * 100) + 1); /*Client: hello or Server:hello*/ + case 0x4c: return ((IPP2P_EDK * 100) + 9); /*Client: Hello-Answer*/ + } + } + return 0; + } +} + + +/*fast check for Direct Connect send command*/ +int +search_dc (const unsigned char *payload, const u16 plen) +{ + + if (payload[0] != 0x24 ) + return 0; + else { + if (memcmp(&payload[1], "Send|", 5) == 0) + return (IPP2P_DATA_DC * 100); + else + return 0; + } + +} + + +/*intensive but slower check for all direct connect packets*/ +int +search_all_dc (const unsigned char *payload, const u16 plen) +{ +// unsigned char *t = haystack; + + if (payload[0] == 0x24 && payload[plen-1] == 0x7c) + { + const unsigned char *t=&payload[1]; + /* Client-Hub-Protocol */ + if (memcmp(t, "Lock ", 5) == 0) return ((IPP2P_DC * 100) + 1); + /* Client-Client-Protocol, some are already recognized by client-hub (like lock) */ + if (memcmp(t, "MyNick ", 7) == 0) return ((IPP2P_DC * 100) + 38); + } + return 0; +} + +/*check for mute*/ +int +search_mute (const unsigned char *payload, const u16 plen) +{ + if ( plen == 209 || plen == 345 || plen == 473 || plen == 609 || plen == 1121 ) + { + //printk(KERN_DEBUG "size hit: %u",size); + if (memcmp(payload,"PublicKey: ",11) == 0 ) + { + return ((IPP2P_MUTE * 100) + 0); + +/* if (memcmp(t+size-14,"\x0aEndPublicKey\x0a",14) == 0) + { + printk(KERN_DEBUG "end pubic key hit: %u",size); + + }*/ + } + } + return 0; +} + + +/* check for xdcc */ +int +search_xdcc (const unsigned char *payload, const u16 plen) +{ + /* search in small packets only */ + if (plen > 20 && plen < 200 && payload[plen-1] == 0x0a && payload[plen-2] == 0x0d && memcmp(payload,"PRIVMSG ",8) == 0) + { + + u16 x=10; + const u16 end=plen - 13; + + /* is seems to be a irc private massage, chedck for xdcc command */ + while (x < end) + { + if (payload[x] == ':') + { + if ( memcmp(&payload[x+1],"xdcc send #",11) == 0 ) + return ((IPP2P_XDCC * 100) + 0); + } + x++; + } + } + return 0; +} + +/* search for waste */ +int search_waste(const unsigned char *payload, const u16 plen) +{ + if ( plen >= 8 && memcmp(payload,"GET.sha1:",9) == 0) + return ((IPP2P_WASTE * 100) + 0); + + return 0; +} + + +static struct { + int command; + __u8 short_hand; /*for fucntions included in short hands*/ + int packet_len; + int (*function_name) (const unsigned char *, const u16); +} matchlist[] = { + {IPP2P_EDK,SHORT_HAND_IPP2P,20, &search_all_edk}, +// {IPP2P_DATA_KAZAA,SHORT_HAND_DATA,200, &search_kazaa}, +// {IPP2P_DATA_EDK,SHORT_HAND_DATA,60, &search_edk}, +// {IPP2P_DATA_DC,SHORT_HAND_DATA,26, &search_dc}, + {IPP2P_DC,SHORT_HAND_IPP2P,5, search_all_dc}, +// {IPP2P_DATA_GNU,SHORT_HAND_DATA,40, &search_gnu}, + {IPP2P_GNU,SHORT_HAND_IPP2P,5, &search_all_gnu}, + {IPP2P_KAZAA,SHORT_HAND_IPP2P,5, &search_all_kazaa}, + {IPP2P_BIT,SHORT_HAND_IPP2P,20, &search_bittorrent}, + {IPP2P_APPLE,SHORT_HAND_IPP2P,5, &search_apple}, + {IPP2P_SOUL,SHORT_HAND_IPP2P,5, &search_soul}, + {IPP2P_WINMX,SHORT_HAND_IPP2P,2, &search_winmx}, + {IPP2P_ARES,SHORT_HAND_IPP2P,5, &search_ares}, + {IPP2P_MUTE,SHORT_HAND_NONE,200, &search_mute}, + {IPP2P_WASTE,SHORT_HAND_NONE,5, &search_waste}, + {IPP2P_XDCC,SHORT_HAND_NONE,5, &search_xdcc}, + {0,0,0,NULL} +}; + + +static struct { + int command; + __u8 short_hand; /*for fucntions included in short hands*/ + int packet_len; + int (*function_name) (unsigned char *, int); +} udp_list[] = { + {IPP2P_KAZAA,SHORT_HAND_IPP2P,14, &udp_search_kazaa}, + {IPP2P_BIT,SHORT_HAND_IPP2P,23, &udp_search_bit}, + {IPP2P_GNU,SHORT_HAND_IPP2P,11, &udp_search_gnu}, + {IPP2P_EDK,SHORT_HAND_IPP2P,9, &udp_search_edk}, + {IPP2P_DC,SHORT_HAND_IPP2P,12, &udp_search_directconnect}, + {0,0,0,NULL} +}; + + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) + const void *hdr, + u_int16_t datalen, +#endif + + int *hotdrop) +{ + const struct ipt_p2p_info *info = matchinfo; + unsigned char *haystack; + struct iphdr *ip = skb->nh.iph; + int p2p_result = 0, i = 0; +// int head_len; + int hlen = ntohs(ip->tot_len)-(ip->ihl*4); /*hlen = packet-data length*/ + + /*must not be a fragment*/ + if (offset) { + if (info->debug) printk("IPP2P.match: offset found %i \n",offset); + return 0; + } + + /*make sure that skb is linear*/ + if(skb_is_nonlinear(skb)){ + if (info->debug) printk("IPP2P.match: nonlinear skb found\n"); + return 0; + } + + + haystack=(char *)ip+(ip->ihl*4); /*haystack = packet data*/ + + switch (ip->protocol){ + case IPPROTO_TCP: /*what to do with a TCP packet*/ + { + struct tcphdr *tcph = (void *) ip + ip->ihl * 4; + + if (tcph->fin) return 0; /*if FIN bit is set bail out*/ + if (tcph->syn) return 0; /*if SYN bit is set bail out*/ + if (tcph->rst) return 0; /*if RST bit is set bail out*/ + + haystack += tcph->doff * 4; /*get TCP-Header-Size*/ + hlen -= tcph->doff * 4; + while (matchlist[i].command) { + if ((((info->cmd & matchlist[i].command) == matchlist[i].command) || + ((info->cmd & matchlist[i].short_hand) == matchlist[i].short_hand)) && + (hlen > matchlist[i].packet_len)) { + p2p_result = matchlist[i].function_name(haystack, hlen); + if (p2p_result) + { + if (info->debug) printk("IPP2P.debug:TCP-match: %i from: %u.%u.%u.%u:%i to: %u.%u.%u.%u:%i Length: %i\n", + p2p_result, NIPQUAD(ip->saddr),ntohs(tcph->source), NIPQUAD(ip->daddr),ntohs(tcph->dest),hlen); + return p2p_result; + } + } + i++; + } + return p2p_result; + } + + case IPPROTO_UDP: /*what to do with an UDP packet*/ + { + struct udphdr *udph = (void *) ip + ip->ihl * 4; + + while (udp_list[i].command){ + if ((((info->cmd & udp_list[i].command) == udp_list[i].command) || + ((info->cmd & udp_list[i].short_hand) == udp_list[i].short_hand)) && + (hlen > udp_list[i].packet_len)) { + p2p_result = udp_list[i].function_name(haystack, hlen); + if (p2p_result){ + if (info->debug) printk("IPP2P.debug:UDP-match: %i from: %u.%u.%u.%u:%i to: %u.%u.%u.%u:%i Length: %i\n", + p2p_result, NIPQUAD(ip->saddr),ntohs(udph->source), NIPQUAD(ip->daddr),ntohs(udph->dest),hlen); + return p2p_result; + } + } + i++; + } + return p2p_result; + } + + default: return 0; + } +} + + + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + /* Must specify -p tcp */ +/* if (ip->proto != IPPROTO_TCP || (ip->invflags & IPT_INV_PROTO)) { + * printk("ipp2p: Only works on TCP packets, use -p tcp\n"); + * return 0; + * }*/ + return 1; +} + + + + +static struct ipt_match ipp2p_match = { +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) + { NULL, NULL }, + "ipp2p", + &match, + &checkentry, + NULL, + THIS_MODULE +#endif +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + .name = "ipp2p", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +#endif +}; + + +static int __init init(void) +{ + printk(KERN_INFO "IPP2P v%s loading\n", IPP2P_VERSION); + return ipt_register_match(&ipp2p_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&ipp2p_match); + printk(KERN_INFO "IPP2P v%s unloaded\n", IPP2P_VERSION); +} + +module_init(init); +module_exit(fini); + + diff --git a/release/src/linux/linux/net/ipv4/netfilter/ipt_iprange.c b/release/src/linux/linux/net/ipv4/netfilter/ipt_iprange.c new file mode 100644 index 00000000..38902524 --- /dev/null +++ b/release/src/linux/linux/net/ipv4/netfilter/ipt_iprange.c @@ -0,0 +1,101 @@ +/* + * iptables module to match IP address ranges + * (c) 2003 Jozsef Kadlecsik + * + * Released under the terms of GNU GPLv2. + * + */ +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_DESCRIPTION("iptables arbitrary IP range match module"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + const void *hdr, + u_int16_t datalen, + int *hotdrop) +{ + const struct ipt_iprange_info *info = matchinfo; + const struct iphdr *iph = skb->nh.iph; + + + if (info->flags & IPRANGE_SRC) { + if (((ntohl(iph->saddr) < ntohl(info->src.min_ip)) + || (ntohl(iph->saddr) > ntohl(info->src.max_ip))) + ^ !!(info->flags & IPRANGE_SRC_INV)) { + DEBUGP("src IP %u.%u.%u.%u NOT in range %s" + "%u.%u.%u.%u-%u.%u.%u.%u\n", + NIPQUAD(iph->saddr), + info->flags & IPRANGE_SRC_INV ? "(INV) " : "", + NIPQUAD(info->src.min_ip), + NIPQUAD(info->src.max_ip)); + return 0; + } + } + if (info->flags & IPRANGE_DST) { + if (((ntohl(iph->daddr) < ntohl(info->dst.min_ip)) + || (ntohl(iph->daddr) > ntohl(info->dst.max_ip))) + ^ !!(info->flags & IPRANGE_DST_INV)) { + DEBUGP("dst IP %u.%u.%u.%u NOT in range %s" + "%u.%u.%u.%u-%u.%u.%u.%u\n", + NIPQUAD(iph->daddr), + info->flags & IPRANGE_DST_INV ? "(INV) " : "", + NIPQUAD(info->dst.min_ip), + NIPQUAD(info->dst.max_ip)); + return 0; + } + } + return 1; +} + +static int check(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + /* verify size */ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_iprange_info))) + return 0; + + return 1; +} + +static struct ipt_match iprange_match = +{ + .list = { NULL, NULL }, + .name = "iprange", + .match = &match, + .checkentry = &check, + .destroy = NULL, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_match(&iprange_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&iprange_match); +} + +module_init(init); +module_exit(fini); diff --git a/release/src/linux/linux/net/ipv4/netfilter/ipt_layer7.c b/release/src/linux/linux/net/ipv4/netfilter/ipt_layer7.c new file mode 100644 index 00000000..567e3847 --- /dev/null +++ b/release/src/linux/linux/net/ipv4/netfilter/ipt_layer7.c @@ -0,0 +1,570 @@ +/* + Kernel module to match application layer (OSI layer 7) + data in connections. + + http://l7-filter.sf.net + + By Matthew Strait and Ethan Sommer, 2003-2005. + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version + 2 of the License, or (at your option) any later version. + http://www.gnu.org/licenses/gpl.txt + + Based on ipt_string.c (C) 2000 Emmanuel Roger + and cls_layer7.c (C) 2003 Matthew Strait, Ethan Sommer, Justin Levandoski +*/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "regexp/regexp.c" + +#include +#include + +MODULE_AUTHOR("Matthew Strait , Ethan Sommer "); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("iptables application layer match module"); + +static int maxdatalen = 2048; // this is the default +MODULE_PARM(maxdatalen,"i"); +MODULE_PARM_DESC(maxdatalen,"maximum bytes of data looked at by l7-filter"); + +#if defined(CONFIG_IP_NF_MATCH_LAYER7_DEBUG) + #define DPRINTK(format,args...) printk(format,##args) +#else + #define DPRINTK(format,args...) +#endif + +#define TOTAL_PACKETS master_conntrack->layer7.numpackets + +/* Number of packets whose data we look at. +This can be modified through /proc/net/layer7_numpackets */ +static int num_packets = 10; + +static struct pattern_cache { + char * regex_string; + regexp * pattern; + struct pattern_cache * next; +} * first_pattern_cache = NULL; + +/* I'm new to locking. Here are my assumptions: + +- No one will write to /proc/net/layer7_numpackets over and over very fast; + if they did, nothing awful would happen. + +- This code will never be processing the same packet twice at the same time, + because iptables rules are traversed in order. + +- It doesn't matter if two packets from different connections are in here at + the same time, because they don't share any data. + +- It _does_ matter if two packets from the same connection are here at the same + time. In this case, we have to protect the conntracks and the list of + compiled patterns. +*/ +DECLARE_RWLOCK(ct_lock); +DECLARE_LOCK(list_lock); + +#if CONFIG_IP_NF_MATCH_LAYER7_DEBUG +/* Converts an unfriendly string into a friendly one by +replacing unprintables with periods and all whitespace with " ". */ +static char * friendly_print(unsigned char * s) +{ + char * f = kmalloc(strlen(s) + 1, GFP_ATOMIC); + int i; + + if(!f) { + if (net_ratelimit()) + printk(KERN_ERR "layer7: out of memory in friendly_print, bailing.\n"); + return NULL; + } + + for(i = 0; i < strlen(s); i++){ + if(isprint(s[i]) && s[i] < 128) f[i] = s[i]; + else if(isspace(s[i])) f[i] = ' '; + else f[i] = '.'; + } + f[i] = '\0'; + return f; +} + +static char dec2hex(int i) +{ + switch (i) { + case 0 ... 9: + return (char)(i + '0'); + break; + case 10 ... 15: + return (char)(i - 10 + 'a'); + break; + default: + if (net_ratelimit()) + printk("Problem in dec2hex\n"); + return '\0'; + } +} + +static char * hex_print(unsigned char * s) +{ + char * g = kmalloc(strlen(s)*3 + 1, GFP_ATOMIC); + int i; + + if(!g) { + if (net_ratelimit()) + printk(KERN_ERR "layer7: out of memory in hex_print, bailing.\n"); + return NULL; + } + + for(i = 0; i < strlen(s); i++) { + g[i*3 ] = dec2hex(s[i]/16); + g[i*3 + 1] = dec2hex(s[i]%16); + g[i*3 + 2] = ' '; + } + g[i*3] = '\0'; + + return g; +} +#endif // DEBUG + +/* Use instead of regcomp. As we expect to be seeing the same regexps over and +over again, it make sense to cache the results. */ +static regexp * compile_and_cache(char * regex_string, char * protocol) +{ + struct pattern_cache * node = first_pattern_cache; + struct pattern_cache * last_pattern_cache = first_pattern_cache; + struct pattern_cache * tmp; + unsigned int len; + + while (node != NULL) { + if (!strcmp(node->regex_string, regex_string)) + return node->pattern; + + last_pattern_cache = node;/* points at the last non-NULL node */ + node = node->next; + } + + /* If we reach the end of the list, then we have not yet cached + the pattern for this regex. Let's do that now. + Be paranoid about running out of memory to avoid list corruption. */ + tmp = kmalloc(sizeof(struct pattern_cache), GFP_ATOMIC); + + if(!tmp) { + if (net_ratelimit()) + printk(KERN_ERR "layer7: out of memory in compile_and_cache, bailing.\n"); + return NULL; + } + + tmp->regex_string = kmalloc(strlen(regex_string) + 1, GFP_ATOMIC); + tmp->pattern = kmalloc(sizeof(struct regexp), GFP_ATOMIC); + tmp->next = NULL; + + if(!tmp->regex_string || !tmp->pattern) { + if (net_ratelimit()) + printk(KERN_ERR "layer7: out of memory in compile_and_cache, bailing.\n"); + kfree(tmp->regex_string); + kfree(tmp->pattern); + kfree(tmp); + return NULL; + } + + /* Ok. The new node is all ready now. */ + node = tmp; + + if(first_pattern_cache == NULL) /* list is empty */ + first_pattern_cache = node; /* make node the beginning */ + else + last_pattern_cache->next = node; /* attach node to the end */ + + /* copy the string and compile the regex */ + len = strlen(regex_string); + DPRINTK("About to compile this: \"%s\"\n", regex_string); + node->pattern = regcomp(regex_string, &len); + if ( !node->pattern ) { + if (net_ratelimit()) + printk(KERN_ERR "layer7: Error compiling regexp \"%s\" (%s)\n", regex_string, protocol); + /* pattern is now cached as NULL, so we won't try again. */ + } + + strcpy(node->regex_string, regex_string); + return node->pattern; +} + +static int can_handle(const struct sk_buff *skb) +{ + if(!skb->nh.iph) /* not IP */ + return 0; + if(skb->nh.iph->protocol != IPPROTO_TCP && + skb->nh.iph->protocol != IPPROTO_UDP && + skb->nh.iph->protocol != IPPROTO_ICMP) + return 0; + return 1; +} + +/* Returns offset the into the skb->data that the application data starts */ +static int app_data_offset(const struct sk_buff *skb) +{ + /* In case we are ported somewhere (ebtables?) where skb->nh.iph + isn't set, this can be gotten from 4*(skb->data[0] & 0x0f) as well. */ + int ip_hl = 4*skb->nh.iph->ihl; + + if( skb->nh.iph->protocol == IPPROTO_TCP ) { + /* 12 == offset into TCP header for the header length field. + Can't get this with skb->h.th->doff because the tcphdr + struct doesn't get set when routing (this is confirmed to be + true in Netfilter as well as QoS.) */ + int tcp_hl = 4*(skb->data[ip_hl + 12] >> 4); + + return ip_hl + tcp_hl; + } else if( skb->nh.iph->protocol == IPPROTO_UDP ) { + return ip_hl + 8; /* UDP header is always 8 bytes */ + } else if( skb->nh.iph->protocol == IPPROTO_ICMP ) { + return ip_hl + 8; /* ICMP header is 8 bytes */ + } else { + if (net_ratelimit()) + printk(KERN_ERR "layer7: tried to handle unknown protocol!\n"); + return ip_hl + 8; /* something reasonable */ + } +} + +/* handles whether there's a match when we aren't appending data anymore */ +static int match_no_append(struct ip_conntrack * conntrack, struct ip_conntrack * master_conntrack, + enum ip_conntrack_info ctinfo, enum ip_conntrack_info master_ctinfo, + struct ipt_layer7_info * info) +{ + /* If we're in here, throw the app data away */ + WRITE_LOCK(&ct_lock); + if(master_conntrack->layer7.app_data != NULL) { + + #ifdef CONFIG_IP_NF_MATCH_LAYER7_DEBUG + if(!master_conntrack->layer7.app_proto) { + char * f = friendly_print(master_conntrack->layer7.app_data); + char * g = hex_print(master_conntrack->layer7.app_data); + DPRINTK("\nl7-filter gave up after %d bytes (%d packets):\n%s\n", + strlen(f), + TOTAL_PACKETS, f); + kfree(f); + DPRINTK("In hex: %s\n", g); + kfree(g); + } + #endif + + kfree(master_conntrack->layer7.app_data); + master_conntrack->layer7.app_data = NULL; /* don't free again */ + } + WRITE_UNLOCK(&ct_lock); + + if(master_conntrack->layer7.app_proto){ + /* Here child connections set their .app_proto (for /proc/net/ip_conntrack) */ + WRITE_LOCK(&ct_lock); + if(!conntrack->layer7.app_proto) { + conntrack->layer7.app_proto = kmalloc(strlen(master_conntrack->layer7.app_proto)+1, GFP_ATOMIC); + if(!conntrack->layer7.app_proto){ + if (net_ratelimit()) + printk(KERN_ERR "layer7: out of memory in match_no_append, bailing.\n"); + WRITE_UNLOCK(&ct_lock); + return 1; + } + strcpy(conntrack->layer7.app_proto, master_conntrack->layer7.app_proto); + } + WRITE_UNLOCK(&ct_lock); + + return (!strcmp(master_conntrack->layer7.app_proto, info->protocol)); + } + else { + /* If not classified, set to "unknown" to distinguish from + connections that are still being tested. */ + WRITE_LOCK(&ct_lock); + master_conntrack->layer7.app_proto = kmalloc(strlen("unknown")+1, GFP_ATOMIC); + if(!master_conntrack->layer7.app_proto){ + if (net_ratelimit()) + printk(KERN_ERR "layer7: out of memory in match_no_append, bailing.\n"); + WRITE_UNLOCK(&ct_lock); + return 1; + } + strcpy(master_conntrack->layer7.app_proto, "unknown"); + WRITE_UNLOCK(&ct_lock); + return 0; + } +} + +/* add the new app data to the conntrack. Return number of bytes added. */ +static int add_data(struct ip_conntrack * master_conntrack, + char * app_data, int appdatalen) +{ + int length = 0, i; + int oldlength = master_conntrack->layer7.app_data_len; + + /* Strip nulls. Make everything lower case (our regex lib doesn't + do case insensitivity). Add it to the end of the current data. */ + for(i = 0; i < maxdatalen-oldlength-1 && i < appdatalen; i++) { + if(app_data[i] != '\0') { + master_conntrack->layer7.app_data[length+oldlength] = + /* the kernel version of tolower mungs 'upper ascii' */ + isascii(app_data[i])? tolower(app_data[i]) : app_data[i]; + length++; + } + } + + master_conntrack->layer7.app_data[length+oldlength] = '\0'; + master_conntrack->layer7.app_data_len = length + oldlength; + + return length; +} + +/* Returns true on match and false otherwise. */ +static int match(/* const */struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const void *matchinfo, + int offset, int *hotdrop) +{ + struct ipt_layer7_info * info = (struct ipt_layer7_info *)matchinfo; + enum ip_conntrack_info master_ctinfo, ctinfo; + struct ip_conntrack *master_conntrack, *conntrack; + unsigned char * app_data; + unsigned int pattern_result, appdatalen; + regexp * comppattern; + + if(!can_handle(skb)){ + DPRINTK("layer7: This is some protocol I can't handle.\n"); + return info->invert; + } + + /* Treat the parent and all its children together as one connection, + except for the purpose of setting conntrack->layer7.app_proto in the + actual connection. This makes /proc/net/ip_conntrack somewhat more + satisfying. */ + if(!(conntrack = ip_conntrack_get((struct sk_buff *)skb, &ctinfo)) || + !(master_conntrack = ip_conntrack_get((struct sk_buff *)skb, &master_ctinfo))) { + DPRINTK("layer7: packet is not from a known connection, giving up.\n"); + return info->invert; + } + + /* Try to get a master conntrack (and its master etc) for FTP, etc. */ + while (master_ct(master_conntrack) != NULL) + master_conntrack = master_ct(master_conntrack); + + if(!skb->cb[0]){ + WRITE_LOCK(&ct_lock); + master_conntrack->layer7.numpackets++;/*starts at 0 via memset*/ + WRITE_UNLOCK(&ct_lock); + } + + /* if we've classified it or seen too many packets */ + if(TOTAL_PACKETS > num_packets || + master_conntrack->layer7.app_proto) { + + pattern_result = match_no_append(conntrack, master_conntrack, ctinfo, master_ctinfo, info); + + /* skb->cb[0] == seen. Avoid doing things twice if there are two l7 + rules. I'm not sure that using cb for this purpose is correct, although + it says "put your private variables there". But it doesn't look like it + is being used for anything else in the skbs that make it here. How can + I write to cb without making the compiler angry? */ + skb->cb[0] = 1; /* marking it seen here is probably irrelevant, but consistant */ + + return (pattern_result ^ info->invert); + } + + if(skb_is_nonlinear(skb)){ + if(skb_linearize(skb, GFP_ATOMIC) != 0){ + if (net_ratelimit()) + printk(KERN_ERR "layer7: failed to linearize packet, bailing.\n"); + return info->invert; + } + } + + /* now that the skb is linearized, it's safe to set these. */ + app_data = skb->data + app_data_offset(skb); + appdatalen = skb->tail - app_data; + + LOCK_BH(&list_lock); + /* the return value gets checked later, when we're ready to use it */ + comppattern = compile_and_cache(info->pattern, info->protocol); + UNLOCK_BH(&list_lock); + + /* On the first packet of a connection, allocate space for app data */ + WRITE_LOCK(&ct_lock); + if(TOTAL_PACKETS == 1 && !skb->cb[0] && !master_conntrack->layer7.app_data) { + master_conntrack->layer7.app_data = kmalloc(maxdatalen, GFP_ATOMIC); + if(!master_conntrack->layer7.app_data){ + if (net_ratelimit()) + printk(KERN_ERR "layer7: out of memory in match, bailing.\n"); + WRITE_UNLOCK(&ct_lock); + return info->invert; + } + + master_conntrack->layer7.app_data[0] = '\0'; + } + WRITE_UNLOCK(&ct_lock); + + /* Can be here, but unallocated, if numpackets is increased near + the beginning of a connection */ + if(master_conntrack->layer7.app_data == NULL) + return (info->invert); /* unmatched */ + + if(!skb->cb[0]){ + int newbytes; + WRITE_LOCK(&ct_lock); + newbytes = add_data(master_conntrack, app_data, appdatalen); + WRITE_UNLOCK(&ct_lock); + + if(newbytes == 0) { /* didn't add any data */ + skb->cb[0] = 1; + /* Didn't match before, not going to match now */ + return info->invert; + } + } + + /* If looking for "unknown", then never match. "Unknown" means that + we've given up; we're still trying with these packets. */ + if(!strcmp(info->protocol, "unknown")) { + pattern_result = 0; + /* If the regexp failed to compile, don't bother running it */ + } else if(comppattern && regexec(comppattern, master_conntrack->layer7.app_data)) { + DPRINTK("layer7: regexec positive: %s!\n", info->protocol); + pattern_result = 1; + } else pattern_result = 0; + + if(pattern_result) { + WRITE_LOCK(&ct_lock); + master_conntrack->layer7.app_proto = kmalloc(strlen(info->protocol)+1, GFP_ATOMIC); + if(!master_conntrack->layer7.app_proto){ + if (net_ratelimit()) + printk(KERN_ERR "layer7: out of memory in match, bailing.\n"); + WRITE_UNLOCK(&ct_lock); + return (pattern_result ^ info->invert); + } + strcpy(master_conntrack->layer7.app_proto, info->protocol); + WRITE_UNLOCK(&ct_lock); + } + + /* mark the packet seen */ + skb->cb[0] = 1; + + return (pattern_result ^ info->invert); +} + +static int checkentry(const char *tablename, const struct ipt_ip *ip, + void *matchinfo, unsigned int matchsize, unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_layer7_info))) + return 0; + return 1; +} + +static struct ipt_match layer7_match = { + .name = "layer7", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE +}; + +/* taken from drivers/video/modedb.c */ +static int my_atoi(const char *s) +{ + int val = 0; + + for (;; s++) { + switch (*s) { + case '0'...'9': + val = 10*val+(*s-'0'); + break; + default: + return val; + } + } +} + +/* write out num_packets to userland. */ +static int layer7_read_proc(char* page, char ** start, off_t off, int count, + int* eof, void * data) +{ + if(num_packets > 99 && net_ratelimit()) + printk(KERN_ERR "layer7: NOT REACHED. num_packets too big\n"); + + page[0] = num_packets/10 + '0'; + page[1] = num_packets%10 + '0'; + page[2] = '\n'; + page[3] = '\0'; + + *eof=1; + + return 3; +} + +/* Read in num_packets from userland */ +static int layer7_write_proc(struct file* file, const char* buffer, + unsigned long count, void *data) +{ + char * foo = kmalloc(count, GFP_ATOMIC); + + if(!foo){ + if (net_ratelimit()) + printk(KERN_ERR "layer7: out of memory, bailing. num_packets unchanged.\n"); + return count; + } + + copy_from_user(foo, buffer, count); + + num_packets = my_atoi(foo); + kfree (foo); + + /* This has an arbitrary limit to make the math easier. I'm lazy. + But anyway, 99 is a LOT! If you want more, you're doing it wrong! */ + if(num_packets > 99) { + printk(KERN_WARNING "layer7: num_packets can't be > 99.\n"); + num_packets = 99; + } else if(num_packets < 1) { + printk(KERN_WARNING "layer7: num_packets can't be < 1.\n"); + num_packets = 1; + } + + return count; +} + +/* register the proc file */ +static void layer7_init_proc(void) +{ + struct proc_dir_entry* entry; + entry = create_proc_entry("layer7_numpackets", 0644, proc_net); + entry->read_proc = layer7_read_proc; + entry->write_proc = layer7_write_proc; +} + +static void layer7_cleanup_proc(void) +{ + remove_proc_entry("layer7_numpackets", proc_net); +} + +static int __init init(void) +{ + layer7_init_proc(); + if(maxdatalen < 1) { + printk(KERN_WARNING "layer7: maxdatalen can't be < 1, using 1\n"); + maxdatalen = 1; + } + /* This is not a hard limit. It's just here to prevent people from + bringing their slow machines to a grinding halt. */ + else if(maxdatalen > 65536) { + printk(KERN_WARNING "layer7: maxdatalen can't be > 65536, using 65536\n"); + maxdatalen = 65536; + } + return ipt_register_match(&layer7_match); +} + +static void __exit fini(void) +{ + layer7_cleanup_proc(); + ipt_unregister_match(&layer7_match); +} + +module_init(init); +module_exit(fini); diff --git a/release/src/linux/linux/net/ipv4/netfilter/ipt_mac.c b/release/src/linux/linux/net/ipv4/netfilter/ipt_mac.c index b320e29b..d0475155 100644 --- a/release/src/linux/linux/net/ipv4/netfilter/ipt_mac.c +++ b/release/src/linux/linux/net/ipv4/netfilter/ipt_mac.c @@ -19,7 +19,8 @@ match(const struct sk_buff *skb, const struct ipt_mac_info *info = matchinfo; /* Is mac pointer valid? */ - return (skb->mac.raw >= skb->head + return (in != NULL // added for OUTPUT experiment -- zzz + && skb->mac.raw >= skb->head && (skb->mac.raw + ETH_HLEN) <= skb->data /* If so, compare... */ && ((memcmp(skb->mac.ethernet->h_source, info->srcaddr, ETH_ALEN) @@ -33,6 +34,7 @@ ipt_mac_checkentry(const char *tablename, unsigned int matchsize, unsigned int hook_mask) { +#if 0 // removed for OUTPUT experiment --jz /* FORWARD isn't always valid, but it's nice to be able to do --RR */ if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_IN) @@ -40,7 +42,7 @@ ipt_mac_checkentry(const char *tablename, printk("ipt_mac: only valid for PRE_ROUTING, LOCAL_IN or FORWARD.\n"); return 0; } - +#endif if (matchsize != IPT_ALIGN(sizeof(struct ipt_mac_info))) return 0; diff --git a/release/src/linux/linux/net/ipv4/netfilter/ipt_macsave.c b/release/src/linux/linux/net/ipv4/netfilter/ipt_macsave.c new file mode 100644 index 00000000..25fa26a4 --- /dev/null +++ b/release/src/linux/linux/net/ipv4/netfilter/ipt_macsave.c @@ -0,0 +1,62 @@ +/* + + macsave match + Copyright (C) 2006 Jonathan Zarate + + Licensed under GNU GPL v2 or later. + +*/ + +#include +#include +#include + +#include +#include +#include + +#define DEBUG 1 + +#ifdef DEBUG +#define DLOG printk +#else +#define DLOG(...) do { } while (0); +#endif + + +static int match(const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, + const void *matchinfo, int offset, const void *hdr, u_int16_t datalen, int *hotdrop) +{ + const struct ipt_macsave_match_info *info = matchinfo; + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + + ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo); // note about cast: ip_conntrack_get() will not modify skb + if (ct) return (memcmp(ct->macsave, info->mac, sizeof(ct->macsave)) == 0) ^ info->invert; + return info->invert; +} + +static int checkentry(const char *tablename, const struct ipt_ip *ip, void *matchinfo, + unsigned int matchsize, unsigned int hook_mask) +{ + return (matchsize == IPT_ALIGN(sizeof(struct ipt_macsave_match_info))); +} + + +static struct ipt_match macsave_match += { { NULL, NULL }, "macsave", &match, &checkentry, NULL, THIS_MODULE }; + +static int __init init(void) +{ + DLOG(KERN_INFO "macsave match init " __DATE__ " " __TIME__ "\n"); + return ipt_register_match(&macsave_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&macsave_match); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/release/src/linux/linux/net/ipv4/netfilter/ipt_mport.c b/release/src/linux/linux/net/ipv4/netfilter/ipt_mport.c index ca99b764..836d3f2f 100644 --- a/release/src/linux/linux/net/ipv4/netfilter/ipt_mport.c +++ b/release/src/linux/linux/net/ipv4/netfilter/ipt_mport.c @@ -10,7 +10,11 @@ MODULE_LICENSE("GPL"); +#if 0 +#define duprintf(format, args...) printk(format , ## args) +#else #define duprintf(format, args...) +#endif /* Returns 1 if the port is matched by the test, 0 otherwise. */ static inline int diff --git a/release/src/linux/linux/net/ipv4/netfilter/ipt_quota.c b/release/src/linux/linux/net/ipv4/netfilter/ipt_quota.c new file mode 100644 index 00000000..d7ab39cf --- /dev/null +++ b/release/src/linux/linux/net/ipv4/netfilter/ipt_quota.c @@ -0,0 +1,88 @@ +/* + * netfilter module to enforce network quotas + * + * Sam Johnston + * + * 30/01/05: Fixed on SMP --Pablo Neira + */ +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); + +static spinlock_t quota_lock = SPIN_LOCK_UNLOCKED; + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, const void *hdr, u_int16_t datalen, int *hotdrop) +{ + struct ipt_quota_info *q = + ((struct ipt_quota_info *) matchinfo)->master; + + spin_lock_bh("a_lock); + + if (q->quota >= datalen) { + /* we can afford this one */ + q->quota -= datalen; + spin_unlock_bh("a_lock); + +#ifdef DEBUG_IPT_QUOTA + printk("IPT Quota OK: %llu datlen %d \n", q->quota, datalen); +#endif + return 1; + } + + /* so we do not allow even small packets from now on */ + q->quota = 0; + +#ifdef DEBUG_IPT_QUOTA + printk("IPT Quota Failed: %llu datlen %d \n", q->quota, datalen); +#endif + + spin_unlock_bh("a_lock); + return 0; +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, unsigned int matchsize, unsigned int hook_mask) +{ + /* TODO: spinlocks? sanity checks? */ + struct ipt_quota_info *q = (struct ipt_quota_info *) matchinfo; + + if (matchsize != IPT_ALIGN(sizeof (struct ipt_quota_info))) + return 0; + + /* For SMP, we only want to use one set of counters. */ + q->master = q; + + return 1; +} + +static struct ipt_match quota_match + = { {NULL, NULL}, "quota", &match, &checkentry, NULL, THIS_MODULE }; + +static int __init +init(void) +{ + return ipt_register_match("a_match); +} + +static void __exit +fini(void) +{ + ipt_unregister_match("a_match); +} + +module_init(init); +module_exit(fini); + diff --git a/release/src/linux/linux/net/ipv4/netfilter/ipt_recent.c b/release/src/linux/linux/net/ipv4/netfilter/ipt_recent.c new file mode 100644 index 00000000..808ae78f --- /dev/null +++ b/release/src/linux/linux/net/ipv4/netfilter/ipt_recent.c @@ -0,0 +1,998 @@ +/* Kernel module to check if the source address has been seen recently. */ +/* Copyright 2002-2003, Stephen Frost */ +/* Author: Stephen Frost */ +/* Project Page: http://snowman.net/projects/ipt_recent/ */ +/* This software is distributed under the terms of the GPL, Version 2 */ +/* This copyright does not cover user programs that use kernel services + * by normal system calls. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#undef DEBUG +#define HASH_LOG 9 + +/* Defaults, these can be overridden on the module command-line. */ +static int ip_list_tot = 100; +static int ip_pkt_list_tot = 20; +static int ip_list_hash_size = 0; +static int ip_list_perms = 0644; +#ifdef DEBUG +static int debug = 1; +#endif + +static char version[] = +KERN_INFO RECENT_NAME " " RECENT_VER ": Stephen Frost . http://snowman.net/projects/ipt_recent/\n"; + +MODULE_AUTHOR("Stephen Frost "); +MODULE_DESCRIPTION("IP tables recently seen matching module " RECENT_VER); +MODULE_LICENSE("GPL"); +MODULE_PARM(ip_list_tot,"i"); +MODULE_PARM(ip_pkt_list_tot,"i"); +MODULE_PARM(ip_list_hash_size,"i"); +MODULE_PARM(ip_list_perms,"i"); +#ifdef DEBUG +MODULE_PARM(debug,"i"); +MODULE_PARM_DESC(debug,"debugging level, defaults to 1"); +#endif +MODULE_PARM_DESC(ip_list_tot,"number of IPs to remember per list"); +MODULE_PARM_DESC(ip_pkt_list_tot,"number of packets per IP to remember"); +MODULE_PARM_DESC(ip_list_hash_size,"size of hash table used to look up IPs"); +MODULE_PARM_DESC(ip_list_perms,"permissions on /proc/net/ipt_recent/* files"); + +/* Structure of our list of recently seen addresses. */ +struct recent_ip_list { + u_int32_t addr; + u_int8_t ttl; + u_int32_t last_seen; + u_int32_t *last_pkts; + u_int32_t oldest_pkt; + u_int32_t hash_entry; + u_int32_t time_pos; +}; + +struct time_info_list { + u_int32_t position; + u_int32_t time; +}; + +/* Structure of our linked list of tables of recent lists. */ +struct recent_ip_tables { + char name[IPT_RECENT_NAME_LEN]; + int count; + int time_pos; + struct recent_ip_list *table; + struct recent_ip_tables *next; + spinlock_t list_lock; + int *hash_table; + struct time_info_list *time_info; +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *status_proc; +#endif /* CONFIG_PROC_FS */ +}; + +/* Our current list of addresses we have recently seen. + * Only added to on a --set, and only updated on --set || --update + */ +static struct recent_ip_tables *r_tables = NULL; + +/* We protect r_list with this spinlock so two processors are not modifying + * the list at the same time. + */ +static spinlock_t recent_lock = SPIN_LOCK_UNLOCKED; + +/* Our /proc/net/ipt_recent entry */ +static struct proc_dir_entry *proc_net_ipt_recent = NULL; + +/* Function declaration for later. */ +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + const void *hdr, + u_int16_t datalen, + int *hotdrop); + +/* Function to hash a given address into the hash table of table_size size */ +int hash_func(unsigned int addr, int table_size) +{ + int result = 0; + unsigned int value = addr; + do { result ^= value; } while((value >>= HASH_LOG)); + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": %d = hash_func(%u,%d)\n", + result & (table_size - 1), + addr, + table_size); +#endif + + return(result & (table_size - 1)); +} + +#ifdef CONFIG_PROC_FS +/* This is the function which produces the output for our /proc output + * interface which lists each IP address, the last seen time and the + * other recent times the address was seen. + */ + +static int ip_recent_get_info(char *buffer, char **start, off_t offset, int length, int *eof, void *data) +{ + int len = 0, count, last_len = 0, pkt_count; + off_t pos = 0; + off_t begin = 0; + struct recent_ip_tables *curr_table; + + curr_table = (struct recent_ip_tables*) data; + + spin_lock_bh(&curr_table->list_lock); + for(count = 0; count < ip_list_tot; count++) { + if(!curr_table->table[count].addr) continue; + last_len = len; + len += sprintf(buffer+len,"src=%u.%u.%u.%u ",NIPQUAD(curr_table->table[count].addr)); + len += sprintf(buffer+len,"ttl: %u ",curr_table->table[count].ttl); + len += sprintf(buffer+len,"last_seen: %u ",curr_table->table[count].last_seen); + len += sprintf(buffer+len,"oldest_pkt: %u ",curr_table->table[count].oldest_pkt); + len += sprintf(buffer+len,"last_pkts: %u",curr_table->table[count].last_pkts[0]); + for(pkt_count = 1; pkt_count < ip_pkt_list_tot; pkt_count++) { + if(!curr_table->table[count].last_pkts[pkt_count]) break; + len += sprintf(buffer+len,", %u",curr_table->table[count].last_pkts[pkt_count]); + } + len += sprintf(buffer+len,"\n"); + pos = begin + len; + if(pos < offset) { len = 0; begin = pos; } + if(pos > offset + length) { len = last_len; break; } + } + + *start = buffer + (offset - begin); + len -= (offset - begin); + if(len > length) len = length; + + spin_unlock_bh(&curr_table->list_lock); + return len; +} + +/* ip_recent_ctrl provides an interface for users to modify the table + * directly. This allows adding entries, removing entries, and + * flushing the entire table. + * This is done by opening up the appropriate table for writing and + * sending one of: + * xx.xx.xx.xx -- Add entry to table with current time + * +xx.xx.xx.xx -- Add entry to table with current time + * -xx.xx.xx.xx -- Remove entry from table + * clear -- Flush table, remove all entries + */ + +static int ip_recent_ctrl(struct file *file, const char *input, unsigned long size, void *data) +{ + static const u_int32_t max[4] = { 0xffffffff, 0xffffff, 0xffff, 0xff }; + u_int32_t val; + int base, used = 0; + char c, *cp; + union iaddr { + uint8_t bytes[4]; + uint32_t word; + } res; + uint8_t *pp = res.bytes; + int digit; + + char buffer[20]; + int len, check_set = 0, count; + u_int32_t addr = 0; + struct sk_buff *skb; + struct ipt_recent_info *info; + struct recent_ip_tables *curr_table; + + curr_table = (struct recent_ip_tables*) data; + + if(size > 20) len = 20; else len = size; + + if(copy_from_user(buffer,input,len)) return -EFAULT; + + if(len < 20) buffer[len] = '\0'; + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl len: %d, input: `%.20s'\n",len,buffer); +#endif + + cp = buffer; + while(isspace(*cp)) { cp++; used++; if(used >= len-5) return used; } + + /* Check if we are asked to flush the entire table */ + if(!memcmp(cp,"clear",5)) { + used += 5; + spin_lock_bh(&curr_table->list_lock); + curr_table->time_pos = 0; + for(count = 0; count < ip_list_hash_size; count++) { + curr_table->hash_table[count] = -1; + } + for(count = 0; count < ip_list_tot; count++) { + curr_table->table[count].last_seen = 0; + curr_table->table[count].addr = 0; + curr_table->table[count].ttl = 0; + memset(curr_table->table[count].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t)); + curr_table->table[count].oldest_pkt = 0; + curr_table->table[count].time_pos = 0; + curr_table->time_info[count].position = count; + curr_table->time_info[count].time = 0; + } + spin_unlock_bh(&curr_table->list_lock); + return used; + } + + check_set = IPT_RECENT_SET; + switch(*cp) { + case '+': check_set = IPT_RECENT_SET; cp++; used++; break; + case '-': check_set = IPT_RECENT_REMOVE; cp++; used++; break; + default: if(!isdigit(*cp)) return (used+1); break; + } + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl cp: `%c', check_set: %d\n",*cp,check_set); +#endif + /* Get addr (effectively inet_aton()) */ + /* Shamelessly stolen from libc, a function in the kernel for doing + * this would, of course, be greatly preferred, but our options appear + * to be rather limited, so we will just do it ourselves here. + */ + res.word = 0; + + c = *cp; + for(;;) { + if(!isdigit(c)) return used; + val = 0; base = 10; digit = 0; + if(c == '0') { + c = *++cp; + if(c == 'x' || c == 'X') base = 16, c = *++cp; + else { base = 8; digit = 1; } + } + for(;;) { + if(isascii(c) && isdigit(c)) { + if(base == 8 && (c == '8' || c == '0')) return used; + val = (val * base) + (c - '0'); + c = *++cp; + digit = 1; + } else if(base == 16 && isascii(c) && isxdigit(c)) { + val = (val << 4) | (c + 10 - (islower(c) ? 'a' : 'A')); + c = *++cp; + digit = 1; + } else break; + } + if(c == '.') { + if(pp > res.bytes + 2 || val > 0xff) return used; + *pp++ = val; + c = *++cp; + } else break; + } + used = cp - buffer; + if(c != '\0' && (!isascii(c) || !isspace(c))) return used; + if(c == '\n') used++; + if(!digit) return used; + + if(val > max[pp - res.bytes]) return used; + addr = res.word | htonl(val); + + if(!addr && check_set == IPT_RECENT_SET) return used; + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl c: %c, addr: %u used: %d\n",c,addr,used); +#endif + + /* Set up and just call match */ + info = kmalloc(sizeof(struct ipt_recent_info),GFP_KERNEL); + if(!info) { return -ENOMEM; } + info->seconds = 0; + info->hit_count = 0; + info->check_set = check_set; + info->invert = 0; + info->side = IPT_RECENT_SOURCE; + strncpy(info->name,curr_table->name,IPT_RECENT_NAME_LEN); + info->name[IPT_RECENT_NAME_LEN-1] = '\0'; + + skb = kmalloc(sizeof(struct sk_buff),GFP_KERNEL); + if (!skb) { + used = -ENOMEM; + goto out_free_info; + } + skb->nh.iph = kmalloc(sizeof(struct iphdr),GFP_KERNEL); + if (!skb->nh.iph) { + used = -ENOMEM; + goto out_free_skb; + } + + skb->nh.iph->saddr = addr; + skb->nh.iph->daddr = 0; + /* Clear ttl since we have no way of knowing it */ + skb->nh.iph->ttl = 0; + match(skb,NULL,NULL,info,0,NULL,sizeof(struct ipt_recent_info),NULL); + + kfree(skb->nh.iph); +out_free_skb: + kfree(skb); +out_free_info: + kfree(info); + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": Leaving ip_recent_ctrl addr: %u used: %d\n",addr,used); +#endif + return used; +} + +#endif /* CONFIG_PROC_FS */ + +/* 'match' is our primary function, called by the kernel whenever a rule is + * hit with our module as an option to it. + * What this function does depends on what was specifically asked of it by + * the user: + * --set -- Add or update last seen time of the source address of the packet + * -- matchinfo->check_set == IPT_RECENT_SET + * --rcheck -- Just check if the source address is in the list + * -- matchinfo->check_set == IPT_RECENT_CHECK + * --update -- If the source address is in the list, update last_seen + * -- matchinfo->check_set == IPT_RECENT_UPDATE + * --remove -- If the source address is in the list, remove it + * -- matchinfo->check_set == IPT_RECENT_REMOVE + * --seconds -- Option to --rcheck/--update, only match if last_seen within seconds + * -- matchinfo->seconds + * --hitcount -- Option to --rcheck/--update, only match if seen hitcount times + * -- matchinfo->hit_count + * --seconds and --hitcount can be combined + */ +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + const void *hdr, + u_int16_t datalen, + int *hotdrop) +{ + int pkt_count, hits_found, ans; + unsigned long now; + const struct ipt_recent_info *info = matchinfo; + u_int32_t addr = 0, time_temp; + u_int8_t ttl = skb->nh.iph->ttl; + int *hash_table; + int orig_hash_result, hash_result, temp, location = 0, time_loc, end_collision_chain = -1; + struct time_info_list *time_info; + struct recent_ip_tables *curr_table; + struct recent_ip_tables *last_table; + struct recent_ip_list *r_list; + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match() called\n"); +#endif + + /* Default is false ^ info->invert */ + ans = info->invert; + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): name = '%s'\n",info->name); +#endif + + /* if out != NULL then routing has been done and TTL changed. + * We change it back here internally for match what came in before routing. */ + if(out) ttl++; + + /* Find the right table */ + spin_lock_bh(&recent_lock); + curr_table = r_tables; + while( (last_table = curr_table) && strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (curr_table = curr_table->next) ); + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): table found('%s')\n",info->name); +#endif + + spin_unlock_bh(&recent_lock); + + /* Table with this name not found, match impossible */ + if(!curr_table) { return ans; } + + /* Make sure no one is changing the list while we work with it */ + spin_lock_bh(&curr_table->list_lock); + + r_list = curr_table->table; + if(info->side == IPT_RECENT_DEST) addr = skb->nh.iph->daddr; else addr = skb->nh.iph->saddr; + + if(!addr) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match() address (%u) invalid, leaving.\n",addr); +#endif + spin_unlock_bh(&curr_table->list_lock); + return ans; + } + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): checking table, addr: %u, ttl: %u, orig_ttl: %u\n",addr,ttl,skb->nh.iph->ttl); +#endif + + /* Get jiffies now in case they changed while we were waiting for a lock */ + now = jiffies; + hash_table = curr_table->hash_table; + time_info = curr_table->time_info; + + orig_hash_result = hash_result = hash_func(addr,ip_list_hash_size); + /* Hash entry at this result used */ + /* Check for TTL match if requested. If TTL is zero then a match would never + * happen, so match regardless of existing TTL in that case. Zero means the + * entry was added via the /proc interface anyway, so we will just use the + * first TTL we get for that IP address. */ + if(info->check_set & IPT_RECENT_TTL) { + while(hash_table[hash_result] != -1 && !(r_list[hash_table[hash_result]].addr == addr && + (!r_list[hash_table[hash_result]].ttl || r_list[hash_table[hash_result]].ttl == ttl))) { + /* Collision in hash table */ + hash_result = (hash_result + 1) % ip_list_hash_size; + } + } else { + while(hash_table[hash_result] != -1 && r_list[hash_table[hash_result]].addr != addr) { + /* Collision in hash table */ + hash_result = (hash_result + 1) % ip_list_hash_size; + } + } + + if(hash_table[hash_result] == -1 && !(info->check_set & IPT_RECENT_SET)) { + /* IP not in list and not asked to SET */ + spin_unlock_bh(&curr_table->list_lock); + return ans; + } + + /* Check if we need to handle the collision, do not need to on REMOVE */ + if(orig_hash_result != hash_result && !(info->check_set & IPT_RECENT_REMOVE)) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision in hash table. (or: %d,hr: %d,oa: %u,ha: %u)\n", + orig_hash_result, + hash_result, + r_list[hash_table[orig_hash_result]].addr, + addr); +#endif + + /* We had a collision. + * orig_hash_result is where we started, hash_result is where we ended up. + * So, swap them because we are likely to see the same guy again sooner */ +#ifdef DEBUG + if(debug) { + printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[orig_hash_result] = %d\n",hash_table[orig_hash_result]); + printk(KERN_INFO RECENT_NAME ": match(): Collision; r_list[hash_table[orig_hash_result]].hash_entry = %d\n", + r_list[hash_table[orig_hash_result]].hash_entry); + } +#endif + + r_list[hash_table[orig_hash_result]].hash_entry = hash_result; + + + temp = hash_table[orig_hash_result]; +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[hash_result] = %d\n",hash_table[hash_result]); +#endif + hash_table[orig_hash_result] = hash_table[hash_result]; + hash_table[hash_result] = temp; + temp = hash_result; + hash_result = orig_hash_result; + orig_hash_result = temp; + time_info[r_list[hash_table[orig_hash_result]].time_pos].position = hash_table[orig_hash_result]; + if(hash_table[hash_result] != -1) { + r_list[hash_table[hash_result]].hash_entry = hash_result; + time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; + } + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision handled.\n"); +#endif + } + + if(hash_table[hash_result] == -1) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): New table entry. (hr: %d,ha: %u)\n", + hash_result, addr); +#endif + + /* New item found and IPT_RECENT_SET, so we need to add it */ + location = time_info[curr_table->time_pos].position; + hash_table[r_list[location].hash_entry] = -1; + hash_table[hash_result] = location; + memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t)); + r_list[location].time_pos = curr_table->time_pos; + r_list[location].addr = addr; + r_list[location].ttl = ttl; + r_list[location].last_seen = now; + r_list[location].oldest_pkt = 1; + r_list[location].last_pkts[0] = now; + r_list[location].hash_entry = hash_result; + time_info[curr_table->time_pos].time = r_list[location].last_seen; + curr_table->time_pos = (curr_table->time_pos + 1) % ip_list_tot; + + ans = !info->invert; + } else { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): Existing table entry. (hr: %d,ha: %u)\n", + hash_result, + addr); +#endif + + /* Existing item found */ + location = hash_table[hash_result]; + /* We have a match on address, now to make sure it meets all requirements for a + * full match. */ + if(info->check_set & IPT_RECENT_CHECK || info->check_set & IPT_RECENT_UPDATE) { + if(!info->seconds && !info->hit_count) ans = !info->invert; else ans = info->invert; + if(info->seconds && !info->hit_count) { + if(time_before_eq(now,r_list[location].last_seen+info->seconds*HZ)) ans = !info->invert; else ans = info->invert; + } + if(info->seconds && info->hit_count) { + for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) { + if(time_before_eq(now,r_list[location].last_pkts[pkt_count]+info->seconds*HZ)) hits_found++; + } + if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert; + } + if(info->hit_count && !info->seconds) { + for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) { + if(r_list[location].last_pkts[pkt_count] == 0) break; + hits_found++; + } + if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert; + } + } +#ifdef DEBUG + if(debug) { + if(ans) + printk(KERN_INFO RECENT_NAME ": match(): match addr: %u\n",addr); + else + printk(KERN_INFO RECENT_NAME ": match(): no match addr: %u\n",addr); + } +#endif + + /* If and only if we have been asked to SET, or to UPDATE (on match) do we add the + * current timestamp to the last_seen. */ + if((info->check_set & IPT_RECENT_SET && (ans = !info->invert)) || (info->check_set & IPT_RECENT_UPDATE && ans)) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): SET or UPDATE; updating time info.\n"); +#endif + /* Have to update our time info */ + time_loc = r_list[location].time_pos; + time_info[time_loc].time = now; + time_info[time_loc].position = location; + while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) { + time_temp = time_info[time_loc].time; + time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time; + time_info[(time_loc+1)%ip_list_tot].time = time_temp; + time_temp = time_info[time_loc].position; + time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position; + time_info[(time_loc+1)%ip_list_tot].position = time_temp; + r_list[time_info[time_loc].position].time_pos = time_loc; + r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot; + time_loc = (time_loc+1) % ip_list_tot; + } + r_list[location].time_pos = time_loc; + r_list[location].ttl = ttl; + r_list[location].last_pkts[r_list[location].oldest_pkt] = now; + r_list[location].oldest_pkt = ++r_list[location].oldest_pkt % ip_pkt_list_tot; + r_list[location].last_seen = now; + } + /* If we have been asked to remove the entry from the list, just set it to 0 */ + if(info->check_set & IPT_RECENT_REMOVE) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; clearing entry (or: %d, hr: %d).\n",orig_hash_result,hash_result); +#endif + /* Check if this is part of a collision chain */ + while(hash_table[(orig_hash_result+1) % ip_list_hash_size] != -1) { + orig_hash_result++; + if(hash_func(r_list[hash_table[orig_hash_result]].addr,ip_list_hash_size) == hash_result) { + /* Found collision chain, how deep does this rabbit hole go? */ +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; found collision chain.\n"); +#endif + end_collision_chain = orig_hash_result; + } + } + if(end_collision_chain != -1) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; part of collision chain, moving to end.\n"); +#endif + /* Part of a collision chain, swap it with the end of the chain + * before removing. */ + r_list[hash_table[end_collision_chain]].hash_entry = hash_result; + temp = hash_table[end_collision_chain]; + hash_table[end_collision_chain] = hash_table[hash_result]; + hash_table[hash_result] = temp; + time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; + hash_result = end_collision_chain; + r_list[hash_table[hash_result]].hash_entry = hash_result; + time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; + } + location = hash_table[hash_result]; + hash_table[r_list[location].hash_entry] = -1; + time_loc = r_list[location].time_pos; + time_info[time_loc].time = 0; + time_info[time_loc].position = location; + while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) { + time_temp = time_info[time_loc].time; + time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time; + time_info[(time_loc+1)%ip_list_tot].time = time_temp; + time_temp = time_info[time_loc].position; + time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position; + time_info[(time_loc+1)%ip_list_tot].position = time_temp; + r_list[time_info[time_loc].position].time_pos = time_loc; + r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot; + time_loc = (time_loc+1) % ip_list_tot; + } + r_list[location].time_pos = time_loc; + r_list[location].last_seen = 0; + r_list[location].addr = 0; + r_list[location].ttl = 0; + memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t)); + r_list[location].oldest_pkt = 0; + ans = !info->invert; + } + spin_unlock_bh(&curr_table->list_lock); + return ans; + } + + spin_unlock_bh(&curr_table->list_lock); +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match() left.\n"); +#endif + return ans; +} + +/* This function is to verify that the rule given during the userspace iptables + * command is correct. + * If the command is valid then we check if the table name referred to by the + * rule exists, if not it is created. + */ +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + int flag = 0, c; + u_int32_t *hold; + const struct ipt_recent_info *info = matchinfo; + struct recent_ip_tables *curr_table, *find_table, *last_table; + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() entered.\n"); +#endif + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_recent_info))) return 0; + + /* seconds and hit_count only valid for CHECK/UPDATE */ + if(info->check_set & IPT_RECENT_SET) { flag++; if(info->seconds || info->hit_count) return 0; } + if(info->check_set & IPT_RECENT_REMOVE) { flag++; if(info->seconds || info->hit_count) return 0; } + if(info->check_set & IPT_RECENT_CHECK) flag++; + if(info->check_set & IPT_RECENT_UPDATE) flag++; + + /* One and only one of these should ever be set */ + if(flag != 1) return 0; + + /* Name must be set to something */ + if(!info->name || !info->name[0]) return 0; + + /* Things look good, create a list for this if it does not exist */ + /* Lock the linked list while we play with it */ + spin_lock_bh(&recent_lock); + + /* Look for an entry with this name already created */ + /* Finds the end of the list and the entry before the end if current name does not exist */ + find_table = r_tables; + while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) ); + + /* If a table already exists just increment the count on that table and return */ + if(find_table) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), incrementing count.\n",info->name); +#endif + find_table->count++; + spin_unlock_bh(&recent_lock); + return 1; + } + + spin_unlock_bh(&recent_lock); + + /* Table with this name not found */ + /* Allocate memory for new linked list item */ + +#ifdef DEBUG + if(debug) { + printk(KERN_INFO RECENT_NAME ": checkentry: no table found (%s)\n",info->name); + printk(KERN_INFO RECENT_NAME ": checkentry: Allocationg %d for link-list entry.\n",sizeof(struct recent_ip_tables)); + } +#endif + + curr_table = vmalloc(sizeof(struct recent_ip_tables)); + if(curr_table == NULL) return -ENOMEM; + + curr_table->list_lock = SPIN_LOCK_UNLOCKED; + curr_table->next = NULL; + curr_table->count = 1; + curr_table->time_pos = 0; + strncpy(curr_table->name,info->name,IPT_RECENT_NAME_LEN); + curr_table->name[IPT_RECENT_NAME_LEN-1] = '\0'; + + /* Allocate memory for this table and the list of packets in each entry. */ +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for table (%s).\n", + sizeof(struct recent_ip_list)*ip_list_tot, + info->name); +#endif + + curr_table->table = vmalloc(sizeof(struct recent_ip_list)*ip_list_tot); + if(curr_table->table == NULL) { vfree(curr_table); return -ENOMEM; } + memset(curr_table->table,0,sizeof(struct recent_ip_list)*ip_list_tot); +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for pkt_list.\n", + sizeof(u_int32_t)*ip_pkt_list_tot*ip_list_tot); +#endif + + hold = vmalloc(sizeof(u_int32_t)*ip_pkt_list_tot*ip_list_tot); +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: After pkt_list allocation.\n"); +#endif + if(hold == NULL) { + printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for pkt_list.\n"); + vfree(curr_table->table); + vfree(curr_table); + return -ENOMEM; + } + for(c = 0; c < ip_list_tot; c++) { + curr_table->table[c].last_pkts = hold + c*ip_pkt_list_tot; + } + + /* Allocate memory for the hash table */ +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for hash_table.\n", + sizeof(int)*ip_list_hash_size); +#endif + + curr_table->hash_table = vmalloc(sizeof(int)*ip_list_hash_size); + if(!curr_table->hash_table) { + printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for hash_table.\n"); + vfree(hold); + vfree(curr_table->table); + vfree(curr_table); + return -ENOMEM; + } + + for(c = 0; c < ip_list_hash_size; c++) { + curr_table->hash_table[c] = -1; + } + + /* Allocate memory for the time info */ +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for time_info.\n", + sizeof(struct time_info_list)*ip_list_tot); +#endif + + curr_table->time_info = vmalloc(sizeof(struct time_info_list)*ip_list_tot); + if(!curr_table->time_info) { + printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for time_info.\n"); + vfree(curr_table->hash_table); + vfree(hold); + vfree(curr_table->table); + vfree(curr_table); + return -ENOMEM; + } + for(c = 0; c < ip_list_tot; c++) { + curr_table->time_info[c].position = c; + curr_table->time_info[c].time = 0; + } + + /* Put the new table in place */ + spin_lock_bh(&recent_lock); + find_table = r_tables; + while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) ); + + /* If a table already exists just increment the count on that table and return */ + if(find_table) { + find_table->count++; + spin_unlock_bh(&recent_lock); +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), created by other process.\n",info->name); +#endif + vfree(curr_table->time_info); + vfree(curr_table->hash_table); + vfree(hold); + vfree(curr_table->table); + vfree(curr_table); + return 1; + } + if(!last_table) r_tables = curr_table; else last_table->next = curr_table; + + spin_unlock_bh(&recent_lock); + +#ifdef CONFIG_PROC_FS + /* Create our proc 'status' entry. */ + curr_table->status_proc = create_proc_entry(curr_table->name, ip_list_perms, proc_net_ipt_recent); + if (!curr_table->status_proc) { + printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for /proc entry.\n"); + /* Destroy the created table */ + spin_lock_bh(&recent_lock); + last_table = NULL; + curr_table = r_tables; + if(!curr_table) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, no tables.\n"); +#endif + spin_unlock_bh(&recent_lock); + return -ENOMEM; + } + while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) ); + if(!curr_table) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, table already destroyed.\n"); +#endif + spin_unlock_bh(&recent_lock); + return -ENOMEM; + } + if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next; + spin_unlock_bh(&recent_lock); + vfree(curr_table->time_info); + vfree(curr_table->hash_table); + vfree(hold); + vfree(curr_table->table); + vfree(curr_table); + return -ENOMEM; + } + + curr_table->status_proc->owner = THIS_MODULE; + curr_table->status_proc->data = curr_table; + wmb(); + curr_table->status_proc->read_proc = ip_recent_get_info; + curr_table->status_proc->write_proc = ip_recent_ctrl; +#endif /* CONFIG_PROC_FS */ + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() left.\n"); +#endif + + return 1; +} + +/* This function is called in the event that a rule matching this module is + * removed. + * When this happens we need to check if there are no other rules matching + * the table given. If that is the case then we remove the table and clean + * up its memory. + */ +static void +destroy(void *matchinfo, unsigned int matchsize) +{ + const struct ipt_recent_info *info = matchinfo; + struct recent_ip_tables *curr_table, *last_table; + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": destroy() entered.\n"); +#endif + + if(matchsize != IPT_ALIGN(sizeof(struct ipt_recent_info))) return; + + /* Lock the linked list while we play with it */ + spin_lock_bh(&recent_lock); + + /* Look for an entry with this name already created */ + /* Finds the end of the list and the entry before the end if current name does not exist */ + last_table = NULL; + curr_table = r_tables; + if(!curr_table) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": destroy() No tables found, leaving.\n"); +#endif + spin_unlock_bh(&recent_lock); + return; + } + while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) ); + + /* If a table does not exist then do nothing and return */ + if(!curr_table) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table not found, leaving.\n"); +#endif + spin_unlock_bh(&recent_lock); + return; + } + + curr_table->count--; + + /* If count is still non-zero then there are still rules referenceing it so we do nothing */ + if(curr_table->count) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, non-zero count, leaving.\n"); +#endif + spin_unlock_bh(&recent_lock); + return; + } + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, zero count, removing.\n"); +#endif + + /* Count must be zero so we remove this table from the list */ + if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next; + + spin_unlock_bh(&recent_lock); + + /* lock to make sure any late-runners still using this after we removed it from + * the list finish up then remove everything */ + spin_lock_bh(&curr_table->list_lock); + spin_unlock_bh(&curr_table->list_lock); + +#ifdef CONFIG_PROC_FS + if(curr_table->status_proc) remove_proc_entry(curr_table->name,proc_net_ipt_recent); +#endif /* CONFIG_PROC_FS */ + vfree(curr_table->table[0].last_pkts); + vfree(curr_table->table); + vfree(curr_table->hash_table); + vfree(curr_table->time_info); + vfree(curr_table); + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": destroy() left.\n"); +#endif + + return; +} + +/* This is the structure we pass to ipt_register to register our + * module with iptables. + */ +static struct ipt_match recent_match = { + .name = "recent", + .match = &match, + .checkentry = &checkentry, + .destroy = &destroy, + .me = THIS_MODULE +}; + +/* Kernel module initialization. */ +static int __init init(void) +{ + int count; + + printk(version); + proc_net_ipt_recent = proc_mkdir("ipt_recent",proc_net); + if(!proc_net_ipt_recent) return -ENOMEM; + + if(ip_list_hash_size && ip_list_hash_size <= ip_list_tot) { + printk(KERN_WARNING RECENT_NAME ": ip_list_hash_size too small, resetting to default.\n"); + ip_list_hash_size = 0; + } + + if(!ip_list_hash_size) { + ip_list_hash_size = ip_list_tot*3; + count = 2*2; + while(ip_list_hash_size > count) count = count*2; + ip_list_hash_size = count; + } + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": ip_list_hash_size: %d\n",ip_list_hash_size); +#endif + + return ipt_register_match(&recent_match); +} + +/* Kernel module destruction. */ +static void __exit fini(void) +{ + ipt_unregister_match(&recent_match); + + remove_proc_entry("ipt_recent",proc_net); +} + +/* Register our module with the kernel. */ +module_init(init); +module_exit(fini); diff --git a/release/src/linux/linux/net/ipv4/netfilter/ipt_string.c b/release/src/linux/linux/net/ipv4/netfilter/ipt_string.c new file mode 100644 index 00000000..a18b89d0 --- /dev/null +++ b/release/src/linux/linux/net/ipv4/netfilter/ipt_string.c @@ -0,0 +1,218 @@ +/* Kernel module to match a string into a packet. + * + * Copyright (C) 2000 Emmanuel Roger + * + * ChangeLog + * 19.02.2002: Gianni Tedesco + * Fixed SMP re-entrancy problem using per-cpu data areas + * for the skip/shift tables. + * 02.05.2001: Gianni Tedesco + * Fixed kernel panic, due to overrunning boyer moore string + * tables. Also slightly tweaked heuristic for deciding what + * search algo to use. + * 27.01.2001: Gianni Tedesco + * Implemented Boyer Moore Sublinear search algorithm + * alongside the existing linear search based on memcmp(). + * Also a quick check to decide which method to use on a per + * packet basis. + */ + +#include +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); + +struct string_per_cpu { + int *skip; + int *shift; + int *len; +}; + +struct string_per_cpu *bm_string_data=NULL; + +/* Boyer Moore Sublinear string search - VERY FAST */ +char *search_sublinear (char *needle, char *haystack, int needle_len, int haystack_len) +{ + int M1, right_end, sk, sh; + int ended, j, i; + + int *skip, *shift, *len; + + /* use data suitable for this CPU */ + shift=bm_string_data[smp_processor_id()].shift; + skip=bm_string_data[smp_processor_id()].skip; + len=bm_string_data[smp_processor_id()].len; + + /* Setup skip/shift tables */ + M1 = right_end = needle_len-1; + for (i = 0; i < BM_MAX_HLEN; i++) skip[i] = needle_len; + for (i = 0; needle[i]; i++) skip[needle[i]] = M1 - i; + + for (i = 1; i < needle_len; i++) { + for (j = 0; j < needle_len && needle[M1 - j] == needle[M1 - i - j]; j++); + len[i] = j; + } + + shift[0] = 1; + for (i = 1; i < needle_len; i++) shift[i] = needle_len; + for (i = M1; i > 0; i--) shift[len[i]] = i; + ended = 0; + + for (i = 0; i < needle_len; i++) { + if (len[i] == M1 - i) ended = i; + if (ended) shift[i] = ended; + } + + /* Do the search*/ + while (right_end < haystack_len) + { + for (i = 0; i < needle_len && haystack[right_end - i] == needle[M1 - i]; i++); + if (i == needle_len) { + return haystack+(right_end - M1); + } + + sk = skip[haystack[right_end - i]]; + sh = shift[i]; + right_end = max(right_end - i + sk, right_end + sh); + } + + return NULL; +} + +/* Linear string search based on memcmp() */ +char *search_linear (char *needle, char *haystack, int needle_len, int haystack_len) +{ + char *k = haystack + (haystack_len-needle_len); + char *t = haystack; + + while ( t <= k ) { + if (memcmp(t, needle, needle_len) == 0) + return t; + t++; + } + + return NULL; +} + + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + const void *hdr, + u_int16_t datalen, + int *hotdrop) +{ + const struct ipt_string_info *info = matchinfo; + struct iphdr *ip = skb->nh.iph; + int hlen, nlen; + char *needle, *haystack; + proc_ipt_search search=search_linear; + + if ( !ip ) return 0; + + /* get lenghts, and validate them */ + nlen=info->len; + hlen=ntohs(ip->tot_len)-(ip->ihl*4); + if ( nlen > hlen ) return 0; + + needle=(char *)&info->string; + haystack=(char *)ip+(ip->ihl*4); + + /* The sublinear search comes in to its own + * on the larger packets */ + if ( (hlen>IPT_STRING_HAYSTACK_THRESH) && + (nlen>IPT_STRING_NEEDLE_THRESH) ) { + if ( hlen < BM_MAX_HLEN ) { + search=search_sublinear; + }else{ + if (net_ratelimit()) + printk(KERN_INFO "ipt_string: Packet too big " + "to attempt sublinear string search " + "(%d bytes)\n", hlen ); + } + } + + return ((search(needle, haystack, nlen, hlen)!=NULL) ^ info->invert); +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_string_info))) + return 0; + + return 1; +} + +void string_freeup_data(void) +{ + int c; + + if ( bm_string_data ) { + for(c=0; c : force the match to be in LOCAL_IN or PRE_ROUTING only. 2001-30-11 Fabrice : added the possibility to use the match in FORWARD/OUTPUT with a little hack, added Nguyen Dang Phuoc Dong patch to support timezones. + 2004-05-02 Fabrice : added support for date matching, from an idea of Fabien COELHO. */ #include @@ -19,7 +20,7 @@ #include MODULE_AUTHOR("Fabrice MARIE "); -MODULE_DESCRIPTION("Match arrival timestamp"); +MODULE_DESCRIPTION("Match arrival timestamp/date"); MODULE_LICENSE("GPL"); struct tm @@ -53,7 +54,8 @@ match(const struct sk_buff *skb, { const struct ipt_time_info *info = matchinfo; /* match info for rule */ struct tm currenttime; /* time human readable */ - unsigned int packet_time; + u_int8_t days_of_week[7] = {64, 32, 16, 8, 4, 2, 1}; + u_int16_t packet_time; struct timeval kerneltimeval; time_t packet_local_time; @@ -66,22 +68,21 @@ match(const struct sk_buff *skb, else packet_local_time = skb->stamp.tv_sec; + /* First we make sure we are in the date start-stop boundaries */ + if ((packet_local_time < info->date_start) || (packet_local_time > info->date_stop)) + return 0; /* We are outside the date boundaries */ + /* Transform the timestamp of the packet, in a human readable form */ localtime(&packet_local_time, ¤ttime); /* check if we match this timestamp, we start by the days... */ - if (!((1 << currenttime.tm_wday) & info->days_match)) + if ((days_of_week[currenttime.tm_wday] & info->days_match) != days_of_week[currenttime.tm_wday]) return 0; /* the day doesn't match */ /* ... check the time now */ - packet_time = (currenttime.tm_hour * 60 * 60) + (currenttime.tm_min * 60) + currenttime.tm_sec; - if (info->time_start < info->time_stop) { - if ((packet_time < info->time_start) || (packet_time > info->time_stop)) - return 0; - } else { - if ((packet_time < info->time_start) && (packet_time > info->time_stop)) - return 0; - } + packet_time = (currenttime.tm_hour * 60) + currenttime.tm_min; + if ((packet_time < info->time_start) || (packet_time > info->time_stop)) + return 0; /* here we match ! */ return 1; @@ -96,24 +97,25 @@ checkentry(const char *tablename, { struct ipt_time_info *info = matchinfo; /* match info for rule */ - /* First, check that we are in the correct hook */ - /* PRE_ROUTING, LOCAL_IN or FROWARD */ + /* First, check that we are in the correct hooks */ if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) | (1 << NF_IP_LOCAL_OUT))) { printk("ipt_time: error, only valid for PRE_ROUTING, LOCAL_IN, FORWARD and OUTPUT)\n"); return 0; } - - /* always use kerneltime */ + /* we use the kerneltime if we are in forward or output */ info->kerneltime = 1; + if (hook_mask & ~((1 << NF_IP_FORWARD) | (1 << NF_IP_LOCAL_OUT))) + /* we use the skb time */ + info->kerneltime = 0; /* Check the size */ - if (matchsize < IPT_ALIGN(sizeof(struct ipt_time_info))) + if (matchsize != IPT_ALIGN(sizeof(struct ipt_time_info))) return 0; /* Now check the coherence of the data ... */ - if ((info->time_start > 86399) || /* 24*60*60-1 = 86399*/ - (info->time_stop > 86399)) + if ((info->time_start > 1439) || /* 23*60+59 = 1439*/ + (info->time_stop > 1439)) { printk(KERN_WARNING "ipt_time: invalid argument\n"); return 0; @@ -122,8 +124,12 @@ checkentry(const char *tablename, return 1; } -static struct ipt_match time_match -= { { NULL, NULL }, "time", &match, &checkentry, NULL, THIS_MODULE }; +static struct ipt_match time_match = { + .name = "time", + .match = match, + .checkentry = checkentry, + .me = THIS_MODULE, +}; static int __init init(void) { diff --git a/release/src/linux/linux/net/ipv4/netfilter/ipt_u32.c b/release/src/linux/linux/net/ipv4/netfilter/ipt_u32.c new file mode 100644 index 00000000..0c749563 --- /dev/null +++ b/release/src/linux/linux/net/ipv4/netfilter/ipt_u32.c @@ -0,0 +1,211 @@ +/* Kernel module to match u32 packet content. */ + +/* +U32 tests whether quantities of up to 4 bytes extracted from a packet +have specified values. The specification of what to extract is general +enough to find data at given offsets from tcp headers or payloads. + + --u32 tests + The argument amounts to a program in a small language described below. + tests := location = value | tests && location = value + value := range | value , range + range := number | number : number + a single number, n, is interpreted the same as n:n + n:m is interpreted as the range of numbers >=n and <=m + location := number | location operator number + operator := & | << | >> | @ + + The operators &, <<, >>, && mean the same as in c. The = is really a set + membership operator and the value syntax describes a set. The @ operator + is what allows moving to the next header and is described further below. + + *** Until I can find out how to avoid it, there are some artificial limits + on the size of the tests: + - no more than 10 ='s (and 9 &&'s) in the u32 argument + - no more than 10 ranges (and 9 commas) per value + - no more than 10 numbers (and 9 operators) per location + + To describe the meaning of location, imagine the following machine that + interprets it. There are three registers: + A is of type char*, initially the address of the IP header + B and C are unsigned 32 bit integers, initially zero + + The instructions are: + number B = number; + C = (*(A+B)<<24)+(*(A+B+1)<<16)+(*(A+B+2)<<8)+*(A+B+3) + &number C = C&number + <>number C = C>>number + @number A = A+C; then do the instruction number + Any access of memory outside [skb->head,skb->end] causes the match to fail. + Otherwise the result of the computation is the final value of C. + + Whitespace is allowed but not required in the tests. + However the characters that do occur there are likely to require + shell quoting, so it's a good idea to enclose the arguments in quotes. + +Example: + match IP packets with total length >= 256 + The IP header contains a total length field in bytes 2-3. + --u32 "0&0xFFFF=0x100:0xFFFF" + read bytes 0-3 + AND that with FFFF (giving bytes 2-3), + and test whether that's in the range [0x100:0xFFFF] + +Example: (more realistic, hence more complicated) + match icmp packets with icmp type 0 + First test that it's an icmp packet, true iff byte 9 (protocol) = 1 + --u32 "6&0xFF=1 && ... + read bytes 6-9, use & to throw away bytes 6-8 and compare the result to 1 + Next test that it's not a fragment. + (If so it might be part of such a packet but we can't always tell.) + n.b. This test is generally needed if you want to match anything + beyond the IP header. + The last 6 bits of byte 6 and all of byte 7 are 0 iff this is a complete + packet (not a fragment). Alternatively, you can allow first fragments + by only testing the last 5 bits of byte 6. + ... 4&0x3FFF=0 && ... + Last test: the first byte past the IP header (the type) is 0 + This is where we have to use the @syntax. The length of the IP header + (IHL) in 32 bit words is stored in the right half of byte 0 of the + IP header itself. + ... 0>>22&0x3C@0>>24=0" + The first 0 means read bytes 0-3, + >>22 means shift that 22 bits to the right. Shifting 24 bits would give + the first byte, so only 22 bits is four times that plus a few more bits. + &3C then eliminates the two extra bits on the right and the first four + bits of the first byte. + For instance, if IHL=5 then the IP header is 20 (4 x 5) bytes long. + In this case bytes 0-1 are (in binary) xxxx0101 yyzzzzzz, + >>22 gives the 10 bit value xxxx0101yy and &3C gives 010100. + @ means to use this number as a new offset into the packet, and read + four bytes starting from there. This is the first 4 bytes of the icmp + payload, of which byte 0 is the icmp type. Therefore we simply shift + the value 24 to the right to throw out all but the first byte and compare + the result with 0. + +Example: + tcp payload bytes 8-12 is any of 1, 2, 5 or 8 + First we test that the packet is a tcp packet (similar to icmp). + --u32 "6&0xFF=6 && ... + Next, test that it's not a fragment (same as above). + ... 0>>22&0x3C@12>>26&0x3C@8=1,2,5,8" + 0>>22&3C as above computes the number of bytes in the IP header. + @ makes this the new offset into the packet, which is the start of the + tcp header. The length of the tcp header (again in 32 bit words) is + the left half of byte 12 of the tcp header. The 12>>26&3C + computes this length in bytes (similar to the IP header before). + @ makes this the new offset, which is the start of the tcp payload. + Finally 8 reads bytes 8-12 of the payload and = checks whether the + result is any of 1, 2, 5 or 8 +*/ + +#include +#include + +#include +#include + +/* #include for timing */ + +MODULE_AUTHOR("Don Cohen "); +MODULE_DESCRIPTION("IP tables u32 matching module"); +MODULE_LICENSE("GPL"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + const void *hdr, + u_int16_t datalen, + int *hotdrop) +{ + const struct ipt_u32 *data = matchinfo; + int testind, i; + unsigned char* origbase = (char*)skb->nh.iph; + unsigned char* base = origbase; + unsigned char* head = skb->head; + unsigned char* end = skb->end; + int nnums, nvals; + u_int32_t pos, val; + /* unsigned long long cycles1, cycles2, cycles3, cycles4; + cycles1 = get_cycles(); */ + + for (testind=0; testind < data->ntests; testind++) { + base = origbase; /* reset for each test */ + pos = data->tests[testind].location[0].number; + if (base+pos+3 > end || base+pos < head) + return 0; + val = (base[pos]<<24) + (base[pos+1]<<16) + + (base[pos+2]<<8) + base[pos+3]; + nnums = data->tests[testind].nnums; + for (i=1; i < nnums; i++) { + u_int32_t number = data->tests[testind].location[i].number; + switch (data->tests[testind].location[i].nextop) { + case IPT_U32_AND: + val = val & number; + break; + case IPT_U32_LEFTSH: + val = val << number; + break; + case IPT_U32_RIGHTSH: + val = val >> number; + break; + case IPT_U32_AT: + base = base + val; + pos = number; + if (base+pos+3 > end || base+pos < head) + return 0; + val = (base[pos]<<24) + (base[pos+1]<<16) + + (base[pos+2]<<8) + base[pos+3]; + break; + } + } + nvals = data->tests[testind].nvalues; + for (i=0; i < nvals; i++) { + if ((data->tests[testind].value[i].min <= val) && + (val <= data->tests[testind].value[i].max)) { + break; + } + } + if (i >= data->tests[testind].nvalues) { + /* cycles2 = get_cycles(); + printk("failed %d in %d cycles\n", testind, + cycles2-cycles1); */ + return 0; + } + } + /* cycles2 = get_cycles(); + printk("succeeded in %d cycles\n", cycles2-cycles1); */ + return 1; +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_u32))) + return 0; + return 1; +} + +static struct ipt_match u32_match += { { NULL, NULL }, "u32", &match, &checkentry, NULL, THIS_MODULE }; + +static int __init init(void) +{ + return ipt_register_match(&u32_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&u32_match); +} + +module_init(init); +module_exit(fini); diff --git a/release/src/linux/linux/net/ipv4/netfilter/ipt_web.c b/release/src/linux/linux/net/ipv4/netfilter/ipt_web.c new file mode 100644 index 00000000..c32a860a --- /dev/null +++ b/release/src/linux/linux/net/ipv4/netfilter/ipt_web.c @@ -0,0 +1,246 @@ +/* + + web (experimental) + HTTP client request match + Copyright (C) 2006 Jonathan Zarate + + Licensed under GNU GPL v2 or later. + +*/ +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Jonathan Zarate"); +MODULE_DESCRIPTION("HTTP client request match (experimental)"); +MODULE_LICENSE("GPL"); + + +// #define LOG printk +#define LOG(...) do { } while (0); + + +static int find(const char *data, const char *tail, const char *text) +{ + int n, o; + int dlen; + const char *p, *e; + + while ((data < tail) && (*data == ' ')) ++data; + while ((tail > data) && (*(tail - 1) == ' ')) --tail; + + dlen = tail - data; + +#if 0 + { + char tmp[128]; + int z; + z = sizeof(tmp) - 1; + if (z > dlen) z = dlen; + memcpy(tmp, data, z); + tmp[z] = 0; + LOG(KERN_INFO "find in '%s'\n", tmp); + } +#endif + + // 012345 + // text + // ^text + // text$ + // ^text$ + // 012345 + + while (*text) { + n = o = strlen(text); + if (*text == '^') { + --n; + if (*(text + n) == '$') { + // exact + --n; + if ((dlen == n) && (memcmp(data, text + 1, n) == 0)) { + LOG(KERN_INFO "matched %s\n", text); + return 1; + } + } + else { + // begins with + if ((dlen >= n) && (memcmp(data, text + 1, n) == 0)) { + LOG(KERN_INFO "matched %s\n", text); + return 1; + } + } + } + else if (*(text + n - 1) == '$') { + // ends with + --n; + if (memcmp(tail - n, text, n) == 0) { + LOG(KERN_INFO "matched %s\n", text); + return 1; + } + } + else { + // contains + p = data; + e = tail - n; + while (p <= e) { + if (memcmp(p, text, n) == 0) { + LOG(KERN_INFO "matched %s\n", text); + return 1; + } + ++p; + } + } + + text += o + 1; + } + return 0; +} + +static inline const char *findend(const char *data, const char *tail, int min) +{ + int n = tail - data; + if (n >= min) { + while (data < tail) { + if (*data == '\r') return data; + ++data; + } + } + return NULL; +} + +static int match(const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, + const void *matchinfo, int offset, const void *hdr, u_int16_t datalen, int *hotdrop) +{ + const struct ipt_web_info *info; + const struct tcphdr *tcph; + const char *data; + const char *tail; + const char *p, *q; + int doff, dlen; + + info = matchinfo; + + if (offset != 0) return info->invert; + + tcph = hdr; + doff = (tcph->doff * 4); + data = (char *)tcph + doff; + dlen = datalen - doff; + +#if 0 + printk(KERN_INFO "datalen=%u dlen=%d doff=%d\n", datalen, dlen, doff); + char tmp[16]; + memcpy(tmp, data, sizeof(tmp)); + tmp[sizeof(tmp) - 1] = 0; + printk(KERN_INFO "[%s]\n", tmp); +#endif + + // POST / HTTP/1.0$$$$ + // GET / HTTP/1.0$$$$ + // 1234567890123456789 + if (dlen < 18) return info->invert; + + // "GET " or "POST" + __u32 sig = *(__u32 *)data; + if ((sig != __constant_htonl(0x47455420)) && (sig != __constant_htonl(0x504f5354))) { + return info->invert; + } + + tail = data + dlen; + if (dlen > 1024) { + dlen = 1024; + tail = data + 1024; + } + + + // POST / HTTP/1.0$$$$ + // GET / HTTP/1.0$$$$ -- minimum + // 0123456789012345678 + // 9876543210 + if (((p = findend(data + 14, tail, 18)) == NULL) || (memcmp(p - 9, " HTTP/", 6) != 0)) + return info->invert; + +#if 0 + { + const char *qq = info->text; + while (*qq) { + printk(KERN_INFO "text=%s\n", qq); + qq += strlen(qq) + 1; + } + } +#endif + + switch (info->mode) { + case IPT_WEB_HTTP: + return !info->invert; + case IPT_WEB_HORE: + // entire request line, else host line + if (find(data + 4, p - 9, info->text)) return !info->invert; + break; + case IPT_WEB_PATH: + // left side of '?' or entire line + q = data += 4; + p -= 9; + while ((q < p) && (*q != '?')) ++q; + return find(data, q, info->text) ^ info->invert; + case IPT_WEB_QUERY: + // right side of '?' or none + q = data + 4; + p -= 9; + while ((q < p) && (*q != '?')) ++q; + if (q >= p) return info->invert; + return find(q + 1, p, info->text) ^ info->invert; + case IPT_WEB_RURI: + // entire request line + return find(data + 4, p - 9, info->text) ^ info->invert; + default: + // shutup compiler + break; + } + + // else, IPT_WEB_HOST + + while (1) { + data = p + 2; // skip previous \r\n + p = findend(data, tail, 8); // p = current line's \r + if (p == NULL) return 0; + +#if 0 + char tmp[64]; + memcpy(tmp, data, 32); + tmp[32] = 0; + printk(KERN_INFO "data=[%s]\n", tmp); +#endif + + if (memcmp(data, "Host: ", 6) == 0) + return find(data + 6, p, info->text) ^ info->invert; + } + + return !info->invert; +} + +static int checkentry(const char *tablename, const struct ipt_ip *ip, void *matchinfo, + unsigned int matchsize, unsigned int hook_mask) +{ + return (matchsize == IPT_ALIGN(sizeof(struct ipt_web_info))); +} + + +static struct ipt_match web_match += { { NULL, NULL }, "web", &match, &checkentry, NULL, THIS_MODULE }; + +static int __init init(void) +{ +// LOG(KERN_INFO "ipt_web <" __DATE__ " " __TIME__ "> loaded\n"); + return ipt_register_match(&web_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&web_match); +} + +module_init(init); +module_exit(fini); diff --git a/release/src/linux/linux/net/ipv4/netfilter/regexp/regexp.c b/release/src/linux/linux/net/ipv4/netfilter/regexp/regexp.c new file mode 100644 index 00000000..31ef35b9 --- /dev/null +++ b/release/src/linux/linux/net/ipv4/netfilter/regexp/regexp.c @@ -0,0 +1,1195 @@ +/* + * regcomp and regexec -- regsub and regerror are elsewhere + * @(#)regexp.c 1.3 of 18 April 87 + * + * Copyright (c) 1986 by University of Toronto. + * Written by Henry Spencer. Not derived from licensed software. + * + * Permission is granted to anyone to use this software for any + * purpose on any computer system, and to redistribute it freely, + * subject to the following restrictions: + * + * 1. The author is not responsible for the consequences of use of + * this software, no matter how awful, even if they arise + * from defects in it. + * + * 2. The origin of this software must not be misrepresented, either + * by explicit claim or by omission. + * + * 3. Altered versions must be plainly marked as such, and must not + * be misrepresented as being the original software. + * + * Beware that some of this code is subtly aware of the way operator + * precedence is structured in regular expressions. Serious changes in + * regular-expression syntax might require a total rethink. + * + * This code was modified by Ethan Sommer to work within the kernel + * (it now uses kmalloc etc..) + * + * Modified slightly by Matthew Strait to use more modern C. + */ + +#include "regexp.h" +#include "regmagic.h" + +/* added by ethan and matt. Lets it work in both kernel and user space. +(So iptables can use it, for instance.) Yea, it goes both ways... */ +#if __KERNEL__ + #define malloc(foo) kmalloc(foo,GFP_ATOMIC) +#else + #define printk(format,args...) printf(format,##args) +#endif + +void regerror(char * s) +{ + printk("<3>Regexp: %s\n", s); + /* NOTREACHED */ +} + +/* + * The "internal use only" fields in regexp.h are present to pass info from + * compile to execute that permits the execute phase to run lots faster on + * simple cases. They are: + * + * regstart char that must begin a match; '\0' if none obvious + * reganch is the match anchored (at beginning-of-line only)? + * regmust string (pointer into program) that match must include, or NULL + * regmlen length of regmust string + * + * Regstart and reganch permit very fast decisions on suitable starting points + * for a match, cutting down the work a lot. Regmust permits fast rejection + * of lines that cannot possibly match. The regmust tests are costly enough + * that regcomp() supplies a regmust only if the r.e. contains something + * potentially expensive (at present, the only such thing detected is * or + + * at the start of the r.e., which can involve a lot of backup). Regmlen is + * supplied because the test in regexec() needs it and regcomp() is computing + * it anyway. + */ + +/* + * Structure for regexp "program". This is essentially a linear encoding + * of a nondeterministic finite-state machine (aka syntax charts or + * "railroad normal form" in parsing technology). Each node is an opcode + * plus a "next" pointer, possibly plus an operand. "Next" pointers of + * all nodes except BRANCH implement concatenation; a "next" pointer with + * a BRANCH on both ends of it is connecting two alternatives. (Here we + * have one of the subtle syntax dependencies: an individual BRANCH (as + * opposed to a collection of them) is never concatenated with anything + * because of operator precedence.) The operand of some types of node is + * a literal string; for others, it is a node leading into a sub-FSM. In + * particular, the operand of a BRANCH node is the first node of the branch. + * (NB this is *not* a tree structure: the tail of the branch connects + * to the thing following the set of BRANCHes.) The opcodes are: + */ + +/* definition number opnd? meaning */ +#define END 0 /* no End of program. */ +#define BOL 1 /* no Match "" at beginning of line. */ +#define EOL 2 /* no Match "" at end of line. */ +#define ANY 3 /* no Match any one character. */ +#define ANYOF 4 /* str Match any character in this string. */ +#define ANYBUT 5 /* str Match any character not in this string. */ +#define BRANCH 6 /* node Match this alternative, or the next... */ +#define BACK 7 /* no Match "", "next" ptr points backward. */ +#define EXACTLY 8 /* str Match this string. */ +#define NOTHING 9 /* no Match empty string. */ +#define STAR 10 /* node Match this (simple) thing 0 or more times. */ +#define PLUS 11 /* node Match this (simple) thing 1 or more times. */ +#define OPEN 20 /* no Mark this point in input as start of #n. */ + /* OPEN+1 is number 1, etc. */ +#define CLOSE 30 /* no Analogous to OPEN. */ + +/* + * Opcode notes: + * + * BRANCH The set of branches constituting a single choice are hooked + * together with their "next" pointers, since precedence prevents + * anything being concatenated to any individual branch. The + * "next" pointer of the last BRANCH in a choice points to the + * thing following the whole choice. This is also where the + * final "next" pointer of each individual branch points; each + * branch starts with the operand node of a BRANCH node. + * + * BACK Normal "next" pointers all implicitly point forward; BACK + * exists to make loop structures possible. + * + * STAR,PLUS '?', and complex '*' and '+', are implemented as circular + * BRANCH structures using BACK. Simple cases (one character + * per match) are implemented with STAR and PLUS for speed + * and to minimize recursive plunges. + * + * OPEN,CLOSE ...are numbered at compile time. + */ + +/* + * A node is one char of opcode followed by two chars of "next" pointer. + * "Next" pointers are stored as two 8-bit pieces, high order first. The + * value is a positive offset from the opcode of the node containing it. + * An operand, if any, simply follows the node. (Note that much of the + * code generation knows about this implicit relationship.) + * + * Using two bytes for the "next" pointer is vast overkill for most things, + * but allows patterns to get big without disasters. + */ +#define OP(p) (*(p)) +#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) +#define OPERAND(p) ((p) + 3) + +/* + * See regmagic.h for one further detail of program structure. + */ + + +/* + * Utility definitions. + */ +#ifndef CHARBITS +#define UCHARAT(p) ((int)*(unsigned char *)(p)) +#else +#define UCHARAT(p) ((int)*(p)&CHARBITS) +#endif + +#define FAIL(m) { regerror(m); return(NULL); } +#define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') +#define META "^$.[()|?+*\\" + +/* + * Flags to be passed up and down. + */ +#define HASWIDTH 01 /* Known never to match null string. */ +#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ +#define SPSTART 04 /* Starts with * or +. */ +#define WORST 0 /* Worst case. */ + +/* + * Global work variables for regcomp(). + */ +static char *regparse; /* Input-scan pointer. */ +static int regnpar; /* () count. */ +static char regdummy; +static char *regcode; /* Code-emit pointer; ®dummy = don't. */ +static long regsize; /* Code size. */ + +/* + * Forward declarations for regcomp()'s friends. + */ +#ifndef STATIC +#define STATIC static +#endif +STATIC char *reg(int paren,int *flagp); +STATIC char *regbranch(int *flagp); +STATIC char *regpiece(int *flagp); +STATIC char *regatom(int *flagp); +STATIC char *regnode(char op); +STATIC char *regnext(char *p); +STATIC void regc(char b); +STATIC void reginsert(char op, char *opnd); +STATIC void regtail(char *p, char *val); +STATIC void regoptail(char *p, char *val); + + +__kernel_size_t my_strcspn(const char *s1,const char *s2) +{ + char *scan1; + char *scan2; + int count; + + count = 0; + for (scan1 = (char *)s1; *scan1 != '\0'; scan1++) { + for (scan2 = (char *)s2; *scan2 != '\0';) /* ++ moved down. */ + if (*scan1 == *scan2++) + return(count); + count++; + } + return(count); +} + +/* + - regcomp - compile a regular expression into internal code + * + * We can't allocate space until we know how big the compiled form will be, + * but we can't compile it (and thus know how big it is) until we've got a + * place to put the code. So we cheat: we compile it twice, once with code + * generation turned off and size counting turned on, and once "for real". + * This also means that we don't allocate space until we are sure that the + * thing really will compile successfully, and we never have to move the + * code and thus invalidate pointers into it. (Note that it has to be in + * one piece because free() must be able to free it all.) + * + * Beware that the optimization-preparation code in here knows about some + * of the structure of the compiled regexp. + */ +regexp * +regcomp(char *exp,int *patternsize) +{ + register regexp *r; + register char *scan; + register char *longest; + register int len; + int flags; + /* commented out by ethan + extern char *malloc(); + */ + + if (exp == NULL) + FAIL("NULL argument"); + + /* First pass: determine size, legality. */ + regparse = exp; + regnpar = 1; + regsize = 0L; + regcode = ®dummy; + regc(MAGIC); + if (reg(0, &flags) == NULL) + return(NULL); + + /* Small enough for pointer-storage convention? */ + if (regsize >= 32767L) /* Probably could be 65535L. */ + FAIL("regexp too big"); + + /* Allocate space. */ + *patternsize=sizeof(regexp) + (unsigned)regsize; + r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize); + if (r == NULL) + FAIL("out of space"); + + /* Second pass: emit code. */ + regparse = exp; + regnpar = 1; + regcode = r->program; + regc(MAGIC); + if (reg(0, &flags) == NULL) + return(NULL); + + /* Dig out information for optimizations. */ + r->regstart = '\0'; /* Worst-case defaults. */ + r->reganch = 0; + r->regmust = NULL; + r->regmlen = 0; + scan = r->program+1; /* First BRANCH. */ + if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ + scan = OPERAND(scan); + + /* Starting-point info. */ + if (OP(scan) == EXACTLY) + r->regstart = *OPERAND(scan); + else if (OP(scan) == BOL) + r->reganch++; + + /* + * If there's something expensive in the r.e., find the + * longest literal string that must appear and make it the + * regmust. Resolve ties in favor of later strings, since + * the regstart check works with the beginning of the r.e. + * and avoiding duplication strengthens checking. Not a + * strong reason, but sufficient in the absence of others. + */ + if (flags&SPSTART) { + longest = NULL; + len = 0; + for (; scan != NULL; scan = regnext(scan)) + if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) { + longest = OPERAND(scan); + len = strlen(OPERAND(scan)); + } + r->regmust = longest; + r->regmlen = len; + } + } + + return(r); +} + +/* + - reg - regular expression, i.e. main body or parenthesized thing + * + * Caller must absorb opening parenthesis. + * + * Combining parenthesis handling with the base level of regular expression + * is a trifle forced, but the need to tie the tails of the branches to what + * follows makes it hard to avoid. + */ +static char * +reg(int paren, int *flagp /* Parenthesized? */ ) +{ + register char *ret; + register char *br; + register char *ender; + register int parno = 0; /* 0 makes gcc happy */ + int flags; + + *flagp = HASWIDTH; /* Tentatively. */ + + /* Make an OPEN node, if parenthesized. */ + if (paren) { + if (regnpar >= NSUBEXP) + FAIL("too many ()"); + parno = regnpar; + regnpar++; + ret = regnode(OPEN+parno); + } else + ret = NULL; + + /* Pick up the branches, linking them together. */ + br = regbranch(&flags); + if (br == NULL) + return(NULL); + if (ret != NULL) + regtail(ret, br); /* OPEN -> first. */ + else + ret = br; + if (!(flags&HASWIDTH)) + *flagp &= ~HASWIDTH; + *flagp |= flags&SPSTART; + while (*regparse == '|') { + regparse++; + br = regbranch(&flags); + if (br == NULL) + return(NULL); + regtail(ret, br); /* BRANCH -> BRANCH. */ + if (!(flags&HASWIDTH)) + *flagp &= ~HASWIDTH; + *flagp |= flags&SPSTART; + } + + /* Make a closing node, and hook it on the end. */ + ender = regnode((paren) ? CLOSE+parno : END); + regtail(ret, ender); + + /* Hook the tails of the branches to the closing node. */ + for (br = ret; br != NULL; br = regnext(br)) + regoptail(br, ender); + + /* Check for proper termination. */ + if (paren && *regparse++ != ')') { + FAIL("unmatched ()"); + } else if (!paren && *regparse != '\0') { + if (*regparse == ')') { + FAIL("unmatched ()"); + } else + FAIL("junk on end"); /* "Can't happen". */ + /* NOTREACHED */ + } + + return(ret); +} + +/* + - regbranch - one alternative of an | operator + * + * Implements the concatenation operator. + */ +static char * +regbranch(int *flagp) +{ + register char *ret; + register char *chain; + register char *latest; + int flags; + + *flagp = WORST; /* Tentatively. */ + + ret = regnode(BRANCH); + chain = NULL; + while (*regparse != '\0' && *regparse != '|' && *regparse != ')') { + latest = regpiece(&flags); + if (latest == NULL) + return(NULL); + *flagp |= flags&HASWIDTH; + if (chain == NULL) /* First piece. */ + *flagp |= flags&SPSTART; + else + regtail(chain, latest); + chain = latest; + } + if (chain == NULL) /* Loop ran zero times. */ + (void) regnode(NOTHING); + + return(ret); +} + +/* + - regpiece - something followed by possible [*+?] + * + * Note that the branching code sequences used for ? and the general cases + * of * and + are somewhat optimized: they use the same NOTHING node as + * both the endmarker for their branch list and the body of the last branch. + * It might seem that this node could be dispensed with entirely, but the + * endmarker role is not redundant. + */ +static char * +regpiece(int *flagp) +{ + register char *ret; + register char op; + register char *next; + int flags; + + ret = regatom(&flags); + if (ret == NULL) + return(NULL); + + op = *regparse; + if (!ISMULT(op)) { + *flagp = flags; + return(ret); + } + + if (!(flags&HASWIDTH) && op != '?') + FAIL("*+ operand could be empty"); + *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); + + if (op == '*' && (flags&SIMPLE)) + reginsert(STAR, ret); + else if (op == '*') { + /* Emit x* as (x&|), where & means "self". */ + reginsert(BRANCH, ret); /* Either x */ + regoptail(ret, regnode(BACK)); /* and loop */ + regoptail(ret, ret); /* back */ + regtail(ret, regnode(BRANCH)); /* or */ + regtail(ret, regnode(NOTHING)); /* null. */ + } else if (op == '+' && (flags&SIMPLE)) + reginsert(PLUS, ret); + else if (op == '+') { + /* Emit x+ as x(&|), where & means "self". */ + next = regnode(BRANCH); /* Either */ + regtail(ret, next); + regtail(regnode(BACK), ret); /* loop back */ + regtail(next, regnode(BRANCH)); /* or */ + regtail(ret, regnode(NOTHING)); /* null. */ + } else if (op == '?') { + /* Emit x? as (x|) */ + reginsert(BRANCH, ret); /* Either x */ + regtail(ret, regnode(BRANCH)); /* or */ + next = regnode(NOTHING); /* null. */ + regtail(ret, next); + regoptail(ret, next); + } + regparse++; + if (ISMULT(*regparse)) + FAIL("nested *?+"); + + return(ret); +} + +/* + - regatom - the lowest level + * + * Optimization: gobbles an entire sequence of ordinary characters so that + * it can turn them into a single node, which is smaller to store and + * faster to run. Backslashed characters are exceptions, each becoming a + * separate node; the code is simpler that way and it's not worth fixing. + */ +static char * +regatom(int *flagp) +{ + register char *ret; + int flags; + + *flagp = WORST; /* Tentatively. */ + + switch (*regparse++) { + case '^': + ret = regnode(BOL); + break; + case '$': + ret = regnode(EOL); + break; + case '.': + ret = regnode(ANY); + *flagp |= HASWIDTH|SIMPLE; + break; + case '[': { + register int class; + register int classend; + + if (*regparse == '^') { /* Complement of range. */ + ret = regnode(ANYBUT); + regparse++; + } else + ret = regnode(ANYOF); + if (*regparse == ']' || *regparse == '-') + regc(*regparse++); + while (*regparse != '\0' && *regparse != ']') { + if (*regparse == '-') { + regparse++; + if (*regparse == ']' || *regparse == '\0') + regc('-'); + else { + class = UCHARAT(regparse-2)+1; + classend = UCHARAT(regparse); + if (class > classend+1) + FAIL("invalid [] range"); + for (; class <= classend; class++) + regc(class); + regparse++; + } + } else + regc(*regparse++); + } + regc('\0'); + if (*regparse != ']') + FAIL("unmatched []"); + regparse++; + *flagp |= HASWIDTH|SIMPLE; + } + break; + case '(': + ret = reg(1, &flags); + if (ret == NULL) + return(NULL); + *flagp |= flags&(HASWIDTH|SPSTART); + break; + case '\0': + case '|': + case ')': + FAIL("internal urp"); /* Supposed to be caught earlier. */ + break; + case '?': + case '+': + case '*': + FAIL("?+* follows nothing"); + break; + case '\\': + if (*regparse == '\0') + FAIL("trailing \\"); + ret = regnode(EXACTLY); + regc(*regparse++); + regc('\0'); + *flagp |= HASWIDTH|SIMPLE; + break; + default: { + register int len; + register char ender; + + regparse--; + len = my_strcspn((const char *)regparse, (const char *)META); + if (len <= 0) + FAIL("internal disaster"); + ender = *(regparse+len); + if (len > 1 && ISMULT(ender)) + len--; /* Back off clear of ?+* operand. */ + *flagp |= HASWIDTH; + if (len == 1) + *flagp |= SIMPLE; + ret = regnode(EXACTLY); + while (len > 0) { + regc(*regparse++); + len--; + } + regc('\0'); + } + break; + } + + return(ret); +} + +/* + - regnode - emit a node + */ +static char * /* Location. */ +regnode(char op) +{ + register char *ret; + register char *ptr; + + ret = regcode; + if (ret == ®dummy) { + regsize += 3; + return(ret); + } + + ptr = ret; + *ptr++ = op; + *ptr++ = '\0'; /* Null "next" pointer. */ + *ptr++ = '\0'; + regcode = ptr; + + return(ret); +} + +/* + - regc - emit (if appropriate) a byte of code + */ +static void +regc(char b) +{ + if (regcode != ®dummy) + *regcode++ = b; + else + regsize++; +} + +/* + - reginsert - insert an operator in front of already-emitted operand + * + * Means relocating the operand. + */ +static void +reginsert(char op, char* opnd) +{ + register char *src; + register char *dst; + register char *place; + + if (regcode == ®dummy) { + regsize += 3; + return; + } + + src = regcode; + regcode += 3; + dst = regcode; + while (src > opnd) + *--dst = *--src; + + place = opnd; /* Op node, where operand used to be. */ + *place++ = op; + *place++ = '\0'; + *place++ = '\0'; +} + +/* + - regtail - set the next-pointer at the end of a node chain + */ +static void +regtail(char *p, char *val) +{ + register char *scan; + register char *temp; + register int offset; + + if (p == ®dummy) + return; + + /* Find last node. */ + scan = p; + for (;;) { + temp = regnext(scan); + if (temp == NULL) + break; + scan = temp; + } + + if (OP(scan) == BACK) + offset = scan - val; + else + offset = val - scan; + *(scan+1) = (offset>>8)&0377; + *(scan+2) = offset&0377; +} + +/* + - regoptail - regtail on operand of first argument; nop if operandless + */ +static void +regoptail(char *p, char *val) +{ + /* "Operandless" and "op != BRANCH" are synonymous in practice. */ + if (p == NULL || p == ®dummy || OP(p) != BRANCH) + return; + regtail(OPERAND(p), val); +} + +/* + * regexec and friends + */ + +/* + * Global work variables for regexec(). + */ +static char *reginput; /* String-input pointer. */ +static char *regbol; /* Beginning of input, for ^ check. */ +static char **regstartp; /* Pointer to startp array. */ +static char **regendp; /* Ditto for endp. */ + +/* + * Forwards. + */ +STATIC int regtry(regexp *prog, char *string); +STATIC int regmatch(char *prog); +STATIC int regrepeat(char *p); + +#ifdef DEBUG +int regnarrate = 0; +void regdump(); +STATIC char *regprop(char *op); +#endif + +/* + - regexec - match a regexp against a string + */ +int +regexec(regexp *prog, char *string) +{ + register char *s; + + /* Be paranoid... */ + if (prog == NULL || string == NULL) { + printk("<3>Regexp: NULL parameter\n"); + return(0); + } + + /* Check validity of program. */ + if (UCHARAT(prog->program) != MAGIC) { + printk("<3>Regexp: corrupted program\n"); + return(0); + } + + /* If there is a "must appear" string, look for it. */ + if (prog->regmust != NULL) { + s = string; + while ((s = strchr(s, prog->regmust[0])) != NULL) { + if (strncmp(s, prog->regmust, prog->regmlen) == 0) + break; /* Found it. */ + s++; + } + if (s == NULL) /* Not present. */ + return(0); + } + + /* Mark beginning of line for ^ . */ + regbol = string; + + /* Simplest case: anchored match need be tried only once. */ + if (prog->reganch) + return(regtry(prog, string)); + + /* Messy cases: unanchored match. */ + s = string; + if (prog->regstart != '\0') + /* We know what char it must start with. */ + while ((s = strchr(s, prog->regstart)) != NULL) { + if (regtry(prog, s)) + return(1); + s++; + } + else + /* We don't -- general case. */ + do { + if (regtry(prog, s)) + return(1); + } while (*s++ != '\0'); + + /* Failure. */ + return(0); +} + +/* + - regtry - try match at specific point + */ +static int /* 0 failure, 1 success */ +regtry(regexp *prog, char *string) +{ + register int i; + register char **sp; + register char **ep; + + reginput = string; + regstartp = prog->startp; + regendp = prog->endp; + + sp = prog->startp; + ep = prog->endp; + for (i = NSUBEXP; i > 0; i--) { + *sp++ = NULL; + *ep++ = NULL; + } + if (regmatch(prog->program + 1)) { + prog->startp[0] = string; + prog->endp[0] = reginput; + return(1); + } else + return(0); +} + +/* + - regmatch - main matching routine + * + * Conceptually the strategy is simple: check to see whether the current + * node matches, call self recursively to see whether the rest matches, + * and then act accordingly. In practice we make some effort to avoid + * recursion, in particular by going through "ordinary" nodes (that don't + * need to know whether the rest of the match failed) by a loop instead of + * by recursion. + */ +static int /* 0 failure, 1 success */ +regmatch(char *prog) +{ + register char *scan = prog; /* Current node. */ + char *next; /* Next node. */ + +#ifdef DEBUG + if (scan != NULL && regnarrate) + fprintf(stderr, "%s(\n", regprop(scan)); +#endif + while (scan != NULL) { +#ifdef DEBUG + if (regnarrate) + fprintf(stderr, "%s...\n", regprop(scan)); +#endif + next = regnext(scan); + + switch (OP(scan)) { + case BOL: + if (reginput != regbol) + return(0); + break; + case EOL: + if (*reginput != '\0') + return(0); + break; + case ANY: + if (*reginput == '\0') + return(0); + reginput++; + break; + case EXACTLY: { + register int len; + register char *opnd; + + opnd = OPERAND(scan); + /* Inline the first character, for speed. */ + if (*opnd != *reginput) + return(0); + len = strlen(opnd); + if (len > 1 && strncmp(opnd, reginput, len) != 0) + return(0); + reginput += len; + } + break; + case ANYOF: + if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) + return(0); + reginput++; + break; + case ANYBUT: + if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) + return(0); + reginput++; + break; + case NOTHING: + case BACK: + break; + case OPEN+1: + case OPEN+2: + case OPEN+3: + case OPEN+4: + case OPEN+5: + case OPEN+6: + case OPEN+7: + case OPEN+8: + case OPEN+9: { + register int no; + register char *save; + + no = OP(scan) - OPEN; + save = reginput; + + if (regmatch(next)) { + /* + * Don't set startp if some later + * invocation of the same parentheses + * already has. + */ + if (regstartp[no] == NULL) + regstartp[no] = save; + return(1); + } else + return(0); + } + break; + case CLOSE+1: + case CLOSE+2: + case CLOSE+3: + case CLOSE+4: + case CLOSE+5: + case CLOSE+6: + case CLOSE+7: + case CLOSE+8: + case CLOSE+9: + { + register int no; + register char *save; + + no = OP(scan) - CLOSE; + save = reginput; + + if (regmatch(next)) { + /* + * Don't set endp if some later + * invocation of the same parentheses + * already has. + */ + if (regendp[no] == NULL) + regendp[no] = save; + return(1); + } else + return(0); + } + break; + case BRANCH: { + register char *save; + + if (OP(next) != BRANCH) /* No choice. */ + next = OPERAND(scan); /* Avoid recursion. */ + else { + do { + save = reginput; + if (regmatch(OPERAND(scan))) + return(1); + reginput = save; + scan = regnext(scan); + } while (scan != NULL && OP(scan) == BRANCH); + return(0); + /* NOTREACHED */ + } + } + break; + case STAR: + case PLUS: { + register char nextch; + register int no; + register char *save; + register int min; + + /* + * Lookahead to avoid useless match attempts + * when we know what character comes next. + */ + nextch = '\0'; + if (OP(next) == EXACTLY) + nextch = *OPERAND(next); + min = (OP(scan) == STAR) ? 0 : 1; + save = reginput; + no = regrepeat(OPERAND(scan)); + while (no >= min) { + /* If it could work, try it. */ + if (nextch == '\0' || *reginput == nextch) + if (regmatch(next)) + return(1); + /* Couldn't or didn't -- back up. */ + no--; + reginput = save + no; + } + return(0); + } + break; + case END: + return(1); /* Success! */ + break; + default: + printk("<3>Regexp: memory corruption\n"); + return(0); + break; + } + + scan = next; + } + + /* + * We get here only if there's trouble -- normally "case END" is + * the terminating point. + */ + printk("<3>Regexp: corrupted pointers\n"); + return(0); +} + +/* + - regrepeat - repeatedly match something simple, report how many + */ +static int +regrepeat(char *p) +{ + register int count = 0; + register char *scan; + register char *opnd; + + scan = reginput; + opnd = OPERAND(p); + switch (OP(p)) { + case ANY: + count = strlen(scan); + scan += count; + break; + case EXACTLY: + while (*opnd == *scan) { + count++; + scan++; + } + break; + case ANYOF: + while (*scan != '\0' && strchr(opnd, *scan) != NULL) { + count++; + scan++; + } + break; + case ANYBUT: + while (*scan != '\0' && strchr(opnd, *scan) == NULL) { + count++; + scan++; + } + break; + default: /* Oh dear. Called inappropriately. */ + printk("<3>Regexp: internal foulup\n"); + count = 0; /* Best compromise. */ + break; + } + reginput = scan; + + return(count); +} + +/* + - regnext - dig the "next" pointer out of a node + */ +static char* +regnext(char *p) +{ + register int offset; + + if (p == ®dummy) + return(NULL); + + offset = NEXT(p); + if (offset == 0) + return(NULL); + + if (OP(p) == BACK) + return(p-offset); + else + return(p+offset); +} + +#ifdef DEBUG + +STATIC char *regprop(); + +/* + - regdump - dump a regexp onto stdout in vaguely comprehensible form + */ +void +regdump(regexp *r) +{ + register char *s; + register char op = EXACTLY; /* Arbitrary non-END op. */ + register char *next; + /* extern char *strchr(); */ + + + s = r->program + 1; + while (op != END) { /* While that wasn't END last time... */ + op = OP(s); + printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ + next = regnext(s); + if (next == NULL) /* Next ptr. */ + printf("(0)"); + else + printf("(%d)", (s-r->program)+(next-s)); + s += 3; + if (op == ANYOF || op == ANYBUT || op == EXACTLY) { + /* Literal string, where present. */ + while (*s != '\0') { + putchar(*s); + s++; + } + s++; + } + putchar('\n'); + } + + /* Header fields of interest. */ + if (r->regstart != '\0') + printf("start `%c' ", r->regstart); + if (r->reganch) + printf("anchored "); + if (r->regmust != NULL) + printf("must have \"%s\"", r->regmust); + printf("\n"); +} + +/* + - regprop - printable representation of opcode + */ +static char * +regprop(char *op) +{ +#define BUFLEN 50 + register char *p; + static char buf[BUFLEN]; + + strcpy(buf, ":"); + + switch (OP(op)) { + case BOL: + p = "BOL"; + break; + case EOL: + p = "EOL"; + break; + case ANY: + p = "ANY"; + break; + case ANYOF: + p = "ANYOF"; + break; + case ANYBUT: + p = "ANYBUT"; + break; + case BRANCH: + p = "BRANCH"; + break; + case EXACTLY: + p = "EXACTLY"; + break; + case NOTHING: + p = "NOTHING"; + break; + case BACK: + p = "BACK"; + break; + case END: + p = "END"; + break; + case OPEN+1: + case OPEN+2: + case OPEN+3: + case OPEN+4: + case OPEN+5: + case OPEN+6: + case OPEN+7: + case OPEN+8: + case OPEN+9: + snprintf(buf+strlen(buf),BUFLEN-strlen(buf), "OPEN%d", OP(op)-OPEN); + p = NULL; + break; + case CLOSE+1: + case CLOSE+2: + case CLOSE+3: + case CLOSE+4: + case CLOSE+5: + case CLOSE+6: + case CLOSE+7: + case CLOSE+8: + case CLOSE+9: + snprintf(buf+strlen(buf),BUFLEN-strlen(buf), "CLOSE%d", OP(op)-CLOSE); + p = NULL; + break; + case STAR: + p = "STAR"; + break; + case PLUS: + p = "PLUS"; + break; + default: + printk("<3>Regexp: corrupted opcode\n"); + break; + } + if (p != NULL) + strncat(buf, p, BUFLEN-strlen(buf)); + return(buf); +} +#endif + + diff --git a/release/src/linux/linux/net/ipv4/netfilter/regexp/regexp.h b/release/src/linux/linux/net/ipv4/netfilter/regexp/regexp.h new file mode 100644 index 00000000..fda9a7c4 --- /dev/null +++ b/release/src/linux/linux/net/ipv4/netfilter/regexp/regexp.h @@ -0,0 +1,40 @@ +/* + * Definitions etc. for regexp(3) routines. + * + * Caveat: this is V8 regexp(3) [actually, a reimplementation thereof], + * not the System V one. + */ + +#ifndef REGEXP_H +#define REGEXP_H + +/* +http://www.opensource.apple.com/darwinsource/10.3/expect-1/expect/expect.h , +which contains a version of this library, says: + + * + * NSUBEXP must be at least 10, and no greater than 117 or the parser + * will not work properly. + * + +However, it looks rather like this library is limited to 10. If you think +otherwise, let us know. +*/ + +#define NSUBEXP 10 +typedef struct regexp { + char *startp[NSUBEXP]; + char *endp[NSUBEXP]; + char regstart; /* Internal use only. */ + char reganch; /* Internal use only. */ + char *regmust; /* Internal use only. */ + int regmlen; /* Internal use only. */ + char program[1]; /* Unwarranted chumminess with compiler. */ +} regexp; + +regexp * regcomp(char *exp, int *patternsize); +int regexec(regexp *prog, char *string); +void regsub(regexp *prog, char *source, char *dest); +void regerror(char *s); + +#endif diff --git a/release/src/linux/linux/net/ipv4/netfilter/regexp/regmagic.h b/release/src/linux/linux/net/ipv4/netfilter/regexp/regmagic.h new file mode 100644 index 00000000..5acf4478 --- /dev/null +++ b/release/src/linux/linux/net/ipv4/netfilter/regexp/regmagic.h @@ -0,0 +1,5 @@ +/* + * The first byte of the regexp internal "program" is actually this magic + * number; the start node begins in the second byte. + */ +#define MAGIC 0234 diff --git a/release/src/linux/linux/net/ipv4/netfilter/regexp/regsub.c b/release/src/linux/linux/net/ipv4/netfilter/regexp/regsub.c new file mode 100644 index 00000000..339631f0 --- /dev/null +++ b/release/src/linux/linux/net/ipv4/netfilter/regexp/regsub.c @@ -0,0 +1,95 @@ +/* + * regsub + * @(#)regsub.c 1.3 of 2 April 86 + * + * Copyright (c) 1986 by University of Toronto. + * Written by Henry Spencer. Not derived from licensed software. + * + * Permission is granted to anyone to use this software for any + * purpose on any computer system, and to redistribute it freely, + * subject to the following restrictions: + * + * 1. The author is not responsible for the consequences of use of + * this software, no matter how awful, even if they arise + * from defects in it. + * + * 2. The origin of this software must not be misrepresented, either + * by explicit claim or by omission. + * + * 3. Altered versions must be plainly marked as such, and must not + * be misrepresented as being the original software. + * + * + * This code was modified by Ethan Sommer to work within the kernel + * (it now uses kmalloc etc..) + * + */ +#include "regexp.h" +#include "regmagic.h" +#include + + +#ifndef CHARBITS +#define UCHARAT(p) ((int)*(unsigned char *)(p)) +#else +#define UCHARAT(p) ((int)*(p)&CHARBITS) +#endif + +#if 0 +//void regerror(char * s) +//{ +// printk("regexp(3): %s", s); +// /* NOTREACHED */ +//} +#endif + +/* + - regsub - perform substitutions after a regexp match + */ +void +regsub(regexp * prog, char * source, char * dest) +{ + register char *src; + register char *dst; + register char c; + register int no; + register int len; + + /* Not necessary and gcc doesn't like it -MLS */ + /*extern char *strncpy();*/ + + if (prog == NULL || source == NULL || dest == NULL) { + regerror("NULL parm to regsub"); + return; + } + if (UCHARAT(prog->program) != MAGIC) { + regerror("damaged regexp fed to regsub"); + return; + } + + src = source; + dst = dest; + while ((c = *src++) != '\0') { + if (c == '&') + no = 0; + else if (c == '\\' && '0' <= *src && *src <= '9') + no = *src++ - '0'; + else + no = -1; + + if (no < 0) { /* Ordinary character. */ + if (c == '\\' && (*src == '\\' || *src == '&')) + c = *src++; + *dst++ = c; + } else if (prog->startp[no] != NULL && prog->endp[no] != NULL) { + len = prog->endp[no] - prog->startp[no]; + (void) strncpy(dst, prog->startp[no], len); + dst += len; + if (len != 0 && *(dst-1) == '\0') { /* strncpy hit NUL. */ + regerror("damaged match string"); + return; + } + } + } + *dst++ = '\0'; +} diff --git a/release/src/linux/linux/net/ipv4/netfilter/tomato_ct.c b/release/src/linux/linux/net/ipv4/netfilter/tomato_ct.c new file mode 100644 index 00000000..a84cab09 --- /dev/null +++ b/release/src/linux/linux/net/ipv4/netfilter/tomato_ct.c @@ -0,0 +1,181 @@ +/* + + tomato_ct.c + Copyright (C) 2006 Jonathan Zarate + + Licensed under GNU GPL v2. + +*/ +#include +#include +#include +#include + +// #define TEST_HASHDIST + + +#ifdef TEST_HASHDIST +static int hashdist_read(char *buffer, char **start, off_t offset, int length, int *eof, void *data) +{ + struct list_head *h; + struct list_head *e; + int i; + int n; + int count; + char *buf; + int max; + + // do this the easy way... + max = ip_conntrack_htable_size * sizeof("12345\t12345\n"); + buf = kmalloc(max + 1, GFP_KERNEL); + if (buf == NULL) return 0; + + n = 0; + max -= sizeof("12345\t12345\n"); + + READ_LOCK(&ip_conntrack_lock); + + for (i = 0; i < ip_conntrack_htable_size; ++i) { + count = 0; + h = &ip_conntrack_hash[i]; + if (h) { + e = h; + while (e->next != h) { + ++count; + e = e->next; + } + } + + n += sprintf(buf + n, "%d\t%d\n", i, count); + if (n > max) { + printk("hashdist: %d > %d\n", n, max); + break; + } + } + + READ_UNLOCK(&ip_conntrack_lock); + + if (offset < n) { + n = n - offset; + if (n > length) { + n = length; + *eof = 0; + } + else { + *eof = 1; + } + memcpy(buffer, buf + offset, n); + *start = buffer; + } + else { + n = 0; + *eof = 1; + } + + kfree(buf); + return n; +} +#endif + + +static void interate_all(void (*func)(struct ip_conntrack *, unsigned long), unsigned long data) +{ + int i; + struct list_head *h; + struct list_head *e; + + WRITE_LOCK(&ip_conntrack_lock); + for (i = 0; i < ip_conntrack_htable_size; ++i) { + h = &ip_conntrack_hash[i]; + if (h) { + e = h; + while (e->next != h) { + e = e->next; + func(((struct ip_conntrack_tuple_hash *)e)->ctrack, data); + } + } + } + WRITE_UNLOCK(&ip_conntrack_lock); +} + +static void expireearly(struct ip_conntrack *ct, unsigned long data) +{ + if (ct->timeout.expires > data) { + if (del_timer(&ct->timeout)) { + ct->timeout.expires = data; + add_timer(&ct->timeout); + } + } +} + +static int expireearly_write(struct file *file, const char *buffer, unsigned long length, void *data) +{ + char s[8]; + unsigned long n; + + if ((length > 0) && (length < 6)) { + memcpy(s, buffer, length); + s[length] = 0; + n = simple_strtoul(s, NULL, 10); + if (n < 10) n = 10; + else if (n > 86400) n = 86400; + + interate_all(expireearly, jiffies + (n * HZ)); + } + +/* + if ((length > 0) && (buffer[0] == '1')) { + interate_all(expireearly, jiffies + (20 * HZ)); + } +*/ + + return length; +} + + +static void clearmarks(struct ip_conntrack *ct, unsigned long data) +{ + ct->mark = 0; +} + +static int clearmarks_write(struct file *file, const char *buffer, unsigned long length, void *data) +{ + if ((length > 0) && (buffer[0] == '1')) { + interate_all(clearmarks, 0); + } + return length; +} + +static int __init init(void) +{ + struct proc_dir_entry *p; + + printk(__FILE__ " [" __DATE__ " " __TIME__ "]\n"); + +#ifdef TEST_HASHDIST + p = create_proc_entry("hash_dist", 0400, proc_net); + if (p) p->read_proc = hashdist_read; +#endif + + p = create_proc_entry("expire_early", 0200, proc_net); + if (p) p->write_proc = expireearly_write; + + p = create_proc_entry("clear_marks", 0200, proc_net); + if (p) p->write_proc = clearmarks_write; + + return 0; +} + +static void __exit fini(void) +{ +#ifdef TEST_HASHDIST + remove_proc_entry("hash_dist", proc_net); +#endif + remove_proc_entry("expire_early", proc_net); + remove_proc_entry("clear_marks", proc_net); +} + +module_init(init); +module_exit(fini); + +MODULE_LICENSE("GPL"); diff --git a/release/src/linux/linux/net/ipv4/route.c b/release/src/linux/linux/net/ipv4/route.c index dfae0871..f3cf20df 100644 --- a/release/src/linux/linux/net/ipv4/route.c +++ b/release/src/linux/linux/net/ipv4/route.c @@ -2465,6 +2465,7 @@ void __init ip_rt_init(void) panic("IP: failed to allocate ip_dst_cache\n"); goal = num_physpages >> (26 - PAGE_SHIFT); +// goal = num_physpages >> (21 - PAGE_SHIFT); for (order = 0; (1UL << order) < goal; order++) /* NOTHING */; @@ -2494,9 +2495,18 @@ void __init ip_rt_init(void) rt_hash_table[i].chain = NULL; } +// ip_rt_max_size = (rt_hash_mask + 1) * 2; +// ipv4_dst_ops.gc_thresh = (ip_rt_max_size / 4); + ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); ip_rt_max_size = (rt_hash_mask + 1) * 16; +// printk("gc_thresh=%d\n", ipv4_dst_ops.gc_thresh); +// printk("ip_rt_max_size=%d\n", ip_rt_max_size); +// printk("rt_hash_mask=%d\n", rt_hash_mask); +// printk("goal=%d\n", goal); + + devinet_init(); ip_fib_init(); diff --git a/release/src/linux/linux/net/ipv4/sysctl_net_ipv4.c b/release/src/linux/linux/net/ipv4/sysctl_net_ipv4.c index 1f4081a9..7fe16445 100644 --- a/release/src/linux/linux/net/ipv4/sysctl_net_ipv4.c +++ b/release/src/linux/linux/net/ipv4/sysctl_net_ipv4.c @@ -221,6 +221,18 @@ ctl_table ipv4_table[] = { &sysctl_icmp_ratemask, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_TCP_TW_REUSE, "tcp_tw_reuse", &sysctl_tcp_tw_reuse, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_TCP_VEGAS, "tcp_vegas_cong_avoid", + &sysctl_tcp_vegas_cong_avoid, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_TCP_VEGAS_ALPHA, "tcp_vegas_alpha", + &sysctl_tcp_vegas_alpha, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_TCP_VEGAS_BETA, "tcp_vegas_beta", + &sysctl_tcp_vegas_beta, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_TCP_VEGAS_GAMMA, "tcp_vegas_gamma", + &sysctl_tcp_vegas_gamma, sizeof(int), 0644, NULL, + &proc_dointvec}, {0} }; diff --git a/release/src/linux/linux/net/ipv4/tcp_input.c b/release/src/linux/linux/net/ipv4/tcp_input.c index 8c99dd52..243e2991 100644 --- a/release/src/linux/linux/net/ipv4/tcp_input.c +++ b/release/src/linux/linux/net/ipv4/tcp_input.c @@ -87,6 +87,16 @@ int sysctl_tcp_stdurg = 0; int sysctl_tcp_rfc1337 = 0; int sysctl_tcp_max_orphans = NR_FILE; +int sysctl_tcp_vegas_cong_avoid = 0; + +/* Default values of the Vegas variables, in fixed-point representation + * with V_PARAM_SHIFT bits to the right of the binary point. + */ +#define V_PARAM_SHIFT 1 +int sysctl_tcp_vegas_alpha = 1<vegas.do_vegas = 1; + tp->vegas.baseRTT = 0x7fffffff; + tcp_vegas_enable(tp); + } else + tcp_vegas_disable(tp); +} + +/* Do RTT sampling needed for Vegas. + * Basically we: + * o min-filter RTT samples from within an RTT to get the current + * propagation delay + queuing delay (we are min-filtering to try to + * avoid the effects of delayed ACKs) + * o min-filter RTT samples from a much longer window (forever for now) + * to find the propagation delay (baseRTT) + */ +static inline void vegas_rtt_calc(struct tcp_opt *tp, __u32 rtt) +{ + __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */ + + /* Filter to find propagation delay: */ + if (vrtt < tp->vegas.baseRTT) + tp->vegas.baseRTT = vrtt; + + /* Find the min RTT during the last RTT to find + * the current prop. delay + queuing delay: + */ + tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt); + tp->vegas.cntRTT++; +} + /* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge @@ -412,6 +458,9 @@ static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt) { long m = mrtt; /* RTT */ + if (tcp_vegas_enabled(tp)) + vegas_rtt_calc(tp, mrtt); + /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. @@ -1013,7 +1062,7 @@ void tcp_enter_loss(struct sock *sk, int how) tcp_sync_left_out(tp); tp->reordering = min_t(unsigned int, tp->reordering, sysctl_tcp_reordering); - tp->ca_state = TCP_CA_Loss; + tcp_set_ca_state(tp, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); } @@ -1375,7 +1424,7 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_opt *tp) tcp_moderate_cwnd(tp); return 1; } - tp->ca_state = TCP_CA_Open; + tcp_set_ca_state(tp, TCP_CA_Open); return 0; } @@ -1435,7 +1484,7 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_opt *tp) tp->retransmits = 0; tp->undo_marker = 0; if (!IsReno(tp)) - tp->ca_state = TCP_CA_Open; + tcp_set_ca_state(tp, TCP_CA_Open); return 1; } return 0; @@ -1466,7 +1515,7 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_opt *tp, int flag) state = TCP_CA_Disorder; if (tp->ca_state != state) { - tp->ca_state = state; + tcp_set_ca_state(tp, state); tp->high_seq = tp->snd_nxt; } tcp_moderate_cwnd(tp); @@ -1540,7 +1589,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, * is ACKed for CWR bit to reach receiver. */ if (tp->snd_una != tp->high_seq) { tcp_complete_cwr(tp); - tp->ca_state = TCP_CA_Open; + tcp_set_ca_state(tp, TCP_CA_Open); } break; @@ -1551,7 +1600,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, * catching for all duplicate ACKs. */ IsReno(tp) || tp->snd_una != tp->high_seq) { tp->undo_marker = 0; - tp->ca_state = TCP_CA_Open; + tcp_set_ca_state(tp, TCP_CA_Open); } break; @@ -1625,7 +1674,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, } tp->snd_cwnd_cnt = 0; - tp->ca_state = TCP_CA_Recovery; + tcp_set_ca_state(tp, TCP_CA_Recovery); } if (is_dupack || tcp_head_timedout(sk, tp)) @@ -1696,7 +1745,7 @@ tcp_ack_update_rtt(struct tcp_opt *tp, int flag, s32 seq_rtt) /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ -static __inline__ void tcp_cong_avoid(struct tcp_opt *tp) +static __inline__ void reno_cong_avoid(struct tcp_opt *tp) { if (tp->snd_cwnd <= tp->snd_ssthresh) { /* In "safe" area, increase. */ @@ -1716,6 +1765,236 @@ static __inline__ void tcp_cong_avoid(struct tcp_opt *tp) tp->snd_cwnd_stamp = tcp_time_stamp; } +/* This is based on the congestion detection/avoidance scheme described in + * Lawrence S. Brakmo and Larry L. Peterson. + * "TCP Vegas: End to end congestion avoidance on a global internet." + * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, + * October 1995. Available from: + * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps + * + * See http://www.cs.arizona.edu/xkernel/ for their implementation. + * The main aspects that distinguish this implementation from the + * Arizona Vegas implementation are: + * o We do not change the loss detection or recovery mechanisms of + * Linux in any way. Linux already recovers from losses quite well, + * using fine-grained timers, NewReno, and FACK. + * o To avoid the performance penalty imposed by increasing cwnd + * only every-other RTT during slow start, we increase during + * every RTT during slow start, just like Reno. + * o Largely to allow continuous cwnd growth during slow start, + * we use the rate at which ACKs come back as the "actual" + * rate, rather than the rate at which data is sent. + * o To speed convergence to the right rate, we set the cwnd + * to achieve the right ("actual") rate when we exit slow start. + * o To filter out the noise caused by delayed ACKs, we use the + * minimum RTT sample observed during the last RTT to calculate + * the actual rate. + * o When the sender re-starts from idle, it waits until it has + * received ACKs for an entire flight of new data before making + * a cwnd adjustment decision. The original Vegas implementation + * assumed senders never went idle. + */ +static void vegas_cong_avoid(struct tcp_opt *tp, u32 ack, u32 seq_rtt) +{ + /* The key players are v_beg_snd_una and v_beg_snd_nxt. + * + * These are so named because they represent the approximate values + * of snd_una and snd_nxt at the beginning of the current RTT. More + * precisely, they represent the amount of data sent during the RTT. + * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, + * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding + * bytes of data have been ACKed during the course of the RTT, giving + * an "actual" rate of: + * + * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) + * + * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, + * because delayed ACKs can cover more than one segment, so they + * don't line up nicely with the boundaries of RTTs. + * + * Another unfortunate fact of life is that delayed ACKs delay the + * advance of the left edge of our send window, so that the number + * of bytes we send in an RTT is often less than our cwnd will allow. + * So we keep track of our cwnd separately, in v_beg_snd_cwnd. + */ + + if (after(ack, tp->vegas.beg_snd_nxt)) { + /* Do the Vegas once-per-RTT cwnd adjustment. */ + u32 old_wnd, old_snd_cwnd; + + + /* Here old_wnd is essentially the window of data that was + * sent during the previous RTT, and has all + * been acknowledged in the course of the RTT that ended + * with the ACK we just received. Likewise, old_snd_cwnd + * is the cwnd during the previous RTT. + */ + old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) / + tp->mss_cache; + old_snd_cwnd = tp->vegas.beg_snd_cwnd; + + /* Save the extent of the current window so we can use this + * at the end of the next RTT. + */ + tp->vegas.beg_snd_una = tp->vegas.beg_snd_nxt; + tp->vegas.beg_snd_nxt = tp->snd_nxt; + tp->vegas.beg_snd_cwnd = tp->snd_cwnd; + + /* Take into account the current RTT sample too, to + * decrease the impact of delayed acks. This double counts + * this sample since we count it for the next window as well, + * but that's not too awful, since we're taking the min, + * rather than averaging. + */ + vegas_rtt_calc(tp, seq_rtt); + + /* We do the Vegas calculations only if we got enough RTT + * samples that we can be reasonably sure that we got + * at least one RTT sample that wasn't from a delayed ACK. + * If we only had 2 samples total, + * then that means we're getting only 1 ACK per RTT, which + * means they're almost certainly delayed ACKs. + * If we have 3 samples, we should be OK. + */ + + if (tp->vegas.cntRTT <= 2) { + /* We don't have enough RTT samples to do the Vegas + * calculation, so we'll behave like Reno. + */ + if (tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd++; + } else { + u32 rtt, target_cwnd, diff; + + /* We have enough RTT samples, so, using the Vegas + * algorithm, we determine if we should increase or + * decrease cwnd, and by how much. + */ + + /* Pluck out the RTT we are using for the Vegas + * calculations. This is the min RTT seen during the + * last RTT. Taking the min filters out the effects + * of delayed ACKs, at the cost of noticing congestion + * a bit later. + */ + rtt = tp->vegas.minRTT; + + /* Calculate the cwnd we should have, if we weren't + * going too fast. + * + * This is: + * (actual rate in segments) * baseRTT + * We keep it as a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary point. + */ + target_cwnd = ((old_wnd * tp->vegas.baseRTT) + << V_PARAM_SHIFT) / rtt; + + /* Calculate the difference between the window we had, + * and the window we would like to have. This quantity + * is the "Diff" from the Arizona Vegas papers. + * + * Again, this is a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary + * point. + */ + diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; + + if (tp->snd_cwnd < tp->snd_ssthresh) { + /* Slow start. */ + if (diff > sysctl_tcp_vegas_gamma) { + /* Going too fast. Time to slow down + * and switch to congestion avoidance. + */ + tp->snd_ssthresh = 2; + + /* Set cwnd to match the actual rate + * exactly: + * cwnd = (actual rate) * baseRTT + * Then we add 1 because the integer + * truncation robs us of full link + * utilization. + */ + tp->snd_cwnd = min(tp->snd_cwnd, + (target_cwnd >> + V_PARAM_SHIFT)+1); + + } + } else { + /* Congestion avoidance. */ + u32 next_snd_cwnd; + + /* Figure out where we would like cwnd + * to be. + */ + if (diff > sysctl_tcp_vegas_beta) { + /* The old window was too fast, so + * we slow down. + */ + next_snd_cwnd = old_snd_cwnd - 1; + } else if (diff < sysctl_tcp_vegas_alpha) { + /* We don't have enough extra packets + * in the network, so speed up. + */ + next_snd_cwnd = old_snd_cwnd + 1; + } else { + /* Sending just as fast as we + * should be. + */ + next_snd_cwnd = old_snd_cwnd; + } + + /* Adjust cwnd upward or downward, toward the + * desired value. + */ + if (next_snd_cwnd > tp->snd_cwnd) + tp->snd_cwnd++; + else if (next_snd_cwnd < tp->snd_cwnd) + tp->snd_cwnd--; + } + } + + /* Wipe the slate clean for the next RTT. */ + tp->vegas.cntRTT = 0; + tp->vegas.minRTT = 0x7fffffff; + } + + /* The following code is executed for every ack we receive, + * except for conditions checked in should_advance_cwnd() + * before the call to tcp_cong_avoid(). Mainly this means that + * we only execute this code if the ack actually acked some + * data. + */ + + /* If we are in slow start, increase our cwnd in response to this ACK. + * (If we are not in slow start then we are in congestion avoidance, + * and adjust our congestion window only once per RTT. See the code + * above.) + */ + if (tp->snd_cwnd <= tp->snd_ssthresh) + tp->snd_cwnd++; + + /* to keep cwnd from growing without bound */ + tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); + + /* Make sure that we are never so timid as to reduce our cwnd below + * 2 MSS. + * + * Going below 2 MSS would risk huge delayed ACKs from our receiver. + */ + tp->snd_cwnd = max(tp->snd_cwnd, 2U); + + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +static inline void tcp_cong_avoid(struct tcp_opt *tp, u32 ack, u32 seq_rtt) +{ + if (tcp_vegas_enabled(tp)) + vegas_cong_avoid(tp, ack, seq_rtt); + else + reno_cong_avoid(tp); +} + /* Restart timer after forward progress on connection. * RFC2988 recommends to restart timer to now+rto. */ @@ -1730,7 +2009,7 @@ static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp) } /* Remove acknowledged frames from the retransmission queue. */ -static int tcp_clean_rtx_queue(struct sock *sk) +static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; @@ -1813,6 +2092,7 @@ static int tcp_clean_rtx_queue(struct sock *sk) } } #endif + *seq_rtt_p = seq_rtt; return acked; } @@ -1900,6 +2180,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; u32 prior_in_flight; + s32 seq_rtt; int prior_packets; /* If the ack is newer than sent or older than previous acks @@ -1947,17 +2228,19 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) prior_in_flight = tcp_packets_in_flight(tp); /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk); + flag |= tcp_clean_rtx_queue(sk, &seq_rtt); if (tcp_ack_is_dubious(tp, flag)) { /* Advanve CWND, if state allows this. */ - if ((flag&FLAG_DATA_ACKED) && prior_in_flight >= tp->snd_cwnd && + if ((flag&FLAG_DATA_ACKED) && + (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) && tcp_may_raise_cwnd(tp, flag)) - tcp_cong_avoid(tp); + tcp_cong_avoid(tp, ack, seq_rtt); tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); } else { - if ((flag&FLAG_DATA_ACKED) && prior_in_flight >= tp->snd_cwnd) - tcp_cong_avoid(tp); + if ((flag & FLAG_DATA_ACKED) && + (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd)) + tcp_cong_avoid(tp, ack, seq_rtt); } if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) diff --git a/release/src/linux/linux/net/ipv4/tcp_minisocks.c b/release/src/linux/linux/net/ipv4/tcp_minisocks.c index b69cc32c..6fdb7681 100644 --- a/release/src/linux/linux/net/ipv4/tcp_minisocks.c +++ b/release/src/linux/linux/net/ipv4/tcp_minisocks.c @@ -715,7 +715,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, newtp->snd_cwnd = 2; newtp->snd_cwnd_cnt = 0; - newtp->ca_state = TCP_CA_Open; + tcp_set_ca_state(newtp, TCP_CA_Open); tcp_init_xmit_timers(newsk); skb_queue_head_init(&newtp->out_of_order_queue); newtp->send_head = NULL; @@ -783,6 +783,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, newtp->mss_clamp = req->mss; TCP_ECN_openreq_child(newtp, req); + tcp_vegas_init(newtp); TCP_INC_STATS_BH(TcpPassiveOpens); } return newsk; diff --git a/release/src/linux/linux/net/ipv4/tcp_output.c b/release/src/linux/linux/net/ipv4/tcp_output.c index 35cbbbf7..3fd4871f 100644 --- a/release/src/linux/linux/net/ipv4/tcp_output.c +++ b/release/src/linux/linux/net/ipv4/tcp_output.c @@ -105,6 +105,9 @@ static void tcp_cwnd_restart(struct tcp_opt *tp) u32 restart_cwnd = tcp_init_cwnd(tp); u32 cwnd = tp->snd_cwnd; + if (tcp_is_vegas(tp)) + tcp_vegas_enable(tp); + tp->snd_ssthresh = tcp_current_ssthresh(tp); restart_cwnd = min(restart_cwnd, cwnd); @@ -223,6 +226,19 @@ int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK)); } + + /* + * If the connection is idle and we are restarting, + * then we don't want to do any Vegas calculations + * until we get fresh RTT samples. So when we + * restart, we reset our Vegas state to a clean + * slate. After we get acks for this flight of + * packets, _then_ we can make Vegas calculations + * again. + */ + if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0) + tcp_vegas_enable(tp); + th = (struct tcphdr *) skb_push(skb, tcp_header_size); skb->h.th = th; skb_set_owner_w(skb, sk); @@ -800,7 +816,7 @@ void tcp_simple_retransmit(struct sock *sk) tp->snd_ssthresh = tcp_current_ssthresh(tp); tp->prior_ssthresh = 0; tp->undo_marker = 0; - tp->ca_state = TCP_CA_Loss; + tcp_set_ca_state(tp, TCP_CA_Loss); } tcp_xmit_retransmit_queue(sk); } @@ -1181,6 +1197,7 @@ static inline void tcp_connect_init(struct sock *sk) tp->window_clamp = dst->window; tp->advmss = dst->advmss; tcp_initialize_rcv_mss(sk); + tcp_vegas_init(tp); tcp_select_initial_window(tcp_full_space(sk), tp->advmss - (tp->ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), @@ -1231,6 +1248,7 @@ int tcp_connect(struct sock *sk) TCP_SKB_CB(buff)->end_seq = tp->write_seq; tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; + tcp_vegas_init(tp); /* Send it off. */ TCP_SKB_CB(buff)->when = tcp_time_stamp; diff --git a/release/src/linux/linux/net/ipv6/netfilter/Config.in b/release/src/linux/linux/net/ipv6/netfilter/Config.in index 062ed247..5d2dac2f 100644 --- a/release/src/linux/linux/net/ipv6/netfilter/Config.in +++ b/release/src/linux/linux/net/ipv6/netfilter/Config.in @@ -17,6 +17,7 @@ tristate 'IP6 tables support (required for filtering/masq/NAT)' CONFIG_IP6_NF_IP if [ "$CONFIG_IP6_NF_IPTABLES" != "n" ]; then # The simple matches. dep_tristate ' limit match support' CONFIG_IP6_NF_MATCH_LIMIT $CONFIG_IP6_NF_IPTABLES + dep_tristate ' condition match support' CONFIG_IP6_NF_MATCH_CONDITION $CONFIG_IP6_NF_IPTABLES dep_tristate ' MAC address match support' CONFIG_IP6_NF_MATCH_MAC $CONFIG_IP6_NF_IPTABLES dep_tristate ' Multiple port match support' CONFIG_IP6_NF_MATCH_MULTIPORT $CONFIG_IP6_NF_IPTABLES if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then @@ -55,6 +56,9 @@ if [ "$CONFIG_IP6_NF_IPTABLES" != "n" ]; then if [ "$CONFIG_IP6_NF_MANGLE" != "n" ]; then # dep_tristate ' TOS target support' CONFIG_IP6_NF_TARGET_TOS $CONFIG_IP_NF_MANGLE dep_tristate ' MARK target support' CONFIG_IP6_NF_TARGET_MARK $CONFIG_IP6_NF_MANGLE + dep_tristate ' ROUTE target support' CONFIG_IP6_NF_TARGET_ROUTE $CONFIG_IP6_NF_MANGLE + + dep_tristate ' IMQ target support' CONFIG_IP6_NF_TARGET_IMQ $CONFIG_IP6_NF_MANGLE fi #dep_tristate ' LOG target support' CONFIG_IP6_NF_TARGET_LOG $CONFIG_IP6_NF_IPTABLES fi diff --git a/release/src/linux/linux/net/ipv6/netfilter/Makefile b/release/src/linux/linux/net/ipv6/netfilter/Makefile index dfd36a89..2bd664f4 100644 --- a/release/src/linux/linux/net/ipv6/netfilter/Makefile +++ b/release/src/linux/linux/net/ipv6/netfilter/Makefile @@ -14,6 +14,7 @@ export-objs := ip6_tables.o # Link order matters here. obj-$(CONFIG_IP6_NF_IPTABLES) += ip6_tables.o obj-$(CONFIG_IP6_NF_MATCH_LIMIT) += ip6t_limit.o +obj-$(CONFIG_IP6_NF_MATCH_CONDITION) += ip6t_condition.o obj-$(CONFIG_IP6_NF_MATCH_MARK) += ip6t_mark.o obj-$(CONFIG_IP6_NF_MATCH_LENGTH) += ip6t_length.o obj-$(CONFIG_IP6_NF_MATCH_MAC) += ip6t_mac.o @@ -23,6 +24,8 @@ obj-$(CONFIG_IP6_NF_MATCH_OWNER) += ip6t_owner.o obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o obj-$(CONFIG_IP6_NF_TARGET_MARK) += ip6t_MARK.o +obj-$(CONFIG_IP6_NF_TARGET_ROUTE) += ip6t_ROUTE.o +obj-$(CONFIG_IP6_NF_TARGET_IMQ) += ip6t_IMQ.o obj-$(CONFIG_IP6_NF_QUEUE) += ip6_queue.o obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o diff --git a/release/src/linux/linux/net/ipv6/netfilter/ip6_tables.c b/release/src/linux/linux/net/ipv6/netfilter/ip6_tables.c index b521af78..8eac7586 100644 --- a/release/src/linux/linux/net/ipv6/netfilter/ip6_tables.c +++ b/release/src/linux/linux/net/ipv6/netfilter/ip6_tables.c @@ -1241,13 +1241,7 @@ do_add_counters(void *user, unsigned int len) goto free; write_lock_bh(&t->lock); - /************************************* - * modify by tanghui @ 2006-10-11 - * for a RACE CONDITION in the "do_add_counters()" function - *************************************/ if (t->private->number != paddc->num_counters) { - if (t->private->number != tmp.num_counters) { - /*************************************/ ret = -EINVAL; goto unlock_up_free; } diff --git a/release/src/linux/linux/net/ipv6/netfilter/ip6t_IMQ.c b/release/src/linux/linux/net/ipv6/netfilter/ip6t_IMQ.c new file mode 100644 index 00000000..760d7447 --- /dev/null +++ b/release/src/linux/linux/net/ipv6/netfilter/ip6t_IMQ.c @@ -0,0 +1,78 @@ +/* This target marks packets to be enqueued to an imq device */ +#include +#include +#include +#include +#include + +static unsigned int imq_target(struct sk_buff **pskb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + const void *targinfo, + void *userinfo) +{ + struct ip6t_imq_info *mr = (struct ip6t_imq_info*)targinfo; + + (*pskb)->imq_flags = mr->todev | IMQ_F_ENQUEUE; + (*pskb)->nfcache |= NFC_ALTERED; + + return IP6T_CONTINUE; +} + +static int imq_checkentry(const char *tablename, + const struct ip6t_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + struct ip6t_imq_info *mr; + + if (targinfosize != IP6T_ALIGN(sizeof(struct ip6t_imq_info))) { + printk(KERN_WARNING "IMQ: invalid targinfosize\n"); + return 0; + } + mr = (struct ip6t_imq_info*)targinfo; + + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_WARNING + "IMQ: IMQ can only be called from \"mangle\" table, not \"%s\"\n", + tablename); + return 0; + } + + if (mr->todev > IMQ_MAX_DEVS) { + printk(KERN_WARNING + "IMQ: invalid device specified, highest is %u\n", + IMQ_MAX_DEVS); + return 0; + } + + return 1; +} + +static struct ip6t_target ip6t_imq_reg = { + { NULL, NULL}, + "IMQ", + imq_target, + imq_checkentry, + NULL, + THIS_MODULE +}; + +static int __init init(void) +{ + if (ip6t_register_target(&ip6t_imq_reg)) + return -EINVAL; + + return 0; +} + +static void __exit fini(void) +{ + ip6t_unregister_target(&ip6t_imq_reg); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/release/src/linux/linux/net/ipv6/netfilter/ip6t_ROUTE.c b/release/src/linux/linux/net/ipv6/netfilter/ip6t_ROUTE.c new file mode 100644 index 00000000..bb6d11f8 --- /dev/null +++ b/release/src/linux/linux/net/ipv6/netfilter/ip6t_ROUTE.c @@ -0,0 +1,308 @@ +/* + * This implements the ROUTE v6 target, which enables you to setup unusual + * routes not supported by the standard kernel routing table. + * + * Copyright (C) 2003 Cedric de Launois + * + * v 1.1 2004/11/23 + * + * This software is distributed under GNU GPL v2, 1991 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if 1 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +#define NIP6(addr) \ + ntohs((addr).s6_addr16[0]), \ + ntohs((addr).s6_addr16[1]), \ + ntohs((addr).s6_addr16[2]), \ + ntohs((addr).s6_addr16[3]), \ + ntohs((addr).s6_addr16[4]), \ + ntohs((addr).s6_addr16[5]), \ + ntohs((addr).s6_addr16[6]), \ + ntohs((addr).s6_addr16[7]) + +/* Route the packet according to the routing keys specified in + * route_info. Keys are : + * - ifindex : + * 0 if no oif preferred, + * otherwise set to the index of the desired oif + * - route_info->gw : + * 0 if no gateway specified, + * otherwise set to the next host to which the pkt must be routed + * If success, skb->dev is the output device to which the packet must + * be sent and skb->dst is not NULL + * + * RETURN: 1 if the packet was succesfully routed to the + * destination desired + * 0 if the kernel routing table could not route the packet + * according to the keys specified + */ +static int +route6(struct sk_buff *skb, + unsigned int ifindex, + const struct ip6t_route_target_info *route_info) +{ + struct rt6_info *rt = NULL; + struct ipv6hdr *ipv6h = skb->nh.ipv6h; + struct in6_addr *gw = (struct in6_addr*)&route_info->gw; + + DEBUGP("ip6t_ROUTE: called with: "); + DEBUGP("DST=%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x ", NIP6(ipv6h->daddr)); + DEBUGP("GATEWAY=%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x ", NIP6(*gw)); + DEBUGP("OUT=%s\n", route_info->oif); + + if (ipv6_addr_any(gw)) + rt = rt6_lookup(&ipv6h->daddr, &ipv6h->saddr, ifindex, 1); + else + rt = rt6_lookup(gw, &ipv6h->saddr, ifindex, 1); + + if (!rt) + goto no_route; + + DEBUGP("ip6t_ROUTE: routing gives: "); + DEBUGP("DST=%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x ", NIP6(rt->rt6i_dst.addr)); + DEBUGP("GATEWAY=%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x ", NIP6(rt->rt6i_gateway)); + DEBUGP("OUT=%s\n", rt->rt6i_dev->name); + + if (ifindex && rt->rt6i_dev->ifindex!=ifindex) + goto wrong_route; + + if (!rt->rt6i_nexthop) { + DEBUGP("ip6t_ROUTE: discovering neighbour\n"); + rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_dst.addr); + } + + /* Drop old route. */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + skb->dev = rt->rt6i_dev; + return 1; + + wrong_route: + dst_release(&rt->u.dst); + no_route: + if (!net_ratelimit()) + return 0; + + printk("ip6t_ROUTE: no explicit route found "); + if (ifindex) + printk("via interface %s ", route_info->oif); + if (!ipv6_addr_any(gw)) + printk("via gateway %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x", NIP6(*gw)); + printk("\n"); + return 0; +} + + +/* Stolen from ip6_output_finish + * PRE : skb->dev is set to the device we are leaving by + * skb->dst is not NULL + * POST: the packet is sent with the link layer header pushed + * the packet is destroyed + */ +static void ip_direct_send(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct hh_cache *hh = dst->hh; + + if (hh) { + read_lock_bh(&hh->hh_lock); + memcpy(skb->data - 16, hh->hh_data, 16); + read_unlock_bh(&hh->hh_lock); + skb_push(skb, hh->hh_len); + hh->hh_output(skb); + } else if (dst->neighbour) + dst->neighbour->output(skb); + else { + if (net_ratelimit()) + DEBUGP(KERN_DEBUG "ip6t_ROUTE: no hdr & no neighbour cache!\n"); + kfree_skb(skb); + } +} + + +static unsigned int +route6_oif(const struct ip6t_route_target_info *route_info, + struct sk_buff *skb) +{ + unsigned int ifindex = 0; + struct net_device *dev_out = NULL; + + /* The user set the interface name to use. + * Getting the current interface index. + */ + if ((dev_out = dev_get_by_name(route_info->oif))) { + ifindex = dev_out->ifindex; + } else { + /* Unknown interface name : packet dropped */ + if (net_ratelimit()) + DEBUGP("ip6t_ROUTE: oif interface %s not found\n", route_info->oif); + + if (route_info->flags & IP6T_ROUTE_CONTINUE) + return IP6T_CONTINUE; + else + return NF_DROP; + } + + /* Trying the standard way of routing packets */ + if (route6(skb, ifindex, route_info)) { + dev_put(dev_out); + if (route_info->flags & IP6T_ROUTE_CONTINUE) + return IP6T_CONTINUE; + + ip_direct_send(skb); + return NF_STOLEN; + } else + return NF_DROP; +} + + +static unsigned int +route6_gw(const struct ip6t_route_target_info *route_info, + struct sk_buff *skb) +{ + if (route6(skb, 0, route_info)) { + if (route_info->flags & IP6T_ROUTE_CONTINUE) + return IP6T_CONTINUE; + + ip_direct_send(skb); + return NF_STOLEN; + } else + return NF_DROP; +} + + +static unsigned int +ip6t_route_target(struct sk_buff **pskb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + const void *targinfo, + void *userinfo) +{ + const struct ip6t_route_target_info *route_info = targinfo; + struct sk_buff *skb = *pskb; + struct in6_addr *gw = (struct in6_addr*)&route_info->gw; + unsigned int res; + + if (route_info->flags & IP6T_ROUTE_CONTINUE) + goto do_it; + + /* If we are at PREROUTING or INPUT hook + * the TTL isn't decreased by the IP stack + */ + if (hooknum == NF_IP6_PRE_ROUTING || + hooknum == NF_IP6_LOCAL_IN) { + + struct ipv6hdr *ipv6h = skb->nh.ipv6h; + + if (ipv6h->hop_limit <= 1) { + /* Force OUTPUT device used as source address */ + skb->dev = skb->dst->dev; + + icmpv6_send(skb, ICMPV6_TIME_EXCEED, + ICMPV6_EXC_HOPLIMIT, 0, skb->dev); + + return NF_DROP; + } + + ipv6h->hop_limit--; + } + + if ((route_info->flags & IP6T_ROUTE_TEE)) { + /* + * Copy the *pskb, and route the copy. Will later return + * IP6T_CONTINUE for the original skb, which should continue + * on its way as if nothing happened. The copy should be + * independantly delivered to the ROUTE --gw. + */ + skb = skb_copy(*pskb, GFP_ATOMIC); + if (!skb) { + if (net_ratelimit()) + DEBUGP(KERN_DEBUG "ip6t_ROUTE: copy failed!\n"); + return IP6T_CONTINUE; + } + } + +do_it: + if (route_info->oif[0]) { + res = route6_oif(route_info, skb); + } else if (!ipv6_addr_any(gw)) { + res = route6_gw(route_info, skb); + } else { + if (net_ratelimit()) + DEBUGP(KERN_DEBUG "ip6t_ROUTE: no parameter !\n"); + res = IP6T_CONTINUE; + } + + if ((route_info->flags & IP6T_ROUTE_TEE)) + res = IP6T_CONTINUE; + + return res; +} + + +static int +ip6t_route_checkentry(const char *tablename, + const struct ip6t_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + if (strcmp(tablename, "mangle") != 0) { + printk("ip6t_ROUTE: can only be called from \"mangle\" table.\n"); + return 0; + } + + if (targinfosize != IP6T_ALIGN(sizeof(struct ip6t_route_target_info))) { + printk(KERN_WARNING "ip6t_ROUTE: targinfosize %u != %Zu\n", + targinfosize, + IP6T_ALIGN(sizeof(struct ip6t_route_target_info))); + return 0; + } + + return 1; +} + + +static struct ip6t_target ip6t_route_reg = { + .name = "ROUTE", + .target = ip6t_route_target, + .checkentry = ip6t_route_checkentry, + .me = THIS_MODULE +}; + + +static int __init init(void) +{ + printk(KERN_DEBUG "registering ipv6 ROUTE target\n"); + if (ip6t_register_target(&ip6t_route_reg)) + return -EINVAL; + + return 0; +} + + +static void __exit fini(void) +{ + ip6t_unregister_target(&ip6t_route_reg); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/release/src/linux/linux/net/ipv6/netfilter/ip6t_condition.c b/release/src/linux/linux/net/ipv6/netfilter/ip6t_condition.c new file mode 100644 index 00000000..15d805e6 --- /dev/null +++ b/release/src/linux/linux/net/ipv6/netfilter/ip6t_condition.c @@ -0,0 +1,254 @@ +/*-------------------------------------------*\ +| Netfilter Condition Module for IPv6 | +| | +| Description: This module allows firewall | +| rules to match using condition variables | +| stored in /proc files. | +| | +| Author: Stephane Ouellette 2003-02-10 | +| | +| | +| This software is distributed under the | +| terms of the GNU GPL. | +\*-------------------------------------------*/ + +#include +#include +#include +#include +#include +#include +#include + + +#ifndef CONFIG_PROC_FS +#error "Proc file system support is required for this module" +#endif + + +MODULE_AUTHOR("Stephane Ouellette "); +MODULE_DESCRIPTION("Allows rules to match against condition variables"); +MODULE_LICENSE("GPL"); + + +struct condition_variable { + struct condition_variable *next; + struct proc_dir_entry *status_proc; + atomic_t refcount; + int enabled; /* TRUE == 1, FALSE == 0 */ +}; + + +static rwlock_t list_lock; +static struct condition_variable *head = NULL; +static struct proc_dir_entry *proc_net_condition = NULL; + + +static int +ipt_condition_read_info(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + struct condition_variable *var = + (struct condition_variable *) data; + + if (offset == 0) { + *start = buffer; + buffer[0] = (var->enabled) ? '1' : '0'; + buffer[1] = '\n'; + return 2; + } + + *eof = 1; + return 0; +} + + +static int +ipt_condition_write_info(struct file *file, const char *buffer, + unsigned long length, void *data) +{ + struct condition_variable *var = + (struct condition_variable *) data; + + if (length) { + /* Match only on the first character */ + switch (buffer[0]) { + case '0': + var->enabled = 0; + break; + case '1': + var->enabled = 1; + } + } + + return (int) length; +} + + +static int +match(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const void *matchinfo, int offset, + const void *hdr, u_int16_t datalen, int *hotdrop) +{ + const struct condition6_info *info = + (const struct condition6_info *) matchinfo; + struct condition_variable *var; + int condition_status = 0; + + read_lock(&list_lock); + + for (var = head; var; var = var->next) { + if (strcmp(info->name, var->status_proc->name) == 0) { + condition_status = var->enabled; + break; + } + } + + read_unlock(&list_lock); + + return condition_status ^ info->invert; +} + + + +static int +checkentry(const char *tablename, const struct ip6t_ip6 *ip, + void *matchinfo, unsigned int matchsize, unsigned int hook_mask) +{ + struct condition6_info *info = + (struct condition6_info *) matchinfo; + struct condition_variable *var, *newvar; + + if (matchsize != IP6T_ALIGN(sizeof(struct condition6_info))) + return 0; + + /* The first step is to check if the condition variable already exists. */ + /* Here, a read lock is sufficient because we won't change the list */ + read_lock(&list_lock); + + for (var = head; var; var = var->next) { + if (strcmp(info->name, var->status_proc->name) == 0) { + atomic_inc(&var->refcount); + read_unlock(&list_lock); + return 1; + } + } + + read_unlock(&list_lock); + + /* At this point, we need to allocate a new condition variable */ + newvar = kmalloc(sizeof(struct condition_variable), GFP_KERNEL); + + if (!newvar) + return -ENOMEM; + + /* Create the condition variable's proc file entry */ + newvar->status_proc = create_proc_entry(info->name, 0644, proc_net_condition); + + if (!newvar->status_proc) { + /* + * There are two possibilities: + * 1- Another condition variable with the same name has been created, which is valid. + * 2- There was a memory allocation error. + */ + kfree(newvar); + read_lock(&list_lock); + + for (var = head; var; var = var->next) { + if (strcmp(info->name, var->status_proc->name) == 0) { + atomic_inc(&var->refcount); + read_unlock(&list_lock); + return 1; + } + } + + read_unlock(&list_lock); + return -ENOMEM; + } + + atomic_set(&newvar->refcount, 1); + newvar->enabled = 0; + newvar->status_proc->owner = THIS_MODULE; + newvar->status_proc->data = newvar; + wmb(); + newvar->status_proc->read_proc = ipt_condition_read_info; + newvar->status_proc->write_proc = ipt_condition_write_info; + + write_lock(&list_lock); + + newvar->next = head; + head = newvar; + + write_unlock(&list_lock); + + return 1; +} + + +static void +destroy(void *matchinfo, unsigned int matchsize) +{ + struct condition6_info *info = + (struct condition6_info *) matchinfo; + struct condition_variable *var, *prev = NULL; + + if (matchsize != IP6T_ALIGN(sizeof(struct condition6_info))) + return; + + write_lock(&list_lock); + + for (var = head; var && strcmp(info->name, var->status_proc->name); + prev = var, var = var->next); + + if (var && atomic_dec_and_test(&var->refcount)) { + if (prev) + prev->next = var->next; + else + head = var->next; + + write_unlock(&list_lock); + remove_proc_entry(var->status_proc->name, proc_net_condition); + kfree(var); + } else + write_unlock(&list_lock); +} + + +static struct ip6t_match condition_match = { + .name = "condition", + .match = &match, + .checkentry = &checkentry, + .destroy = &destroy, + .me = THIS_MODULE +}; + + +static int __init +init(void) +{ + int errorcode; + + rwlock_init(&list_lock); + proc_net_condition = proc_mkdir("ip6t_condition", proc_net); + + if (proc_net_condition) { + errorcode = ipt_register_match(&condition_match); + + if (errorcode) + remove_proc_entry("ip6t_condition", proc_net); + } else + errorcode = -EACCES; + + return errorcode; +} + + +static void __exit +fini(void) +{ + ipt_unregister_match(&condition_match); + remove_proc_entry("ip6t_condition", proc_net); +} + +module_init(init); +module_exit(fini); diff --git a/release/src/linux/linux/net/sched/Config.in b/release/src/linux/linux/net/sched/Config.in index 8e203456..468fdf2a 100644 --- a/release/src/linux/linux/net/sched/Config.in +++ b/release/src/linux/linux/net/sched/Config.in @@ -5,13 +5,14 @@ tristate ' CBQ packet scheduler' CONFIG_NET_SCH_CBQ tristate ' HTB packet scheduler' CONFIG_NET_SCH_HTB tristate ' CSZ packet scheduler' CONFIG_NET_SCH_CSZ #tristate ' H-PFQ packet scheduler' CONFIG_NET_SCH_HPFQ -#tristate ' H-FSC packet scheduler' CONFIG_NET_SCH_HFCS +tristate ' H-FSC packet scheduler' CONFIG_NET_SCH_HFSC if [ "$CONFIG_ATM" = "y" ]; then bool ' ATM pseudo-scheduler' CONFIG_NET_SCH_ATM fi tristate ' The simplest PRIO pseudoscheduler' CONFIG_NET_SCH_PRIO tristate ' RED queue' CONFIG_NET_SCH_RED tristate ' SFQ queue' CONFIG_NET_SCH_SFQ +tristate ' ESFQ queue' CONFIG_NET_SCH_ESFQ tristate ' TEQL queue' CONFIG_NET_SCH_TEQL tristate ' TBF queue' CONFIG_NET_SCH_TBF tristate ' GRED queue' CONFIG_NET_SCH_GRED diff --git a/release/src/linux/linux/net/sched/Makefile b/release/src/linux/linux/net/sched/Makefile index e48e5c3e..49cf71e7 100644 --- a/release/src/linux/linux/net/sched/Makefile +++ b/release/src/linux/linux/net/sched/Makefile @@ -18,6 +18,7 @@ obj-$(CONFIG_NET_SCH_HPFQ) += sch_hpfq.o obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o +obj-$(CONFIG_NET_SCH_ESFQ) += sch_esfq.o obj-$(CONFIG_NET_SCH_RED) += sch_red.o obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o diff --git a/release/src/linux/linux/net/sched/sch_api.c b/release/src/linux/linux/net/sched/sch_api.c index a5d8945e..ae384433 100644 --- a/release/src/linux/linux/net/sched/sch_api.c +++ b/release/src/linux/linux/net/sched/sch_api.c @@ -1232,6 +1232,9 @@ int __init pktsched_init(void) #ifdef CONFIG_NET_SCH_SFQ INIT_QDISC(sfq); #endif +#ifdef CONFIG_NET_SCH_ESFQ + INIT_QDISC(esfq); +#endif #ifdef CONFIG_NET_SCH_TBF INIT_QDISC(tbf); #endif diff --git a/release/src/linux/linux/net/sched/sch_esfq.c b/release/src/linux/linux/net/sched/sch_esfq.c new file mode 100644 index 00000000..26640f18 --- /dev/null +++ b/release/src/linux/linux/net/sched/sch_esfq.c @@ -0,0 +1,652 @@ +/* + * net/sched/sch_esfq.c Extended Stochastic Fairness Queueing discipline. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + * + * Changes: Alexander Atanasov, + * Added dynamic depth,limit,divisor,hash_kind options. + * Added dst and src hashes. + * + * Alexander Clouter, + * Ported ESFQ to Linux 2.6. + * + * Corey Hickey, + * Maintenance of the Linux 2.6 port. + * Added fwmark hash (thanks to Robert Kurjata) + * Added direct hashing for src, dst, and fwmark. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Stochastic Fairness Queuing algorithm. + For more comments look at sch_sfq.c. + The difference is that you can change limit, depth, + hash table size and choose 7 hash types. + + classic: same as in sch_sfq.c + dst: destination IP address + src: source IP address + fwmark: netfilter mark value + dst_direct: + src_direct: + fwmark_direct: direct hashing of the above sources + + TODO: + make sfq_change work. +*/ + +#ifndef IPPROTO_SCTP +#define IPPROTO_SCTP 132 +#endif +#ifndef IPPROTO_DCCP +#define IPPROTO_DCCP 33 +#endif + +/* This type should contain at least SFQ_DEPTH*2 values */ +typedef unsigned int esfq_index; + +struct esfq_head +{ + esfq_index next; + esfq_index prev; +}; + +struct esfq_sched_data +{ +/* Parameters */ + int perturb_period; + unsigned quantum; /* Allotment per round: MUST BE >= MTU */ + int limit; + unsigned depth; + unsigned hash_divisor; + unsigned hash_kind; +/* Variables */ + struct timer_list perturb_timer; + int perturbation; + esfq_index tail; /* Index of current slot in round */ + esfq_index max_depth; /* Maximal depth */ + + esfq_index *ht; /* Hash table */ + esfq_index *next; /* Active slots link */ + short *allot; /* Current allotment per slot */ + unsigned short *hash; /* Hash value indexed by slots */ + struct sk_buff_head *qs; /* Slot queue */ + struct esfq_head *dep; /* Linked list of slots, indexed by depth */ + unsigned dyn_min; /* For dynamic divisor adjustment; minimum value seen */ + unsigned dyn_max; /* maximum value seen */ + unsigned dyn_range; /* saved range */ +}; + +static __inline__ unsigned esfq_hash_u32(struct esfq_sched_data *q,u32 h) +{ + int pert = q->perturbation; + + if (pert) + h = (h<>(0x1F - pert)); + + h = ntohl(h) * 2654435761UL; + return h & (q->hash_divisor-1); +} + +/* Hash input values directly into the "nearest" slot, taking into account the + * range of input values seen. This is most useful when the hash table is at + * least as large as the range of possible values. */ +static __inline__ unsigned esfq_hash_direct(struct esfq_sched_data *q, u32 h) +{ + /* adjust minimum and maximum */ + if (h < q->dyn_min || h > q->dyn_max) { + q->dyn_min = h < q->dyn_min ? h : q->dyn_min; + q->dyn_max = h > q->dyn_max ? h : q->dyn_max; + + /* find new range */ + if ((q->dyn_range = q->dyn_max - q->dyn_min) >= q->hash_divisor) + printk(KERN_WARNING "ESFQ: (direct hash) Input range %u is larger than hash " + "table. See ESFQ README for details.\n", q->dyn_range); + } + + /* hash input values into slot numbers */ + if (q->dyn_min == q->dyn_max) + return 0; /* only one value seen; avoid division by 0 */ + else + return (h - q->dyn_min) * (q->hash_divisor - 1) / q->dyn_range; +} + +static __inline__ unsigned esfq_fold_hash_classic(struct esfq_sched_data *q, u32 h, u32 h1) +{ + int pert = q->perturbation; + + /* Have we any rotation primitives? If not, WHY? */ + h ^= (h1<>(0x1F - pert)); + h ^= h>>10; + return h & (q->hash_divisor-1); +} + +static unsigned esfq_hash(struct esfq_sched_data *q, struct sk_buff *skb) +{ + u32 h, h2; + u32 hs; + u32 nfm; + + switch (skb->protocol) { + case __constant_htons(ETH_P_IP): + { + struct iphdr *iph = skb->nh.iph; + h = iph->daddr; + hs = iph->saddr; + nfm = skb->nfmark; + h2 = hs^iph->protocol; + if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && + (iph->protocol == IPPROTO_TCP || + iph->protocol == IPPROTO_UDP || + iph->protocol == IPPROTO_SCTP || + iph->protocol == IPPROTO_DCCP || + iph->protocol == IPPROTO_ESP)) + h2 ^= *(((u32*)iph) + iph->ihl); + break; + } + case __constant_htons(ETH_P_IPV6): + { + struct ipv6hdr *iph = skb->nh.ipv6h; + h = iph->daddr.s6_addr32[3]; + hs = iph->saddr.s6_addr32[3]; + nfm = skb->nfmark; + h2 = hs^iph->nexthdr; + if (iph->nexthdr == IPPROTO_TCP || + iph->nexthdr == IPPROTO_UDP || + iph->nexthdr == IPPROTO_SCTP || + iph->nexthdr == IPPROTO_DCCP || + iph->nexthdr == IPPROTO_ESP) + h2 ^= *(u32*)&iph[1]; + break; + } + default: + h = (u32)(unsigned long)skb->dst; + hs = (u32)(unsigned long)skb->sk; + nfm = skb->nfmark; + h2 = hs^skb->protocol; + } + switch(q->hash_kind) + { + case TCA_SFQ_HASH_CLASSIC: + return esfq_fold_hash_classic(q, h, h2); + case TCA_SFQ_HASH_DST: + return esfq_hash_u32(q,h); + case TCA_SFQ_HASH_DSTDIR: + return esfq_hash_direct(q, ntohl(h)); + case TCA_SFQ_HASH_SRC: + return esfq_hash_u32(q,hs); + case TCA_SFQ_HASH_SRCDIR: + return esfq_hash_direct(q, ntohl(hs)); +#ifdef CONFIG_NETFILTER + case TCA_SFQ_HASH_FWMARK: + return esfq_hash_u32(q,nfm); + case TCA_SFQ_HASH_FWMARKDIR: + return esfq_hash_direct(q,nfm); +#endif + default: + if (net_ratelimit()) + printk(KERN_WARNING "ESFQ: Unknown hash method. Falling back to classic.\n"); + } + return esfq_fold_hash_classic(q, h, h2); +} + +static inline void esfq_link(struct esfq_sched_data *q, esfq_index x) +{ + esfq_index p, n; + int d = q->qs[x].qlen + q->depth; + + p = d; + n = q->dep[d].next; + q->dep[x].next = n; + q->dep[x].prev = p; + q->dep[p].next = q->dep[n].prev = x; +} + +static inline void esfq_dec(struct esfq_sched_data *q, esfq_index x) +{ + esfq_index p, n; + + n = q->dep[x].next; + p = q->dep[x].prev; + q->dep[p].next = n; + q->dep[n].prev = p; + + if (n == p && q->max_depth == q->qs[x].qlen + 1) + q->max_depth--; + + esfq_link(q, x); +} + +static inline void esfq_inc(struct esfq_sched_data *q, esfq_index x) +{ + esfq_index p, n; + int d; + + n = q->dep[x].next; + p = q->dep[x].prev; + q->dep[p].next = n; + q->dep[n].prev = p; + d = q->qs[x].qlen; + if (q->max_depth < d) + q->max_depth = d; + + esfq_link(q, x); +} + +static int esfq_drop(struct Qdisc *sch) +{ + struct esfq_sched_data *q = (struct esfq_sched_data *)sch->data; + esfq_index d = q->max_depth; + struct sk_buff *skb; + unsigned int len; + + /* Queue is full! Find the longest slot and + drop a packet from it */ + + if (d > 1) { + esfq_index x = q->dep[d+q->depth].next; + skb = q->qs[x].prev; + len = skb->len; + __skb_unlink(skb, &q->qs[x]); + kfree_skb(skb); + esfq_dec(q, x); + sch->q.qlen--; + sch->stats.drops++; + return len; + } + + if (d == 1) { + /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ + d = q->next[q->tail]; + q->next[q->tail] = q->next[d]; + q->allot[q->next[d]] += q->quantum; + skb = q->qs[d].prev; + len = skb->len; + __skb_unlink(skb, &q->qs[d]); + kfree_skb(skb); + esfq_dec(q, d); + sch->q.qlen--; + q->ht[q->hash[d]] = q->depth; + sch->stats.drops++; + return len; + } + + return 0; +} + +static int +esfq_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct esfq_sched_data *q = (struct esfq_sched_data *)sch->data; + unsigned hash = esfq_hash(q, skb); + unsigned depth = q->depth; + esfq_index x; + + x = q->ht[hash]; + if (x == depth) { + q->ht[hash] = x = q->dep[depth].next; + q->hash[x] = hash; + } + __skb_queue_tail(&q->qs[x], skb); + esfq_inc(q, x); + if (q->qs[x].qlen == 1) { /* The flow is new */ + if (q->tail == depth) { /* It is the first flow */ + q->tail = x; + q->next[x] = x; + q->allot[x] = q->quantum; + } else { + q->next[x] = q->next[q->tail]; + q->next[q->tail] = x; + q->tail = x; + } + } + if (++sch->q.qlen < q->limit-1) { + sch->stats.bytes += skb->len; + sch->stats.packets++; + return 0; + } + + esfq_drop(sch); + return NET_XMIT_CN; +} + +static int +esfq_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct esfq_sched_data *q = (struct esfq_sched_data *)sch->data; + unsigned hash = esfq_hash(q, skb); + unsigned depth = q->depth; + esfq_index x; + + x = q->ht[hash]; + if (x == depth) { + q->ht[hash] = x = q->dep[depth].next; + q->hash[x] = hash; + } + __skb_queue_head(&q->qs[x], skb); + esfq_inc(q, x); + if (q->qs[x].qlen == 1) { /* The flow is new */ + if (q->tail == depth) { /* It is the first flow */ + q->tail = x; + q->next[x] = x; + q->allot[x] = q->quantum; + } else { + q->next[x] = q->next[q->tail]; + q->next[q->tail] = x; + q->tail = x; + } + } + if (++sch->q.qlen < q->limit - 1) { +// sch->stats.requeues++; + return 0; + } + + sch->stats.drops++; + esfq_drop(sch); + return NET_XMIT_CN; +} + + + + +static struct sk_buff * +esfq_dequeue(struct Qdisc* sch) +{ + struct esfq_sched_data *q = (struct esfq_sched_data *)sch->data; + struct sk_buff *skb; + unsigned depth = q->depth; + esfq_index a, old_a; + + /* No active slots */ + if (q->tail == depth) + return NULL; + + a = old_a = q->next[q->tail]; + + /* Grab packet */ + skb = __skb_dequeue(&q->qs[a]); + esfq_dec(q, a); + sch->q.qlen--; + + /* Is the slot empty? */ + if (q->qs[a].qlen == 0) { + q->ht[q->hash[a]] = depth; + a = q->next[a]; + if (a == old_a) { + q->tail = depth; + return skb; + } + q->next[q->tail] = a; + q->allot[a] += q->quantum; + } else if ((q->allot[a] -= skb->len) <= 0) { + q->tail = a; + a = q->next[a]; + q->allot[a] += q->quantum; + } + + return skb; +} + +static void +esfq_reset(struct Qdisc* sch) +{ + struct sk_buff *skb; + + while ((skb = esfq_dequeue(sch)) != NULL) + kfree_skb(skb); +} + +static void esfq_perturbation(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + struct esfq_sched_data *q = (struct esfq_sched_data *)sch->data; + + q->perturbation = net_random()&0x1F; + + if (q->perturb_period) { + q->perturb_timer.expires = jiffies + q->perturb_period; + add_timer(&q->perturb_timer); + } +} + +/* +static int esfq_change(struct Qdisc *sch, struct rtattr *opt) +{ + struct esfq_sched_data *q = (struct esfq_sched_data *)sch->data; + struct tc_esfq_qopt *ctl = RTA_DATA(opt); + int old_perturb = q->perturb_period; + + if (opt->rta_len < RTA_LENGTH(sizeof(*ctl))) + return -EINVAL; + + sch_tree_lock(sch); + q->quantum = ctl->quantum ? : psched_mtu(sch->dev); + q->perturb_period = ctl->perturb_period*HZ; +// q->hash_divisor = ctl->divisor; +// q->tail = q->limit = q->depth = ctl->flows; + + if (ctl->limit) + q->limit = min_t(u32, ctl->limit, q->depth); + + if (ctl->hash_kind) { + q->hash_kind = ctl->hash_kind; + if (q->hash_kind != TCA_SFQ_HASH_CLASSIC) + q->perturb_period = 0; + } + + // is sch_tree_lock enough to do this ? + while (sch->q.qlen >= q->limit-1) + esfq_drop(sch); + + if (old_perturb) + del_timer(&q->perturb_timer); + if (q->perturb_period) { + q->perturb_timer.expires = jiffies + q->perturb_period; + add_timer(&q->perturb_timer); + } else { + q->perturbation = 0; + } + sch_tree_unlock(sch); + return 0; +} +*/ + +static int esfq_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct esfq_sched_data *q = (struct esfq_sched_data *)sch->data; + struct tc_esfq_qopt *ctl; + esfq_index p = ~0UL/2; + int i; + + if (opt && opt->rta_len < RTA_LENGTH(sizeof(*ctl))) + return -EINVAL; + + init_timer(&q->perturb_timer); + q->perturb_timer.data = (unsigned long)sch; + q->perturb_timer.function = esfq_perturbation; + q->perturbation = 0; + q->hash_kind = TCA_SFQ_HASH_CLASSIC; + q->max_depth = 0; + q->dyn_min = ~0U; /* maximum value for this type */ + q->dyn_max = 0; /* dyn_min/dyn_max will be set properly upon first packet */ + if (opt == NULL) { + q->quantum = psched_mtu(sch->dev); + q->perturb_period = 0; + q->hash_divisor = 1024; + q->tail = q->limit = q->depth = 128; + + } else { + ctl = RTA_DATA(opt); + q->quantum = ctl->quantum ? : psched_mtu(sch->dev); + q->perturb_period = ctl->perturb_period*HZ; + q->hash_divisor = ctl->divisor ? : 1024; + q->tail = q->limit = q->depth = ctl->flows ? : 128; + + if ( q->depth > p - 1 ) + return -EINVAL; + + if (ctl->limit) + q->limit = min_t(u32, ctl->limit, q->depth); + + if (ctl->hash_kind) { + q->hash_kind = ctl->hash_kind; + } + + if (q->perturb_period) { + q->perturb_timer.expires = jiffies + q->perturb_period; + add_timer(&q->perturb_timer); + } + } + + q->ht = kmalloc(q->hash_divisor*sizeof(esfq_index), GFP_KERNEL); + if (!q->ht) + goto err_case; + + q->dep = kmalloc((1+q->depth*2)*sizeof(struct esfq_head), GFP_KERNEL); + if (!q->dep) + goto err_case; + q->next = kmalloc(q->depth*sizeof(esfq_index), GFP_KERNEL); + if (!q->next) + goto err_case; + + q->allot = kmalloc(q->depth*sizeof(short), GFP_KERNEL); + if (!q->allot) + goto err_case; + q->hash = kmalloc(q->depth*sizeof(unsigned short), GFP_KERNEL); + if (!q->hash) + goto err_case; + q->qs = kmalloc(q->depth*sizeof(struct sk_buff_head), GFP_KERNEL); + if (!q->qs) + goto err_case; + + for (i=0; i< q->hash_divisor; i++) + q->ht[i] = q->depth; + for (i=0; idepth; i++) { + skb_queue_head_init(&q->qs[i]); + q->dep[i+q->depth].next = i+q->depth; + q->dep[i+q->depth].prev = i+q->depth; + } + + for (i=0; idepth; i++) + esfq_link(q, i); + return 0; +err_case: + del_timer(&q->perturb_timer); + if (q->ht) + kfree(q->ht); + if (q->dep) + kfree(q->dep); + if (q->next) + kfree(q->next); + if (q->allot) + kfree(q->allot); + if (q->hash) + kfree(q->hash); + if (q->qs) + kfree(q->qs); + return -ENOBUFS; +} + +static void esfq_destroy(struct Qdisc *sch) +{ + struct esfq_sched_data *q = (struct esfq_sched_data *)sch->data; + del_timer(&q->perturb_timer); + if(q->ht) + kfree(q->ht); + if(q->dep) + kfree(q->dep); + if(q->next) + kfree(q->next); + if(q->allot) + kfree(q->allot); + if(q->hash) + kfree(q->hash); + if(q->qs) + kfree(q->qs); +} + +static int esfq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct esfq_sched_data *q = (struct esfq_sched_data *)sch->data; + unsigned char *b = skb->tail; + struct tc_esfq_qopt opt; + + opt.quantum = q->quantum; + opt.perturb_period = q->perturb_period/HZ; + + opt.limit = q->limit; + opt.divisor = q->hash_divisor; + opt.flows = q->depth; + opt.hash_kind = q->hash_kind; + + RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static struct Qdisc_ops esfq_qdisc_ops = +{ + .next = NULL, + .cl_ops = NULL, + .id = "esfq", + .priv_size = sizeof(struct esfq_sched_data), + .enqueue = esfq_enqueue, + .dequeue = esfq_dequeue, + .requeue = esfq_requeue, + .drop = esfq_drop, + .init = esfq_init, + .reset = esfq_reset, + .destroy = esfq_destroy, + .change = NULL, /* esfq_change - needs more work */ + .dump = esfq_dump, +// .owner = THIS_MODULE, +}; + +static int __init esfq_module_init(void) +{ + return register_qdisc(&esfq_qdisc_ops); +} +static void __exit esfq_module_exit(void) +{ + unregister_qdisc(&esfq_qdisc_ops); +} +module_init(esfq_module_init) +module_exit(esfq_module_exit) +MODULE_LICENSE("GPL"); diff --git a/release/src/linux/linux/net/sched/sch_fifo.c b/release/src/linux/linux/net/sched/sch_fifo.c index d8ce46f2..3a7741e9 100644 --- a/release/src/linux/linux/net/sched/sch_fifo.c +++ b/release/src/linux/linux/net/sched/sch_fifo.c @@ -46,7 +46,7 @@ bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) { struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data; - if (sch->stats.backlog <= q->limit) { + if (sch->stats.backlog + skb->len <= q->limit) { __skb_queue_tail(&sch->q, skb); sch->stats.backlog += skb->len; sch->stats.bytes += skb->len; @@ -87,9 +87,10 @@ fifo_drop(struct Qdisc* sch) skb = __skb_dequeue_tail(&sch->q); if (skb) { - sch->stats.backlog -= skb->len; + int len = skb->len; + sch->stats.backlog -= len; kfree_skb(skb); - return 1; + return len; } return 0; } @@ -106,7 +107,7 @@ pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) { struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data; - if (sch->q.qlen <= q->limit) { + if (sch->q.qlen < q->limit) { __skb_queue_tail(&sch->q, skb); sch->stats.bytes += skb->len; sch->stats.packets++; @@ -139,10 +140,12 @@ static int fifo_init(struct Qdisc *sch, struct rtattr *opt) struct fifo_sched_data *q = (void*)sch->data; if (opt == NULL) { + unsigned int limit = sch->dev->tx_queue_len ? : 1; + if (sch->ops == &bfifo_qdisc_ops) - q->limit = sch->dev->tx_queue_len*sch->dev->mtu; + q->limit = limit*sch->dev->mtu; else - q->limit = sch->dev->tx_queue_len; + q->limit = limit; } else { struct tc_fifo_qopt *ctl = RTA_DATA(opt); if (opt->rta_len < RTA_LENGTH(sizeof(*ctl))) diff --git a/release/src/linux/linux/net/sched/sch_generic.c b/release/src/linux/linux/net/sched/sch_generic.c index 7b0d49e7..ca30d124 100644 --- a/release/src/linux/linux/net/sched/sch_generic.c +++ b/release/src/linux/linux/net/sched/sch_generic.c @@ -29,6 +29,9 @@ #include #include #include +#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) +#include +#endif #include #include @@ -79,6 +82,10 @@ int qdisc_restart(struct net_device *dev) struct Qdisc *q = dev->qdisc; struct sk_buff *skb; + /* BRCM: bail out if queue is null */ + if (!q) + return 0; + /* Dequeue packet */ if ((skb = q->dequeue(q)) != NULL) { if (spin_trylock(&dev->xmit_lock)) { @@ -89,7 +96,11 @@ int qdisc_restart(struct net_device *dev) spin_unlock(&dev->queue_lock); if (!netif_queue_stopped(dev)) { - if (netdev_nit) + if (netdev_nit +#if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE) + && !(skb->imq_flags & IMQ_F_ENQUEUE) +#endif + ) dev_queue_xmit_nit(skb, dev); if (dev->hard_start_xmit(skb, dev) == 0) { diff --git a/release/src/linux/linux/net/sched/sch_hfsc.c b/release/src/linux/linux/net/sched/sch_hfsc.c new file mode 100644 index 00000000..0b6e6d38 --- /dev/null +++ b/release/src/linux/linux/net/sched/sch_hfsc.c @@ -0,0 +1,1817 @@ +/* + * Copyright (c) 2003 Patrick McHardy, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * 2003-10-17 - Ported from altq + */ +/* + * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software and + * its documentation is hereby granted (including for commercial or + * for-profit use), provided that both the copyright notice and this + * permission notice appear in all copies of the software, derivative + * works, or modified versions, and any portions thereof. + * + * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF + * WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON PROVIDES THIS + * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Carnegie Mellon encourages (but does not require) users of this + * software to return any improvements or extensions that they make, + * and to grant Carnegie Mellon the rights to redistribute these + * changes without encumbrance. + */ +/* + * H-FSC is described in Proceedings of SIGCOMM'97, + * "A Hierarchical Fair Service Curve Algorithm for Link-Sharing, + * Real-Time and Priority Service" + * by Ion Stoica, Hui Zhang, and T. S. Eugene Ng. + * + * Oleg Cherevko added the upperlimit for link-sharing. + * when a class has an upperlimit, the fit-time is computed from the + * upperlimit service curve. the link-sharing scheduler does not schedule + * a class whose fit-time exceeds the current time. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define HFSC_DEBUG 0 + +/* + * kernel internal service curve representation: + * coordinates are given by 64 bit unsigned integers. + * x-axis: unit is clock count. + * y-axis: unit is byte. + * + * The service curve parameters are converted to the internal + * representation. The slope values are scaled to avoid overflow. + * the inverse slope values as well as the y-projection of the 1st + * segment are kept in order to to avoid 64-bit divide operations + * that are expensive on 32-bit architectures. + */ + +struct internal_sc +{ + u64 sm1; /* scaled slope of the 1st segment */ + u64 ism1; /* scaled inverse-slope of the 1st segment */ + u64 dx; /* the x-projection of the 1st segment */ + u64 dy; /* the y-projection of the 1st segment */ + u64 sm2; /* scaled slope of the 2nd segment */ + u64 ism2; /* scaled inverse-slope of the 2nd segment */ +}; + +/* runtime service curve */ +struct runtime_sc +{ + u64 x; /* current starting position on x-axis */ + u64 y; /* current starting position on y-axis */ + u64 sm1; /* scaled slope of the 1st segment */ + u64 ism1; /* scaled inverse-slope of the 1st segment */ + u64 dx; /* the x-projection of the 1st segment */ + u64 dy; /* the y-projection of the 1st segment */ + u64 sm2; /* scaled slope of the 2nd segment */ + u64 ism2; /* scaled inverse-slope of the 2nd segment */ +}; + +enum hfsc_class_flags +{ + HFSC_RSC = 0x1, + HFSC_FSC = 0x2, + HFSC_USC = 0x4 +}; + +struct hfsc_class +{ + u32 classid; /* class id */ + unsigned int refcnt; /* usage count */ + + struct tc_stats stats; /* generic statistics */ + unsigned int level; /* class level in hierarchy */ + struct tcf_proto *filter_list; /* filter list */ + unsigned int filter_cnt; /* filter count */ + + struct hfsc_sched *sched; /* scheduler data */ + struct hfsc_class *cl_parent; /* parent class */ + struct list_head siblings; /* sibling classes */ + struct list_head children; /* child classes */ + struct Qdisc *qdisc; /* leaf qdisc */ + + rb_node_t el_node; /* qdisc's eligible tree member */ + rb_root_t vt_tree; /* active children sorted by cl_vt */ + rb_node_t vt_node; /* parent's vt_tree member */ + rb_root_t cf_tree; /* active children sorted by cl_f */ + rb_node_t cf_node; /* parent's cf_heap member */ + struct list_head hlist; /* hash list member */ + struct list_head dlist; /* drop list member */ + + u64 cl_total; /* total work in bytes */ + u64 cl_cumul; /* cumulative work in bytes done by + real-time criteria */ + + u64 cl_d; /* deadline*/ + u64 cl_e; /* eligible time */ + u64 cl_vt; /* virtual time */ + u64 cl_f; /* time when this class will fit for + link-sharing, max(myf, cfmin) */ + u64 cl_myf; /* my fit-time (calculated from this + class's own upperlimit curve) */ + u64 cl_myfadj; /* my fit-time adjustment (to cancel + history dependence) */ + u64 cl_cfmin; /* earliest children's fit-time (used + with cl_myf to obtain cl_f) */ + u64 cl_cvtmin; /* minimal virtual time among the + children fit for link-sharing + (monotonic within a period) */ + u64 cl_vtadj; /* intra-period cumulative vt + adjustment */ + u64 cl_vtoff; /* inter-period cumulative vt offset */ + u64 cl_cvtmax; /* max child's vt in the last period */ + u64 cl_cvtoff; /* cumulative cvtmax of all periods */ + u64 cl_pcvtoff; /* parent's cvtoff at initalization + time */ + + struct internal_sc cl_rsc; /* internal real-time service curve */ + struct internal_sc cl_fsc; /* internal fair service curve */ + struct internal_sc cl_usc; /* internal upperlimit service curve */ + struct runtime_sc cl_deadline; /* deadline curve */ + struct runtime_sc cl_eligible; /* eligible curve */ + struct runtime_sc cl_virtual; /* virtual curve */ + struct runtime_sc cl_ulimit; /* upperlimit curve */ + + unsigned long cl_flags; /* which curves are valid */ + unsigned long cl_vtperiod; /* vt period sequence number */ + unsigned long cl_parentperiod;/* parent's vt period sequence number*/ + unsigned long cl_nactive; /* number of active children */ +}; + +#define HFSC_HSIZE 16 + +struct hfsc_sched +{ + u16 defcls; /* default class id */ + struct hfsc_class root; /* root class */ + struct list_head clhash[HFSC_HSIZE]; /* class hash */ + rb_root_t eligible; /* eligible tree */ + struct list_head droplist; /* active leaf class list (for + dropping) */ + struct sk_buff_head requeue; /* requeued packet */ + struct timer_list wd_timer; /* watchdog timer */ +}; + +/* + * macros + */ +#if PSCHED_CLOCK_SOURCE == PSCHED_GETTIMEOFDAY +#include +#undef PSCHED_GET_TIME +#define PSCHED_GET_TIME(stamp) \ +do { \ + struct timeval tv; \ + do_gettimeofday(&tv); \ + (stamp) = 1000000ULL * tv.tv_sec + tv.tv_usec; \ +} while (0) +#endif + +#if HFSC_DEBUG +#define ASSERT(cond) \ +do { \ + if (unlikely(!(cond))) \ + printk("assertion %s failed at %s:%i (%s)\n", \ + #cond, __FILE__, __LINE__, __FUNCTION__); \ +} while (0) +#else +#define ASSERT(cond) +#endif /* HFSC_DEBUG */ + +#define HT_INFINITY 0xffffffffffffffffULL /* infinite time value */ + + +/* + * eligible tree holds backlogged classes being sorted by their eligible times. + * there is one eligible tree per hfsc instance. + */ + +static void +eltree_insert(struct hfsc_class *cl) +{ + rb_node_t **p = &cl->sched->eligible.rb_node; + rb_node_t *parent = NULL; + struct hfsc_class *cl1; + + while (*p != NULL) { + parent = *p; + cl1 = rb_entry(parent, struct hfsc_class, el_node); + if (cl->cl_e >= cl1->cl_e) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&cl->el_node, parent, p); + rb_insert_color(&cl->el_node, &cl->sched->eligible); +} + +static inline void +eltree_remove(struct hfsc_class *cl) +{ + rb_erase(&cl->el_node, &cl->sched->eligible); +} + +static inline void +eltree_update(struct hfsc_class *cl) +{ + eltree_remove(cl); + eltree_insert(cl); +} + +/* find the class with the minimum deadline among the eligible classes */ +static inline struct hfsc_class * +eltree_get_mindl(struct hfsc_sched *q, u64 cur_time) +{ + struct hfsc_class *p, *cl = NULL; + rb_node_t *n; + + for (n = rb_first(&q->eligible); n != NULL; n = rb_next(n)) { + p = rb_entry(n, struct hfsc_class, el_node); + if (p->cl_e > cur_time) + break; + if (cl == NULL || p->cl_d < cl->cl_d) + cl = p; + } + return cl; +} + +/* find the class with minimum eligible time among the eligible classes */ +static inline struct hfsc_class * +eltree_get_minel(struct hfsc_sched *q) +{ + rb_node_t *n; + + n = rb_first(&q->eligible); + if (n == NULL) + return NULL; + return rb_entry(n, struct hfsc_class, el_node); +} + +/* + * vttree holds holds backlogged child classes being sorted by their virtual + * time. each intermediate class has one vttree. + */ +static void +vttree_insert(struct hfsc_class *cl) +{ + rb_node_t **p = &cl->cl_parent->vt_tree.rb_node; + rb_node_t *parent = NULL; + struct hfsc_class *cl1; + + while (*p != NULL) { + parent = *p; + cl1 = rb_entry(parent, struct hfsc_class, vt_node); + if (cl->cl_vt >= cl1->cl_vt) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&cl->vt_node, parent, p); + rb_insert_color(&cl->vt_node, &cl->cl_parent->vt_tree); +} + +static inline void +vttree_remove(struct hfsc_class *cl) +{ + rb_erase(&cl->vt_node, &cl->cl_parent->vt_tree); +} + +static inline void +vttree_update(struct hfsc_class *cl) +{ + vttree_remove(cl); + vttree_insert(cl); +} + +static inline struct hfsc_class * +vttree_firstfit(struct hfsc_class *cl, u64 cur_time) +{ + struct hfsc_class *p; + rb_node_t *n; + + for (n = rb_first(&cl->vt_tree); n != NULL; n = rb_next(n)) { + p = rb_entry(n, struct hfsc_class, vt_node); + if (p->cl_f <= cur_time) + return p; + } + return NULL; +} + +/* + * get the leaf class with the minimum vt in the hierarchy + */ +static struct hfsc_class * +vttree_get_minvt(struct hfsc_class *cl, u64 cur_time) +{ + /* if root-class's cfmin is bigger than cur_time nothing to do */ + if (cl->cl_cfmin > cur_time) + return NULL; + + while (cl->level > 0) { + cl = vttree_firstfit(cl, cur_time); + if (cl == NULL) + return NULL; + /* + * update parent's cl_cvtmin. + */ + if (cl->cl_parent->cl_cvtmin < cl->cl_vt) + cl->cl_parent->cl_cvtmin = cl->cl_vt; + } + return cl; +} + +static void +cftree_insert(struct hfsc_class *cl) +{ + rb_node_t **p = &cl->cl_parent->cf_tree.rb_node; + rb_node_t *parent = NULL; + struct hfsc_class *cl1; + + while (*p != NULL) { + parent = *p; + cl1 = rb_entry(parent, struct hfsc_class, cf_node); + if (cl->cl_f >= cl1->cl_f) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&cl->cf_node, parent, p); + rb_insert_color(&cl->cf_node, &cl->cl_parent->cf_tree); +} + +static inline void +cftree_remove(struct hfsc_class *cl) +{ + rb_erase(&cl->cf_node, &cl->cl_parent->cf_tree); +} + +static inline void +cftree_update(struct hfsc_class *cl) +{ + cftree_remove(cl); + cftree_insert(cl); +} + +/* + * service curve support functions + * + * external service curve parameters + * m: bps + * d: us + * internal service curve parameters + * sm: (bytes/psched_us) << SM_SHIFT + * ism: (psched_us/byte) << ISM_SHIFT + * dx: psched_us + * + * Time source resolution + * PSCHED_JIFFIES: for 48<=HZ<=1534 resolution is between 0.63us and 1.27us. + * PSCHED_CPU: resolution is between 0.5us and 1us. + * PSCHED_GETTIMEOFDAY: resolution is exactly 1us. + * + * sm and ism are scaled in order to keep effective digits. + * SM_SHIFT and ISM_SHIFT are selected to keep at least 4 effective + * digits in decimal using the following table. + * + * Note: We can afford the additional accuracy (altq hfsc keeps at most + * 3 effective digits) thanks to the fact that linux clock is bounded + * much more tightly. + * + * bits/sec 100Kbps 1Mbps 10Mbps 100Mbps 1Gbps + * ------------+------------------------------------------------------- + * bytes/0.5us 6.25e-3 62.5e-3 625e-3 6250e-e 62500e-3 + * bytes/us 12.5e-3 125e-3 1250e-3 12500e-3 125000e-3 + * bytes/1.27us 15.875e-3 158.75e-3 1587.5e-3 15875e-3 158750e-3 + * + * 0.5us/byte 160 16 1.6 0.16 0.016 + * us/byte 80 8 0.8 0.08 0.008 + * 1.27us/byte 63 6.3 0.63 0.063 0.0063 + */ +#define SM_SHIFT 20 +#define ISM_SHIFT 18 + +#define SM_MASK ((1ULL << SM_SHIFT) - 1) +#define ISM_MASK ((1ULL << ISM_SHIFT) - 1) + +static inline u64 +seg_x2y(u64 x, u64 sm) +{ + u64 y; + + /* + * compute + * y = x * sm >> SM_SHIFT + * but divide it for the upper and lower bits to avoid overflow + */ + y = (x >> SM_SHIFT) * sm + (((x & SM_MASK) * sm) >> SM_SHIFT); + return y; +} + +static inline u64 +seg_y2x(u64 y, u64 ism) +{ + u64 x; + + if (y == 0) + x = 0; + else if (ism == HT_INFINITY) + x = HT_INFINITY; + else { + x = (y >> ISM_SHIFT) * ism + + (((y & ISM_MASK) * ism) >> ISM_SHIFT); + } + return x; +} + +/* Convert m (bps) into sm (bytes/psched us) */ +static u64 +m2sm(u32 m) +{ + u64 sm; + + sm = ((u64)m << SM_SHIFT); + sm += PSCHED_JIFFIE2US(HZ) - 1; + do_div(sm, PSCHED_JIFFIE2US(HZ)); + return sm; +} + +/* convert m (bps) into ism (psched us/byte) */ +static u64 +m2ism(u32 m) +{ + u64 ism; + + if (m == 0) + ism = HT_INFINITY; + else { + ism = ((u64)PSCHED_JIFFIE2US(HZ) << ISM_SHIFT); + ism += m - 1; + do_div(ism, m); + } + return ism; +} + +/* convert d (us) into dx (psched us) */ +static u64 +d2dx(u32 d) +{ + u64 dx; + + dx = ((u64)d * PSCHED_JIFFIE2US(HZ)); + dx += 1000000 - 1; + do_div(dx, 1000000); + return dx; +} + +/* convert sm (bytes/psched us) into m (bps) */ +static u32 +sm2m(u64 sm) +{ + u64 m; + + m = (sm * PSCHED_JIFFIE2US(HZ)) >> SM_SHIFT; + return (u32)m; +} + +/* convert dx (psched us) into d (us) */ +static u32 +dx2d(u64 dx) +{ + u64 d; + + d = dx * 1000000; + do_div(d, PSCHED_JIFFIE2US(HZ)); + return (u32)d; +} + +static void +sc2isc(struct tc_service_curve *sc, struct internal_sc *isc) +{ + isc->sm1 = m2sm(sc->m1); + isc->ism1 = m2ism(sc->m1); + isc->dx = d2dx(sc->d); + isc->dy = seg_x2y(isc->dx, isc->sm1); + isc->sm2 = m2sm(sc->m2); + isc->ism2 = m2ism(sc->m2); +} + +/* + * initialize the runtime service curve with the given internal + * service curve starting at (x, y). + */ +static void +rtsc_init(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y) +{ + rtsc->x = x; + rtsc->y = y; + rtsc->sm1 = isc->sm1; + rtsc->ism1 = isc->ism1; + rtsc->dx = isc->dx; + rtsc->dy = isc->dy; + rtsc->sm2 = isc->sm2; + rtsc->ism2 = isc->ism2; +} + +/* + * calculate the y-projection of the runtime service curve by the + * given x-projection value + */ +static u64 +rtsc_y2x(struct runtime_sc *rtsc, u64 y) +{ + u64 x; + + if (y < rtsc->y) + x = rtsc->x; + else if (y <= rtsc->y + rtsc->dy) { + /* x belongs to the 1st segment */ + if (rtsc->dy == 0) + x = rtsc->x + rtsc->dx; + else + x = rtsc->x + seg_y2x(y - rtsc->y, rtsc->ism1); + } else { + /* x belongs to the 2nd segment */ + x = rtsc->x + rtsc->dx + + seg_y2x(y - rtsc->y - rtsc->dy, rtsc->ism2); + } + return x; +} + +static u64 +rtsc_x2y(struct runtime_sc *rtsc, u64 x) +{ + u64 y; + + if (x <= rtsc->x) + y = rtsc->y; + else if (x <= rtsc->x + rtsc->dx) + /* y belongs to the 1st segment */ + y = rtsc->y + seg_x2y(x - rtsc->x, rtsc->sm1); + else + /* y belongs to the 2nd segment */ + y = rtsc->y + rtsc->dy + + seg_x2y(x - rtsc->x - rtsc->dx, rtsc->sm2); + return y; +} + +/* + * update the runtime service curve by taking the minimum of the current + * runtime service curve and the service curve starting at (x, y). + */ +static void +rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y) +{ + u64 y1, y2, dx, dy; + u32 dsm; + + if (isc->sm1 <= isc->sm2) { + /* service curve is convex */ + y1 = rtsc_x2y(rtsc, x); + if (y1 < y) + /* the current rtsc is smaller */ + return; + rtsc->x = x; + rtsc->y = y; + return; + } + + /* + * service curve is concave + * compute the two y values of the current rtsc + * y1: at x + * y2: at (x + dx) + */ + y1 = rtsc_x2y(rtsc, x); + if (y1 <= y) { + /* rtsc is below isc, no change to rtsc */ + return; + } + + y2 = rtsc_x2y(rtsc, x + isc->dx); + if (y2 >= y + isc->dy) { + /* rtsc is above isc, replace rtsc by isc */ + rtsc->x = x; + rtsc->y = y; + rtsc->dx = isc->dx; + rtsc->dy = isc->dy; + return; + } + + /* + * the two curves intersect + * compute the offsets (dx, dy) using the reverse + * function of seg_x2y() + * seg_x2y(dx, sm1) == seg_x2y(dx, sm2) + (y1 - y) + */ + dx = (y1 - y) << SM_SHIFT; + dsm = isc->sm1 - isc->sm2; + do_div(dx, dsm); + /* + * check if (x, y1) belongs to the 1st segment of rtsc. + * if so, add the offset. + */ + if (rtsc->x + rtsc->dx > x) + dx += rtsc->x + rtsc->dx - x; + dy = seg_x2y(dx, isc->sm1); + + rtsc->x = x; + rtsc->y = y; + rtsc->dx = dx; + rtsc->dy = dy; + return; +} + +static void +init_ed(struct hfsc_class *cl, unsigned int next_len) +{ + u64 cur_time; + + PSCHED_GET_TIME(cur_time); + + /* update the deadline curve */ + rtsc_min(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul); + + /* + * update the eligible curve. + * for concave, it is equal to the deadline curve. + * for convex, it is a linear curve with slope m2. + */ + cl->cl_eligible = cl->cl_deadline; + if (cl->cl_rsc.sm1 <= cl->cl_rsc.sm2) { + cl->cl_eligible.dx = 0; + cl->cl_eligible.dy = 0; + } + + /* compute e and d */ + cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); + + eltree_insert(cl); +} + +static void +update_ed(struct hfsc_class *cl, unsigned int next_len) +{ + cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); + + eltree_update(cl); +} + +static inline void +update_d(struct hfsc_class *cl, unsigned int next_len) +{ + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); +} + +static inline void +update_cfmin(struct hfsc_class *cl) +{ + rb_node_t *n = rb_first(&cl->cf_tree); + struct hfsc_class *p; + + if (n == NULL) { + cl->cl_cfmin = 0; + return; + } + p = rb_entry(n, struct hfsc_class, cf_node); + cl->cl_cfmin = p->cl_f; +} + +static void +init_vf(struct hfsc_class *cl, unsigned int len) +{ + struct hfsc_class *max_cl; + rb_node_t *n; + u64 vt, f, cur_time; + int go_active; + + cur_time = 0; + go_active = 1; + for (; cl->cl_parent != NULL; cl = cl->cl_parent) { + if (go_active && cl->cl_nactive++ == 0) + go_active = 1; + else + go_active = 0; + + if (go_active) { + n = rb_last(&cl->cl_parent->vt_tree); + if (n != NULL) { + max_cl = rb_entry(n, struct hfsc_class,vt_node); + /* + * set vt to the average of the min and max + * classes. if the parent's period didn't + * change, don't decrease vt of the class. + */ + vt = max_cl->cl_vt; + if (cl->cl_parent->cl_cvtmin != 0) + vt = (cl->cl_parent->cl_cvtmin + vt)/2; + + if (cl->cl_parent->cl_vtperiod != + cl->cl_parentperiod || vt > cl->cl_vt) + cl->cl_vt = vt; + } else { + /* + * first child for a new parent backlog period. + * add parent's cvtmax to cvtoff to make a new + * vt (vtoff + vt) larger than the vt in the + * last period for all children. + */ + vt = cl->cl_parent->cl_cvtmax; + cl->cl_parent->cl_cvtoff += vt; + cl->cl_parent->cl_cvtmax = 0; + cl->cl_parent->cl_cvtmin = 0; + cl->cl_vt = 0; + } + + cl->cl_vtoff = cl->cl_parent->cl_cvtoff - + cl->cl_pcvtoff; + + /* update the virtual curve */ + vt = cl->cl_vt + cl->cl_vtoff; + rtsc_min(&cl->cl_virtual, &cl->cl_fsc, vt, + cl->cl_total); + if (cl->cl_virtual.x == vt) { + cl->cl_virtual.x -= cl->cl_vtoff; + cl->cl_vtoff = 0; + } + cl->cl_vtadj = 0; + + cl->cl_vtperiod++; /* increment vt period */ + cl->cl_parentperiod = cl->cl_parent->cl_vtperiod; + if (cl->cl_parent->cl_nactive == 0) + cl->cl_parentperiod++; + cl->cl_f = 0; + + vttree_insert(cl); + cftree_insert(cl); + + if (cl->cl_flags & HFSC_USC) { + /* class has upper limit curve */ + if (cur_time == 0) + PSCHED_GET_TIME(cur_time); + + /* update the ulimit curve */ + rtsc_min(&cl->cl_ulimit, &cl->cl_usc, cur_time, + cl->cl_total); + /* compute myf */ + cl->cl_myf = rtsc_y2x(&cl->cl_ulimit, + cl->cl_total); + cl->cl_myfadj = 0; + } + } + + f = max(cl->cl_myf, cl->cl_cfmin); + if (f != cl->cl_f) { + cl->cl_f = f; + cftree_update(cl); + update_cfmin(cl->cl_parent); + } + } +} + +static void +update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time) +{ + u64 f; /* , myf_bound, delta; */ + int go_passive = 0; + + if (cl->qdisc->q.qlen == 0 && cl->cl_flags & HFSC_FSC) + go_passive = 1; + + for (; cl->cl_parent != NULL; cl = cl->cl_parent) { + cl->cl_total += len; + + if (!(cl->cl_flags & HFSC_FSC) || cl->cl_nactive == 0) + continue; + + if (go_passive && --cl->cl_nactive == 0) + go_passive = 1; + else + go_passive = 0; + + if (go_passive) { + /* no more active child, going passive */ + + /* update cvtmax of the parent class */ + if (cl->cl_vt > cl->cl_parent->cl_cvtmax) + cl->cl_parent->cl_cvtmax = cl->cl_vt; + + /* remove this class from the vt tree */ + vttree_remove(cl); + + cftree_remove(cl); + update_cfmin(cl->cl_parent); + + continue; + } + + /* + * update vt and f + */ + cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total) + - cl->cl_vtoff + cl->cl_vtadj; + + /* + * if vt of the class is smaller than cvtmin, + * the class was skipped in the past due to non-fit. + * if so, we need to adjust vtadj. + */ + if (cl->cl_vt < cl->cl_parent->cl_cvtmin) { + cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt; + cl->cl_vt = cl->cl_parent->cl_cvtmin; + } + + /* update the vt tree */ + vttree_update(cl); + + if (cl->cl_flags & HFSC_USC) { + cl->cl_myf = cl->cl_myfadj + rtsc_y2x(&cl->cl_ulimit, + cl->cl_total); +#if 0 + /* + * This code causes classes to stay way under their + * limit when multiple classes are used at gigabit + * speed. needs investigation. -kaber + */ + /* + * if myf lags behind by more than one clock tick + * from the current time, adjust myfadj to prevent + * a rate-limited class from going greedy. + * in a steady state under rate-limiting, myf + * fluctuates within one clock tick. + */ + myf_bound = cur_time - PSCHED_JIFFIE2US(1); + if (cl->cl_myf < myf_bound) { + delta = cur_time - cl->cl_myf; + cl->cl_myfadj += delta; + cl->cl_myf += delta; + } +#endif + } + + f = max(cl->cl_myf, cl->cl_cfmin); + if (f != cl->cl_f) { + cl->cl_f = f; + cftree_update(cl); + update_cfmin(cl->cl_parent); + } + } +} + +static void +set_active(struct hfsc_class *cl, unsigned int len) +{ + if (cl->cl_flags & HFSC_RSC) + init_ed(cl, len); + if (cl->cl_flags & HFSC_FSC) + init_vf(cl, len); + + list_add_tail(&cl->dlist, &cl->sched->droplist); +} + +static void +set_passive(struct hfsc_class *cl) +{ + if (cl->cl_flags & HFSC_RSC) + eltree_remove(cl); + + list_del(&cl->dlist); + + /* + * vttree is now handled in update_vf() so that update_vf(cl, 0, 0) + * needs to be called explicitly to remove a class from vttree. + */ +} + +/* + * hack to get length of first packet in queue. + */ +static unsigned int +qdisc_peek_len(struct Qdisc *sch) +{ + struct sk_buff *skb; + unsigned int len; + + skb = sch->dequeue(sch); + if (skb == NULL) { + if (net_ratelimit()) + printk("qdisc_peek_len: non work-conserving qdisc ?\n"); + return 0; + } + len = skb->len; + if (unlikely(sch->ops->requeue(skb, sch) != NET_XMIT_SUCCESS)) { + if (net_ratelimit()) + printk("qdisc_peek_len: failed to requeue\n"); + return 0; + } + return len; +} + +static void +hfsc_purge_queue(struct Qdisc *sch, struct hfsc_class *cl) +{ + unsigned int len = cl->qdisc->q.qlen; + + qdisc_reset(cl->qdisc); + if (len > 0) { + update_vf(cl, 0, 0); + set_passive(cl); + sch->q.qlen -= len; + } +} + +static void +hfsc_adjust_levels(struct hfsc_class *cl) +{ + struct hfsc_class *p; + unsigned int level; + + do { + level = 0; + list_for_each_entry(p, &cl->children, siblings) { + if (p->level > level) + level = p->level; + } + cl->level = level + 1; + } while ((cl = cl->cl_parent) != NULL); +} + +static inline unsigned int +hfsc_hash(u32 h) +{ + h ^= h >> 8; + h ^= h >> 4; + + return h & (HFSC_HSIZE - 1); +} + +static inline struct hfsc_class * +hfsc_find_class(u32 classid, struct Qdisc *sch) +{ + struct hfsc_sched *q = (struct hfsc_sched *)sch->data; + struct hfsc_class *cl; + + list_for_each_entry(cl, &q->clhash[hfsc_hash(classid)], hlist) { + if (cl->classid == classid) + return cl; + } + return NULL; +} + +static void +hfsc_change_rsc(struct hfsc_class *cl, struct tc_service_curve *rsc, + u64 cur_time) +{ + sc2isc(rsc, &cl->cl_rsc); + rtsc_init(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul); + cl->cl_eligible = cl->cl_deadline; + if (cl->cl_rsc.sm1 <= cl->cl_rsc.sm2) { + cl->cl_eligible.dx = 0; + cl->cl_eligible.dy = 0; + } + cl->cl_flags |= HFSC_RSC; +} + +static void +hfsc_change_fsc(struct hfsc_class *cl, struct tc_service_curve *fsc) +{ + sc2isc(fsc, &cl->cl_fsc); + rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total); + cl->cl_flags |= HFSC_FSC; +} + +static void +hfsc_change_usc(struct hfsc_class *cl, struct tc_service_curve *usc, + u64 cur_time) +{ + sc2isc(usc, &cl->cl_usc); + rtsc_init(&cl->cl_ulimit, &cl->cl_usc, cur_time, cl->cl_total); + cl->cl_flags |= HFSC_USC; +} + +static int +hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + struct rtattr **tca, unsigned long *arg) +{ + struct hfsc_sched *q = (struct hfsc_sched *)sch->data; + struct hfsc_class *cl = (struct hfsc_class *)*arg; + struct hfsc_class *parent = NULL; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_HFSC_MAX]; + struct tc_service_curve *rsc = NULL, *fsc = NULL, *usc = NULL; + u64 cur_time; + + if (opt == NULL || + rtattr_parse(tb, TCA_HFSC_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt))) + return -EINVAL; + + if (tb[TCA_HFSC_RSC-1]) { + if (RTA_PAYLOAD(tb[TCA_HFSC_RSC-1]) < sizeof(*rsc)) + return -EINVAL; + rsc = RTA_DATA(tb[TCA_HFSC_RSC-1]); + if (rsc->m1 == 0 && rsc->m2 == 0) + rsc = NULL; + } + + if (tb[TCA_HFSC_FSC-1]) { + if (RTA_PAYLOAD(tb[TCA_HFSC_FSC-1]) < sizeof(*fsc)) + return -EINVAL; + fsc = RTA_DATA(tb[TCA_HFSC_FSC-1]); + if (fsc->m1 == 0 && fsc->m2 == 0) + fsc = NULL; + } + + if (tb[TCA_HFSC_USC-1]) { + if (RTA_PAYLOAD(tb[TCA_HFSC_USC-1]) < sizeof(*usc)) + return -EINVAL; + usc = RTA_DATA(tb[TCA_HFSC_USC-1]); + if (usc->m1 == 0 && usc->m2 == 0) + usc = NULL; + } + + if (cl != NULL) { + if (parentid) { + if (cl->cl_parent && cl->cl_parent->classid != parentid) + return -EINVAL; + if (cl->cl_parent == NULL && parentid != TC_H_ROOT) + return -EINVAL; + } + PSCHED_GET_TIME(cur_time); + + sch_tree_lock(sch); + if (rsc != NULL) + hfsc_change_rsc(cl, rsc, cur_time); + if (fsc != NULL) + hfsc_change_fsc(cl, fsc); + if (usc != NULL) + hfsc_change_usc(cl, usc, cur_time); + + if (cl->qdisc->q.qlen != 0) { + if (cl->cl_flags & HFSC_RSC) + update_ed(cl, qdisc_peek_len(cl->qdisc)); + if (cl->cl_flags & HFSC_FSC) + update_vf(cl, 0, cur_time); + } + sch_tree_unlock(sch); + +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) { + qdisc_kill_estimator(&cl->stats); + qdisc_new_estimator(&cl->stats, tca[TCA_RATE-1]); + } +#endif + return 0; + } + + if (parentid == TC_H_ROOT) + return -EEXIST; + + parent = &q->root; + if (parentid) { + parent = hfsc_find_class(parentid, sch); + if (parent == NULL) + return -ENOENT; + } + + if (classid == 0 || TC_H_MAJ(classid ^ sch->handle) != 0) + return -EINVAL; + if (hfsc_find_class(classid, sch)) + return -EEXIST; + + if (rsc == NULL && fsc == NULL) + return -EINVAL; + + cl = kmalloc(sizeof(struct hfsc_class), GFP_KERNEL); + if (cl == NULL) + return -ENOBUFS; + memset(cl, 0, sizeof(struct hfsc_class)); + + if (rsc != NULL) + hfsc_change_rsc(cl, rsc, 0); + if (fsc != NULL) + hfsc_change_fsc(cl, fsc); + if (usc != NULL) + hfsc_change_usc(cl, usc, 0); + + cl->refcnt = 1; + cl->classid = classid; + cl->sched = q; + cl->cl_parent = parent; + cl->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + if (cl->qdisc == NULL) + cl->qdisc = &noop_qdisc; + cl->stats.lock = &sch->dev->queue_lock; + INIT_LIST_HEAD(&cl->children); + cl->vt_tree = RB_ROOT; + cl->cf_tree = RB_ROOT; + + sch_tree_lock(sch); + list_add_tail(&cl->hlist, &q->clhash[hfsc_hash(classid)]); + list_add_tail(&cl->siblings, &parent->children); + if (parent->level == 0) + hfsc_purge_queue(sch, parent); + hfsc_adjust_levels(parent); + cl->cl_pcvtoff = parent->cl_cvtoff; + sch_tree_unlock(sch); + +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) + qdisc_new_estimator(&cl->stats, tca[TCA_RATE-1]); +#endif + *arg = (unsigned long)cl; + return 0; +} + +static void +hfsc_destroy_filters(struct tcf_proto **fl) +{ + struct tcf_proto *tp; + + while ((tp = *fl) != NULL) { + *fl = tp->next; + tcf_destroy(tp); + } +} + +static void +hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl) +{ + struct hfsc_sched *q = (struct hfsc_sched *)sch->data; + + hfsc_destroy_filters(&cl->filter_list); + qdisc_destroy(cl->qdisc); +#ifdef CONFIG_NET_ESTIMATOR + qdisc_kill_estimator(&cl->stats); +#endif + if (cl != &q->root) + kfree(cl); +} + +static int +hfsc_delete_class(struct Qdisc *sch, unsigned long arg) +{ + struct hfsc_sched *q = (struct hfsc_sched *)sch->data; + struct hfsc_class *cl = (struct hfsc_class *)arg; + + if (cl->level > 0 || cl->filter_cnt > 0 || cl == &q->root) + return -EBUSY; + + sch_tree_lock(sch); + + list_del(&cl->hlist); + list_del(&cl->siblings); + hfsc_adjust_levels(cl->cl_parent); + hfsc_purge_queue(sch, cl); + if (--cl->refcnt == 0) + hfsc_destroy_class(sch, cl); + + sch_tree_unlock(sch); + return 0; +} + +static struct hfsc_class * +hfsc_classify(struct sk_buff *skb, struct Qdisc *sch) +{ + struct hfsc_sched *q = (struct hfsc_sched *)sch->data; + struct hfsc_class *cl; + struct tcf_result res; + struct tcf_proto *tcf; + int result; + + if (TC_H_MAJ(skb->priority ^ sch->handle) == 0 && + (cl = hfsc_find_class(skb->priority, sch)) != NULL) + if (cl->level == 0) + return cl; + + tcf = q->root.filter_list; + while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { +#ifdef CONFIG_NET_CLS_POLICE + if (result == TC_POLICE_SHOT) + return NULL; +#endif + if ((cl = (struct hfsc_class *)res.class) == NULL) { + if ((cl = hfsc_find_class(res.classid, sch)) == NULL) + break; /* filter selected invalid classid */ + } + + if (cl->level == 0) + return cl; /* hit leaf class */ + + /* apply inner filter chain */ + tcf = cl->filter_list; + } + + /* classification failed, try default class */ + cl = hfsc_find_class(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch); + if (cl == NULL || cl->level > 0) + return NULL; + + return cl; +} + +static int +hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct hfsc_class *cl = (struct hfsc_class *)arg; + + if (cl == NULL) + return -ENOENT; + if (cl->level > 0) + return -EINVAL; + if (new == NULL) { + new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + if (new == NULL) + new = &noop_qdisc; + } + + sch_tree_lock(sch); + hfsc_purge_queue(sch, cl); + *old = xchg(&cl->qdisc, new); + sch_tree_unlock(sch); + return 0; +} + +static struct Qdisc * +hfsc_class_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct hfsc_class *cl = (struct hfsc_class *)arg; + + if (cl != NULL && cl->level == 0) + return cl->qdisc; + + return NULL; +} + +static unsigned long +hfsc_get_class(struct Qdisc *sch, u32 classid) +{ + struct hfsc_class *cl = hfsc_find_class(classid, sch); + + if (cl != NULL) + cl->refcnt++; + + return (unsigned long)cl; +} + +static void +hfsc_put_class(struct Qdisc *sch, unsigned long arg) +{ + struct hfsc_class *cl = (struct hfsc_class *)arg; + + if (--cl->refcnt == 0) + hfsc_destroy_class(sch, cl); +} + +static unsigned long +hfsc_bind_tcf(struct Qdisc *sch, unsigned long parent, u32 classid) +{ + struct hfsc_class *p = (struct hfsc_class *)parent; + struct hfsc_class *cl = hfsc_find_class(classid, sch); + + if (cl != NULL) { + if (p != NULL && p->level <= cl->level) + return 0; + cl->filter_cnt++; + } + + return (unsigned long)cl; +} + +static void +hfsc_unbind_tcf(struct Qdisc *sch, unsigned long arg) +{ + struct hfsc_class *cl = (struct hfsc_class *)arg; + + cl->filter_cnt--; +} + +static struct tcf_proto ** +hfsc_tcf_chain(struct Qdisc *sch, unsigned long arg) +{ + struct hfsc_sched *q = (struct hfsc_sched *)sch->data; + struct hfsc_class *cl = (struct hfsc_class *)arg; + + if (cl == NULL) + cl = &q->root; + + return &cl->filter_list; +} + +static int +hfsc_dump_sc(struct sk_buff *skb, int attr, struct internal_sc *sc) +{ + struct tc_service_curve tsc; + + tsc.m1 = sm2m(sc->sm1); + tsc.d = dx2d(sc->dx); + tsc.m2 = sm2m(sc->sm2); + RTA_PUT(skb, attr, sizeof(tsc), &tsc); + + return skb->len; + + rtattr_failure: + return -1; +} + +static inline int +hfsc_dump_curves(struct sk_buff *skb, struct hfsc_class *cl) +{ + if ((cl->cl_flags & HFSC_RSC) && + (hfsc_dump_sc(skb, TCA_HFSC_RSC, &cl->cl_rsc) < 0)) + goto rtattr_failure; + + if ((cl->cl_flags & HFSC_FSC) && + (hfsc_dump_sc(skb, TCA_HFSC_FSC, &cl->cl_fsc) < 0)) + goto rtattr_failure; + + if ((cl->cl_flags & HFSC_USC) && + (hfsc_dump_sc(skb, TCA_HFSC_USC, &cl->cl_usc) < 0)) + goto rtattr_failure; + + return skb->len; + + rtattr_failure: + return -1; +} + +static inline int +hfsc_dump_stats(struct sk_buff *skb, struct hfsc_class *cl) +{ + cl->stats.qlen = cl->qdisc->q.qlen; + if (qdisc_copy_stats(skb, &cl->stats) < 0) + goto rtattr_failure; + + return skb->len; + + rtattr_failure: + return -1; +} + +static inline int +hfsc_dump_xstats(struct sk_buff *skb, struct hfsc_class *cl) +{ + struct tc_hfsc_stats xstats; + + xstats.level = cl->level; + xstats.period = cl->cl_vtperiod; + xstats.work = cl->cl_total; + xstats.rtwork = cl->cl_cumul; + RTA_PUT(skb, TCA_XSTATS, sizeof(xstats), &xstats); + + return skb->len; + + rtattr_failure: + return -1; +} + +static int +hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, + struct tcmsg *tcm) +{ + struct hfsc_class *cl = (struct hfsc_class *)arg; + unsigned char *b = skb->tail; + struct rtattr *rta = (struct rtattr *)b; + + tcm->tcm_parent = cl->cl_parent ? cl->cl_parent->classid : TC_H_ROOT; + tcm->tcm_handle = cl->classid; + if (cl->level == 0) + tcm->tcm_info = cl->qdisc->handle; + + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + if (hfsc_dump_curves(skb, cl) < 0) + goto rtattr_failure; + rta->rta_len = skb->tail - b; + + if ((hfsc_dump_stats(skb, cl) < 0) || + (hfsc_dump_xstats(skb, cl) < 0)) + goto rtattr_failure; + + return skb->len; + + rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static void +hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct hfsc_sched *q = (struct hfsc_sched *)sch->data; + struct hfsc_class *cl; + unsigned int i; + + if (arg->stop) + return; + + for (i = 0; i < HFSC_HSIZE; i++) { + list_for_each_entry(cl, &q->clhash[i], hlist) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, (unsigned long)cl, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } +} + +static void +hfsc_watchdog(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc *)arg; + + sch->flags &= ~TCQ_F_THROTTLED; + netif_schedule(sch->dev); +} + +static void +hfsc_schedule_watchdog(struct Qdisc *sch, u64 cur_time) +{ + struct hfsc_sched *q = (struct hfsc_sched *)sch->data; + struct hfsc_class *cl; + u64 next_time = 0; + long delay; + + if ((cl = eltree_get_minel(q)) != NULL) + next_time = cl->cl_e; + if (q->root.cl_cfmin != 0) { + if (next_time == 0 || next_time > q->root.cl_cfmin) + next_time = q->root.cl_cfmin; + } + ASSERT(next_time != 0); + delay = next_time - cur_time; + delay = PSCHED_US2JIFFIE(delay); + + sch->flags |= TCQ_F_THROTTLED; + mod_timer(&q->wd_timer, jiffies + delay); +} + +static int +hfsc_init_qdisc(struct Qdisc *sch, struct rtattr *opt) +{ + struct hfsc_sched *q = (struct hfsc_sched *)sch->data; + struct tc_hfsc_qopt *qopt; + unsigned int i; + + if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt)) + return -EINVAL; + qopt = RTA_DATA(opt); + + sch->stats.lock = &sch->dev->queue_lock; + + q->defcls = qopt->defcls; + for (i = 0; i < HFSC_HSIZE; i++) + INIT_LIST_HEAD(&q->clhash[i]); + q->eligible = RB_ROOT; + INIT_LIST_HEAD(&q->droplist); + skb_queue_head_init(&q->requeue); + + q->root.refcnt = 1; + q->root.classid = sch->handle; + q->root.sched = q; + q->root.qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + if (q->root.qdisc == NULL) + q->root.qdisc = &noop_qdisc; + q->root.stats.lock = &sch->dev->queue_lock; + INIT_LIST_HEAD(&q->root.children); + q->root.vt_tree = RB_ROOT; + q->root.cf_tree = RB_ROOT; + + list_add(&q->root.hlist, &q->clhash[hfsc_hash(q->root.classid)]); + + init_timer(&q->wd_timer); + q->wd_timer.function = hfsc_watchdog; + q->wd_timer.data = (unsigned long)sch; + + MOD_INC_USE_COUNT; + return 0; +} + +static int +hfsc_change_qdisc(struct Qdisc *sch, struct rtattr *opt) +{ + struct hfsc_sched *q = (struct hfsc_sched *)sch->data; + struct tc_hfsc_qopt *qopt; + + if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt)) + return -EINVAL;; + qopt = RTA_DATA(opt); + + sch_tree_lock(sch); + q->defcls = qopt->defcls; + sch_tree_unlock(sch); + + return 0; +} + +static void +hfsc_reset_class(struct hfsc_class *cl) +{ + cl->cl_total = 0; + cl->cl_cumul = 0; + cl->cl_d = 0; + cl->cl_e = 0; + cl->cl_vt = 0; + cl->cl_vtadj = 0; + cl->cl_vtoff = 0; + cl->cl_cvtmin = 0; + cl->cl_cvtmax = 0; + cl->cl_cvtoff = 0; + cl->cl_pcvtoff = 0; + cl->cl_vtperiod = 0; + cl->cl_parentperiod = 0; + cl->cl_f = 0; + cl->cl_myf = 0; + cl->cl_myfadj = 0; + cl->cl_cfmin = 0; + cl->cl_nactive = 0; + + cl->vt_tree = RB_ROOT; + cl->cf_tree = RB_ROOT; + qdisc_reset(cl->qdisc); + + if (cl->cl_flags & HFSC_RSC) + rtsc_init(&cl->cl_deadline, &cl->cl_rsc, 0, 0); + if (cl->cl_flags & HFSC_FSC) + rtsc_init(&cl->cl_virtual, &cl->cl_fsc, 0, 0); + if (cl->cl_flags & HFSC_USC) + rtsc_init(&cl->cl_ulimit, &cl->cl_usc, 0, 0); +} + +static void +hfsc_reset_qdisc(struct Qdisc *sch) +{ + struct hfsc_sched *q = (struct hfsc_sched *)sch->data; + struct hfsc_class *cl; + unsigned int i; + + for (i = 0; i < HFSC_HSIZE; i++) { + list_for_each_entry(cl, &q->clhash[i], hlist) + hfsc_reset_class(cl); + } + __skb_queue_purge(&q->requeue); + q->eligible = RB_ROOT; + INIT_LIST_HEAD(&q->droplist); + del_timer(&q->wd_timer); + sch->flags &= ~TCQ_F_THROTTLED; + sch->q.qlen = 0; +} + +static void +hfsc_destroy_qdisc(struct Qdisc *sch) +{ + struct hfsc_sched *q = (struct hfsc_sched *)sch->data; + struct hfsc_class *cl, *next; + unsigned int i; + + for (i = 0; i < HFSC_HSIZE; i++) { + list_for_each_entry_safe(cl, next, &q->clhash[i], hlist) + hfsc_destroy_class(sch, cl); + } + __skb_queue_purge(&q->requeue); + del_timer(&q->wd_timer); + MOD_DEC_USE_COUNT; +} + +static int +hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb) +{ + struct hfsc_sched *q = (struct hfsc_sched *)sch->data; + unsigned char *b = skb->tail; + struct tc_hfsc_qopt qopt; + + qopt.defcls = q->defcls; + RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); + + return skb->len; + + rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int +hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct hfsc_class *cl = hfsc_classify(skb, sch); + unsigned int len = skb->len; + int err; + + if (cl == NULL) { + kfree_skb(skb); + sch->stats.drops++; + return NET_XMIT_DROP; + } + + err = cl->qdisc->enqueue(skb, cl->qdisc); + if (unlikely(err != NET_XMIT_SUCCESS)) { + cl->stats.drops++; + sch->stats.drops++; + return err; + } + + if (cl->qdisc->q.qlen == 1) + set_active(cl, len); + + cl->stats.packets++; + cl->stats.bytes += len; + sch->stats.packets++; + sch->stats.bytes += len; + sch->q.qlen++; + + return NET_XMIT_SUCCESS; +} + +static struct sk_buff * +hfsc_dequeue(struct Qdisc *sch) +{ + struct hfsc_sched *q = (struct hfsc_sched *)sch->data; + struct hfsc_class *cl; + struct sk_buff *skb; + u64 cur_time; + unsigned int next_len; + int realtime = 0; + + if (sch->q.qlen == 0) + return NULL; + if ((skb = __skb_dequeue(&q->requeue))) + goto out; + + PSCHED_GET_TIME(cur_time); + + /* + * if there are eligible classes, use real-time criteria. + * find the class with the minimum deadline among + * the eligible classes. + */ + if ((cl = eltree_get_mindl(q, cur_time)) != NULL) { + realtime = 1; + } else { + /* + * use link-sharing criteria + * get the class with the minimum vt in the hierarchy + */ + cl = vttree_get_minvt(&q->root, cur_time); + if (cl == NULL) { + sch->stats.overlimits++; + hfsc_schedule_watchdog(sch, cur_time); + return NULL; + } + } + + skb = cl->qdisc->dequeue(cl->qdisc); + if (skb == NULL) { + if (net_ratelimit()) + printk("HFSC: Non-work-conserving qdisc ?\n"); + return NULL; + } + + update_vf(cl, skb->len, cur_time); + if (realtime) + cl->cl_cumul += skb->len; + + if (cl->qdisc->q.qlen != 0) { + if (cl->cl_flags & HFSC_RSC) { + /* update ed */ + next_len = qdisc_peek_len(cl->qdisc); + if (realtime) + update_ed(cl, next_len); + else + update_d(cl, next_len); + } + } else { + /* the class becomes passive */ + set_passive(cl); + } + + out: + sch->flags &= ~TCQ_F_THROTTLED; + sch->q.qlen--; + + return skb; +} + +static int +hfsc_requeue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct hfsc_sched *q = (struct hfsc_sched *)sch->data; + + __skb_queue_head(&q->requeue, skb); + sch->q.qlen++; + return NET_XMIT_SUCCESS; +} + +static unsigned int +hfsc_drop(struct Qdisc *sch) +{ + struct hfsc_sched *q = (struct hfsc_sched *)sch->data; + struct hfsc_class *cl; + unsigned int len; + + list_for_each_entry(cl, &q->droplist, dlist) { + if (cl->qdisc->ops->drop != NULL && + (len = cl->qdisc->ops->drop(cl->qdisc)) > 0) { + if (cl->qdisc->q.qlen == 0) { + update_vf(cl, 0, 0); + set_passive(cl); + } else { + list_move_tail(&cl->dlist, &q->droplist); + } + cl->stats.drops++; + sch->stats.drops++; + sch->q.qlen--; + return len; + } + } + return 0; +} + +static struct Qdisc_class_ops hfsc_class_ops = { + .change = hfsc_change_class, + .delete = hfsc_delete_class, + .graft = hfsc_graft_class, + .leaf = hfsc_class_leaf, + .get = hfsc_get_class, + .put = hfsc_put_class, + .bind_tcf = hfsc_bind_tcf, + .unbind_tcf = hfsc_unbind_tcf, + .tcf_chain = hfsc_tcf_chain, + .dump = hfsc_dump_class, + .walk = hfsc_walk +}; + +struct Qdisc_ops hfsc_qdisc_ops = { + .id = "hfsc", + .init = hfsc_init_qdisc, + .change = hfsc_change_qdisc, + .reset = hfsc_reset_qdisc, + .destroy = hfsc_destroy_qdisc, + .dump = hfsc_dump_qdisc, + .enqueue = hfsc_enqueue, + .dequeue = hfsc_dequeue, + .requeue = hfsc_requeue, + .drop = hfsc_drop, + .cl_ops = &hfsc_class_ops, + .priv_size = sizeof(struct hfsc_sched) +}; + +static int __init +hfsc_init(void) +{ + return register_qdisc(&hfsc_qdisc_ops); +} + +static void __exit +hfsc_cleanup(void) +{ + unregister_qdisc(&hfsc_qdisc_ops); +} + +MODULE_LICENSE("GPL"); +module_init(hfsc_init); +module_exit(hfsc_cleanup); diff --git a/release/src/linux/linux/net/sched/sch_htb.c b/release/src/linux/linux/net/sched/sch_htb.c index 7539e490..944cb555 100644 --- a/release/src/linux/linux/net/sched/sch_htb.c +++ b/release/src/linux/linux/net/sched/sch_htb.c @@ -9,6 +9,8 @@ * Authors: Martin Devera, * * Credits (in time order) for older HTB versions: + * Stef Coene + * HTB support at LARTC mailing list * Ondrej Kraus, * found missing INIT_QDISC(htb) * Vladimir Smelhaus, Aamer Akhter, Bert Hubert @@ -17,9 +19,13 @@ * code review and helpful comments on shaping * Tomasz Wrona, * created test case so that I was able to fix nasty bug + * Wilfried Weissmann + * spotted bug in dequeue code and helped with fix + * Jiri Fojtasek + * fixed requeue routine * and many others. thanks. * - * $Id: sch_htb.c,v 1.1.1.4 2003/10/14 08:09:35 sparq Exp $ + * $Id: sch_htb.c,v 1.25 2003/12/07 11:08:25 devik Exp devik $ */ #include #include @@ -66,21 +72,17 @@ #define HTB_HSIZE 16 /* classid hash size */ #define HTB_EWMAC 2 /* rate average over HTB_EWMAC*HTB_HSIZE sec */ -#define HTB_DEBUG 1 /* compile debugging support (activated by tc tool) */ +//#define HTB_DEBUG 1 /* compile debugging support (activated by tc tool) */ #define HTB_RATECM 1 /* whether to use rate computer */ -#define HTB_HYSTERESIS 1/* whether to use mode hysteresis for speedup */ +#define HTB_HYSTERESIS 0/* whether to use mode hysteresis for speedup */ #define HTB_QLOCK(S) spin_lock_bh(&(S)->dev->queue_lock) #define HTB_QUNLOCK(S) spin_unlock_bh(&(S)->dev->queue_lock) -#define HTB_VER 0x30007 /* major must be matched with number suplied by TC as version */ +#define HTB_VER 0x30011 /* major must be matched with number suplied by TC as version */ #if HTB_VER >> 16 != TC_HTB_PROTOVER #error "Mismatched sch_htb.c and pkt_sch.h" #endif -/* temporary debug defines to be removed after beta stage */ -#define DEVIK_MEND(N) -#define DEVIK_MSTART(N) - /* debugging support; S is subsystem, these are defined: 0 - netlink messages 1 - enqueue @@ -100,13 +102,16 @@ from LSB */ #ifdef HTB_DEBUG -#define HTB_DBG(S,L,FMT,ARG...) if (((q->debug>>(2*S))&3) >= L) \ +#define HTB_DBG_COND(S,L) (((q->debug>>(2*S))&3) >= L) +#define HTB_DBG(S,L,FMT,ARG...) if (HTB_DBG_COND(S,L)) \ printk(KERN_DEBUG FMT,##ARG) #define HTB_CHCL(cl) BUG_TRAP((cl)->magic == HTB_CMAGIC) #define HTB_PASSQ q, #define HTB_ARGQ struct htb_sched *q, #define static +#undef __inline__ #define __inline__ +#undef inline #define inline #define HTB_CMAGIC 0xFEFAFEF1 #define htb_safe_rb_erase(N,R) do { BUG_TRAP((N)->rb_color != -1); \ @@ -114,6 +119,7 @@ rb_erase(N,R); \ (N)->rb_color = -1; } while (0) #else +#define HTB_DBG_COND(S,L) (0) #define HTB_DBG(S,L,FMT,ARG...) #define HTB_PASSQ #define HTB_ARGQ @@ -166,6 +172,11 @@ struct htb_class struct htb_class_inner { rb_root_t feed[TC_HTB_NUMPRIO]; /* feed trees */ rb_node_t *ptr[TC_HTB_NUMPRIO]; /* current class ptr */ + /* When class changes from state 1->2 and disconnects from + parent's feed then we lost ptr value and start from the + first child again. Here we store classid of the + last valid ptr (used when ptr is NULL). */ + u32 last_ptr_id[TC_HTB_NUMPRIO]; } inner; } un; rb_node_t node[TC_HTB_NUMPRIO]; /* node for self or feed tree */ @@ -212,6 +223,7 @@ struct htb_sched rb_root_t row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; int row_mask[TC_HTB_MAXDEPTH]; rb_node_t *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; + u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; /* self wait list - roots of wait PQs per row */ rb_root_t wait_pq[TC_HTB_MAXDEPTH]; @@ -219,6 +231,9 @@ struct htb_sched /* time of nearest event per level (row) */ unsigned long near_ev_cache[TC_HTB_MAXDEPTH]; + /* cached value of jiffies in dequeue */ + unsigned long jiffies; + /* whether we hit non-work conserving class during this dequeue; we use */ int nwc_hit; /* this to disable mindelay complaint in dequeue */ @@ -297,7 +312,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch) rules in it */ if (skb->priority == sch->handle) return HTB_DIRECT; /* X:0 (direct flow) selected */ - if ((cl = htb_find(skb->priority,sch)) != NULL) + if ((cl = htb_find(skb->priority,sch)) != NULL && cl->level == 0) return cl; tcf = q->filter_list; @@ -338,7 +353,7 @@ static void htb_next_rb_node(rb_node_t **n); static void htb_debug_dump (struct htb_sched *q) { int i,p; - printk(KERN_DEBUG "htb*g j=%lu\n",jiffies); + printk(KERN_DEBUG "htb*g j=%lu lj=%lu\n",jiffies,q->jiffies); /* rows */ for (i=TC_HTB_MAXDEPTH-1;i>=0;i--) { printk(KERN_DEBUG "htb*r%d m=%x",i,q->row_mask[i]); @@ -421,26 +436,24 @@ static void htb_add_to_wait_tree (struct htb_sched *q, if ((delay <= 0 || delay > cl->mbuffer) && net_ratelimit()) printk(KERN_ERR "HTB: suspicious delay in wait_tree d=%ld cl=%X h=%d\n",delay,cl->classid,debug_hint); #endif - DEVIK_MSTART(9); - cl->pq_key = jiffies + PSCHED_US2JIFFIE(delay); - if (cl->pq_key == jiffies) + cl->pq_key = q->jiffies + PSCHED_US2JIFFIE(delay); + if (cl->pq_key == q->jiffies) cl->pq_key++; /* update the nearest event cache */ - if (q->near_ev_cache[cl->level] - cl->pq_key < 0x80000000) + if (time_after(q->near_ev_cache[cl->level], cl->pq_key)) q->near_ev_cache[cl->level] = cl->pq_key; while (*p) { struct htb_class *c; parent = *p; c = rb_entry(parent, struct htb_class, pq_node); - if (cl->pq_key - c->pq_key < 0x80000000) + if (time_after_eq(cl->pq_key, c->pq_key)) p = &parent->rb_right; else p = &parent->rb_left; } rb_link_node(&cl->pq_node, parent, p); rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]); - DEVIK_MEND(9); } /** @@ -453,12 +466,14 @@ static void htb_next_rb_node(rb_node_t **n) { rb_node_t *p; if ((*n)->rb_right) { + /* child at right. use it or its leftmost ancestor */ *n = (*n)->rb_right; while ((*n)->rb_left) *n = (*n)->rb_left; return; } while ((p = (*n)->rb_parent) != NULL) { + /* if we've arrived from left child then we have next node */ if (p->rb_left == *n) break; *n = p; } @@ -567,8 +582,13 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl) int prio = ffz(~m); m &= ~(1 << prio); - if (p->un.inner.ptr[prio] == cl->node+prio) - htb_next_rb_node(p->un.inner.ptr + prio); + if (p->un.inner.ptr[prio] == cl->node+prio) { + /* we are removing child which is pointed to from + parent feed - forget the pointer but remember + classid */ + p->un.inner.last_ptr_id[prio] = cl->classid; + p->un.inner.ptr[prio] = NULL; + } htb_safe_rb_erase(cl->node + prio,p->un.inner.feed + prio); @@ -602,7 +622,7 @@ htb_class_mode(struct htb_class *cl,long *diff) long toks; if ((toks = (cl->ctokens + *diff)) < ( -#ifdef HTB_HYSTERESIS +#if HTB_HYSTERESIS cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : #endif 0)) { @@ -610,7 +630,7 @@ htb_class_mode(struct htb_class *cl,long *diff) return HTB_CANT_SEND; } if ((toks = (cl->tokens + *diff)) >= ( -#ifdef HTB_HYSTERESIS +#if HTB_HYSTERESIS cl->cmode == HTB_CAN_SEND ? -cl->buffer : #endif 0)) @@ -689,7 +709,6 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) struct htb_sched *q = (struct htb_sched *)sch->data; struct htb_class *cl = htb_classify(skb,sch); - DEVIK_MSTART(0); if (cl == HTB_DIRECT || !cl) { /* enqueue to helper queue */ if (q->direct_queue.qlen < q->direct_qlen && cl) { @@ -698,25 +717,20 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) } else { kfree_skb (skb); sch->stats.drops++; - DEVIK_MEND(0); return NET_XMIT_DROP; } } else if (cl->un.leaf.q->enqueue(skb, cl->un.leaf.q) != NET_XMIT_SUCCESS) { sch->stats.drops++; cl->stats.drops++; - DEVIK_MEND(0); return NET_XMIT_DROP; } else { cl->stats.packets++; cl->stats.bytes += skb->len; - DEVIK_MSTART(1); htb_activate (q,cl); - DEVIK_MEND(1); } sch->q.qlen++; sch->stats.packets++; sch->stats.bytes += skb->len; - HTB_DBG(1,1,"htb_enq_ok cl=%X skb=%p\n",cl?cl->classid:0,skb); - DEVIK_MEND(0); + HTB_DBG(1,1,"htb_enq_ok cl=%X skb=%p\n",(cl && cl != HTB_DIRECT)?cl->classid:0,skb); return NET_XMIT_SUCCESS; } @@ -725,16 +739,18 @@ static int htb_requeue(struct sk_buff *skb, struct Qdisc *sch) { struct htb_sched *q = (struct htb_sched *)sch->data; struct htb_class *cl = htb_classify(skb,sch); + struct sk_buff *tskb; if (cl == HTB_DIRECT || !cl) { /* enqueue to helper queue */ if (q->direct_queue.qlen < q->direct_qlen && cl) { - __skb_queue_tail(&q->direct_queue, skb); - q->direct_pkts++; + __skb_queue_head(&q->direct_queue, skb); } else { - kfree_skb (skb); - sch->stats.drops++; - return NET_XMIT_DROP; + __skb_queue_head(&q->direct_queue, skb); + tskb = __skb_dequeue_tail(&q->direct_queue); + kfree_skb (tskb); + sch->stats.drops++; + return NET_XMIT_CN; } } else if (cl->un.leaf.q->ops->requeue(skb, cl->un.leaf.q) != NET_XMIT_SUCCESS) { sch->stats.drops++; @@ -744,7 +760,7 @@ static int htb_requeue(struct sk_buff *skb, struct Qdisc *sch) htb_activate (q,cl); sch->q.qlen++; - HTB_DBG(1,1,"htb_req_ok cl=%X skb=%p\n",cl?cl->classid:0,skb); + HTB_DBG(1,1,"htb_req_ok cl=%X skb=%p\n",(cl && cl != HTB_DIRECT)?cl->classid:0,skb); return NET_XMIT_SUCCESS; } @@ -819,7 +835,7 @@ static void htb_charge_class(struct htb_sched *q,struct htb_class *cl, cl->classid, diff, (unsigned long long) q->now, (unsigned long long) cl->t_c, - jiffies); + q->jiffies); diff = 1000; } #endif @@ -862,6 +878,7 @@ static void htb_charge_class(struct htb_sched *q,struct htb_class *cl, * * Scans event queue for pending events and applies them. Returns jiffies to * next pending event (0 for no event in pq). + * Note: Aplied are events whose have cl->pq_key <= jiffies. */ static long htb_do_events(struct htb_sched *q,int level) { @@ -876,9 +893,9 @@ static long htb_do_events(struct htb_sched *q,int level) while (p->rb_left) p = p->rb_left; cl = rb_entry(p, struct htb_class, pq_node); - if (cl->pq_key - (jiffies+1) < 0x80000000) { - HTB_DBG(8,3,"htb_do_ev_ret delay=%ld\n",cl->pq_key - jiffies); - return cl->pq_key - jiffies; + if (time_after(cl->pq_key, q->jiffies)) { + HTB_DBG(8,3,"htb_do_ev_ret delay=%ld\n",cl->pq_key - q->jiffies); + return cl->pq_key - q->jiffies; } htb_safe_rb_erase(p,q->wait_pq+level); diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer, 0); @@ -889,7 +906,7 @@ static long htb_do_events(struct htb_sched *q,int level) cl->classid, diff, (unsigned long long) q->now, (unsigned long long) cl->t_c, - jiffies); + q->jiffies); diff = 1000; } #endif @@ -902,24 +919,56 @@ static long htb_do_events(struct htb_sched *q,int level) return HZ/10; } +/* Returns class->node+prio from id-tree where classe's id is >= id. NULL + is no such one exists. */ +static rb_node_t * +htb_id_find_next_upper(int prio,rb_node_t *n,u32 id) +{ + rb_node_t *r = NULL; + while (n) { + struct htb_class *cl = rb_entry(n,struct htb_class,node[prio]); + if (id == cl->classid) return n; + + if (id > cl->classid) { + n = n->rb_right; + } else { + r = n; + n = n->rb_left; + } + } + return r; +} + /** * htb_lookup_leaf - returns next leaf class in DRR order * * Find leaf where current feed pointers points to. */ static struct htb_class * -htb_lookup_leaf(rb_root_t *tree,int prio,rb_node_t **pptr) +htb_lookup_leaf(HTB_ARGQ rb_root_t *tree,int prio,rb_node_t **pptr,u32 *pid) { int i; struct { rb_node_t *root; rb_node_t **pptr; + u32 *pid; } stk[TC_HTB_MAXDEPTH],*sp = stk; + BUG_TRAP(tree->rb_node); sp->root = tree->rb_node; sp->pptr = pptr; + sp->pid = pid; for (i = 0; i < 65535; i++) { + HTB_DBG(4,2,"htb_lleaf ptr=%p pid=%X\n",*sp->pptr,*sp->pid); + + if (!*sp->pptr && *sp->pid) { + /* ptr was invalidated but id is valid - try to recover + the original or next ptr */ + *sp->pptr = htb_id_find_next_upper(prio,sp->root,*sp->pid); + } + *sp->pid = 0; /* ptr is valid now so that remove this hint as it + can become out of date quickly */ if (!*sp->pptr) { /* we are at right end; rewind & go up */ *sp->pptr = sp->root; while ((*sp->pptr)->rb_left) @@ -937,6 +986,7 @@ htb_lookup_leaf(rb_root_t *tree,int prio,rb_node_t **pptr) return cl; (++sp)->root = cl->un.inner.feed[prio].rb_node; sp->pptr = cl->un.inner.ptr+prio; + sp->pid = cl->un.inner.last_ptr_id+prio; } } BUG_TRAP(0); @@ -949,16 +999,37 @@ static struct sk_buff * htb_dequeue_tree(struct htb_sched *q,int prio,int level) { struct sk_buff *skb = NULL; - //struct htb_sched *q = (struct htb_sched *)sch->data; struct htb_class *cl,*start; /* look initial class up in the row */ - DEVIK_MSTART(6); - start = cl = htb_lookup_leaf (q->row[level]+prio,prio,q->ptr[level]+prio); + start = cl = htb_lookup_leaf (HTB_PASSQ q->row[level]+prio,prio, + q->ptr[level]+prio,q->last_ptr_id[level]+prio); do { - BUG_TRAP(cl && cl->un.leaf.q->q.qlen); if (!cl) return NULL; +next: + BUG_TRAP(cl); + if (!cl) return NULL; HTB_DBG(4,1,"htb_deq_tr prio=%d lev=%d cl=%X defic=%d\n", prio,level,cl->classid,cl->un.leaf.deficit[level]); + + /* class can be empty - it is unlikely but can be true if leaf + qdisc drops packets in enqueue routine or if someone used + graft operation on the leaf since last dequeue; + simply deactivate and skip such class */ + if (unlikely(cl->un.leaf.q->q.qlen == 0)) { + struct htb_class *next; + htb_deactivate(q,cl); + + /* row/level might become empty */ + if ((q->row_mask[level] & (1 << prio)) == 0) + return NULL; + + next = htb_lookup_leaf (HTB_PASSQ q->row[level]+prio, + prio,q->ptr[level]+prio,q->last_ptr_id[level]+prio); + if (cl == start) /* fix start if we just deleted it */ + start = next; + cl = next; + goto next; + } if (likely((skb = cl->un.leaf.q->dequeue(cl->un.leaf.q)) != NULL)) break; @@ -968,11 +1039,10 @@ htb_dequeue_tree(struct htb_sched *q,int prio,int level) } q->nwc_hit++; htb_next_rb_node((level?cl->parent->un.inner.ptr:q->ptr[0])+prio); - cl = htb_lookup_leaf (q->row[level]+prio,prio,q->ptr[level]+prio); + cl = htb_lookup_leaf (HTB_PASSQ q->row[level]+prio,prio,q->ptr[level]+prio, + q->last_ptr_id[level]+prio); } while (cl != start); - DEVIK_MEND(6); - DEVIK_MSTART(7); if (likely(skb != NULL)) { if ((cl->un.leaf.deficit[level] -= skb->len) < 0) { HTB_DBG(4,2,"htb_next_cl oldptr=%p quant_add=%d\n", @@ -984,27 +1054,22 @@ htb_dequeue_tree(struct htb_sched *q,int prio,int level) gives us slightly better performance */ if (!cl->un.leaf.q->q.qlen) htb_deactivate (q,cl); - DEVIK_MSTART(8); htb_charge_class (q,cl,level,skb->len); - DEVIK_MEND(8); } - DEVIK_MEND(7); return skb; } static void htb_delay_by(struct Qdisc *sch,long delay) { struct htb_sched *q = (struct htb_sched *)sch->data; - if (netif_queue_stopped(sch->dev)) return; if (delay <= 0) delay = 1; if (unlikely(delay > 5*HZ)) { if (net_ratelimit()) printk(KERN_INFO "HTB delay %ld > 5sec\n", delay); delay = 5*HZ; } - del_timer(&q->timer); - q->timer.expires = jiffies + delay; - add_timer(&q->timer); + /* why don't use jiffies here ? because expires can be in past */ + mod_timer(&q->timer, q->jiffies + delay); sch->flags |= TCQ_F_THROTTLED; sch->stats.overlimits++; HTB_DBG(3,1,"htb_deq t_delay=%ld\n",delay); @@ -1016,7 +1081,11 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch) struct htb_sched *q = (struct htb_sched *)sch->data; int level; long min_delay; +#ifdef HTB_DEBUG + int evs_used = 0; +#endif + q->jiffies = jiffies; HTB_DBG(3,1,"htb_deq dircnt=%d qlen=%d\n",skb_queue_len(&q->direct_queue), sch->q.qlen); @@ -1027,27 +1096,26 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch) return skb; } - DEVIK_MSTART(2); if (!sch->q.qlen) goto fin; PSCHED_GET_TIME(q->now); - min_delay = HZ*5; + min_delay = LONG_MAX; q->nwc_hit = 0; for (level = 0; level < TC_HTB_MAXDEPTH; level++) { /* common case optimization - skip event handler quickly */ int m; long delay; - DEVIK_MSTART(3); - if (jiffies - q->near_ev_cache[level] < 0x80000000 || 0) { + if (time_after_eq(q->jiffies, q->near_ev_cache[level])) { delay = htb_do_events(q,level); - q->near_ev_cache[level] += delay ? delay : HZ; + q->near_ev_cache[level] = q->jiffies + (delay ? delay : HZ); +#ifdef HTB_DEBUG + evs_used++; +#endif } else - delay = q->near_ev_cache[level] - jiffies; + delay = q->near_ev_cache[level] - q->jiffies; if (delay && min_delay > delay) min_delay = delay; - DEVIK_MEND(3); - DEVIK_MSTART(5); m = ~q->row_mask[level]; while (m != (int)(-1)) { int prio = ffz (m); @@ -1056,29 +1124,29 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch) if (likely(skb != NULL)) { sch->q.qlen--; sch->flags &= ~TCQ_F_THROTTLED; - DEVIK_MEND(5); goto fin; } } - DEVIK_MEND(5); } - DEVIK_MSTART(4); #ifdef HTB_DEBUG - if (!q->nwc_hit && min_delay >= 5*HZ && net_ratelimit()) { - printk(KERN_ERR "HTB: mindelay=%ld, report it please !\n",min_delay); - htb_debug_dump(q); + if (!q->nwc_hit && min_delay >= 10*HZ && net_ratelimit()) { + if (min_delay == LONG_MAX) { + printk(KERN_ERR "HTB: dequeue bug (%d,%lu,%lu), report it please !\n", + evs_used,q->jiffies,jiffies); + htb_debug_dump(q); + } else + printk(KERN_WARNING "HTB: mindelay=%ld, some class has " + "too small rate\n",min_delay); } #endif - htb_delay_by (sch,min_delay); - DEVIK_MEND(4); + htb_delay_by (sch,min_delay > 5*HZ ? 5*HZ : min_delay); fin: - HTB_DBG(3,1,"htb_deq_end %s j=%lu skb=%p\n",sch->dev->name,jiffies,skb); - DEVIK_MEND(2); + HTB_DBG(3,1,"htb_deq_end %s j=%lu skb=%p\n",sch->dev->name,q->jiffies,skb); return skb; } /* try to drop from each class (by prio) until one succeed */ -static int htb_drop(struct Qdisc* sch) +static unsigned int htb_drop(struct Qdisc* sch) { struct htb_sched *q = (struct htb_sched *)sch->data; int prio; @@ -1086,14 +1154,15 @@ static int htb_drop(struct Qdisc* sch) for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) { struct list_head *p; list_for_each (p,q->drops+prio) { - struct htb_class *cl = list_entry(p,struct htb_class, - un.leaf.drop_list); + struct htb_class *cl = list_entry(p, struct htb_class, + un.leaf.drop_list); + unsigned int len; if (cl->un.leaf.q->ops->drop && - cl->un.leaf.q->ops->drop(cl->un.leaf.q)) { + (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) { sch->q.qlen--; if (!cl->un.leaf.q->q.qlen) htb_deactivate (q,cl); - return 1; + return len; } } } @@ -1162,7 +1231,6 @@ static int htb_init(struct Qdisc *sch, struct rtattr *opt) HTB_VER >> 16,HTB_VER & 0xffff,gopt->version); return -EINVAL; } - memset(q,0,sizeof(*q)); q->debug = gopt->debug; HTB_DBG(0,1,"htb_init sch=%p handle=%X r2q=%d\n",sch,sch->handle,gopt->rate2quantum); @@ -1208,7 +1276,8 @@ static int htb_dump(struct Qdisc *sch, struct sk_buff *skb) gopt.direct_pkts = q->direct_pkts; #ifdef HTB_DEBUG - htb_debug_dump(q); + if (HTB_DBG_COND(0,2)) + htb_debug_dump(q); #endif gopt.version = HTB_VER; gopt.rate2quantum = q->rate2quantum; @@ -1218,8 +1287,6 @@ static int htb_dump(struct Qdisc *sch, struct sk_buff *skb) RTA_PUT(skb, TCA_OPTIONS, 0, NULL); RTA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt); rta->rta_len = skb->tail - b; - sch->stats.qlen = sch->q.qlen; - RTA_PUT(skb, TCA_STATS, sizeof(sch->stats), &sch->stats); HTB_QUNLOCK(sch); return skb->len; rtattr_failure: @@ -1289,6 +1356,9 @@ static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, return -ENOBUFS; sch_tree_lock(sch); if ((*old = xchg(&cl->un.leaf.q, new)) != NULL) { + if (cl->prio_activity) + htb_deactivate ((struct htb_sched*)sch->data,cl); + /* TODO: is it correct ? Why CBQ doesn't do it ? */ sch->q.qlen -= (*old)->q.qlen; qdisc_reset(*old); @@ -1323,7 +1393,7 @@ static void htb_destroy_filters(struct tcf_proto **fl) while ((tp = *fl) != NULL) { *fl = tp->next; - tp->ops->destroy(tp); + tcf_destroy(tp); } } @@ -1371,11 +1441,16 @@ static void htb_destroy(struct Qdisc* sch) #ifdef HTB_RATECM del_timer_sync (&q->rttim); #endif + /* This line used to be after htb_destroy_class call below + and surprisingly it worked in 2.4. But it must precede it + because filter need its target class alive to be able to call + unbind_filter on it (without Oops). */ + htb_destroy_filters(&q->filter_list); + while (!list_empty(&q->root)) htb_destroy_class (sch,list_entry(q->root.next, struct htb_class,sibling)); - htb_destroy_filters(&q->filter_list); __skb_queue_purge(&q->direct_queue); MOD_DEC_USE_COUNT; } @@ -1438,12 +1513,13 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, parent = parentid == TC_H_ROOT ? NULL : htb_find (parentid,sch); hopt = RTA_DATA(tb[TCA_HTB_PARMS-1]); - HTB_DBG(0,1,"htb_chg cl=%p, clid=%X, opt/prio=%d, rate=%u, buff=%d, quant=%d\n", cl,cl?cl->classid:0,(int)hopt->prio,hopt->rate.rate,hopt->buffer,hopt->quantum); + HTB_DBG(0,1,"htb_chg cl=%p(%X), clid=%X, parid=%X, opt/prio=%d, rate=%u, buff=%d, quant=%d\n", cl,cl?cl->classid:0,classid,parentid,(int)hopt->prio,hopt->rate.rate,hopt->buffer,hopt->quantum); rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB-1]); ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB-1]); if (!rtab || !ctab) goto failure; if (!cl) { /* new class */ + struct Qdisc *new_q; /* check for valid classid */ if (!classid || TC_H_MAJ(classid^sch->handle) || htb_find(classid,sch)) goto failure; @@ -1467,6 +1543,10 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, cl->magic = HTB_CMAGIC; #endif + /* create leaf qdisc early because it uses kmalloc(GFP_KERNEL) + so that can't be used inside of sch_tree_lock + -- thanks to Karlis Peisenieks */ + new_q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); sch_tree_lock(sch); if (parent && !parent->level) { /* turn parent into inner node */ @@ -1485,8 +1565,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, memset (&parent->un.inner,0,sizeof(parent->un.inner)); } /* leaf (we) needs elementary qdisc */ - if (!(cl->un.leaf.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops))) - cl->un.leaf.q = &noop_qdisc; + cl->un.leaf.q = new_q ? new_q : &noop_qdisc; cl->classid = classid; cl->parent = parent; @@ -1514,11 +1593,11 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (!cl->level) { cl->un.leaf.quantum = rtab->rate.rate / q->rate2quantum; if (!hopt->quantum && cl->un.leaf.quantum < 1000) { - printk(KERN_WARNING "HTB: quantum of class %X is small. Consider r2q change.", cl->classid); + printk(KERN_WARNING "HTB: quantum of class %X is small. Consider r2q change.\n", cl->classid); cl->un.leaf.quantum = 1000; } if (!hopt->quantum && cl->un.leaf.quantum > 200000) { - printk(KERN_WARNING "HTB: quantum of class %X is big. Consider r2q change.", cl->classid); + printk(KERN_WARNING "HTB: quantum of class %X is big. Consider r2q change.\n", cl->classid); cl->un.leaf.quantum = 200000; } if (hopt->quantum) diff --git a/release/src/linux/linux/net/sched/sch_ingress.c b/release/src/linux/linux/net/sched/sch_ingress.c index 2b30fce0..70698728 100644 --- a/release/src/linux/linux/net/sched/sch_ingress.c +++ b/release/src/linux/linux/net/sched/sch_ingress.c @@ -250,7 +250,6 @@ int ingress_init(struct Qdisc *sch,struct rtattr *opt) } DPRINTK("ingress_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt); - memset(p, 0, sizeof(*p)); p->filter_list = NULL; p->q = &noop_qdisc; MOD_INC_USE_COUNT; @@ -286,10 +285,7 @@ static void ingress_destroy(struct Qdisc *sch) p->filter_list = tp->next; tp->ops->destroy(tp); } - memset(p, 0, sizeof(*p)); - p->filter_list = NULL; - MOD_DEC_USE_COUNT; } diff --git a/release/src/linux/linux/net/sched/sch_sfq.c b/release/src/linux/linux/net/sched/sch_sfq.c index c96762fb..a6c17424 100644 --- a/release/src/linux/linux/net/sched/sch_sfq.c +++ b/release/src/linux/linux/net/sched/sch_sfq.c @@ -218,6 +218,7 @@ static int sfq_drop(struct Qdisc *sch) struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; sfq_index d = q->max_depth; struct sk_buff *skb; + int len; /* Queue is full! Find the longest slot and drop a packet from it */ @@ -225,12 +226,13 @@ static int sfq_drop(struct Qdisc *sch) if (d > 1) { sfq_index x = q->dep[d+SFQ_DEPTH].next; skb = q->qs[x].prev; + len = skb->len; __skb_unlink(skb, &q->qs[x]); kfree_skb(skb); sfq_dec(q, x); sch->q.qlen--; sch->stats.drops++; - return 1; + return len; } if (d == 1) { @@ -239,13 +241,14 @@ static int sfq_drop(struct Qdisc *sch) q->next[q->tail] = q->next[d]; q->allot[q->next[d]] += q->quantum; skb = q->qs[d].prev; + len = skb->len; __skb_unlink(skb, &q->qs[d]); kfree_skb(skb); sfq_dec(q, d); sch->q.qlen--; q->ht[q->hash[d]] = SFQ_DEPTH; sch->stats.drops++; - return 1; + return len; } return 0; @@ -342,6 +345,7 @@ sfq_dequeue(struct Qdisc* sch) /* Is the slot empty? */ if (q->qs[a].qlen == 0) { + q->ht[q->hash[a]] = SFQ_DEPTH; a = q->next[a]; if (a == old_a) { q->tail = SFQ_DEPTH; diff --git a/release/src/linux/linux/net/socket.c b/release/src/linux/linux/net/socket.c index d8b479c9..4816eeb6 100644 --- a/release/src/linux/linux/net/socket.c +++ b/release/src/linux/linux/net/socket.c @@ -607,6 +607,9 @@ ssize_t sock_sendpage(struct file *file, struct page *page, if (more) flags |= MSG_MORE; + if (!sock->ops->sendpage) + return sock_no_sendpage(sock, page, offset, size, flags); + return sock->ops->sendpage(sock, page, offset, size, flags); } -- cgit v1.2.3-54-g00ecf