diff options
author | Andreas Baumann <mail@andreasbaumann.cc> | 2015-01-03 12:04:58 +0100 |
---|---|---|
committer | Andreas Baumann <mail@andreasbaumann.cc> | 2015-01-03 12:04:58 +0100 |
commit | 008d0be72b2f160382c6e880765e96b64a050c65 (patch) | |
tree | 36f48a98a3815a408e2ce1693dd182af90f80305 /release/src/linux/linux/net/sched | |
parent | 611becfb8726c60cb060368541ad98191d4532f5 (diff) | |
download | tomato-008d0be72b2f160382c6e880765e96b64a050c65.tar.gz tomato-008d0be72b2f160382c6e880765e96b64a050c65.tar.bz2 |
imported original firmware WRT54GL_v4.30.11_11_US
Diffstat (limited to 'release/src/linux/linux/net/sched')
27 files changed, 15266 insertions, 0 deletions
diff --git a/release/src/linux/linux/net/sched/Config.in b/release/src/linux/linux/net/sched/Config.in new file mode 100644 index 00000000..8e203456 --- /dev/null +++ b/release/src/linux/linux/net/sched/Config.in @@ -0,0 +1,41 @@ +# +# Traffic control configuration. +# +tristate ' CBQ packet scheduler' CONFIG_NET_SCH_CBQ +tristate ' HTB packet scheduler' CONFIG_NET_SCH_HTB +tristate ' CSZ packet scheduler' CONFIG_NET_SCH_CSZ +#tristate ' H-PFQ packet scheduler' CONFIG_NET_SCH_HPFQ +#tristate ' H-FSC packet scheduler' CONFIG_NET_SCH_HFCS +if [ "$CONFIG_ATM" = "y" ]; then + bool ' ATM pseudo-scheduler' CONFIG_NET_SCH_ATM +fi +tristate ' The simplest PRIO pseudoscheduler' CONFIG_NET_SCH_PRIO +tristate ' RED queue' CONFIG_NET_SCH_RED +tristate ' SFQ queue' CONFIG_NET_SCH_SFQ +tristate ' TEQL queue' CONFIG_NET_SCH_TEQL +tristate ' TBF queue' CONFIG_NET_SCH_TBF +tristate ' GRED queue' CONFIG_NET_SCH_GRED +tristate ' Diffserv field marker' CONFIG_NET_SCH_DSMARK +if [ "$CONFIG_NETFILTER" = "y" ]; then + tristate ' Ingress Qdisc' CONFIG_NET_SCH_INGRESS +fi +bool ' QoS support' CONFIG_NET_QOS +if [ "$CONFIG_NET_QOS" = "y" ]; then + bool ' Rate estimator' CONFIG_NET_ESTIMATOR +fi +bool ' Packet classifier API' CONFIG_NET_CLS +if [ "$CONFIG_NET_CLS" = "y" ]; then + tristate ' TC index classifier' CONFIG_NET_CLS_TCINDEX + tristate ' Routing table based classifier' CONFIG_NET_CLS_ROUTE4 + if [ "$CONFIG_NET_CLS_ROUTE4" != "n" ]; then + define_bool CONFIG_NET_CLS_ROUTE y + fi + tristate ' Firewall based classifier' CONFIG_NET_CLS_FW + tristate ' U32 classifier' CONFIG_NET_CLS_U32 + if [ "$CONFIG_NET_QOS" = "y" ]; then + tristate ' Special RSVP classifier' CONFIG_NET_CLS_RSVP + tristate ' Special RSVP classifier for IPv6' CONFIG_NET_CLS_RSVP6 + bool ' Traffic policing (needed for in/egress)' CONFIG_NET_CLS_POLICE + fi +fi + diff --git a/release/src/linux/linux/net/sched/Makefile b/release/src/linux/linux/net/sched/Makefile new file mode 100644 index 00000000..e48e5c3e --- /dev/null +++ b/release/src/linux/linux/net/sched/Makefile @@ -0,0 +1,35 @@ +# +# Makefile for the Linux Traffic Control Unit. +# + +O_TARGET := sched.o + +obj-y := sch_generic.o + + +obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o +obj-$(CONFIG_NET_ESTIMATOR) += estimator.o +obj-$(CONFIG_NET_CLS) += cls_api.o +obj-$(CONFIG_NET_CLS_POLICE) += police.o +obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o +obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o +obj-$(CONFIG_NET_SCH_CSZ) += sch_csz.o +obj-$(CONFIG_NET_SCH_HPFQ) += sch_hpfq.o +obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o +obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o +obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o +obj-$(CONFIG_NET_SCH_RED) += sch_red.o +obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o +obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o +obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o +obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o +obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o +obj-$(CONFIG_NET_CLS_TCINDEX) += cls_tcindex.o +obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o +obj-$(CONFIG_NET_CLS_U32) += cls_u32.o +obj-$(CONFIG_NET_CLS_RSVP) += cls_rsvp.o +obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o +obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o +obj-$(CONFIG_NET_CLS_FW) += cls_fw.o + +include $(TOPDIR)/Rules.make diff --git a/release/src/linux/linux/net/sched/cls_api.c b/release/src/linux/linux/net/sched/cls_api.c new file mode 100644 index 00000000..9d9b0d65 --- /dev/null +++ b/release/src/linux/linux/net/sched/cls_api.c @@ -0,0 +1,470 @@ +/* + * net/sched/cls_api.c Packet classifier API. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * Changes: + * + * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/config.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <linux/kmod.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +/* The list of all installed classifier types */ + +static struct tcf_proto_ops *tcf_proto_base; + +/* Protects list of registered TC modules. It is pure SMP lock. */ +static rwlock_t cls_mod_lock = RW_LOCK_UNLOCKED; + +/* Find classifier type by string name */ + +struct tcf_proto_ops * tcf_proto_lookup_ops(struct rtattr *kind) +{ + struct tcf_proto_ops *t = NULL; + + if (kind) { + read_lock(&cls_mod_lock); + for (t = tcf_proto_base; t; t = t->next) { + if (rtattr_strcmp(kind, t->kind) == 0) + break; + } + read_unlock(&cls_mod_lock); + } + return t; +} + +/* Register(unregister) new classifier type */ + +int register_tcf_proto_ops(struct tcf_proto_ops *ops) +{ + struct tcf_proto_ops *t, **tp; + + write_lock(&cls_mod_lock); + for (tp = &tcf_proto_base; (t=*tp) != NULL; tp = &t->next) { + if (strcmp(ops->kind, t->kind) == 0) { + write_unlock(&cls_mod_lock); + return -EEXIST; + } + } + + ops->next = NULL; + *tp = ops; + write_unlock(&cls_mod_lock); + return 0; +} + +int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) +{ + struct tcf_proto_ops *t, **tp; + + write_lock(&cls_mod_lock); + for (tp = &tcf_proto_base; (t=*tp) != NULL; tp = &t->next) + if (t == ops) + break; + + if (!t) { + write_unlock(&cls_mod_lock); + return -ENOENT; + } + *tp = t->next; + write_unlock(&cls_mod_lock); + return 0; +} + +static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n, + struct tcf_proto *tp, unsigned long fh, int event); + + +/* Select new prio value from the range, managed by kernel. */ + +static __inline__ u32 tcf_auto_prio(struct tcf_proto *tp) +{ + u32 first = TC_H_MAKE(0xC0000000U,0U); + + if (tp) + first = tp->prio-1; + + return first; +} + +/* Add/change/delete/get a filter node */ + +static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +{ + struct rtattr **tca = arg; + struct tcmsg *t = NLMSG_DATA(n); + u32 protocol = TC_H_MIN(t->tcm_info); + u32 prio = TC_H_MAJ(t->tcm_info); + u32 nprio = prio; + u32 parent = t->tcm_parent; + struct net_device *dev; + struct Qdisc *q; + struct tcf_proto **back, **chain; + struct tcf_proto *tp = NULL; + struct tcf_proto_ops *tp_ops; + struct Qdisc_class_ops *cops; + unsigned long cl = 0; + unsigned long fh; + int err; + + if (prio == 0) { + /* If no priority is given, user wants we allocated it. */ + if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) + return -ENOENT; + prio = TC_H_MAKE(0x80000000U,0U); + } + + /* Find head of filter chain. */ + + /* Find link */ + if ((dev = __dev_get_by_index(t->tcm_ifindex)) == NULL) + return -ENODEV; + + /* Find qdisc */ + if (!parent) { + q = dev->qdisc_sleeping; + parent = q->handle; + } else if ((q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent))) == NULL) + return -EINVAL; + + /* Is it classful? */ + if ((cops = q->ops->cl_ops) == NULL) + return -EINVAL; + + /* Do we search for filter, attached to class? */ + if (TC_H_MIN(parent)) { + cl = cops->get(q, parent); + if (cl == 0) + return -ENOENT; + } + + /* And the last stroke */ + chain = cops->tcf_chain(q, cl); + err = -EINVAL; + if (chain == NULL) + goto errout; + + /* Check the chain for existence of proto-tcf with this priority */ + for (back = chain; (tp=*back) != NULL; back = &tp->next) { + if (tp->prio >= prio) { + if (tp->prio == prio) { + if (!nprio || (tp->protocol != protocol && protocol)) + goto errout; + } else + tp = NULL; + break; + } + } + + if (tp == NULL) { + /* Proto-tcf does not exist, create new one */ + + if (tca[TCA_KIND-1] == NULL || !protocol) + goto errout; + + err = -ENOENT; + if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) + goto errout; + + + /* Create new proto tcf */ + + err = -ENOBUFS; + if ((tp = kmalloc(sizeof(*tp), GFP_KERNEL)) == NULL) + goto errout; + tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND-1]); +#ifdef CONFIG_KMOD + if (tp_ops==NULL && tca[TCA_KIND-1] != NULL) { + struct rtattr *kind = tca[TCA_KIND-1]; + char module_name[4 + IFNAMSIZ + 1]; + + if (RTA_PAYLOAD(kind) <= IFNAMSIZ) { + sprintf(module_name, "cls_%s", (char*)RTA_DATA(kind)); + request_module (module_name); + tp_ops = tcf_proto_lookup_ops(kind); + } + } +#endif + if (tp_ops == NULL) { + err = -EINVAL; + kfree(tp); + goto errout; + } + memset(tp, 0, sizeof(*tp)); + tp->ops = tp_ops; + tp->protocol = protocol; + tp->prio = nprio ? : tcf_auto_prio(*back); + tp->q = q; + tp->classify = tp_ops->classify; + tp->classid = parent; + err = tp_ops->init(tp); + if (err) { + kfree(tp); + goto errout; + } + write_lock(&qdisc_tree_lock); + spin_lock_bh(&dev->queue_lock); + tp->next = *back; + *back = tp; + spin_unlock_bh(&dev->queue_lock); + write_unlock(&qdisc_tree_lock); + } else if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], tp->ops->kind)) + goto errout; + + fh = tp->ops->get(tp, t->tcm_handle); + + if (fh == 0) { + if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) { + write_lock(&qdisc_tree_lock); + spin_lock_bh(&dev->queue_lock); + *back = tp->next; + spin_unlock_bh(&dev->queue_lock); + write_unlock(&qdisc_tree_lock); + + tp->ops->destroy(tp); + kfree(tp); + err = 0; + goto errout; + } + + err = -ENOENT; + if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) + goto errout; + } else { + switch (n->nlmsg_type) { + case RTM_NEWTFILTER: + err = -EEXIST; + if (n->nlmsg_flags&NLM_F_EXCL) + goto errout; + break; + case RTM_DELTFILTER: + err = tp->ops->delete(tp, fh); + goto errout; + case RTM_GETTFILTER: + err = tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER); + goto errout; + default: + err = -EINVAL; + goto errout; + } + } + + err = tp->ops->change(tp, cl, t->tcm_handle, tca, &fh); + if (err == 0) + tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER); + +errout: + if (cl) + cops->put(q, cl); + return err; +} + +static int +tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh, + u32 pid, u32 seq, unsigned flags, int event) +{ + struct tcmsg *tcm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); + nlh->nlmsg_flags = flags; + tcm = NLMSG_DATA(nlh); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm_ifindex = tp->q->dev->ifindex; + tcm->tcm_parent = tp->classid; + tcm->tcm_handle = 0; + tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol); + RTA_PUT(skb, TCA_KIND, IFNAMSIZ, tp->ops->kind); + if (tp->ops->dump && tp->ops->dump(tp, fh, skb, tcm) < 0) + goto rtattr_failure; + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n, + struct tcf_proto *tp, unsigned long fh, int event) +{ + struct sk_buff *skb; + u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (tcf_fill_node(skb, tp, fh, pid, n->nlmsg_seq, 0, event) <= 0) { + kfree_skb(skb); + return -EINVAL; + } + + return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); +} + +struct tcf_dump_args +{ + struct tcf_walker w; + struct sk_buff *skb; + struct netlink_callback *cb; +}; + +static int tcf_node_dump(struct tcf_proto *tp, unsigned long n, struct tcf_walker *arg) +{ + struct tcf_dump_args *a = (void*)arg; + + return tcf_fill_node(a->skb, tp, n, NETLINK_CB(a->cb->skb).pid, + a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER); +} + +static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) +{ + int t; + int s_t; + struct net_device *dev; + struct Qdisc *q; + struct tcf_proto *tp, **chain; + struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh); + unsigned long cl = 0; + struct Qdisc_class_ops *cops; + struct tcf_dump_args arg; + + if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) + return skb->len; + if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) + return skb->len; + + read_lock(&qdisc_tree_lock); + if (!tcm->tcm_parent) + q = dev->qdisc_sleeping; + else + q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); + if (q == NULL) { + read_unlock(&qdisc_tree_lock); + dev_put(dev); + return skb->len; + } + if ((cops = q->ops->cl_ops) == NULL) + goto errout; + if (TC_H_MIN(tcm->tcm_parent)) { + cl = cops->get(q, tcm->tcm_parent); + if (cl == 0) + goto errout; + } + chain = cops->tcf_chain(q, cl); + if (chain == NULL) + goto errout; + + s_t = cb->args[0]; + + for (tp=*chain, t=0; tp; tp = tp->next, t++) { + if (t < s_t) continue; + if (TC_H_MAJ(tcm->tcm_info) && + TC_H_MAJ(tcm->tcm_info) != tp->prio) + continue; + if (TC_H_MIN(tcm->tcm_info) && + TC_H_MIN(tcm->tcm_info) != tp->protocol) + continue; + if (t > s_t) + memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); + if (cb->args[1] == 0) { + if (tcf_fill_node(skb, tp, 0, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) { + break; + } + cb->args[1] = 1; + } + if (tp->ops->walk == NULL) + continue; + arg.w.fn = tcf_node_dump; + arg.skb = skb; + arg.cb = cb; + arg.w.stop = 0; + arg.w.skip = cb->args[1]-1; + arg.w.count = 0; + tp->ops->walk(tp, &arg.w); + cb->args[1] = arg.w.count+1; + if (arg.w.stop) + break; + } + + cb->args[0] = t; + +errout: + if (cl) + cops->put(q, cl); + + read_unlock(&qdisc_tree_lock); + dev_put(dev); + return skb->len; +} + + +int __init tc_filter_init(void) +{ + struct rtnetlink_link *link_p = rtnetlink_links[PF_UNSPEC]; + + /* Setup rtnetlink links. It is made here to avoid + exporting large number of public symbols. + */ + + if (link_p) { + link_p[RTM_NEWTFILTER-RTM_BASE].doit = tc_ctl_tfilter; + link_p[RTM_DELTFILTER-RTM_BASE].doit = tc_ctl_tfilter; + link_p[RTM_GETTFILTER-RTM_BASE].doit = tc_ctl_tfilter; + link_p[RTM_GETTFILTER-RTM_BASE].dumpit = tc_dump_tfilter; + } +#define INIT_TC_FILTER(name) { \ + extern struct tcf_proto_ops cls_##name##_ops; \ + register_tcf_proto_ops(&cls_##name##_ops); \ + } + +#ifdef CONFIG_NET_CLS_U32 + INIT_TC_FILTER(u32); +#endif +#ifdef CONFIG_NET_CLS_ROUTE4 + INIT_TC_FILTER(route4); +#endif +#ifdef CONFIG_NET_CLS_FW + INIT_TC_FILTER(fw); +#endif +#ifdef CONFIG_NET_CLS_RSVP + INIT_TC_FILTER(rsvp); +#endif +#ifdef CONFIG_NET_CLS_TCINDEX + INIT_TC_FILTER(tcindex); +#endif +#ifdef CONFIG_NET_CLS_RSVP6 + INIT_TC_FILTER(rsvp6); +#endif + return 0; +} diff --git a/release/src/linux/linux/net/sched/cls_fw.c b/release/src/linux/linux/net/sched/cls_fw.c new file mode 100644 index 00000000..15c5df7d --- /dev/null +++ b/release/src/linux/linux/net/sched/cls_fw.c @@ -0,0 +1,379 @@ +/* + * net/sched/cls_fw.c Classifier mapping ipchains' fwmark to traffic class. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * Changes: + * Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_walk off by one + * Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_delete killed all the filter (and kernel). + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <linux/netfilter.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +struct fw_head +{ + struct fw_filter *ht[256]; +}; + +struct fw_filter +{ + struct fw_filter *next; + u32 id; + struct tcf_result res; +#ifdef CONFIG_NET_CLS_POLICE + struct tcf_police *police; +#endif +}; + +static __inline__ int fw_hash(u32 handle) +{ + return handle&0xFF; +} + +static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + struct fw_head *head = (struct fw_head*)tp->root; + struct fw_filter *f; +#ifdef CONFIG_NETFILTER + u32 id = skb->nfmark; +#else + u32 id = 0; +#endif + + if (head == NULL) + goto old_method; + + for (f=head->ht[fw_hash(id)]; f; f=f->next) { + if (f->id == id) { + *res = f->res; +#ifdef CONFIG_NET_CLS_POLICE + if (f->police) + return tcf_police(skb, f->police); +#endif + return 0; + } + } + return -1; + +old_method: + if (id && (TC_H_MAJ(id) == 0 || + !(TC_H_MAJ(id^tp->q->handle)))) { + res->classid = id; + res->class = 0; + return 0; + } + return -1; +} + +static unsigned long fw_get(struct tcf_proto *tp, u32 handle) +{ + struct fw_head *head = (struct fw_head*)tp->root; + struct fw_filter *f; + + if (head == NULL) + return 0; + + for (f=head->ht[fw_hash(handle)]; f; f=f->next) { + if (f->id == handle) + return (unsigned long)f; + } + return 0; +} + +static void fw_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int fw_init(struct tcf_proto *tp) +{ + MOD_INC_USE_COUNT; + return 0; +} + +static void fw_destroy(struct tcf_proto *tp) +{ + struct fw_head *head = (struct fw_head*)xchg(&tp->root, NULL); + struct fw_filter *f; + int h; + + if (head == NULL) { + MOD_DEC_USE_COUNT; + return; + } + + for (h=0; h<256; h++) { + while ((f=head->ht[h]) != NULL) { + unsigned long cl; + head->ht[h] = f->next; + + if ((cl = __cls_set_class(&f->res.class, 0)) != 0) + tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); +#ifdef CONFIG_NET_CLS_POLICE + tcf_police_release(f->police); +#endif + kfree(f); + } + } + kfree(head); + MOD_DEC_USE_COUNT; +} + +static int fw_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct fw_head *head = (struct fw_head*)tp->root; + struct fw_filter *f = (struct fw_filter*)arg; + struct fw_filter **fp; + + if (head == NULL || f == NULL) + return -EINVAL; + + for (fp=&head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) { + if (*fp == f) { + unsigned long cl; + + tcf_tree_lock(tp); + *fp = f->next; + tcf_tree_unlock(tp); + + if ((cl = cls_set_class(tp, &f->res.class, 0)) != 0) + tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); +#ifdef CONFIG_NET_CLS_POLICE + tcf_police_release(f->police); +#endif + kfree(f); + return 0; + } + } + return -EINVAL; +} + +static int fw_change(struct tcf_proto *tp, unsigned long base, + u32 handle, + struct rtattr **tca, + unsigned long *arg) +{ + struct fw_head *head = (struct fw_head*)tp->root; + struct fw_filter *f; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_FW_MAX]; + int err; + + if (!opt) + return handle ? -EINVAL : 0; + + if (rtattr_parse(tb, TCA_FW_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0) + return -EINVAL; + + if ((f = (struct fw_filter*)*arg) != NULL) { + /* Node exists: adjust only classid */ + + if (f->id != handle && handle) + return -EINVAL; + if (tb[TCA_FW_CLASSID-1]) { + unsigned long cl; + + f->res.classid = *(u32*)RTA_DATA(tb[TCA_FW_CLASSID-1]); + cl = tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid); + cl = cls_set_class(tp, &f->res.class, cl); + if (cl) + tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); + } +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_FW_POLICE-1]) { + struct tcf_police *police = tcf_police_locate(tb[TCA_FW_POLICE-1], tca[TCA_RATE-1]); + + tcf_tree_lock(tp); + police = xchg(&f->police, police); + tcf_tree_unlock(tp); + + tcf_police_release(police); + } +#endif + return 0; + } + + if (!handle) + return -EINVAL; + + if (head == NULL) { + head = kmalloc(sizeof(struct fw_head), GFP_KERNEL); + if (head == NULL) + return -ENOBUFS; + memset(head, 0, sizeof(*head)); + + tcf_tree_lock(tp); + tp->root = head; + tcf_tree_unlock(tp); + } + + f = kmalloc(sizeof(struct fw_filter), GFP_KERNEL); + if (f == NULL) + return -ENOBUFS; + memset(f, 0, sizeof(*f)); + + f->id = handle; + + if (tb[TCA_FW_CLASSID-1]) { + err = -EINVAL; + if (RTA_PAYLOAD(tb[TCA_FW_CLASSID-1]) != 4) + goto errout; + f->res.classid = *(u32*)RTA_DATA(tb[TCA_FW_CLASSID-1]); + cls_set_class(tp, &f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid)); + } + +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_FW_POLICE-1]) + f->police = tcf_police_locate(tb[TCA_FW_POLICE-1], tca[TCA_RATE-1]); +#endif + + f->next = head->ht[fw_hash(handle)]; + tcf_tree_lock(tp); + head->ht[fw_hash(handle)] = f; + tcf_tree_unlock(tp); + + *arg = (unsigned long)f; + return 0; + +errout: + if (f) + kfree(f); + return err; +} + +static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct fw_head *head = (struct fw_head*)tp->root; + int h; + + if (head == NULL) + arg->stop = 1; + + if (arg->stop) + return; + + for (h = 0; h < 256; h++) { + struct fw_filter *f; + + for (f = head->ht[h]; f; f = f->next) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(tp, (unsigned long)f, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } + } +} + +static int fw_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct fw_filter *f = (struct fw_filter*)fh; + unsigned char *b = skb->tail; + struct rtattr *rta; + + if (f == NULL) + return skb->len; + + t->tcm_handle = f->id; + + if (!f->res.classid +#ifdef CONFIG_NET_CLS_POLICE + && !f->police +#endif + ) + return skb->len; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + if (f->res.classid) + RTA_PUT(skb, TCA_FW_CLASSID, 4, &f->res.classid); +#ifdef CONFIG_NET_CLS_POLICE + if (f->police) { + struct rtattr * p_rta = (struct rtattr*)skb->tail; + + RTA_PUT(skb, TCA_FW_POLICE, 0, NULL); + + if (tcf_police_dump(skb, f->police) < 0) + goto rtattr_failure; + + p_rta->rta_len = skb->tail - (u8*)p_rta; + } +#endif + + rta->rta_len = skb->tail - b; +#ifdef CONFIG_NET_CLS_POLICE + if (f->police) { + if (qdisc_copy_stats(skb, &f->police->stats)) + goto rtattr_failure; + } +#endif + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +struct tcf_proto_ops cls_fw_ops = { + NULL, + "fw", + fw_classify, + fw_init, + fw_destroy, + + fw_get, + fw_put, + fw_change, + fw_delete, + fw_walk, + fw_dump +}; + +#ifdef MODULE +int init_module(void) +{ + return register_tcf_proto_ops(&cls_fw_ops); +} + +void cleanup_module(void) +{ + unregister_tcf_proto_ops(&cls_fw_ops); +} +#endif +MODULE_LICENSE("GPL"); diff --git a/release/src/linux/linux/net/sched/cls_route.c b/release/src/linux/linux/net/sched/cls_route.c new file mode 100644 index 00000000..eb348c91 --- /dev/null +++ b/release/src/linux/linux/net/sched/cls_route.c @@ -0,0 +1,628 @@ +/* + * net/sched/cls_route.c ROUTE4 classifier. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <linux/module.h> +#include <linux/config.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + + +struct route4_fastmap +{ + struct route4_filter *filter; + u32 id; + int iif; +}; + +struct route4_head +{ + struct route4_fastmap fastmap[16]; + struct route4_bucket *table[256+1]; +}; + +struct route4_bucket +{ + struct route4_filter *ht[16+16+1]; +}; + +struct route4_filter +{ + struct route4_filter *next; + u32 id; + int iif; + + struct tcf_result res; +#ifdef CONFIG_NET_CLS_POLICE + struct tcf_police *police; +#endif + + u32 handle; + struct route4_bucket *bkt; +}; + +#define ROUTE4_FAILURE ((struct route4_filter*)(-1L)) + +static __inline__ int route4_fastmap_hash(u32 id, int iif) +{ + return id&0xF; +} + +static void route4_reset_fastmap(struct net_device *dev, struct route4_head *head, u32 id) +{ + spin_lock_bh(&dev->queue_lock); + memset(head->fastmap, 0, sizeof(head->fastmap)); + spin_unlock_bh(&dev->queue_lock); +} + +static void __inline__ +route4_set_fastmap(struct route4_head *head, u32 id, int iif, + struct route4_filter *f) +{ + int h = route4_fastmap_hash(id, iif); + head->fastmap[h].id = id; + head->fastmap[h].iif = iif; + head->fastmap[h].filter = f; +} + +static __inline__ int route4_hash_to(u32 id) +{ + return id&0xFF; +} + +static __inline__ int route4_hash_from(u32 id) +{ + return (id>>16)&0xF; +} + +static __inline__ int route4_hash_iif(int iif) +{ + return 16 + ((iif>>16)&0xF); +} + +static __inline__ int route4_hash_wild(void) +{ + return 32; +} + +#ifdef CONFIG_NET_CLS_POLICE +#define IF_ROUTE_POLICE \ +if (f->police) { \ + int pol_res = tcf_police(skb, f->police); \ + if (pol_res >= 0) return pol_res; \ + dont_cache = 1; \ + continue; \ +} \ +if (!dont_cache) +#else +#define IF_ROUTE_POLICE +#endif + + +static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + struct route4_head *head = (struct route4_head*)tp->root; + struct dst_entry *dst; + struct route4_bucket *b; + struct route4_filter *f; +#ifdef CONFIG_NET_CLS_POLICE + int dont_cache = 0; +#endif + u32 id, h; + int iif; + + if ((dst = skb->dst) == NULL) + goto failure; + + id = dst->tclassid; + if (head == NULL) + goto old_method; + + iif = ((struct rtable*)dst)->key.iif; + + h = route4_fastmap_hash(id, iif); + if (id == head->fastmap[h].id && + iif == head->fastmap[h].iif && + (f = head->fastmap[h].filter) != NULL) { + if (f == ROUTE4_FAILURE) + goto failure; + + *res = f->res; + return 0; + } + + h = route4_hash_to(id); + +restart: + if ((b = head->table[h]) != NULL) { + f = b->ht[route4_hash_from(id)]; + + for ( ; f; f = f->next) { + if (f->id == id) { + *res = f->res; + IF_ROUTE_POLICE route4_set_fastmap(head, id, iif, f); + return 0; + } + } + + for (f = b->ht[route4_hash_iif(iif)]; f; f = f->next) { + if (f->iif == iif) { + *res = f->res; + IF_ROUTE_POLICE route4_set_fastmap(head, id, iif, f); + return 0; + } + } + + for (f = b->ht[route4_hash_wild()]; f; f = f->next) { + *res = f->res; + IF_ROUTE_POLICE route4_set_fastmap(head, id, iif, f); + return 0; + } + + } + if (h < 256) { + h = 256; + id &= ~0xFFFF; + goto restart; + } + +#ifdef CONFIG_NET_CLS_POLICE + if (!dont_cache) +#endif + route4_set_fastmap(head, id, iif, ROUTE4_FAILURE); +failure: + return -1; + +old_method: + if (id && (TC_H_MAJ(id) == 0 || + !(TC_H_MAJ(id^tp->q->handle)))) { + res->classid = id; + res->class = 0; + return 0; + } + return -1; +} + +static u32 to_hash(u32 id) +{ + u32 h = id&0xFF; + if (id&0x8000) + h += 256; + return h; +} + +static u32 from_hash(u32 id) +{ + id &= 0xFFFF; + if (id == 0xFFFF) + return 32; + if (!(id & 0x8000)) { + if (id > 255) + return 256; + return id&0xF; + } + return 16 + (id&0xF); +} + +static unsigned long route4_get(struct tcf_proto *tp, u32 handle) +{ + struct route4_head *head = (struct route4_head*)tp->root; + struct route4_bucket *b; + struct route4_filter *f; + unsigned h1, h2; + + if (!head) + return 0; + + h1 = to_hash(handle); + if (h1 > 256) + return 0; + + h2 = from_hash(handle>>16); + if (h2 > 32) + return 0; + + if ((b = head->table[h1]) != NULL) { + for (f = b->ht[h2]; f; f = f->next) + if (f->handle == handle) + return (unsigned long)f; + } + return 0; +} + +static void route4_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int route4_init(struct tcf_proto *tp) +{ + MOD_INC_USE_COUNT; + return 0; +} + +static void route4_destroy(struct tcf_proto *tp) +{ + struct route4_head *head = xchg(&tp->root, NULL); + int h1, h2; + + if (head == NULL) { + MOD_DEC_USE_COUNT; + return; + } + + for (h1=0; h1<=256; h1++) { + struct route4_bucket *b; + + if ((b = head->table[h1]) != NULL) { + for (h2=0; h2<=32; h2++) { + struct route4_filter *f; + + while ((f = b->ht[h2]) != NULL) { + unsigned long cl; + + b->ht[h2] = f->next; + if ((cl = __cls_set_class(&f->res.class, 0)) != 0) + tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); +#ifdef CONFIG_NET_CLS_POLICE + tcf_police_release(f->police); +#endif + kfree(f); + } + } + kfree(b); + } + } + kfree(head); + MOD_DEC_USE_COUNT; +} + +static int route4_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct route4_head *head = (struct route4_head*)tp->root; + struct route4_filter **fp, *f = (struct route4_filter*)arg; + unsigned h = 0; + struct route4_bucket *b; + int i; + + if (!head || !f) + return -EINVAL; + + h = f->handle; + b = f->bkt; + + for (fp = &b->ht[from_hash(h>>16)]; *fp; fp = &(*fp)->next) { + if (*fp == f) { + unsigned long cl; + + tcf_tree_lock(tp); + *fp = f->next; + tcf_tree_unlock(tp); + + route4_reset_fastmap(tp->q->dev, head, f->id); + + if ((cl = cls_set_class(tp, &f->res.class, 0)) != 0) + tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); + +#ifdef CONFIG_NET_CLS_POLICE + tcf_police_release(f->police); +#endif + kfree(f); + + /* Strip tree */ + + for (i=0; i<=32; i++) + if (b->ht[i]) + return 0; + + /* OK, session has no flows */ + tcf_tree_lock(tp); + head->table[to_hash(h)] = NULL; + tcf_tree_unlock(tp); + + kfree(b); + return 0; + } + } + return 0; +} + +static int route4_change(struct tcf_proto *tp, unsigned long base, + u32 handle, + struct rtattr **tca, + unsigned long *arg) +{ + struct route4_head *head = tp->root; + struct route4_filter *f, *f1, **ins_f; + struct route4_bucket *b; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_ROUTE4_MAX]; + unsigned h1, h2; + int err; + + if (opt == NULL) + return handle ? -EINVAL : 0; + + if (rtattr_parse(tb, TCA_ROUTE4_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0) + return -EINVAL; + + if ((f = (struct route4_filter*)*arg) != NULL) { + /* Node exists: adjust only classid */ + + if (f->handle != handle && handle) + return -EINVAL; + if (tb[TCA_ROUTE4_CLASSID-1]) { + unsigned long cl; + + f->res.classid = *(u32*)RTA_DATA(tb[TCA_ROUTE4_CLASSID-1]); + cl = cls_set_class(tp, &f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid)); + if (cl) + tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); + } +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_ROUTE4_POLICE-1]) { + struct tcf_police *police = tcf_police_locate(tb[TCA_ROUTE4_POLICE-1], tca[TCA_RATE-1]); + + tcf_tree_lock(tp); + police = xchg(&f->police, police); + tcf_tree_unlock(tp); + + tcf_police_release(police); + } +#endif + return 0; + } + + /* Now more serious part... */ + + if (head == NULL) { + head = kmalloc(sizeof(struct route4_head), GFP_KERNEL); + if (head == NULL) + return -ENOBUFS; + memset(head, 0, sizeof(struct route4_head)); + + tcf_tree_lock(tp); + tp->root = head; + tcf_tree_unlock(tp); + } + + f = kmalloc(sizeof(struct route4_filter), GFP_KERNEL); + if (f == NULL) + return -ENOBUFS; + + memset(f, 0, sizeof(*f)); + + err = -EINVAL; + f->handle = 0x8000; + if (tb[TCA_ROUTE4_TO-1]) { + if (handle&0x8000) + goto errout; + if (RTA_PAYLOAD(tb[TCA_ROUTE4_TO-1]) < 4) + goto errout; + f->id = *(u32*)RTA_DATA(tb[TCA_ROUTE4_TO-1]); + if (f->id > 0xFF) + goto errout; + f->handle = f->id; + } + if (tb[TCA_ROUTE4_FROM-1]) { + u32 sid; + if (tb[TCA_ROUTE4_IIF-1]) + goto errout; + if (RTA_PAYLOAD(tb[TCA_ROUTE4_FROM-1]) < 4) + goto errout; + sid = (*(u32*)RTA_DATA(tb[TCA_ROUTE4_FROM-1])); + if (sid > 0xFF) + goto errout; + f->handle |= sid<<16; + f->id |= sid<<16; + } else if (tb[TCA_ROUTE4_IIF-1]) { + if (RTA_PAYLOAD(tb[TCA_ROUTE4_IIF-1]) < 4) + goto errout; + f->iif = *(u32*)RTA_DATA(tb[TCA_ROUTE4_IIF-1]); + if (f->iif > 0x7FFF) + goto errout; + f->handle |= (f->iif|0x8000)<<16; + } else + f->handle |= 0xFFFF<<16; + + if (handle) { + f->handle |= handle&0x7F00; + if (f->handle != handle) + goto errout; + } + + if (tb[TCA_ROUTE4_CLASSID-1]) { + if (RTA_PAYLOAD(tb[TCA_ROUTE4_CLASSID-1]) < 4) + goto errout; + f->res.classid = *(u32*)RTA_DATA(tb[TCA_ROUTE4_CLASSID-1]); + } + + h1 = to_hash(f->handle); + if ((b = head->table[h1]) == NULL) { + err = -ENOBUFS; + b = kmalloc(sizeof(struct route4_bucket), GFP_KERNEL); + if (b == NULL) + goto errout; + memset(b, 0, sizeof(*b)); + + tcf_tree_lock(tp); + head->table[h1] = b; + tcf_tree_unlock(tp); + } + f->bkt = b; + + err = -EEXIST; + h2 = from_hash(f->handle>>16); + for (ins_f = &b->ht[h2]; (f1=*ins_f) != NULL; ins_f = &f1->next) { + if (f->handle < f1->handle) + break; + if (f1->handle == f->handle) + goto errout; + } + + cls_set_class(tp, &f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid)); +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_ROUTE4_POLICE-1]) + f->police = tcf_police_locate(tb[TCA_ROUTE4_POLICE-1], tca[TCA_RATE-1]); +#endif + + f->next = f1; + tcf_tree_lock(tp); + *ins_f = f; + tcf_tree_unlock(tp); + + route4_reset_fastmap(tp->q->dev, head, f->id); + *arg = (unsigned long)f; + return 0; + +errout: + if (f) + kfree(f); + return err; +} + +static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct route4_head *head = tp->root; + unsigned h, h1; + + if (head == NULL) + arg->stop = 1; + + if (arg->stop) + return; + + for (h = 0; h <= 256; h++) { + struct route4_bucket *b = head->table[h]; + + if (b) { + for (h1 = 0; h1 <= 32; h1++) { + struct route4_filter *f; + + for (f = b->ht[h1]; f; f = f->next) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(tp, (unsigned long)f, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } + } + } + } +} + +static int route4_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct route4_filter *f = (struct route4_filter*)fh; + unsigned char *b = skb->tail; + struct rtattr *rta; + u32 id; + + if (f == NULL) + return skb->len; + + t->tcm_handle = f->handle; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + if (!(f->handle&0x8000)) { + id = f->id&0xFF; + RTA_PUT(skb, TCA_ROUTE4_TO, sizeof(id), &id); + } + if (f->handle&0x80000000) { + if ((f->handle>>16) != 0xFFFF) + RTA_PUT(skb, TCA_ROUTE4_IIF, sizeof(f->iif), &f->iif); + } else { + id = f->id>>16; + RTA_PUT(skb, TCA_ROUTE4_FROM, sizeof(id), &id); + } + if (f->res.classid) + RTA_PUT(skb, TCA_ROUTE4_CLASSID, 4, &f->res.classid); +#ifdef CONFIG_NET_CLS_POLICE + if (f->police) { + struct rtattr * p_rta = (struct rtattr*)skb->tail; + + RTA_PUT(skb, TCA_ROUTE4_POLICE, 0, NULL); + + if (tcf_police_dump(skb, f->police) < 0) + goto rtattr_failure; + + p_rta->rta_len = skb->tail - (u8*)p_rta; + } +#endif + + rta->rta_len = skb->tail - b; +#ifdef CONFIG_NET_CLS_POLICE + if (f->police) { + if (qdisc_copy_stats(skb, &f->police->stats)) + goto rtattr_failure; + } +#endif + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +struct tcf_proto_ops cls_route4_ops = { + NULL, + "route", + route4_classify, + route4_init, + route4_destroy, + + route4_get, + route4_put, + route4_change, + route4_delete, + route4_walk, + route4_dump +}; + +#ifdef MODULE +int init_module(void) +{ + return register_tcf_proto_ops(&cls_route4_ops); +} + +void cleanup_module(void) +{ + unregister_tcf_proto_ops(&cls_route4_ops); +} +#endif +MODULE_LICENSE("GPL"); diff --git a/release/src/linux/linux/net/sched/cls_rsvp.c b/release/src/linux/linux/net/sched/cls_rsvp.c new file mode 100644 index 00000000..05a937b5 --- /dev/null +++ b/release/src/linux/linux/net/sched/cls_rsvp.c @@ -0,0 +1,42 @@ +/* + * net/sched/cls_rsvp.c Special RSVP packet classifier for IPv4. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +#define RSVP_DST_LEN 1 +#define RSVP_ID "rsvp" +#define RSVP_OPS cls_rsvp_ops + +#include "cls_rsvp.h" +MODULE_LICENSE("GPL"); diff --git a/release/src/linux/linux/net/sched/cls_rsvp.h b/release/src/linux/linux/net/sched/cls_rsvp.h new file mode 100644 index 00000000..070d25f6 --- /dev/null +++ b/release/src/linux/linux/net/sched/cls_rsvp.h @@ -0,0 +1,700 @@ +/* + * net/sched/cls_rsvp.h Template file for RSVPv[46] classifiers. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +/* + Comparing to general packet classification problem, + RSVP needs only sevaral relatively simple rules: + + * (dst, protocol) are always specified, + so that we are able to hash them. + * src may be exact, or may be wildcard, so that + we can keep a hash table plus one wildcard entry. + * source port (or flow label) is important only if src is given. + + IMPLEMENTATION. + + We use a two level hash table: The top level is keyed by + destination address and protocol ID, every bucket contains a list + of "rsvp sessions", identified by destination address, protocol and + DPI(="Destination Port ID"): triple (key, mask, offset). + + Every bucket has a smaller hash table keyed by source address + (cf. RSVP flowspec) and one wildcard entry for wildcard reservations. + Every bucket is again a list of "RSVP flows", selected by + source address and SPI(="Source Port ID" here rather than + "security parameter index"): triple (key, mask, offset). + + + NOTE 1. All the packets with IPv6 extension headers (but AH and ESP) + and all fragmented packets go to the best-effort traffic class. + + + NOTE 2. Two "port id"'s seems to be redundant, rfc2207 requires + only one "Generalized Port Identifier". So that for classic + ah, esp (and udp,tcp) both *pi should coincide or one of them + should be wildcard. + + At first sight, this redundancy is just a waste of CPU + resources. But DPI and SPI add the possibility to assign different + priorities to GPIs. Look also at note 4 about tunnels below. + + + NOTE 3. One complication is the case of tunneled packets. + We implement it as following: if the first lookup + matches a special session with "tunnelhdr" value not zero, + flowid doesn't contain the true flow ID, but the tunnel ID (1...255). + In this case, we pull tunnelhdr bytes and restart lookup + with tunnel ID added to the list of keys. Simple and stupid 8)8) + It's enough for PIMREG and IPIP. + + + NOTE 4. Two GPIs make it possible to parse even GRE packets. + F.e. DPI can select ETH_P_IP (and necessary flags to make + tunnelhdr correct) in GRE protocol field and SPI matches + GRE key. Is it not nice? 8)8) + + + Well, as result, despite its simplicity, we get a pretty + powerful classification engine. */ + +#include <linux/config.h> + +struct rsvp_head +{ + u32 tmap[256/32]; + u32 hgenerator; + u8 tgenerator; + struct rsvp_session *ht[256]; +}; + +struct rsvp_session +{ + struct rsvp_session *next; + u32 dst[RSVP_DST_LEN]; + struct tc_rsvp_gpi dpi; + u8 protocol; + u8 tunnelid; + /* 16 (src,sport) hash slots, and one wildcard source slot */ + struct rsvp_filter *ht[16+1]; +}; + + +struct rsvp_filter +{ + struct rsvp_filter *next; + u32 src[RSVP_DST_LEN]; + struct tc_rsvp_gpi spi; + u8 tunnelhdr; + + struct tcf_result res; +#ifdef CONFIG_NET_CLS_POLICE + struct tcf_police *police; +#endif + + u32 handle; + struct rsvp_session *sess; +}; + +static __inline__ unsigned hash_dst(u32 *dst, u8 protocol, u8 tunnelid) +{ + unsigned h = dst[RSVP_DST_LEN-1]; + h ^= h>>16; + h ^= h>>8; + return (h ^ protocol ^ tunnelid) & 0xFF; +} + +static __inline__ unsigned hash_src(u32 *src) +{ + unsigned h = src[RSVP_DST_LEN-1]; + h ^= h>>16; + h ^= h>>8; + h ^= h>>4; + return h & 0xF; +} + +#ifdef CONFIG_NET_CLS_POLICE +#define RSVP_POLICE() \ +if (f->police) { \ + int pol_res = tcf_police(skb, f->police); \ + if (pol_res < 0) continue; \ + if (pol_res) return pol_res; \ +} +#else +#define RSVP_POLICE() +#endif + + +static int rsvp_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht; + struct rsvp_session *s; + struct rsvp_filter *f; + unsigned h1, h2; + u32 *dst, *src; + u8 protocol; + u8 tunnelid = 0; + u8 *xprt; +#if RSVP_DST_LEN == 4 + struct ipv6hdr *nhptr = skb->nh.ipv6h; +#else + struct iphdr *nhptr = skb->nh.iph; +#endif + +#if !defined(__i386__) && !defined(__mc68000__) + if ((unsigned long)nhptr & 3) + return -1; +#endif + +restart: + +#if RSVP_DST_LEN == 4 + src = &nhptr->saddr.s6_addr32[0]; + dst = &nhptr->daddr.s6_addr32[0]; + protocol = nhptr->nexthdr; + xprt = ((u8*)nhptr) + sizeof(struct ipv6hdr); +#else + src = &nhptr->saddr; + dst = &nhptr->daddr; + protocol = nhptr->protocol; + xprt = ((u8*)nhptr) + (nhptr->ihl<<2); + if (nhptr->frag_off&__constant_htons(IP_MF|IP_OFFSET)) + return -1; +#endif + + h1 = hash_dst(dst, protocol, tunnelid); + h2 = hash_src(src); + + for (s = sht[h1]; s; s = s->next) { + if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] && + protocol == s->protocol && + !(s->dpi.mask & (*(u32*)(xprt+s->dpi.offset)^s->dpi.key)) +#if RSVP_DST_LEN == 4 + && dst[0] == s->dst[0] + && dst[1] == s->dst[1] + && dst[2] == s->dst[2] +#endif + && tunnelid == s->tunnelid) { + + for (f = s->ht[h2]; f; f = f->next) { + if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN-1] && + !(f->spi.mask & (*(u32*)(xprt+f->spi.offset)^f->spi.key)) +#if RSVP_DST_LEN == 4 + && src[0] == f->src[0] + && src[1] == f->src[1] + && src[2] == f->src[2] +#endif + ) { + *res = f->res; + + RSVP_POLICE(); + +matched: + if (f->tunnelhdr == 0) + return 0; + + tunnelid = f->res.classid; + nhptr = (void*)(xprt + f->tunnelhdr - sizeof(*nhptr)); + goto restart; + } + } + + /* And wildcard bucket... */ + for (f = s->ht[16]; f; f = f->next) { + *res = f->res; + RSVP_POLICE(); + goto matched; + } + return -1; + } + } + return -1; +} + +static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle) +{ + struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht; + struct rsvp_session *s; + struct rsvp_filter *f; + unsigned h1 = handle&0xFF; + unsigned h2 = (handle>>8)&0xFF; + + if (h2 > 16) + return 0; + + for (s = sht[h1]; s; s = s->next) { + for (f = s->ht[h2]; f; f = f->next) { + if (f->handle == handle) + return (unsigned long)f; + } + } + return 0; +} + +static void rsvp_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int rsvp_init(struct tcf_proto *tp) +{ + struct rsvp_head *data; + + MOD_INC_USE_COUNT; + data = kmalloc(sizeof(struct rsvp_head), GFP_KERNEL); + if (data) { + memset(data, 0, sizeof(struct rsvp_head)); + tp->root = data; + return 0; + } + MOD_DEC_USE_COUNT; + return -ENOBUFS; +} + +static void rsvp_destroy(struct tcf_proto *tp) +{ + struct rsvp_head *data = xchg(&tp->root, NULL); + struct rsvp_session **sht; + int h1, h2; + + if (data == NULL) + return; + + sht = data->ht; + + for (h1=0; h1<256; h1++) { + struct rsvp_session *s; + + while ((s = sht[h1]) != NULL) { + sht[h1] = s->next; + + for (h2=0; h2<=16; h2++) { + struct rsvp_filter *f; + + while ((f = s->ht[h2]) != NULL) { + unsigned long cl; + + s->ht[h2] = f->next; + if ((cl = __cls_set_class(&f->res.class, 0)) != 0) + tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); +#ifdef CONFIG_NET_CLS_POLICE + tcf_police_release(f->police); +#endif + kfree(f); + } + } + kfree(s); + } + } + kfree(data); + MOD_DEC_USE_COUNT; +} + +static int rsvp_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct rsvp_filter **fp, *f = (struct rsvp_filter*)arg; + unsigned h = f->handle; + struct rsvp_session **sp; + struct rsvp_session *s = f->sess; + int i; + + for (fp = &s->ht[(h>>8)&0xFF]; *fp; fp = &(*fp)->next) { + if (*fp == f) { + unsigned long cl; + + + tcf_tree_lock(tp); + *fp = f->next; + tcf_tree_unlock(tp); + + if ((cl = cls_set_class(tp, &f->res.class, 0)) != 0) + tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); + +#ifdef CONFIG_NET_CLS_POLICE + tcf_police_release(f->police); +#endif + + kfree(f); + + /* Strip tree */ + + for (i=0; i<=16; i++) + if (s->ht[i]) + return 0; + + /* OK, session has no flows */ + for (sp = &((struct rsvp_head*)tp->root)->ht[h&0xFF]; + *sp; sp = &(*sp)->next) { + if (*sp == s) { + tcf_tree_lock(tp); + *sp = s->next; + tcf_tree_unlock(tp); + + kfree(s); + return 0; + } + } + + return 0; + } + } + return 0; +} + +static unsigned gen_handle(struct tcf_proto *tp, unsigned salt) +{ + struct rsvp_head *data = tp->root; + int i = 0xFFFF; + + while (i-- > 0) { + u32 h; + if ((data->hgenerator += 0x10000) == 0) + data->hgenerator = 0x10000; + h = data->hgenerator|salt; + if (rsvp_get(tp, h) == 0) + return h; + } + return 0; +} + +static int tunnel_bts(struct rsvp_head *data) +{ + int n = data->tgenerator>>5; + u32 b = 1<<(data->tgenerator&0x1F); + + if (data->tmap[n]&b) + return 0; + data->tmap[n] |= b; + return 1; +} + +static void tunnel_recycle(struct rsvp_head *data) +{ + struct rsvp_session **sht = data->ht; + u32 tmap[256/32]; + int h1, h2; + + memset(tmap, 0, sizeof(tmap)); + + for (h1=0; h1<256; h1++) { + struct rsvp_session *s; + for (s = sht[h1]; s; s = s->next) { + for (h2=0; h2<=16; h2++) { + struct rsvp_filter *f; + + for (f = s->ht[h2]; f; f = f->next) { + if (f->tunnelhdr == 0) + continue; + data->tgenerator = f->res.classid; + tunnel_bts(data); + } + } + } + } + + memcpy(data->tmap, tmap, sizeof(tmap)); +} + +static u32 gen_tunnel(struct rsvp_head *data) +{ + int i, k; + + for (k=0; k<2; k++) { + for (i=255; i>0; i--) { + if (++data->tgenerator == 0) + data->tgenerator = 1; + if (tunnel_bts(data)) + return data->tgenerator; + } + tunnel_recycle(data); + } + return 0; +} + +static int rsvp_change(struct tcf_proto *tp, unsigned long base, + u32 handle, + struct rtattr **tca, + unsigned long *arg) +{ + struct rsvp_head *data = tp->root; + struct rsvp_filter *f, **fp; + struct rsvp_session *s, **sp; + struct tc_rsvp_pinfo *pinfo = NULL; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_RSVP_MAX]; + unsigned h1, h2; + u32 *dst; + int err; + + if (opt == NULL) + return handle ? -EINVAL : 0; + + if (rtattr_parse(tb, TCA_RSVP_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0) + return -EINVAL; + + if ((f = (struct rsvp_filter*)*arg) != NULL) { + /* Node exists: adjust only classid */ + + if (f->handle != handle && handle) + return -EINVAL; + if (tb[TCA_RSVP_CLASSID-1]) { + unsigned long cl; + + f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]); + cl = cls_set_class(tp, &f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid)); + if (cl) + tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); + } +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_RSVP_POLICE-1]) { + struct tcf_police *police = tcf_police_locate(tb[TCA_RSVP_POLICE-1], tca[TCA_RATE-1]); + + tcf_tree_lock(tp); + police = xchg(&f->police, police); + tcf_tree_unlock(tp); + + tcf_police_release(police); + } +#endif + return 0; + } + + /* Now more serious part... */ + if (handle) + return -EINVAL; + if (tb[TCA_RSVP_DST-1] == NULL) + return -EINVAL; + + f = kmalloc(sizeof(struct rsvp_filter), GFP_KERNEL); + if (f == NULL) + return -ENOBUFS; + + memset(f, 0, sizeof(*f)); + h2 = 16; + if (tb[TCA_RSVP_SRC-1]) { + err = -EINVAL; + if (RTA_PAYLOAD(tb[TCA_RSVP_SRC-1]) != sizeof(f->src)) + goto errout; + memcpy(f->src, RTA_DATA(tb[TCA_RSVP_SRC-1]), sizeof(f->src)); + h2 = hash_src(f->src); + } + if (tb[TCA_RSVP_PINFO-1]) { + err = -EINVAL; + if (RTA_PAYLOAD(tb[TCA_RSVP_PINFO-1]) < sizeof(struct tc_rsvp_pinfo)) + goto errout; + pinfo = RTA_DATA(tb[TCA_RSVP_PINFO-1]); + f->spi = pinfo->spi; + f->tunnelhdr = pinfo->tunnelhdr; + } + if (tb[TCA_RSVP_CLASSID-1]) { + err = -EINVAL; + if (RTA_PAYLOAD(tb[TCA_RSVP_CLASSID-1]) != 4) + goto errout; + f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]); + } + + err = -EINVAL; + if (RTA_PAYLOAD(tb[TCA_RSVP_DST-1]) != sizeof(f->src)) + goto errout; + dst = RTA_DATA(tb[TCA_RSVP_DST-1]); + h1 = hash_dst(dst, pinfo ? pinfo->protocol : 0, pinfo ? pinfo->tunnelid : 0); + + err = -ENOMEM; + if ((f->handle = gen_handle(tp, h1 | (h2<<8))) == 0) + goto errout; + + if (f->tunnelhdr) { + err = -EINVAL; + if (f->res.classid > 255) + goto errout; + + err = -ENOMEM; + if (f->res.classid == 0 && + (f->res.classid = gen_tunnel(data)) == 0) + goto errout; + } + + for (sp = &data->ht[h1]; (s=*sp) != NULL; sp = &s->next) { + if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] && + pinfo->protocol == s->protocol && + memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 +#if RSVP_DST_LEN == 4 + && dst[0] == s->dst[0] + && dst[1] == s->dst[1] + && dst[2] == s->dst[2] +#endif + && pinfo->tunnelid == s->tunnelid) { + +insert: + /* OK, we found appropriate session */ + + fp = &s->ht[h2]; + + f->sess = s; + if (f->tunnelhdr == 0) + cls_set_class(tp, &f->res.class, tp->q->ops->cl_ops->bind_tcf(tp->q, base, f->res.classid)); +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_RSVP_POLICE-1]) + f->police = tcf_police_locate(tb[TCA_RSVP_POLICE-1], tca[TCA_RATE-1]); +#endif + + for (fp = &s->ht[h2]; *fp; fp = &(*fp)->next) + if (((*fp)->spi.mask&f->spi.mask) != f->spi.mask) + break; + f->next = *fp; + wmb(); + *fp = f; + + *arg = (unsigned long)f; + return 0; + } + } + + /* No session found. Create new one. */ + + err = -ENOBUFS; + s = kmalloc(sizeof(struct rsvp_session), GFP_KERNEL); + if (s == NULL) + goto errout; + memset(s, 0, sizeof(*s)); + memcpy(s->dst, dst, sizeof(s->dst)); + s->dpi = pinfo->dpi; + s->protocol = pinfo->protocol; + s->tunnelid = pinfo->tunnelid; + for (sp = &data->ht[h1]; *sp; sp = &(*sp)->next) { + if (((*sp)->dpi.mask&s->dpi.mask) != s->dpi.mask) + break; + } + s->next = *sp; + wmb(); + *sp = s; + + goto insert; + +errout: + if (f) + kfree(f); + return err; +} + +static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct rsvp_head *head = tp->root; + unsigned h, h1; + + if (arg->stop) + return; + + for (h = 0; h < 256; h++) { + struct rsvp_session *s; + + for (s = head->ht[h]; s; s = s->next) { + for (h1 = 0; h1 <= 16; h1++) { + struct rsvp_filter *f; + + for (f = s->ht[h1]; f; f = f->next) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(tp, (unsigned long)f, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } + } + } + } +} + +static int rsvp_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct rsvp_filter *f = (struct rsvp_filter*)fh; + struct rsvp_session *s; + unsigned char *b = skb->tail; + struct rtattr *rta; + struct tc_rsvp_pinfo pinfo; + + if (f == NULL) + return skb->len; + s = f->sess; + + t->tcm_handle = f->handle; + + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + RTA_PUT(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst); + pinfo.dpi = s->dpi; + pinfo.spi = f->spi; + pinfo.protocol = s->protocol; + pinfo.tunnelid = s->tunnelid; + pinfo.tunnelhdr = f->tunnelhdr; + RTA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo); + if (f->res.classid) + RTA_PUT(skb, TCA_RSVP_CLASSID, 4, &f->res.classid); + if (((f->handle>>8)&0xFF) != 16) + RTA_PUT(skb, TCA_RSVP_SRC, sizeof(f->src), f->src); +#ifdef CONFIG_NET_CLS_POLICE + if (f->police) { + struct rtattr * p_rta = (struct rtattr*)skb->tail; + + RTA_PUT(skb, TCA_RSVP_POLICE, 0, NULL); + + if (tcf_police_dump(skb, f->police) < 0) + goto rtattr_failure; + + p_rta->rta_len = skb->tail - (u8*)p_rta; + } +#endif + + rta->rta_len = skb->tail - b; +#ifdef CONFIG_NET_CLS_POLICE + if (f->police) { + if (qdisc_copy_stats(skb, &f->police->stats)) + goto rtattr_failure; + } +#endif + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +struct tcf_proto_ops RSVP_OPS = { + NULL, + RSVP_ID, + rsvp_classify, + rsvp_init, + rsvp_destroy, + + rsvp_get, + rsvp_put, + rsvp_change, + rsvp_delete, + rsvp_walk, + rsvp_dump +}; + +#ifdef MODULE +int init_module(void) +{ + return register_tcf_proto_ops(&RSVP_OPS); +} + +void cleanup_module(void) +{ + unregister_tcf_proto_ops(&RSVP_OPS); +} +#endif diff --git a/release/src/linux/linux/net/sched/cls_rsvp6.c b/release/src/linux/linux/net/sched/cls_rsvp6.c new file mode 100644 index 00000000..85ed7b40 --- /dev/null +++ b/release/src/linux/linux/net/sched/cls_rsvp6.c @@ -0,0 +1,43 @@ +/* + * net/sched/cls_rsvp6.c Special RSVP packet classifier for IPv6. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <linux/ipv6.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +#define RSVP_DST_LEN 4 +#define RSVP_ID "rsvp6" +#define RSVP_OPS cls_rsvp6_ops + +#include "cls_rsvp.h" +MODULE_LICENSE("GPL"); diff --git a/release/src/linux/linux/net/sched/cls_tcindex.c b/release/src/linux/linux/net/sched/cls_tcindex.c new file mode 100644 index 00000000..f0a6ffdc --- /dev/null +++ b/release/src/linux/linux/net/sched/cls_tcindex.c @@ -0,0 +1,496 @@ +/* + * net/sched/cls_tcindex.c Packet classifier for skb->tc_index + * + * Written 1998,1999 by Werner Almesberger, EPFL ICA + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/netdevice.h> +#include <net/ip.h> +#include <net/pkt_sched.h> +#include <net/route.h> + + +/* + * Not quite sure if we need all the xchgs Alexey uses when accessing things. + * Can always add them later ... :) + */ + + +#define PERFECT_HASH_THRESHOLD 64 /* use perfect hash if not bigger */ +#define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */ + + +#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) + +#define D2PRINTK(format,args...) + + +#define PRIV(tp) ((struct tcindex_data *) (tp)->root) + + +struct tcindex_filter_result { + struct tcf_police *police; + struct tcf_result res; +}; + +struct tcindex_filter { + __u16 key; + struct tcindex_filter_result result; + struct tcindex_filter *next; +}; + + +struct tcindex_data { + struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */ + struct tcindex_filter **h; /* imperfect hash; only used if !perfect; + NULL if unused */ + __u16 mask; /* AND key with mask */ + int shift; /* shift ANDed key to the right */ + int hash; /* hash table size; 0 if undefined */ + int alloc_hash; /* allocated size */ + int fall_through; /* 0: only classify if explicit match */ +}; + + +static struct tcindex_filter_result *lookup(struct tcindex_data *p,__u16 key) +{ + struct tcindex_filter *f; + + if (p->perfect) + return p->perfect[key].res.class ? p->perfect+key : NULL; + if (!p->h) + return NULL; + for (f = p->h[key % p->hash]; f; f = f->next) { + if (f->key == key) + return &f->result; + } + return NULL; +} + + +static int tcindex_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + struct tcindex_data *p = PRIV(tp); + struct tcindex_filter_result *f; + + D2PRINTK("tcindex_classify(skb %p,tp %p,res %p),p %p\n",skb,tp,res,p); + + f = lookup(p,(skb->tc_index & p->mask) >> p->shift); + if (!f) { + if (!p->fall_through) + return -1; + res->classid = TC_H_MAKE(TC_H_MAJ(tp->q->handle), + (skb->tc_index& p->mask) >> p->shift); + res->class = 0; + D2PRINTK("alg 0x%x\n",res->classid); + return 0; + } + *res = f->res; + D2PRINTK("map 0x%x\n",res->classid); +#ifdef CONFIG_NET_CLS_POLICE + if (f->police) { + int result; + + result = tcf_police(skb,f->police); + D2PRINTK("police %d\n",res); + return result; + } +#endif + return 0; +} + + +static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle) +{ + struct tcindex_data *p = PRIV(tp); + struct tcindex_filter_result *r; + + DPRINTK("tcindex_get(tp %p,handle 0x%08x)\n",tp,handle); + if (p->perfect && handle >= p->alloc_hash) + return 0; + r = lookup(PRIV(tp),handle); + return r && r->res.class ? (unsigned long) r : 0; +} + + +static void tcindex_put(struct tcf_proto *tp, unsigned long f) +{ + DPRINTK("tcindex_put(tp %p,f 0x%lx)\n",tp,f); +} + + +static int tcindex_init(struct tcf_proto *tp) +{ + struct tcindex_data *p; + + DPRINTK("tcindex_init(tp %p)\n",tp); + MOD_INC_USE_COUNT; + p = kmalloc(sizeof(struct tcindex_data),GFP_KERNEL); + if (!p) { + MOD_DEC_USE_COUNT; + return -ENOMEM; + } + tp->root = p; + p->perfect = NULL; + p->h = NULL; + p->hash = 0; + p->mask = 0xffff; + p->shift = 0; + p->fall_through = 1; + return 0; +} + + +static int tcindex_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct tcindex_data *p = PRIV(tp); + struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg; + struct tcindex_filter *f = NULL; + unsigned long cl; + + DPRINTK("tcindex_delete(tp %p,arg 0x%lx),p %p,f %p\n",tp,arg,p,f); + if (p->perfect) { + if (!r->res.class) + return -ENOENT; + } else { + int i; + struct tcindex_filter **walk = NULL; + + for (i = 0; i < p->hash; i++) + for (walk = p->h+i; *walk; walk = &(*walk)->next) + if (&(*walk)->result == r) + goto found; + return -ENOENT; + +found: + f = *walk; + tcf_tree_lock(tp); + *walk = f->next; + tcf_tree_unlock(tp); + } + cl = __cls_set_class(&r->res.class,0); + if (cl) + tp->q->ops->cl_ops->unbind_tcf(tp->q,cl); +#ifdef CONFIG_NET_CLS_POLICE + tcf_police_release(r->police); +#endif + if (f) + kfree(f); + return 0; +} + + +/* + * There are no parameters for tcindex_init, so we overload tcindex_change + */ + + +static int tcindex_change(struct tcf_proto *tp,unsigned long base,u32 handle, + struct rtattr **tca,unsigned long *arg) +{ + struct tcindex_filter_result new_filter_result = { + NULL, /* no policing */ + { 0,0 }, /* no classification */ + }; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_TCINDEX_MAX]; + struct tcindex_data *p = PRIV(tp); + struct tcindex_filter *f; + struct tcindex_filter_result *r = (struct tcindex_filter_result *) *arg; + struct tcindex_filter **walk; + int hash,shift; + __u16 mask; + + DPRINTK("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p," + "p %p,r %p\n",tp,handle,tca,arg,opt,p,r); + if (arg) + DPRINTK("*arg = 0x%lx\n",*arg); + if (!opt) + return 0; + if (rtattr_parse(tb,TCA_TCINDEX_MAX,RTA_DATA(opt),RTA_PAYLOAD(opt)) < 0) + return -EINVAL; + if (!tb[TCA_TCINDEX_HASH-1]) { + hash = p->hash; + } else { + if (RTA_PAYLOAD(tb[TCA_TCINDEX_HASH-1]) < sizeof(int)) + return -EINVAL; + hash = *(int *) RTA_DATA(tb[TCA_TCINDEX_HASH-1]); + } + if (!tb[TCA_TCINDEX_MASK-1]) { + mask = p->mask; + } else { + if (RTA_PAYLOAD(tb[TCA_TCINDEX_MASK-1]) < sizeof(__u16)) + return -EINVAL; + mask = *(__u16 *) RTA_DATA(tb[TCA_TCINDEX_MASK-1]); + } + if (!tb[TCA_TCINDEX_SHIFT-1]) + shift = p->shift; + else { + if (RTA_PAYLOAD(tb[TCA_TCINDEX_SHIFT-1]) < sizeof(__u16)) + return -EINVAL; + shift = *(int *) RTA_DATA(tb[TCA_TCINDEX_SHIFT-1]); + } + if (p->perfect && hash <= (mask >> shift)) + return -EBUSY; + if (p->perfect && hash > p->alloc_hash) + return -EBUSY; + if (p->h && hash != p->alloc_hash) + return -EBUSY; + p->hash = hash; + p->mask = mask; + p->shift = shift; + if (tb[TCA_TCINDEX_FALL_THROUGH-1]) { + if (RTA_PAYLOAD(tb[TCA_TCINDEX_FALL_THROUGH-1]) < sizeof(int)) + return -EINVAL; + p->fall_through = + *(int *) RTA_DATA(tb[TCA_TCINDEX_FALL_THROUGH-1]); + } + DPRINTK("classid/police %p/%p\n",tb[TCA_TCINDEX_CLASSID-1], + tb[TCA_TCINDEX_POLICE-1]); + if (!tb[TCA_TCINDEX_CLASSID-1] && !tb[TCA_TCINDEX_POLICE-1]) + return 0; + if (!hash) { + if ((mask >> shift) < PERFECT_HASH_THRESHOLD) { + p->hash = (mask >> shift)+1; + } else { + p->hash = DEFAULT_HASH_SIZE; + } + } + if (!p->perfect && !p->h) { + p->alloc_hash = p->hash; + DPRINTK("hash %d mask %d\n",p->hash,p->mask); + if (p->hash > (mask >> shift)) { + p->perfect = kmalloc(p->hash* + sizeof(struct tcindex_filter_result),GFP_KERNEL); + if (!p->perfect) + return -ENOMEM; + memset(p->perfect, 0, + p->hash * sizeof(struct tcindex_filter_result)); + } else { + p->h = kmalloc(p->hash*sizeof(struct tcindex_filter *), + GFP_KERNEL); + if (!p->h) + return -ENOMEM; + memset(p->h, 0, p->hash*sizeof(struct tcindex_filter *)); + } + } + /* + * Note: this could be as restrictive as + * if (handle & ~(mask >> shift)) + * but then, we'd fail handles that may become valid after some + * future mask change. While this is extremely unlikely to ever + * matter, the check below is safer (and also more + * backwards-compatible). + */ + if (p->perfect && handle >= p->alloc_hash) + return -EINVAL; + if (p->perfect) { + r = p->perfect+handle; + } else { + r = lookup(p,handle); + DPRINTK("r=%p\n",r); + if (!r) + r = &new_filter_result; + } + DPRINTK("r=%p\n",r); + if (tb[TCA_TCINDEX_CLASSID-1]) { + unsigned long cl = cls_set_class(tp,&r->res.class,0); + + if (cl) + tp->q->ops->cl_ops->unbind_tcf(tp->q,cl); + r->res.classid = *(__u32 *) RTA_DATA(tb[TCA_TCINDEX_CLASSID-1]); + r->res.class = tp->q->ops->cl_ops->bind_tcf(tp->q,base, + r->res.classid); + if (!r->res.class) { + r->res.classid = 0; + return -ENOENT; + } + } +#ifdef CONFIG_NET_CLS_POLICE + { + struct tcf_police *police; + + police = tb[TCA_TCINDEX_POLICE-1] ? + tcf_police_locate(tb[TCA_TCINDEX_POLICE-1],NULL) : NULL; + tcf_tree_lock(tp); + police = xchg(&r->police,police); + tcf_tree_unlock(tp); + tcf_police_release(police); + } +#endif + if (r != &new_filter_result) + return 0; + f = kmalloc(sizeof(struct tcindex_filter),GFP_KERNEL); + if (!f) + return -ENOMEM; + f->key = handle; + f->result = new_filter_result; + f->next = NULL; + for (walk = p->h+(handle % p->hash); *walk; walk = &(*walk)->next) + /* nothing */; + wmb(); + *walk = f; + return 0; +} + + +static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) +{ + struct tcindex_data *p = PRIV(tp); + struct tcindex_filter *f,*next; + int i; + + DPRINTK("tcindex_walk(tp %p,walker %p),p %p\n",tp,walker,p); + if (p->perfect) { + for (i = 0; i < p->hash; i++) { + if (!p->perfect[i].res.class) + continue; + if (walker->count >= walker->skip) { + if (walker->fn(tp, + (unsigned long) (p->perfect+i), walker) + < 0) { + walker->stop = 1; + return; + } + } + walker->count++; + } + } + if (!p->h) + return; + for (i = 0; i < p->hash; i++) { + for (f = p->h[i]; f; f = next) { + next = f->next; + if (walker->count >= walker->skip) { + if (walker->fn(tp,(unsigned long) &f->result, + walker) < 0) { + walker->stop = 1; + return; + } + } + walker->count++; + } + } +} + + +static int tcindex_destroy_element(struct tcf_proto *tp, + unsigned long arg, struct tcf_walker *walker) +{ + return tcindex_delete(tp,arg); +} + + +static void tcindex_destroy(struct tcf_proto *tp) +{ + struct tcindex_data *p = PRIV(tp); + struct tcf_walker walker; + + DPRINTK("tcindex_destroy(tp %p),p %p\n",tp,p); + walker.count = 0; + walker.skip = 0; + walker.fn = &tcindex_destroy_element; + tcindex_walk(tp,&walker); + if (p->perfect) + kfree(p->perfect); + if (p->h) + kfree(p->h); + kfree(p); + tp->root = NULL; + MOD_DEC_USE_COUNT; +} + + +static int tcindex_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct tcindex_data *p = PRIV(tp); + struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh; + unsigned char *b = skb->tail; + struct rtattr *rta; + + DPRINTK("tcindex_dump(tp %p,fh 0x%lx,skb %p,t %p),p %p,r %p,b %p\n", + tp,fh,skb,t,p,r,b); + DPRINTK("p->perfect %p p->h %p\n",p->perfect,p->h); + rta = (struct rtattr *) b; + RTA_PUT(skb,TCA_OPTIONS,0,NULL); + if (!fh) { + t->tcm_handle = ~0; /* whatever ... */ + RTA_PUT(skb,TCA_TCINDEX_HASH,sizeof(p->hash),&p->hash); + RTA_PUT(skb,TCA_TCINDEX_MASK,sizeof(p->mask),&p->mask); + RTA_PUT(skb,TCA_TCINDEX_SHIFT,sizeof(p->shift),&p->shift); + RTA_PUT(skb,TCA_TCINDEX_FALL_THROUGH,sizeof(p->fall_through), + &p->fall_through); + } else { + if (p->perfect) { + t->tcm_handle = r-p->perfect; + } else { + struct tcindex_filter *f; + int i; + + t->tcm_handle = 0; + for (i = 0; !t->tcm_handle && i < p->hash; i++) { + for (f = p->h[i]; !t->tcm_handle && f; + f = f->next) { + if (&f->result == r) + t->tcm_handle = f->key; + } + } + } + DPRINTK("handle = %d\n",t->tcm_handle); + if (r->res.class) + RTA_PUT(skb, TCA_TCINDEX_CLASSID, 4, &r->res.classid); +#ifdef CONFIG_NET_CLS_POLICE + if (r->police) { + struct rtattr *p_rta = (struct rtattr *) skb->tail; + + RTA_PUT(skb,TCA_TCINDEX_POLICE,0,NULL); + if (tcf_police_dump(skb,r->police) < 0) + goto rtattr_failure; + p_rta->rta_len = skb->tail-(u8 *) p_rta; + } +#endif + } + rta->rta_len = skb->tail-b; + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +struct tcf_proto_ops cls_tcindex_ops = { + NULL, + "tcindex", + tcindex_classify, + tcindex_init, + tcindex_destroy, + + tcindex_get, + tcindex_put, + tcindex_change, + tcindex_delete, + tcindex_walk, + tcindex_dump +}; + + +#ifdef MODULE +int init_module(void) +{ + return register_tcf_proto_ops(&cls_tcindex_ops); +} + +void cleanup_module(void) +{ + unregister_tcf_proto_ops(&cls_tcindex_ops); +} +#endif +MODULE_LICENSE("GPL"); diff --git a/release/src/linux/linux/net/sched/cls_u32.c b/release/src/linux/linux/net/sched/cls_u32.c new file mode 100644 index 00000000..3f53f1be --- /dev/null +++ b/release/src/linux/linux/net/sched/cls_u32.c @@ -0,0 +1,723 @@ +/* + * net/sched/cls_u32.c Ugly (or Universal) 32bit key Packet Classifier. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * The filters are packed to hash tables of key nodes + * with a set of 32bit key/mask pairs at every node. + * Nodes reference next level hash tables etc. + * + * This scheme is the best universal classifier I managed to + * invent; it is not super-fast, but it is not slow (provided you + * program it correctly), and general enough. And its relative + * speed grows as the number of rules becomes larger. + * + * It seems that it represents the best middle point between + * speed and manageability both by human and by machine. + * + * It is especially useful for link sharing combined with QoS; + * pure RSVP doesn't need such a general approach and can use + * much simpler (and faster) schemes, sort of cls_rsvp.c. + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <linux/rtnetlink.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + + +struct tc_u_knode +{ + struct tc_u_knode *next; + u32 handle; + struct tc_u_hnode *ht_up; +#ifdef CONFIG_NET_CLS_POLICE + struct tcf_police *police; +#endif + struct tcf_result res; + struct tc_u_hnode *ht_down; + struct tc_u32_sel sel; +}; + +struct tc_u_hnode +{ + struct tc_u_hnode *next; + u32 handle; + struct tc_u_common *tp_c; + int refcnt; + unsigned divisor; + u32 hgenerator; + struct tc_u_knode *ht[1]; +}; + +struct tc_u_common +{ + struct tc_u_common *next; + struct tc_u_hnode *hlist; + struct Qdisc *q; + int refcnt; + u32 hgenerator; +}; + +static struct tc_u_common *u32_list; + +static __inline__ unsigned u32_hash_fold(u32 key, struct tc_u32_sel *sel) +{ + unsigned h = key & sel->hmask; + + h ^= h>>16; + h ^= h>>8; + return h; +} + +static int u32_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_result *res) +{ + struct { + struct tc_u_knode *knode; + u8 *ptr; + } stack[TC_U32_MAXDEPTH]; + + struct tc_u_hnode *ht = (struct tc_u_hnode*)tp->root; + u8 *ptr = skb->nh.raw; + struct tc_u_knode *n; + int sdepth = 0; + int off2 = 0; + int sel = 0; + int i; + +#if !defined(__i386__) && !defined(__mc68000__) + if ((unsigned long)ptr & 3) + return -1; +#endif + +next_ht: + n = ht->ht[sel]; + +next_knode: + if (n) { + struct tc_u32_key *key = n->sel.keys; + + for (i = n->sel.nkeys; i>0; i--, key++) { + if ((*(u32*)(ptr+key->off+(off2&key->offmask))^key->val)&key->mask) { + n = n->next; + goto next_knode; + } + } + if (n->ht_down == NULL) { +check_terminal: + if (n->sel.flags&TC_U32_TERMINAL) { + *res = n->res; +#ifdef CONFIG_NET_CLS_POLICE + if (n->police) { + int pol_res = tcf_police(skb, n->police); + if (pol_res >= 0) + return pol_res; + } else +#endif + return 0; + } + n = n->next; + goto next_knode; + } + + /* PUSH */ + if (sdepth >= TC_U32_MAXDEPTH) + goto deadloop; + stack[sdepth].knode = n; + stack[sdepth].ptr = ptr; + sdepth++; + + ht = n->ht_down; + sel = 0; + if (ht->divisor) + sel = ht->divisor&u32_hash_fold(*(u32*)(ptr+n->sel.hoff), &n->sel); + + if (!(n->sel.flags&(TC_U32_VAROFFSET|TC_U32_OFFSET|TC_U32_EAT))) + goto next_ht; + + if (n->sel.flags&(TC_U32_OFFSET|TC_U32_VAROFFSET)) { + off2 = n->sel.off + 3; + if (n->sel.flags&TC_U32_VAROFFSET) + off2 += ntohs(n->sel.offmask & *(u16*)(ptr+n->sel.offoff)) >>n->sel.offshift; + off2 &= ~3; + } + if (n->sel.flags&TC_U32_EAT) { + ptr += off2; + off2 = 0; + } + + if (ptr < skb->tail) + goto next_ht; + } + + /* POP */ + if (sdepth--) { + n = stack[sdepth].knode; + ht = n->ht_up; + ptr = stack[sdepth].ptr; + goto check_terminal; + } + return -1; + +deadloop: + if (net_ratelimit()) + printk("cls_u32: dead loop\n"); + return -1; +} + +static __inline__ struct tc_u_hnode * +u32_lookup_ht(struct tc_u_common *tp_c, u32 handle) +{ + struct tc_u_hnode *ht; + + for (ht = tp_c->hlist; ht; ht = ht->next) + if (ht->handle == handle) + break; + + return ht; +} + +static __inline__ struct tc_u_knode * +u32_lookup_key(struct tc_u_hnode *ht, u32 handle) +{ + unsigned sel; + struct tc_u_knode *n; + + sel = TC_U32_HASH(handle); + if (sel > ht->divisor) + return 0; + + for (n = ht->ht[sel]; n; n = n->next) + if (n->handle == handle) + return n; + + return NULL; +} + + +static unsigned long u32_get(struct tcf_proto *tp, u32 handle) +{ + struct tc_u_hnode *ht; + struct tc_u_common *tp_c = tp->data; + + if (TC_U32_HTID(handle) == TC_U32_ROOT) + ht = tp->root; + else + ht = u32_lookup_ht(tp_c, TC_U32_HTID(handle)); + + if (!ht) + return 0; + + if (TC_U32_KEY(handle) == 0) + return (unsigned long)ht; + + return (unsigned long)u32_lookup_key(ht, handle); +} + +static void u32_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static u32 gen_new_htid(struct tc_u_common *tp_c) +{ + int i = 0x800; + + do { + if (++tp_c->hgenerator == 0x7FF) + tp_c->hgenerator = 1; + } while (--i>0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20)); + + return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0; +} + +static int u32_init(struct tcf_proto *tp) +{ + struct tc_u_hnode *root_ht; + struct tc_u_common *tp_c; + + MOD_INC_USE_COUNT; + + for (tp_c = u32_list; tp_c; tp_c = tp_c->next) + if (tp_c->q == tp->q) + break; + + root_ht = kmalloc(sizeof(*root_ht), GFP_KERNEL); + if (root_ht == NULL) { + MOD_DEC_USE_COUNT; + return -ENOBUFS; + } + memset(root_ht, 0, sizeof(*root_ht)); + root_ht->divisor = 0; + root_ht->refcnt++; + root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000; + + if (tp_c == NULL) { + tp_c = kmalloc(sizeof(*tp_c), GFP_KERNEL); + if (tp_c == NULL) { + kfree(root_ht); + MOD_DEC_USE_COUNT; + return -ENOBUFS; + } + memset(tp_c, 0, sizeof(*tp_c)); + tp_c->q = tp->q; + tp_c->next = u32_list; + u32_list = tp_c; + } + + tp_c->refcnt++; + root_ht->next = tp_c->hlist; + tp_c->hlist = root_ht; + root_ht->tp_c = tp_c; + + tp->root = root_ht; + tp->data = tp_c; + return 0; +} + +static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n) +{ + unsigned long cl; + + if ((cl = __cls_set_class(&n->res.class, 0)) != 0) + tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); +#ifdef CONFIG_NET_CLS_POLICE + tcf_police_release(n->police); +#endif + if (n->ht_down) + n->ht_down->refcnt--; + kfree(n); + return 0; +} + +static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key) +{ + struct tc_u_knode **kp; + struct tc_u_hnode *ht = key->ht_up; + + if (ht) { + for (kp = &ht->ht[TC_U32_HASH(key->handle)]; *kp; kp = &(*kp)->next) { + if (*kp == key) { + tcf_tree_lock(tp); + *kp = key->next; + tcf_tree_unlock(tp); + + u32_destroy_key(tp, key); + return 0; + } + } + } + BUG_TRAP(0); + return 0; +} + +static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) +{ + struct tc_u_knode *n; + unsigned h; + + for (h=0; h<=ht->divisor; h++) { + while ((n = ht->ht[h]) != NULL) { + ht->ht[h] = n->next; + + u32_destroy_key(tp, n); + } + } +} + +static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) +{ + struct tc_u_common *tp_c = tp->data; + struct tc_u_hnode **hn; + + BUG_TRAP(!ht->refcnt); + + u32_clear_hnode(tp, ht); + + for (hn = &tp_c->hlist; *hn; hn = &(*hn)->next) { + if (*hn == ht) { + *hn = ht->next; + kfree(ht); + return 0; + } + } + + BUG_TRAP(0); + return -ENOENT; +} + +static void u32_destroy(struct tcf_proto *tp) +{ + struct tc_u_common *tp_c = tp->data; + struct tc_u_hnode *root_ht = xchg(&tp->root, NULL); + + BUG_TRAP(root_ht != NULL); + + if (root_ht && --root_ht->refcnt == 0) + u32_destroy_hnode(tp, root_ht); + + if (--tp_c->refcnt == 0) { + struct tc_u_hnode *ht; + struct tc_u_common **tp_cp; + + for (tp_cp = &u32_list; *tp_cp; tp_cp = &(*tp_cp)->next) { + if (*tp_cp == tp_c) { + *tp_cp = tp_c->next; + break; + } + } + + for (ht=tp_c->hlist; ht; ht = ht->next) + u32_clear_hnode(tp, ht); + + while ((ht = tp_c->hlist) != NULL) { + tp_c->hlist = ht->next; + + BUG_TRAP(ht->refcnt == 0); + + kfree(ht); + }; + + kfree(tp_c); + } + + MOD_DEC_USE_COUNT; + tp->data = NULL; +} + +static int u32_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct tc_u_hnode *ht = (struct tc_u_hnode*)arg; + + if (ht == NULL) + return 0; + + if (TC_U32_KEY(ht->handle)) + return u32_delete_key(tp, (struct tc_u_knode*)ht); + + if (tp->root == ht) + return -EINVAL; + + if (--ht->refcnt == 0) + u32_destroy_hnode(tp, ht); + + return 0; +} + +static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle) +{ + struct tc_u_knode *n; + unsigned i = 0x7FF; + + for (n=ht->ht[TC_U32_HASH(handle)]; n; n = n->next) + if (i < TC_U32_NODE(n->handle)) + i = TC_U32_NODE(n->handle); + i++; + + return handle|(i>0xFFF ? 0xFFF : i); +} + +static int u32_set_parms(struct Qdisc *q, unsigned long base, + struct tc_u_hnode *ht, + struct tc_u_knode *n, struct rtattr **tb, + struct rtattr *est) +{ + if (tb[TCA_U32_LINK-1]) { + u32 handle = *(u32*)RTA_DATA(tb[TCA_U32_LINK-1]); + struct tc_u_hnode *ht_down = NULL; + + if (TC_U32_KEY(handle)) + return -EINVAL; + + if (handle) { + ht_down = u32_lookup_ht(ht->tp_c, handle); + + if (ht_down == NULL) + return -EINVAL; + ht_down->refcnt++; + } + + sch_tree_lock(q); + ht_down = xchg(&n->ht_down, ht_down); + sch_tree_unlock(q); + + if (ht_down) + ht_down->refcnt--; + } + if (tb[TCA_U32_CLASSID-1]) { + unsigned long cl; + + n->res.classid = *(u32*)RTA_DATA(tb[TCA_U32_CLASSID-1]); + sch_tree_lock(q); + cl = __cls_set_class(&n->res.class, q->ops->cl_ops->bind_tcf(q, base, n->res.classid)); + sch_tree_unlock(q); + if (cl) + q->ops->cl_ops->unbind_tcf(q, cl); + } +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_U32_POLICE-1]) { + struct tcf_police *police = tcf_police_locate(tb[TCA_U32_POLICE-1], est); + + sch_tree_lock(q); + police = xchg(&n->police, police); + sch_tree_unlock(q); + + tcf_police_release(police); + } +#endif + return 0; +} + +static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle, + struct rtattr **tca, + unsigned long *arg) +{ + struct tc_u_common *tp_c = tp->data; + struct tc_u_hnode *ht; + struct tc_u_knode *n; + struct tc_u32_sel *s; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_U32_MAX]; + u32 htid; + int err; + + if (opt == NULL) + return handle ? -EINVAL : 0; + + if (rtattr_parse(tb, TCA_U32_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0) + return -EINVAL; + + if ((n = (struct tc_u_knode*)*arg) != NULL) { + if (TC_U32_KEY(n->handle) == 0) + return -EINVAL; + + return u32_set_parms(tp->q, base, n->ht_up, n, tb, tca[TCA_RATE-1]); + } + + if (tb[TCA_U32_DIVISOR-1]) { + unsigned divisor = *(unsigned*)RTA_DATA(tb[TCA_U32_DIVISOR-1]); + + if (--divisor > 0x100) + return -EINVAL; + if (TC_U32_KEY(handle)) + return -EINVAL; + if (handle == 0) { + handle = gen_new_htid(tp->data); + if (handle == 0) + return -ENOMEM; + } + ht = kmalloc(sizeof(*ht) + divisor*sizeof(void*), GFP_KERNEL); + if (ht == NULL) + return -ENOBUFS; + memset(ht, 0, sizeof(*ht) + divisor*sizeof(void*)); + ht->tp_c = tp_c; + ht->refcnt = 0; + ht->divisor = divisor; + ht->handle = handle; + ht->next = tp_c->hlist; + tp_c->hlist = ht; + *arg = (unsigned long)ht; + return 0; + } + + if (tb[TCA_U32_HASH-1]) { + htid = *(unsigned*)RTA_DATA(tb[TCA_U32_HASH-1]); + if (TC_U32_HTID(htid) == TC_U32_ROOT) { + ht = tp->root; + htid = ht->handle; + } else { + ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid)); + if (ht == NULL) + return -EINVAL; + } + } else { + ht = tp->root; + htid = ht->handle; + } + + if (ht->divisor < TC_U32_HASH(htid)) + return -EINVAL; + + if (handle) { + if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid)) + return -EINVAL; + handle = htid | TC_U32_NODE(handle); + } else + handle = gen_new_kid(ht, htid); + + if (tb[TCA_U32_SEL-1] == 0 || + RTA_PAYLOAD(tb[TCA_U32_SEL-1]) < sizeof(struct tc_u32_sel)) + return -EINVAL; + + s = RTA_DATA(tb[TCA_U32_SEL-1]); + n = kmalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL); + if (n == NULL) + return -ENOBUFS; + memset(n, 0, sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key)); + memcpy(&n->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key)); + n->ht_up = ht; + n->handle = handle; + err = u32_set_parms(tp->q, base, ht, n, tb, tca[TCA_RATE-1]); + if (err == 0) { + struct tc_u_knode **ins; + for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next) + if (TC_U32_NODE(handle) < TC_U32_NODE((*ins)->handle)) + break; + + n->next = *ins; + wmb(); + *ins = n; + + *arg = (unsigned long)n; + return 0; + } + kfree(n); + return err; +} + +static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct tc_u_common *tp_c = tp->data; + struct tc_u_hnode *ht; + struct tc_u_knode *n; + unsigned h; + + if (arg->stop) + return; + + for (ht = tp_c->hlist; ht; ht = ht->next) { + if (arg->count >= arg->skip) { + if (arg->fn(tp, (unsigned long)ht, arg) < 0) { + arg->stop = 1; + return; + } + } + arg->count++; + for (h = 0; h <= ht->divisor; h++) { + for (n = ht->ht[h]; n; n = n->next) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(tp, (unsigned long)n, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } + } +} + +static int u32_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct tc_u_knode *n = (struct tc_u_knode*)fh; + unsigned char *b = skb->tail; + struct rtattr *rta; + + if (n == NULL) + return skb->len; + + t->tcm_handle = n->handle; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + if (TC_U32_KEY(n->handle) == 0) { + struct tc_u_hnode *ht = (struct tc_u_hnode*)fh; + u32 divisor = ht->divisor+1; + RTA_PUT(skb, TCA_U32_DIVISOR, 4, &divisor); + } else { + RTA_PUT(skb, TCA_U32_SEL, + sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key), + &n->sel); + if (n->ht_up) { + u32 htid = n->handle & 0xFFFFF000; + RTA_PUT(skb, TCA_U32_HASH, 4, &htid); + } + if (n->res.classid) + RTA_PUT(skb, TCA_U32_CLASSID, 4, &n->res.classid); + if (n->ht_down) + RTA_PUT(skb, TCA_U32_LINK, 4, &n->ht_down->handle); +#ifdef CONFIG_NET_CLS_POLICE + if (n->police) { + struct rtattr * p_rta = (struct rtattr*)skb->tail; + + RTA_PUT(skb, TCA_U32_POLICE, 0, NULL); + + if (tcf_police_dump(skb, n->police) < 0) + goto rtattr_failure; + + p_rta->rta_len = skb->tail - (u8*)p_rta; + } +#endif + } + + rta->rta_len = skb->tail - b; +#ifdef CONFIG_NET_CLS_POLICE + if (TC_U32_KEY(n->handle) && n->police) { + if (qdisc_copy_stats(skb, &n->police->stats)) + goto rtattr_failure; + } +#endif + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +struct tcf_proto_ops cls_u32_ops = { + NULL, + "u32", + u32_classify, + u32_init, + u32_destroy, + + u32_get, + u32_put, + u32_change, + u32_delete, + u32_walk, + u32_dump +}; + +#ifdef MODULE +int init_module(void) +{ + return register_tcf_proto_ops(&cls_u32_ops); +} + +void cleanup_module(void) +{ + unregister_tcf_proto_ops(&cls_u32_ops); +} +#endif +MODULE_LICENSE("GPL"); diff --git a/release/src/linux/linux/net/sched/estimator.c b/release/src/linux/linux/net/sched/estimator.c new file mode 100644 index 00000000..e70066f9 --- /dev/null +++ b/release/src/linux/linux/net/sched/estimator.c @@ -0,0 +1,197 @@ +/* + * net/sched/estimator.c Simple rate estimator. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +/* + This code is NOT intended to be used for statistics collection, + its purpose is to provide a base for statistical multiplexing + for controlled load service. + If you need only statistics, run a user level daemon which + periodically reads byte counters. + + Unfortunately, rate estimation is not a very easy task. + F.e. I did not find a simple way to estimate the current peak rate + and even failed to formulate the problem 8)8) + + So I preferred not to built an estimator into the scheduler, + but run this task separately. + Ideally, it should be kernel thread(s), but for now it runs + from timers, which puts apparent top bounds on the number of rated + flows, has minimal overhead on small, but is enough + to handle controlled load service, sets of aggregates. + + We measure rate over A=(1<<interval) seconds and evaluate EWMA: + + avrate = avrate*(1-W) + rate*W + + where W is chosen as negative power of 2: W = 2^(-ewma_log) + + The resulting time constant is: + + T = A/(-ln(1-W)) + + + NOTES. + + * The stored value for avbps is scaled by 2^5, so that maximal + rate is ~1Gbit, avpps is scaled by 2^10. + + * Minimal interval is HZ/4=250msec (it is the greatest common divisor + for HZ=100 and HZ=1024 8)), maximal interval + is (HZ/4)*2^EST_MAX_INTERVAL = 8sec. Shorter intervals + are too expensive, longer ones can be implemented + at user level painlessly. + */ + +#if (HZ%4) != 0 +#error Bad HZ value. +#endif + +#define EST_MAX_INTERVAL 5 + +struct qdisc_estimator +{ + struct qdisc_estimator *next; + struct tc_stats *stats; + unsigned interval; + int ewma_log; + u64 last_bytes; + u32 last_packets; + u32 avpps; + u32 avbps; +}; + +struct qdisc_estimator_head +{ + struct timer_list timer; + struct qdisc_estimator *list; +}; + +static struct qdisc_estimator_head elist[EST_MAX_INTERVAL+1]; + +/* Estimator array lock */ +static rwlock_t est_lock = RW_LOCK_UNLOCKED; + +static void est_timer(unsigned long arg) +{ + int idx = (int)arg; + struct qdisc_estimator *e; + + read_lock(&est_lock); + for (e = elist[idx].list; e; e = e->next) { + struct tc_stats *st = e->stats; + u64 nbytes; + u32 npackets; + u32 rate; + + spin_lock(st->lock); + nbytes = st->bytes; + npackets = st->packets; + rate = (nbytes - e->last_bytes)<<(7 - idx); + e->last_bytes = nbytes; + e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log; + st->bps = (e->avbps+0xF)>>5; + + rate = (npackets - e->last_packets)<<(12 - idx); + e->last_packets = npackets; + e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log; + e->stats->pps = (e->avpps+0x1FF)>>10; + spin_unlock(st->lock); + } + + mod_timer(&elist[idx].timer, jiffies + ((HZ/4)<<idx)); + read_unlock(&est_lock); +} + +int qdisc_new_estimator(struct tc_stats *stats, struct rtattr *opt) +{ + struct qdisc_estimator *est; + struct tc_estimator *parm = RTA_DATA(opt); + + if (RTA_PAYLOAD(opt) < sizeof(*parm)) + return -EINVAL; + + if (parm->interval < -2 || parm->interval > 3) + return -EINVAL; + + est = kmalloc(sizeof(*est), GFP_KERNEL); + if (est == NULL) + return -ENOBUFS; + + memset(est, 0, sizeof(*est)); + est->interval = parm->interval + 2; + est->stats = stats; + est->ewma_log = parm->ewma_log; + est->last_bytes = stats->bytes; + est->avbps = stats->bps<<5; + est->last_packets = stats->packets; + est->avpps = stats->pps<<10; + + est->next = elist[est->interval].list; + if (est->next == NULL) { + init_timer(&elist[est->interval].timer); + elist[est->interval].timer.data = est->interval; + elist[est->interval].timer.expires = jiffies + ((HZ/4)<<est->interval); + elist[est->interval].timer.function = est_timer; + add_timer(&elist[est->interval].timer); + } + write_lock_bh(&est_lock); + elist[est->interval].list = est; + write_unlock_bh(&est_lock); + return 0; +} + +void qdisc_kill_estimator(struct tc_stats *stats) +{ + int idx; + struct qdisc_estimator *est, **pest; + + for (idx=0; idx <= EST_MAX_INTERVAL; idx++) { + int killed = 0; + pest = &elist[idx].list; + while ((est=*pest) != NULL) { + if (est->stats != stats) { + pest = &est->next; + continue; + } + + write_lock_bh(&est_lock); + *pest = est->next; + write_unlock_bh(&est_lock); + + kfree(est); + killed++; + } + if (killed && elist[idx].list == NULL) + del_timer(&elist[idx].timer); + } +} + diff --git a/release/src/linux/linux/net/sched/police.c b/release/src/linux/linux/net/sched/police.c new file mode 100644 index 00000000..78fb8c55 --- /dev/null +++ b/release/src/linux/linux/net/sched/police.c @@ -0,0 +1,251 @@ +/* + * net/sched/police.c Input police filter. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/config.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +#define L2T(p,L) ((p)->R_tab->data[(L)>>(p)->R_tab->rate.cell_log]) +#define L2T_P(p,L) ((p)->P_tab->data[(L)>>(p)->P_tab->rate.cell_log]) + +static u32 idx_gen; +static struct tcf_police *tcf_police_ht[16]; +/* Policer hash table lock */ +static rwlock_t police_lock = RW_LOCK_UNLOCKED; + +/* Each policer is serialized by its individual spinlock */ + +static __inline__ unsigned tcf_police_hash(u32 index) +{ + return index&0xF; +} + +static __inline__ struct tcf_police * tcf_police_lookup(u32 index) +{ + struct tcf_police *p; + + read_lock(&police_lock); + for (p = tcf_police_ht[tcf_police_hash(index)]; p; p = p->next) { + if (p->index == index) + break; + } + read_unlock(&police_lock); + return p; +} + +static __inline__ u32 tcf_police_new_index(void) +{ + do { + if (++idx_gen == 0) + idx_gen = 1; + } while (tcf_police_lookup(idx_gen)); + + return idx_gen; +} + + +void tcf_police_destroy(struct tcf_police *p) +{ + unsigned h = tcf_police_hash(p->index); + struct tcf_police **p1p; + + for (p1p = &tcf_police_ht[h]; *p1p; p1p = &(*p1p)->next) { + if (*p1p == p) { + write_lock_bh(&police_lock); + *p1p = p->next; + write_unlock_bh(&police_lock); +#ifdef CONFIG_NET_ESTIMATOR + qdisc_kill_estimator(&p->stats); +#endif + if (p->R_tab) + qdisc_put_rtab(p->R_tab); + if (p->P_tab) + qdisc_put_rtab(p->P_tab); + kfree(p); + return; + } + } + BUG_TRAP(0); +} + +struct tcf_police * tcf_police_locate(struct rtattr *rta, struct rtattr *est) +{ + unsigned h; + struct tcf_police *p; + struct rtattr *tb[TCA_POLICE_MAX]; + struct tc_police *parm; + + if (rtattr_parse(tb, TCA_POLICE_MAX, RTA_DATA(rta), RTA_PAYLOAD(rta)) < 0) + return NULL; + + if (tb[TCA_POLICE_TBF-1] == NULL) + return NULL; + + parm = RTA_DATA(tb[TCA_POLICE_TBF-1]); + + if (parm->index && (p = tcf_police_lookup(parm->index)) != NULL) { + p->refcnt++; + return p; + } + + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (p == NULL) + return NULL; + + memset(p, 0, sizeof(*p)); + p->refcnt = 1; + spin_lock_init(&p->lock); + p->stats.lock = &p->lock; + if (parm->rate.rate) { + if ((p->R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE-1])) == NULL) + goto failure; + if (parm->peakrate.rate && + (p->P_tab = qdisc_get_rtab(&parm->peakrate, tb[TCA_POLICE_PEAKRATE-1])) == NULL) + goto failure; + } + if (tb[TCA_POLICE_RESULT-1]) + p->result = *(int*)RTA_DATA(tb[TCA_POLICE_RESULT-1]); +#ifdef CONFIG_NET_ESTIMATOR + if (tb[TCA_POLICE_AVRATE-1]) + p->ewma_rate = *(u32*)RTA_DATA(tb[TCA_POLICE_AVRATE-1]); +#endif + p->toks = p->burst = parm->burst; + p->mtu = parm->mtu; + if (p->mtu == 0) { + p->mtu = ~0; + if (p->R_tab) + p->mtu = 255<<p->R_tab->rate.cell_log; + } + if (p->P_tab) + p->ptoks = L2T_P(p, p->mtu); + PSCHED_GET_TIME(p->t_c); + p->index = parm->index ? : tcf_police_new_index(); + p->action = parm->action; +#ifdef CONFIG_NET_ESTIMATOR + if (est) + qdisc_new_estimator(&p->stats, est); +#endif + h = tcf_police_hash(p->index); + write_lock_bh(&police_lock); + p->next = tcf_police_ht[h]; + tcf_police_ht[h] = p; + write_unlock_bh(&police_lock); + return p; + +failure: + if (p->R_tab) + qdisc_put_rtab(p->R_tab); + kfree(p); + return NULL; +} + +int tcf_police(struct sk_buff *skb, struct tcf_police *p) +{ + psched_time_t now; + long toks; + long ptoks = 0; + + spin_lock(&p->lock); + + p->stats.bytes += skb->len; + p->stats.packets++; + +#ifdef CONFIG_NET_ESTIMATOR + if (p->ewma_rate && p->stats.bps >= p->ewma_rate) { + p->stats.overlimits++; + spin_unlock(&p->lock); + return p->action; + } +#endif + + if (skb->len <= p->mtu) { + if (p->R_tab == NULL) { + spin_unlock(&p->lock); + return p->result; + } + + PSCHED_GET_TIME(now); + + toks = PSCHED_TDIFF_SAFE(now, p->t_c, p->burst, 0); + + if (p->P_tab) { + ptoks = toks + p->ptoks; + if (ptoks > (long)L2T_P(p, p->mtu)) + ptoks = (long)L2T_P(p, p->mtu); + ptoks -= L2T_P(p, skb->len); + } + toks += p->toks; + if (toks > (long)p->burst) + toks = p->burst; + toks -= L2T(p, skb->len); + + if ((toks|ptoks) >= 0) { + p->t_c = now; + p->toks = toks; + p->ptoks = ptoks; + spin_unlock(&p->lock); + return p->result; + } + } + + p->stats.overlimits++; + spin_unlock(&p->lock); + return p->action; +} + +int tcf_police_dump(struct sk_buff *skb, struct tcf_police *p) +{ + unsigned char *b = skb->tail; + struct tc_police opt; + + opt.index = p->index; + opt.action = p->action; + opt.mtu = p->mtu; + opt.burst = p->burst; + if (p->R_tab) + opt.rate = p->R_tab->rate; + else + memset(&opt.rate, 0, sizeof(opt.rate)); + if (p->P_tab) + opt.peakrate = p->P_tab->rate; + else + memset(&opt.peakrate, 0, sizeof(opt.peakrate)); + RTA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt); + if (p->result) + RTA_PUT(skb, TCA_POLICE_RESULT, sizeof(int), &p->result); +#ifdef CONFIG_NET_ESTIMATOR + if (p->ewma_rate) + RTA_PUT(skb, TCA_POLICE_AVRATE, 4, &p->ewma_rate); +#endif + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} diff --git a/release/src/linux/linux/net/sched/sch_api.c b/release/src/linux/linux/net/sched/sch_api.c new file mode 100644 index 00000000..a5d8945e --- /dev/null +++ b/release/src/linux/linux/net/sched/sch_api.c @@ -0,0 +1,1256 @@ +/* + * net/sched/sch_api.c Packet scheduler API. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * Fixes: + * + * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired. + * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support + * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <linux/kmod.h> + +#include <net/sock.h> +#include <net/pkt_sched.h> + +#include <asm/processor.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> + +static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid, + struct Qdisc *old, struct Qdisc *new); +static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, + struct Qdisc *q, unsigned long cl, int event); + +/* + + Short review. + ------------- + + This file consists of two interrelated parts: + + 1. queueing disciplines manager frontend. + 2. traffic classes manager frontend. + + Generally, queueing discipline ("qdisc") is a black box, + which is able to enqueue packets and to dequeue them (when + device is ready to send something) in order and at times + determined by algorithm hidden in it. + + qdisc's are divided to two categories: + - "queues", which have no internal structure visible from outside. + - "schedulers", which split all the packets to "traffic classes", + using "packet classifiers" (look at cls_api.c) + + In turn, classes may have child qdiscs (as rule, queues) + attached to them etc. etc. etc. + + The goal of the routines in this file is to translate + information supplied by user in the form of handles + to more intelligible for kernel form, to make some sanity + checks and part of work, which is common to all qdiscs + and to provide rtnetlink notifications. + + All real intelligent work is done inside qdisc modules. + + + + Every discipline has two major routines: enqueue and dequeue. + + ---dequeue + + dequeue usually returns a skb to send. It is allowed to return NULL, + but it does not mean that queue is empty, it just means that + discipline does not want to send anything this time. + Queue is really empty if q->q.qlen == 0. + For complicated disciplines with multiple queues q->q is not + real packet queue, but however q->q.qlen must be valid. + + ---enqueue + + enqueue returns 0, if packet was enqueued successfully. + If packet (this one or another one) was dropped, it returns + not zero error code. + NET_XMIT_DROP - this packet dropped + Expected action: do not backoff, but wait until queue will clear. + NET_XMIT_CN - probably this packet enqueued, but another one dropped. + Expected action: backoff or ignore + NET_XMIT_POLICED - dropped by police. + Expected action: backoff or error to real-time apps. + + Auxiliary routines: + + ---requeue + + requeues once dequeued packet. It is used for non-standard or + just buggy devices, which can defer output even if dev->tbusy=0. + + ---reset + + returns qdisc to initial state: purge all buffers, clear all + timers, counters (except for statistics) etc. + + ---init + + initializes newly created qdisc. + + ---destroy + + destroys resources allocated by init and during lifetime of qdisc. + + ---change + + changes qdisc parameters. + */ + +/* Protects list of registered TC modules. It is pure SMP lock. */ +static rwlock_t qdisc_mod_lock = RW_LOCK_UNLOCKED; + + +/************************************************ + * Queueing disciplines manipulation. * + ************************************************/ + + +/* The list of all installed queueing disciplines. */ + +static struct Qdisc_ops *qdisc_base = NULL; + +/* Register/uregister queueing discipline */ + +int register_qdisc(struct Qdisc_ops *qops) +{ + struct Qdisc_ops *q, **qp; + + write_lock(&qdisc_mod_lock); + for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) { + if (strcmp(qops->id, q->id) == 0) { + write_unlock(&qdisc_mod_lock); + return -EEXIST; + } + } + + if (qops->enqueue == NULL) + qops->enqueue = noop_qdisc_ops.enqueue; + if (qops->requeue == NULL) + qops->requeue = noop_qdisc_ops.requeue; + if (qops->dequeue == NULL) + qops->dequeue = noop_qdisc_ops.dequeue; + + qops->next = NULL; + *qp = qops; + write_unlock(&qdisc_mod_lock); + return 0; +} + +int unregister_qdisc(struct Qdisc_ops *qops) +{ + struct Qdisc_ops *q, **qp; + int err = -ENOENT; + + write_lock(&qdisc_mod_lock); + for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) + if (q == qops) + break; + if (q) { + *qp = q->next; + q->next = NULL; + err = 0; + } + write_unlock(&qdisc_mod_lock); + return err; +} + +/* We know handle. Find qdisc among all qdisc's attached to device + (root qdisc, all its children, children of children etc.) + */ + +struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) +{ + struct Qdisc *q; + + for (q = dev->qdisc_list; q; q = q->next) { + if (q->handle == handle) + return q; + } + return NULL; +} + +struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) +{ + unsigned long cl; + struct Qdisc *leaf; + struct Qdisc_class_ops *cops = p->ops->cl_ops; + + if (cops == NULL) + return NULL; + cl = cops->get(p, classid); + + if (cl == 0) + return NULL; + leaf = cops->leaf(p, cl); + cops->put(p, cl); + return leaf; +} + +/* Find queueing discipline by name */ + +struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind) +{ + struct Qdisc_ops *q = NULL; + + if (kind) { + read_lock(&qdisc_mod_lock); + for (q = qdisc_base; q; q = q->next) { + if (rtattr_strcmp(kind, q->id) == 0) + break; + } + read_unlock(&qdisc_mod_lock); + } + return q; +} + +static struct qdisc_rate_table *qdisc_rtab_list; + +struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab) +{ + struct qdisc_rate_table *rtab; + + for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { + if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) { + rtab->refcnt++; + return rtab; + } + } + + if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024) + return NULL; + + rtab = kmalloc(sizeof(*rtab), GFP_KERNEL); + if (rtab) { + rtab->rate = *r; + rtab->refcnt = 1; + memcpy(rtab->data, RTA_DATA(tab), 1024); + rtab->next = qdisc_rtab_list; + qdisc_rtab_list = rtab; + } + return rtab; +} + +void qdisc_put_rtab(struct qdisc_rate_table *tab) +{ + struct qdisc_rate_table *rtab, **rtabp; + + if (!tab || --tab->refcnt) + return; + + for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) { + if (rtab == tab) { + *rtabp = rtab->next; + kfree(rtab); + return; + } + } +} + + +/* Allocate an unique handle from space managed by kernel */ + +u32 qdisc_alloc_handle(struct net_device *dev) +{ + int i = 0x10000; + static u32 autohandle = TC_H_MAKE(0x80000000U, 0); + + do { + autohandle += TC_H_MAKE(0x10000U, 0); + if (autohandle == TC_H_MAKE(TC_H_ROOT, 0)) + autohandle = TC_H_MAKE(0x80000000U, 0); + } while (qdisc_lookup(dev, autohandle) && --i > 0); + + return i>0 ? autohandle : 0; +} + +/* Attach toplevel qdisc to device dev */ + +static struct Qdisc * +dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc) +{ + struct Qdisc *oqdisc; + + if (dev->flags & IFF_UP) + dev_deactivate(dev); + + write_lock(&qdisc_tree_lock); + spin_lock_bh(&dev->queue_lock); + if (qdisc && qdisc->flags&TCQ_F_INGRES) { + oqdisc = dev->qdisc_ingress; + /* Prune old scheduler */ + if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) { + /* delete */ + qdisc_reset(oqdisc); + dev->qdisc_ingress = NULL; + } else { /* new */ + dev->qdisc_ingress = qdisc; + } + + } else { + + oqdisc = dev->qdisc_sleeping; + + /* Prune old scheduler */ + if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) + qdisc_reset(oqdisc); + + /* ... and graft new one */ + if (qdisc == NULL) + qdisc = &noop_qdisc; + dev->qdisc_sleeping = qdisc; + dev->qdisc = &noop_qdisc; + } + + spin_unlock_bh(&dev->queue_lock); + write_unlock(&qdisc_tree_lock); + + if (dev->flags & IFF_UP) + dev_activate(dev); + + return oqdisc; +} + + +/* Graft qdisc "new" to class "classid" of qdisc "parent" or + to device "dev". + + Old qdisc is not destroyed but returned in *old. + */ + +int qdisc_graft(struct net_device *dev, struct Qdisc *parent, u32 classid, + struct Qdisc *new, struct Qdisc **old) +{ + int err = 0; + struct Qdisc *q = *old; + + + if (parent == NULL) { + if (q && q->flags&TCQ_F_INGRES) { + *old = dev_graft_qdisc(dev, q); + } else { + *old = dev_graft_qdisc(dev, new); + } + } else { + struct Qdisc_class_ops *cops = parent->ops->cl_ops; + + err = -EINVAL; + + if (cops) { + unsigned long cl = cops->get(parent, classid); + if (cl) { + err = cops->graft(parent, cl, new, old); + cops->put(parent, cl); + } + } + } + return err; +} + +/* + Allocate and initialize new qdisc. + + Parameters are passed via opt. + */ + +static struct Qdisc * +qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) +{ + int err; + struct rtattr *kind = tca[TCA_KIND-1]; + struct Qdisc *sch = NULL; + struct Qdisc_ops *ops; + int size; + + ops = qdisc_lookup_ops(kind); +#ifdef CONFIG_KMOD + if (ops==NULL && tca[TCA_KIND-1] != NULL) { + char module_name[4 + IFNAMSIZ + 1]; + + if (RTA_PAYLOAD(kind) <= IFNAMSIZ) { + sprintf(module_name, "sch_%s", (char*)RTA_DATA(kind)); + request_module (module_name); + ops = qdisc_lookup_ops(kind); + } + } +#endif + + err = -EINVAL; + if (ops == NULL) + goto err_out; + + size = sizeof(*sch) + ops->priv_size; + + sch = kmalloc(size, GFP_KERNEL); + err = -ENOBUFS; + if (!sch) + goto err_out; + + /* Grrr... Resolve race condition with module unload */ + + err = -EINVAL; + if (ops != qdisc_lookup_ops(kind)) + goto err_out; + + memset(sch, 0, size); + + skb_queue_head_init(&sch->q); + + if (handle == TC_H_INGRESS) + sch->flags |= TCQ_F_INGRES; + + sch->ops = ops; + sch->enqueue = ops->enqueue; + sch->dequeue = ops->dequeue; + sch->dev = dev; + atomic_set(&sch->refcnt, 1); + sch->stats.lock = &dev->queue_lock; + if (handle == 0) { + handle = qdisc_alloc_handle(dev); + err = -ENOMEM; + if (handle == 0) + goto err_out; + } + + if (handle == TC_H_INGRESS) + sch->handle =TC_H_MAKE(TC_H_INGRESS, 0); + else + sch->handle = handle; + + if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) { + write_lock(&qdisc_tree_lock); + sch->next = dev->qdisc_list; + dev->qdisc_list = sch; + write_unlock(&qdisc_tree_lock); +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) + qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]); +#endif + return sch; + } + +err_out: + *errp = err; + if (sch) + kfree(sch); + return NULL; +} + +static int qdisc_change(struct Qdisc *sch, struct rtattr **tca) +{ + if (tca[TCA_OPTIONS-1]) { + int err; + + if (sch->ops->change == NULL) + return -EINVAL; + err = sch->ops->change(sch, tca[TCA_OPTIONS-1]); + if (err) + return err; + } +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) { + qdisc_kill_estimator(&sch->stats); + qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]); + } +#endif + return 0; +} + +struct check_loop_arg +{ + struct qdisc_walker w; + struct Qdisc *p; + int depth; +}; + +static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w); + +static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth) +{ + struct check_loop_arg arg; + + if (q->ops->cl_ops == NULL) + return 0; + + arg.w.stop = arg.w.skip = arg.w.count = 0; + arg.w.fn = check_loop_fn; + arg.depth = depth; + arg.p = p; + q->ops->cl_ops->walk(q, &arg.w); + return arg.w.stop ? -ELOOP : 0; +} + +static int +check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) +{ + struct Qdisc *leaf; + struct Qdisc_class_ops *cops = q->ops->cl_ops; + struct check_loop_arg *arg = (struct check_loop_arg *)w; + + leaf = cops->leaf(q, cl); + if (leaf) { + if (leaf == arg->p || arg->depth > 7) + return -ELOOP; + return check_loop(leaf, arg->p, arg->depth + 1); + } + return 0; +} + +/* + * Delete/get qdisc. + */ + +static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +{ + struct tcmsg *tcm = NLMSG_DATA(n); + struct rtattr **tca = arg; + struct net_device *dev; + u32 clid = tcm->tcm_parent; + struct Qdisc *q = NULL; + struct Qdisc *p = NULL; + int err; + + if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) + return -ENODEV; + + if (clid) { + if (clid != TC_H_ROOT) { + if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) { + if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) + return -ENOENT; + q = qdisc_leaf(p, clid); + } else { /* ingress */ + q = dev->qdisc_ingress; + } + } else { + q = dev->qdisc_sleeping; + } + if (!q) + return -ENOENT; + + if (tcm->tcm_handle && q->handle != tcm->tcm_handle) + return -EINVAL; + } else { + if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL) + return -ENOENT; + } + + if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) + return -EINVAL; + + if (n->nlmsg_type == RTM_DELQDISC) { + if (!clid) + return -EINVAL; + if (q->handle == 0) + return -ENOENT; + if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0) + return err; + if (q) { + qdisc_notify(skb, n, clid, q, NULL); + spin_lock_bh(&dev->queue_lock); + qdisc_destroy(q); + spin_unlock_bh(&dev->queue_lock); + } + } else { + qdisc_notify(skb, n, clid, NULL, q); + } + return 0; +} + +/* + Create/change qdisc. + */ + +static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +{ + struct tcmsg *tcm = NLMSG_DATA(n); + struct rtattr **tca = arg; + struct net_device *dev; + u32 clid = tcm->tcm_parent; + struct Qdisc *q = NULL; + struct Qdisc *p = NULL; + int err; + + if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) + return -ENODEV; + + if (clid) { + if (clid != TC_H_ROOT) { + if (clid != TC_H_INGRESS) { + if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) + return -ENOENT; + q = qdisc_leaf(p, clid); + } else { /*ingress */ + q = dev->qdisc_ingress; + } + } else { + q = dev->qdisc_sleeping; + } + + /* It may be default qdisc, ignore it */ + if (q && q->handle == 0) + q = NULL; + + if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) { + if (tcm->tcm_handle) { + if (q && !(n->nlmsg_flags&NLM_F_REPLACE)) + return -EEXIST; + if (TC_H_MIN(tcm->tcm_handle)) + return -EINVAL; + if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL) + goto create_n_graft; + if (n->nlmsg_flags&NLM_F_EXCL) + return -EEXIST; + if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) + return -EINVAL; + if (q == p || + (p && check_loop(q, p, 0))) + return -ELOOP; + atomic_inc(&q->refcnt); + goto graft; + } else { + if (q == NULL) + goto create_n_graft; + + /* This magic test requires explanation. + * + * We know, that some child q is already + * attached to this parent and have choice: + * either to change it or to create/graft new one. + * + * 1. We are allowed to create/graft only + * if CREATE and REPLACE flags are set. + * + * 2. If EXCL is set, requestor wanted to say, + * that qdisc tcm_handle is not expected + * to exist, so that we choose create/graft too. + * + * 3. The last case is when no flags are set. + * Alas, it is sort of hole in API, we + * cannot decide what to do unambiguously. + * For now we select create/graft, if + * user gave KIND, which does not match existing. + */ + if ((n->nlmsg_flags&NLM_F_CREATE) && + (n->nlmsg_flags&NLM_F_REPLACE) && + ((n->nlmsg_flags&NLM_F_EXCL) || + (tca[TCA_KIND-1] && + rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)))) + goto create_n_graft; + } + } + } else { + if (!tcm->tcm_handle) + return -EINVAL; + q = qdisc_lookup(dev, tcm->tcm_handle); + } + + /* Change qdisc parameters */ + if (q == NULL) + return -ENOENT; + if (n->nlmsg_flags&NLM_F_EXCL) + return -EEXIST; + if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) + return -EINVAL; + err = qdisc_change(q, tca); + if (err == 0) + qdisc_notify(skb, n, clid, NULL, q); + return err; + +create_n_graft: + if (!(n->nlmsg_flags&NLM_F_CREATE)) + return -ENOENT; + if (clid == TC_H_INGRESS) + q = qdisc_create(dev, tcm->tcm_parent, tca, &err); + else + q = qdisc_create(dev, tcm->tcm_handle, tca, &err); + if (q == NULL) + return err; + +graft: + if (1) { + struct Qdisc *old_q = NULL; + err = qdisc_graft(dev, p, clid, q, &old_q); + if (err) { + if (q) { + spin_lock_bh(&dev->queue_lock); + qdisc_destroy(q); + spin_unlock_bh(&dev->queue_lock); + } + return err; + } + qdisc_notify(skb, n, clid, old_q, q); + if (old_q) { + spin_lock_bh(&dev->queue_lock); + qdisc_destroy(old_q); + spin_unlock_bh(&dev->queue_lock); + } + } + return 0; +} + +int qdisc_copy_stats(struct sk_buff *skb, struct tc_stats *st) +{ + spin_lock_bh(st->lock); + RTA_PUT(skb, TCA_STATS, (char*)&st->lock - (char*)st, st); + spin_unlock_bh(st->lock); + return 0; + +rtattr_failure: + spin_unlock_bh(st->lock); + return -1; +} + + +static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, + u32 pid, u32 seq, unsigned flags, int event) +{ + struct tcmsg *tcm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); + nlh->nlmsg_flags = flags; + tcm = NLMSG_DATA(nlh); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0; + tcm->tcm_parent = clid; + tcm->tcm_handle = q->handle; + tcm->tcm_info = atomic_read(&q->refcnt); + RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); + if (q->ops->dump && q->ops->dump(q, skb) < 0) + goto rtattr_failure; + q->stats.qlen = q->q.qlen; + if (qdisc_copy_stats(skb, &q->stats)) + goto rtattr_failure; + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, + u32 clid, struct Qdisc *old, struct Qdisc *new) +{ + struct sk_buff *skb; + u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (old && old->handle) { + if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0) + goto err_out; + } + if (new) { + if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) + goto err_out; + } + + if (skb->len) + return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + +err_out: + kfree_skb(skb); + return -EINVAL; +} + +static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx, q_idx; + int s_idx, s_q_idx; + struct net_device *dev; + struct Qdisc *q; + + s_idx = cb->args[0]; + s_q_idx = q_idx = cb->args[1]; + read_lock(&dev_base_lock); + for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { + if (idx < s_idx) + continue; + if (idx > s_idx) + s_q_idx = 0; + read_lock(&qdisc_tree_lock); + for (q = dev->qdisc_list, q_idx = 0; q; + q = q->next, q_idx++) { + if (q_idx < s_q_idx) + continue; + if (tc_fill_qdisc(skb, q, 0, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) { + read_unlock(&qdisc_tree_lock); + goto done; + } + } + read_unlock(&qdisc_tree_lock); + } + +done: + read_unlock(&dev_base_lock); + + cb->args[0] = idx; + cb->args[1] = q_idx; + + return skb->len; +} + + + +/************************************************ + * Traffic classes manipulation. * + ************************************************/ + + + +static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +{ + struct tcmsg *tcm = NLMSG_DATA(n); + struct rtattr **tca = arg; + struct net_device *dev; + struct Qdisc *q = NULL; + struct Qdisc_class_ops *cops; + unsigned long cl = 0; + unsigned long new_cl; + u32 pid = tcm->tcm_parent; + u32 clid = tcm->tcm_handle; + u32 qid = TC_H_MAJ(clid); + int err; + + if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) + return -ENODEV; + + /* + parent == TC_H_UNSPEC - unspecified parent. + parent == TC_H_ROOT - class is root, which has no parent. + parent == X:0 - parent is root class. + parent == X:Y - parent is a node in hierarchy. + parent == 0:Y - parent is X:Y, where X:0 is qdisc. + + handle == 0:0 - generate handle from kernel pool. + handle == 0:Y - class is X:Y, where X:0 is qdisc. + handle == X:Y - clear. + handle == X:0 - root class. + */ + + /* Step 1. Determine qdisc handle X:0 */ + + if (pid != TC_H_ROOT) { + u32 qid1 = TC_H_MAJ(pid); + + if (qid && qid1) { + /* If both majors are known, they must be identical. */ + if (qid != qid1) + return -EINVAL; + } else if (qid1) { + qid = qid1; + } else if (qid == 0) + qid = dev->qdisc_sleeping->handle; + + /* Now qid is genuine qdisc handle consistent + both with parent and child. + + TC_H_MAJ(pid) still may be unspecified, complete it now. + */ + if (pid) + pid = TC_H_MAKE(qid, pid); + } else { + if (qid == 0) + qid = dev->qdisc_sleeping->handle; + } + + /* OK. Locate qdisc */ + if ((q = qdisc_lookup(dev, qid)) == NULL) + return -ENOENT; + + /* An check that it supports classes */ + cops = q->ops->cl_ops; + if (cops == NULL) + return -EINVAL; + + /* Now try to get class */ + if (clid == 0) { + if (pid == TC_H_ROOT) + clid = qid; + } else + clid = TC_H_MAKE(qid, clid); + + if (clid) + cl = cops->get(q, clid); + + if (cl == 0) { + err = -ENOENT; + if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE)) + goto out; + } else { + switch (n->nlmsg_type) { + case RTM_NEWTCLASS: + err = -EEXIST; + if (n->nlmsg_flags&NLM_F_EXCL) + goto out; + break; + case RTM_DELTCLASS: + err = cops->delete(q, cl); + if (err == 0) + tclass_notify(skb, n, q, cl, RTM_DELTCLASS); + goto out; + case RTM_GETTCLASS: + err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS); + goto out; + default: + err = -EINVAL; + goto out; + } + } + + new_cl = cl; + err = cops->change(q, clid, pid, tca, &new_cl); + if (err == 0) + tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS); + +out: + if (cl) + cops->put(q, cl); + + return err; +} + + +static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, + unsigned long cl, + u32 pid, u32 seq, unsigned flags, int event) +{ + struct tcmsg *tcm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); + nlh->nlmsg_flags = flags; + tcm = NLMSG_DATA(nlh); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0; + tcm->tcm_parent = q->handle; + tcm->tcm_handle = q->handle; + tcm->tcm_info = 0; + RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); + if (q->ops->cl_ops->dump && q->ops->cl_ops->dump(q, cl, skb, tcm) < 0) + goto rtattr_failure; + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, + struct Qdisc *q, unsigned long cl, int event) +{ + struct sk_buff *skb; + u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) { + kfree_skb(skb); + return -EINVAL; + } + + return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); +} + +struct qdisc_dump_args +{ + struct qdisc_walker w; + struct sk_buff *skb; + struct netlink_callback *cb; +}; + +static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg) +{ + struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; + + return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid, + a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS); +} + +static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) +{ + int t; + int s_t; + struct net_device *dev; + struct Qdisc *q; + struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh); + struct qdisc_dump_args arg; + + if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) + return 0; + if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) + return 0; + + s_t = cb->args[0]; + + read_lock(&qdisc_tree_lock); + for (q=dev->qdisc_list, t=0; q; q = q->next, t++) { + if (t < s_t) continue; + if (!q->ops->cl_ops) continue; + if (tcm->tcm_parent && TC_H_MAJ(tcm->tcm_parent) != q->handle) + continue; + if (t > s_t) + memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); + arg.w.fn = qdisc_class_dump; + arg.skb = skb; + arg.cb = cb; + arg.w.stop = 0; + arg.w.skip = cb->args[1]; + arg.w.count = 0; + q->ops->cl_ops->walk(q, &arg.w); + cb->args[1] = arg.w.count; + if (arg.w.stop) + break; + } + read_unlock(&qdisc_tree_lock); + + cb->args[0] = t; + + dev_put(dev); + return skb->len; +} + +int psched_us_per_tick = 1; +int psched_tick_per_us = 1; + +#ifdef CONFIG_PROC_FS +static int psched_read_proc(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + int len; + + len = sprintf(buffer, "%08x %08x %08x %08x\n", + psched_tick_per_us, psched_us_per_tick, + 1000000, HZ); + + len -= offset; + + if (len > length) + len = length; + if(len < 0) + len = 0; + + *start = buffer + offset; + *eof = 1; + + return len; +} +#endif + +#if PSCHED_CLOCK_SOURCE == PSCHED_GETTIMEOFDAY +int psched_tod_diff(int delta_sec, int bound) +{ + int delta; + + if (bound <= 1000000 || delta_sec > (0x7FFFFFFF/1000000)-1) + return bound; + delta = delta_sec * 1000000; + if (delta > bound) + delta = bound; + return delta; +} +#endif + +psched_time_t psched_time_base; + +#if PSCHED_CLOCK_SOURCE == PSCHED_CPU +psched_tdiff_t psched_clock_per_hz; +int psched_clock_scale; +#endif + +#ifdef PSCHED_WATCHER +PSCHED_WATCHER psched_time_mark; + +static void psched_tick(unsigned long); + +static struct timer_list psched_timer = + { function: psched_tick }; + +static void psched_tick(unsigned long dummy) +{ +#if PSCHED_CLOCK_SOURCE == PSCHED_CPU + psched_time_t dummy_stamp; + PSCHED_GET_TIME(dummy_stamp); + /* It is OK up to 4GHz cpu */ + psched_timer.expires = jiffies + 1*HZ; +#else + unsigned long now = jiffies; + psched_time_base += ((u64)(now-psched_time_mark))<<PSCHED_JSCALE; + psched_time_mark = now; + psched_timer.expires = now + 60*60*HZ; +#endif + add_timer(&psched_timer); +} +#endif + +#if PSCHED_CLOCK_SOURCE == PSCHED_CPU +int __init psched_calibrate_clock(void) +{ + psched_time_t stamp, stamp1; + struct timeval tv, tv1; + psched_tdiff_t delay; + long rdelay; + unsigned long stop; + +#ifdef PSCHED_WATCHER + psched_tick(0); +#endif + stop = jiffies + HZ/10; + PSCHED_GET_TIME(stamp); + do_gettimeofday(&tv); + while (time_before(jiffies, stop)) { + barrier(); + cpu_relax(); + } + PSCHED_GET_TIME(stamp1); + do_gettimeofday(&tv1); + + delay = PSCHED_TDIFF(stamp1, stamp); + rdelay = tv1.tv_usec - tv.tv_usec; + rdelay += (tv1.tv_sec - tv.tv_sec)*1000000; + if (rdelay > delay) + return -1; + delay /= rdelay; + psched_tick_per_us = delay; + while ((delay>>=1) != 0) + psched_clock_scale++; + psched_us_per_tick = 1<<psched_clock_scale; + psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale; + return 0; +} +#endif + +int __init pktsched_init(void) +{ + struct rtnetlink_link *link_p; + +#if PSCHED_CLOCK_SOURCE == PSCHED_CPU + if (psched_calibrate_clock() < 0) + return -1; +#elif PSCHED_CLOCK_SOURCE == PSCHED_JIFFIES + psched_tick_per_us = HZ<<PSCHED_JSCALE; + psched_us_per_tick = 1000000; +#ifdef PSCHED_WATCHER + psched_tick(0); +#endif +#endif + + link_p = rtnetlink_links[PF_UNSPEC]; + + /* Setup rtnetlink links. It is made here to avoid + exporting large number of public symbols. + */ + + if (link_p) { + link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc; + link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc; + link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc; + link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc; + link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass; + link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass; + link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass; + link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass; + } + +#define INIT_QDISC(name) { \ + extern struct Qdisc_ops name##_qdisc_ops; \ + register_qdisc(& name##_qdisc_ops); \ + } + + INIT_QDISC(pfifo); + INIT_QDISC(bfifo); + +#ifdef CONFIG_NET_SCH_CBQ + INIT_QDISC(cbq); +#endif +#ifdef CONFIG_NET_SCH_HTB + INIT_QDISC(htb); +#endif +#ifdef CONFIG_NET_SCH_CSZ + INIT_QDISC(csz); +#endif +#ifdef CONFIG_NET_SCH_HPFQ + INIT_QDISC(hpfq); +#endif +#ifdef CONFIG_NET_SCH_HFSC + INIT_QDISC(hfsc); +#endif +#ifdef CONFIG_NET_SCH_RED + INIT_QDISC(red); +#endif +#ifdef CONFIG_NET_SCH_GRED + INIT_QDISC(gred); +#endif +#ifdef CONFIG_NET_SCH_INGRESS + INIT_QDISC(ingress); +#endif +#ifdef CONFIG_NET_SCH_DSMARK + INIT_QDISC(dsmark); +#endif +#ifdef CONFIG_NET_SCH_SFQ + INIT_QDISC(sfq); +#endif +#ifdef CONFIG_NET_SCH_TBF + INIT_QDISC(tbf); +#endif +#ifdef CONFIG_NET_SCH_TEQL + teql_init(); +#endif +#ifdef CONFIG_NET_SCH_PRIO + INIT_QDISC(prio); +#endif +#ifdef CONFIG_NET_SCH_ATM + INIT_QDISC(atm); +#endif +#ifdef CONFIG_NET_CLS + tc_filter_init(); +#endif + +#ifdef CONFIG_PROC_FS + create_proc_read_entry("net/psched", 0, 0, psched_read_proc, NULL); +#endif + + return 0; +} diff --git a/release/src/linux/linux/net/sched/sch_atm.c b/release/src/linux/linux/net/sched/sch_atm.c new file mode 100644 index 00000000..1a90e091 --- /dev/null +++ b/release/src/linux/linux/net/sched/sch_atm.c @@ -0,0 +1,710 @@ +/* net/sched/sch_atm.c - ATM VC selection "queueing discipline" */ + +/* Written 1998-2000 by Werner Almesberger, EPFL ICA */ + + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/interrupt.h> +#include <linux/atmdev.h> +#include <linux/atmclip.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/file.h> /* for fput */ +#include <net/pkt_sched.h> +#include <net/sock.h> + + +extern struct socket *sockfd_lookup(int fd, int *err); /* @@@ fix this */ +#define sockfd_put(sock) fput((sock)->file) /* @@@ copied because it's + __inline__ in socket.c */ + + +#define DPRINTK(format,args...) + +#define D2PRINTK(format,args...) + + +/* + * The ATM queuing discipline provides a framework for invoking classifiers + * (aka "filters"), which in turn select classes of this queuing discipline. + * Each class maps the flow(s) it is handling to a given VC. Multiple classes + * may share the same VC. + * + * When creating a class, VCs are specified by passing the number of the open + * socket descriptor by which the calling process references the VC. The kernel + * keeps the VC open at least until all classes using it are removed. + * + * In this file, most functions are named atm_tc_* to avoid confusion with all + * the atm_* in net/atm. This naming convention differs from what's used in the + * rest of net/sched. + * + * Known bugs: + * - sometimes messes up the IP stack + * - any manipulations besides the few operations described in the README, are + * untested and likely to crash the system + * - should lock the flow while there is data in the queue (?) + */ + + +#define PRIV(sch) ((struct atm_qdisc_data *) (sch)->data) +#define VCC2FLOW(vcc) ((struct atm_flow_data *) ((vcc)->user_back)) + + +struct atm_flow_data { + struct Qdisc *q; /* FIFO, TBF, etc. */ + struct tcf_proto *filter_list; + struct atm_vcc *vcc; /* VCC; NULL if VCC is closed */ + void (*old_pop)(struct atm_vcc *vcc,struct sk_buff *skb); /* chaining */ + struct atm_qdisc_data *parent; /* parent qdisc */ + struct socket *sock; /* for closing */ + u32 classid; /* x:y type ID */ + int ref; /* reference count */ + struct tc_stats stats; + struct atm_flow_data *next; + struct atm_flow_data *excess; /* flow for excess traffic; + NULL to set CLP instead */ + int hdr_len; + unsigned char hdr[0]; /* header data; MUST BE LAST */ +}; + +struct atm_qdisc_data { + struct atm_flow_data link; /* unclassified skbs go here */ + struct atm_flow_data *flows; /* NB: "link" is also on this + list */ + struct tasklet_struct task; /* requeue tasklet */ +}; + + +/* ------------------------- Class/flow operations ------------------------- */ + + +static int find_flow(struct atm_qdisc_data *qdisc,struct atm_flow_data *flow) +{ + struct atm_flow_data *walk; + + DPRINTK("find_flow(qdisc %p,flow %p)\n",qdisc,flow); + for (walk = qdisc->flows; walk; walk = walk->next) + if (walk == flow) return 1; + DPRINTK("find_flow: not found\n"); + return 0; +} + + +static __inline__ struct atm_flow_data *lookup_flow(struct Qdisc *sch, + u32 classid) +{ + struct atm_flow_data *flow; + + for (flow = PRIV(sch)->flows; flow; flow = flow->next) + if (flow->classid == classid) break; + return flow; +} + + +static int atm_tc_graft(struct Qdisc *sch,unsigned long arg, + struct Qdisc *new,struct Qdisc **old) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow = (struct atm_flow_data *) arg; + + DPRINTK("atm_tc_graft(sch %p,[qdisc %p],flow %p,new %p,old %p)\n",sch, + p,flow,new,old); + if (!find_flow(p,flow)) return -EINVAL; + if (!new) new = &noop_qdisc; + *old = xchg(&flow->q,new); + if (*old) qdisc_reset(*old); + return 0; +} + + +static struct Qdisc *atm_tc_leaf(struct Qdisc *sch,unsigned long cl) +{ + struct atm_flow_data *flow = (struct atm_flow_data *) cl; + + DPRINTK("atm_tc_leaf(sch %p,flow %p)\n",sch,flow); + return flow ? flow->q : NULL; +} + + +static unsigned long atm_tc_get(struct Qdisc *sch,u32 classid) +{ + struct atm_qdisc_data *p __attribute__((unused)) = PRIV(sch); + struct atm_flow_data *flow; + + DPRINTK("atm_tc_get(sch %p,[qdisc %p],classid %x)\n",sch,p,classid); + flow = lookup_flow(sch,classid); + if (flow) flow->ref++; + DPRINTK("atm_tc_get: flow %p\n",flow); + return (unsigned long) flow; +} + + +static unsigned long atm_tc_bind_filter(struct Qdisc *sch, + unsigned long parent, u32 classid) +{ + return atm_tc_get(sch,classid); +} + + +static void destroy_filters(struct atm_flow_data *flow) +{ + struct tcf_proto *filter; + + while ((filter = flow->filter_list)) { + DPRINTK("destroy_filters: destroying filter %p\n",filter); + flow->filter_list = filter->next; + filter->ops->destroy(filter); + } +} + + +/* + * atm_tc_put handles all destructions, including the ones that are explicitly + * requested (atm_tc_destroy, etc.). The assumption here is that we never drop + * anything that still seems to be in use. + */ + +static void atm_tc_put(struct Qdisc *sch, unsigned long cl) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow = (struct atm_flow_data *) cl; + struct atm_flow_data **prev; + + DPRINTK("atm_tc_put(sch %p,[qdisc %p],flow %p)\n",sch,p,flow); + if (--flow->ref) return; + DPRINTK("atm_tc_put: destroying\n"); + for (prev = &p->flows; *prev; prev = &(*prev)->next) + if (*prev == flow) break; + if (!*prev) { + printk(KERN_CRIT "atm_tc_put: class %p not found\n",flow); + return; + } + *prev = flow->next; + DPRINTK("atm_tc_put: qdisc %p\n",flow->q); + qdisc_destroy(flow->q); + destroy_filters(flow); + if (flow->sock) { + DPRINTK("atm_tc_put: f_count %d\n", + file_count(flow->sock->file)); + flow->vcc->pop = flow->old_pop; + sockfd_put(flow->sock); + } + if (flow->excess) atm_tc_put(sch,(unsigned long) flow->excess); + if (flow != &p->link) kfree(flow); + /* + * If flow == &p->link, the qdisc no longer works at this point and + * needs to be removed. (By the caller of atm_tc_put.) + */ +} + + +static void sch_atm_pop(struct atm_vcc *vcc,struct sk_buff *skb) +{ + struct atm_qdisc_data *p = VCC2FLOW(vcc)->parent; + + D2PRINTK("sch_atm_pop(vcc %p,skb %p,[qdisc %p])\n",vcc,skb,p); + VCC2FLOW(vcc)->old_pop(vcc,skb); + tasklet_schedule(&p->task); +} + + +static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent, + struct rtattr **tca, unsigned long *arg) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow = (struct atm_flow_data *) *arg; + struct atm_flow_data *excess = NULL; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_ATM_MAX]; + struct socket *sock; + int fd,error,hdr_len; + void *hdr; + + DPRINTK("atm_tc_change(sch %p,[qdisc %p],classid %x,parent %x," + "flow %p,opt %p)\n",sch,p,classid,parent,flow,opt); + /* + * The concept of parents doesn't apply for this qdisc. + */ + if (parent && parent != TC_H_ROOT && parent != sch->handle) + return -EINVAL; + /* + * ATM classes cannot be changed. In order to change properties of the + * ATM connection, that socket needs to be modified directly (via the + * native ATM API. In order to send a flow to a different VC, the old + * class needs to be removed and a new one added. (This may be changed + * later.) + */ + if (flow) return -EBUSY; + if (opt == NULL || rtattr_parse(tb,TCA_ATM_MAX,RTA_DATA(opt), + RTA_PAYLOAD(opt))) return -EINVAL; + if (!tb[TCA_ATM_FD-1] || RTA_PAYLOAD(tb[TCA_ATM_FD-1]) < sizeof(fd)) + return -EINVAL; + fd = *(int *) RTA_DATA(tb[TCA_ATM_FD-1]); + DPRINTK("atm_tc_change: fd %d\n",fd); + if (tb[TCA_ATM_HDR-1]) { + hdr_len = RTA_PAYLOAD(tb[TCA_ATM_HDR-1]); + hdr = RTA_DATA(tb[TCA_ATM_HDR-1]); + } + else { + hdr_len = RFC1483LLC_LEN; + hdr = NULL; /* default LLC/SNAP for IP */ + } + if (!tb[TCA_ATM_EXCESS-1]) excess = NULL; + else { + if (RTA_PAYLOAD(tb[TCA_ATM_EXCESS-1]) != sizeof(u32)) + return -EINVAL; + excess = (struct atm_flow_data *) atm_tc_get(sch, + *(u32 *) RTA_DATA(tb[TCA_ATM_EXCESS-1])); + if (!excess) return -ENOENT; + } + DPRINTK("atm_tc_change: type %d, payload %d, hdr_len %d\n", + opt->rta_type,RTA_PAYLOAD(opt),hdr_len); + if (!(sock = sockfd_lookup(fd,&error))) return error; /* f_count++ */ + DPRINTK("atm_tc_change: f_count %d\n",file_count(sock->file)); + if (sock->ops->family != PF_ATMSVC && sock->ops->family != PF_ATMPVC) { + error = -EPROTOTYPE; + goto err_out; + } + /* @@@ should check if the socket is really operational or we'll crash + on vcc->send */ + if (classid) { + if (TC_H_MAJ(classid ^ sch->handle)) { + DPRINTK("atm_tc_change: classid mismatch\n"); + error = -EINVAL; + goto err_out; + } + if (find_flow(p,flow)) { + error = -EEXIST; + goto err_out; + } + } + else { + int i; + unsigned long cl; + + for (i = 1; i < 0x8000; i++) { + classid = TC_H_MAKE(sch->handle,0x8000 | i); + if (!(cl = atm_tc_get(sch,classid))) break; + atm_tc_put(sch,cl); + } + } + DPRINTK("atm_tc_change: new id %x\n",classid); + flow = kmalloc(sizeof(struct atm_flow_data)+hdr_len,GFP_KERNEL); + DPRINTK("atm_tc_change: flow %p\n",flow); + if (!flow) { + error = -ENOBUFS; + goto err_out; + } + memset(flow,0,sizeof(*flow)); + flow->filter_list = NULL; + if (!(flow->q = qdisc_create_dflt(sch->dev,&pfifo_qdisc_ops))) + flow->q = &noop_qdisc; + DPRINTK("atm_tc_change: qdisc %p\n",flow->q); + flow->sock = sock; + flow->vcc = ATM_SD(sock); /* speedup */ + flow->vcc->user_back = flow; + DPRINTK("atm_tc_change: vcc %p\n",flow->vcc); + flow->old_pop = flow->vcc->pop; + flow->parent = p; + flow->vcc->pop = sch_atm_pop; + flow->classid = classid; + flow->ref = 1; + flow->excess = excess; + flow->next = p->link.next; + p->link.next = flow; + flow->hdr_len = hdr_len; + if (hdr) memcpy(flow->hdr,hdr,hdr_len); + else { + memcpy(flow->hdr,llc_oui,sizeof(llc_oui)); + ((u16 *) flow->hdr)[3] = htons(ETH_P_IP); + } + *arg = (unsigned long) flow; + return 0; +err_out: + if (excess) atm_tc_put(sch,(unsigned long) excess); + sockfd_put(sock); + return error; +} + + +static int atm_tc_delete(struct Qdisc *sch,unsigned long arg) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow = (struct atm_flow_data *) arg; + + DPRINTK("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n",sch,p,flow); + if (!find_flow(PRIV(sch),flow)) return -EINVAL; + if (flow->filter_list || flow == &p->link) return -EBUSY; + /* + * Reference count must be 2: one for "keepalive" (set at class + * creation), and one for the reference held when calling delete. + */ + if (flow->ref < 2) { + printk(KERN_ERR "atm_tc_delete: flow->ref == %d\n",flow->ref); + return -EINVAL; + } + if (flow->ref > 2) return -EBUSY; /* catch references via excess, etc.*/ + atm_tc_put(sch,arg); + return 0; +} + + +static void atm_tc_walk(struct Qdisc *sch,struct qdisc_walker *walker) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow; + + DPRINTK("atm_tc_walk(sch %p,[qdisc %p],walker %p)\n",sch,p,walker); + if (walker->stop) return; + for (flow = p->flows; flow; flow = flow->next) { + if (walker->count >= walker->skip) + if (walker->fn(sch,(unsigned long) flow,walker) < 0) { + walker->stop = 1; + break; + } + walker->count++; + } +} + + +static struct tcf_proto **atm_tc_find_tcf(struct Qdisc *sch,unsigned long cl) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow = (struct atm_flow_data *) cl; + + DPRINTK("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n",sch,p,flow); + return flow ? &flow->filter_list : &p->link.filter_list; +} + + +/* --------------------------- Qdisc operations ---------------------------- */ + + +static int atm_tc_enqueue(struct sk_buff *skb,struct Qdisc *sch) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow = NULL ; /* @@@ */ + struct tcf_result res; + int result; + int ret = NET_XMIT_POLICED; + + D2PRINTK("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p); + result = TC_POLICE_OK; /* be nice to gcc */ + if (TC_H_MAJ(skb->priority) != sch->handle || + !(flow = (struct atm_flow_data *) atm_tc_get(sch,skb->priority))) + for (flow = p->flows; flow; flow = flow->next) + if (flow->filter_list) { + result = tc_classify(skb,flow->filter_list, + &res); + if (result < 0) continue; + flow = (struct atm_flow_data *) res.class; + if (!flow) flow = lookup_flow(sch,res.classid); + break; + } + if (!flow) flow = &p->link; + else { + if (flow->vcc) + ATM_SKB(skb)->atm_options = flow->vcc->atm_options; + /*@@@ looks good ... but it's not supposed to work :-)*/ +#ifdef CONFIG_NET_CLS_POLICE + switch (result) { + case TC_POLICE_SHOT: + kfree_skb(skb); + break; + case TC_POLICE_RECLASSIFY: + if (flow->excess) flow = flow->excess; + else { + ATM_SKB(skb)->atm_options |= + ATM_ATMOPT_CLP; + break; + } + /* fall through */ + case TC_POLICE_OK: + /* fall through */ + default: + break; + } +#endif + } + if ( +#ifdef CONFIG_NET_CLS_POLICE + result == TC_POLICE_SHOT || +#endif + (ret = flow->q->enqueue(skb,flow->q)) != 0) { + sch->stats.drops++; + if (flow) flow->stats.drops++; + return ret; + } + sch->stats.bytes += skb->len; + sch->stats.packets++; + flow->stats.bytes += skb->len; + flow->stats.packets++; + /* + * Okay, this may seem weird. We pretend we've dropped the packet if + * it goes via ATM. The reason for this is that the outer qdisc + * expects to be able to q->dequeue the packet later on if we return + * success at this place. Also, sch->q.qdisc needs to reflect whether + * there is a packet egligible for dequeuing or not. Note that the + * statistics of the outer qdisc are necessarily wrong because of all + * this. There's currently no correct solution for this. + */ + if (flow == &p->link) { + sch->q.qlen++; + return 0; + } + tasklet_schedule(&p->task); + return NET_XMIT_BYPASS; +} + + +/* + * Dequeue packets and send them over ATM. Note that we quite deliberately + * avoid checking net_device's flow control here, simply because sch_atm + * uses its own channels, which have nothing to do with any CLIP/LANE/or + * non-ATM interfaces. + */ + + +static void sch_atm_dequeue(unsigned long data) +{ + struct Qdisc *sch = (struct Qdisc *) data; + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow; + struct sk_buff *skb; + + D2PRINTK("sch_atm_dequeue(sch %p,[qdisc %p])\n",sch,p); + for (flow = p->link.next; flow; flow = flow->next) + /* + * If traffic is properly shaped, this won't generate nasty + * little bursts. Otherwise, it may ... (but that's okay) + */ + while ((skb = flow->q->dequeue(flow->q))) { + if (!atm_may_send(flow->vcc,skb->truesize)) { + (void) flow->q->ops->requeue(skb,flow->q); + break; + } + D2PRINTK("atm_tc_deqeueue: sending on class %p\n",flow); + /* remove any LL header somebody else has attached */ + skb_pull(skb,(char *) skb->nh.iph-(char *) skb->data); + if (skb_headroom(skb) < flow->hdr_len) { + struct sk_buff *new; + + new = skb_realloc_headroom(skb,flow->hdr_len); + dev_kfree_skb(skb); + if (!new) continue; + skb = new; + } + D2PRINTK("sch_atm_dequeue: ip %p, data %p\n", + skb->nh.iph,skb->data); + ATM_SKB(skb)->vcc = flow->vcc; + memcpy(skb_push(skb,flow->hdr_len),flow->hdr, + flow->hdr_len); + atomic_add(skb->truesize,&flow->vcc->tx_inuse); + ATM_SKB(skb)->iovcnt = 0; + /* atm.atm_options are already set by atm_tc_enqueue */ + (void) flow->vcc->send(flow->vcc,skb); + } +} + + +static struct sk_buff *atm_tc_dequeue(struct Qdisc *sch) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct sk_buff *skb; + + D2PRINTK("atm_tc_dequeue(sch %p,[qdisc %p])\n",sch,p); + tasklet_schedule(&p->task); + skb = p->link.q->dequeue(p->link.q); + if (skb) sch->q.qlen--; + return skb; +} + + +static int atm_tc_requeue(struct sk_buff *skb,struct Qdisc *sch) +{ + struct atm_qdisc_data *p = PRIV(sch); + int ret; + + D2PRINTK("atm_tc_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p); + ret = p->link.q->ops->requeue(skb,p->link.q); + if (!ret) sch->q.qlen++; + else { + sch->stats.drops++; + p->link.stats.drops++; + } + return ret; +} + + +static int atm_tc_drop(struct Qdisc *sch) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow; + + DPRINTK("atm_tc_drop(sch %p,[qdisc %p])\n",sch,p); + for (flow = p->flows; flow; flow = flow->next) + if (flow->q->ops->drop && flow->q->ops->drop(flow->q)) + return 1; + return 0; +} + + +static int atm_tc_init(struct Qdisc *sch,struct rtattr *opt) +{ + struct atm_qdisc_data *p = PRIV(sch); + + DPRINTK("atm_tc_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt); + memset(p,0,sizeof(*p)); + p->flows = &p->link; + if(!(p->link.q = qdisc_create_dflt(sch->dev,&pfifo_qdisc_ops))) + p->link.q = &noop_qdisc; + DPRINTK("atm_tc_init: link (%p) qdisc %p\n",&p->link,p->link.q); + p->link.filter_list = NULL; + p->link.vcc = NULL; + p->link.sock = NULL; + p->link.classid = sch->handle; + p->link.ref = 1; + p->link.next = NULL; + tasklet_init(&p->task,sch_atm_dequeue,(unsigned long) sch); + MOD_INC_USE_COUNT; + return 0; +} + + +static void atm_tc_reset(struct Qdisc *sch) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow; + + DPRINTK("atm_tc_reset(sch %p,[qdisc %p])\n",sch,p); + for (flow = p->flows; flow; flow = flow->next) qdisc_reset(flow->q); + sch->q.qlen = 0; +} + + +static void atm_tc_destroy(struct Qdisc *sch) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow; + + DPRINTK("atm_tc_destroy(sch %p,[qdisc %p])\n",sch,p); + /* races ? */ + while ((flow = p->flows)) { + destroy_filters(flow); + if (flow->ref > 1) + printk(KERN_ERR "atm_destroy: %p->ref = %d\n",flow, + flow->ref); + atm_tc_put(sch,(unsigned long) flow); + if (p->flows == flow) { + printk(KERN_ERR "atm_destroy: putting flow %p didn't " + "kill it\n",flow); + p->flows = flow->next; /* brute force */ + break; + } + } + tasklet_kill(&p->task); + MOD_DEC_USE_COUNT; +} + + +static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow = (struct atm_flow_data *) cl; + unsigned char *b = skb->tail; + struct rtattr *rta; + + DPRINTK("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n", + sch,p,flow,skb,tcm); + if (!find_flow(p,flow)) return -EINVAL; + tcm->tcm_handle = flow->classid; + rta = (struct rtattr *) b; + RTA_PUT(skb,TCA_OPTIONS,0,NULL); + RTA_PUT(skb,TCA_ATM_HDR,flow->hdr_len,flow->hdr); + if (flow->vcc) { + struct sockaddr_atmpvc pvc; + int state; + + pvc.sap_family = AF_ATMPVC; + pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1; + pvc.sap_addr.vpi = flow->vcc->vpi; + pvc.sap_addr.vci = flow->vcc->vci; + RTA_PUT(skb,TCA_ATM_ADDR,sizeof(pvc),&pvc); + state = ATM_VF2VS(flow->vcc->flags); + RTA_PUT(skb,TCA_ATM_STATE,sizeof(state),&state); + } + if (flow->excess) + RTA_PUT(skb,TCA_ATM_EXCESS,sizeof(u32),&flow->classid); + else { + static u32 zero = 0; + + RTA_PUT(skb,TCA_ATM_EXCESS,sizeof(zero),&zero); + } + rta->rta_len = skb->tail-b; + return skb->len; + +rtattr_failure: + skb_trim(skb,b-skb->data); + return -1; +} + +static int atm_tc_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + return 0; +} + +static struct Qdisc_class_ops atm_class_ops = +{ + atm_tc_graft, /* graft */ + atm_tc_leaf, /* leaf */ + atm_tc_get, /* get */ + atm_tc_put, /* put */ + atm_tc_change, /* change */ + atm_tc_delete, /* delete */ + atm_tc_walk, /* walk */ + + atm_tc_find_tcf, /* tcf_chain */ + atm_tc_bind_filter, /* bind_tcf */ + atm_tc_put, /* unbind_tcf */ + + atm_tc_dump_class, /* dump */ +}; + +struct Qdisc_ops atm_qdisc_ops = +{ + NULL, /* next */ + &atm_class_ops, /* cl_ops */ + "atm", + sizeof(struct atm_qdisc_data), + + atm_tc_enqueue, /* enqueue */ + atm_tc_dequeue, /* dequeue */ + atm_tc_requeue, /* requeue */ + atm_tc_drop, /* drop */ + + atm_tc_init, /* init */ + atm_tc_reset, /* reset */ + atm_tc_destroy, /* destroy */ + NULL, /* change */ + + atm_tc_dump /* dump */ +}; + + +#ifdef MODULE +int init_module(void) +{ + return register_qdisc(&atm_qdisc_ops); +} + + +void cleanup_module(void) +{ + unregister_qdisc(&atm_qdisc_ops); +} +#endif diff --git a/release/src/linux/linux/net/sched/sch_cbq.c b/release/src/linux/linux/net/sched/sch_cbq.c new file mode 100644 index 00000000..761d7f08 --- /dev/null +++ b/release/src/linux/linux/net/sched/sch_cbq.c @@ -0,0 +1,2062 @@ +/* + * net/sched/sch_cbq.c Class-Based Queueing discipline. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + + + +struct cbq_sched_data; + + +struct cbq_class +{ + struct cbq_class *next; /* hash table link */ + struct cbq_class *next_alive; /* next class with backlog in this priority band */ + +/* Parameters */ + u32 classid; + unsigned char priority; /* class priority */ + unsigned char priority2; /* priority to be used after overlimit */ + unsigned char ewma_log; /* time constant for idle time calculation */ + unsigned char ovl_strategy; +#ifdef CONFIG_NET_CLS_POLICE + unsigned char police; +#endif + + u32 defmap; + + /* Link-sharing scheduler parameters */ + long maxidle; /* Class paramters: see below. */ + long offtime; + long minidle; + u32 avpkt; + struct qdisc_rate_table *R_tab; + + /* Overlimit strategy parameters */ + void (*overlimit)(struct cbq_class *cl); + long penalty; + + /* General scheduler (WRR) parameters */ + long allot; + long quantum; /* Allotment per WRR round */ + long weight; /* Relative allotment: see below */ + + struct Qdisc *qdisc; /* Ptr to CBQ discipline */ + struct cbq_class *split; /* Ptr to split node */ + struct cbq_class *share; /* Ptr to LS parent in the class tree */ + struct cbq_class *tparent; /* Ptr to tree parent in the class tree */ + struct cbq_class *borrow; /* NULL if class is bandwidth limited; + parent otherwise */ + struct cbq_class *sibling; /* Sibling chain */ + struct cbq_class *children; /* Pointer to children chain */ + + struct Qdisc *q; /* Elementary queueing discipline */ + + +/* Variables */ + unsigned char cpriority; /* Effective priority */ + unsigned char delayed; + unsigned char level; /* level of the class in hierarchy: + 0 for leaf classes, and maximal + level of children + 1 for nodes. + */ + + psched_time_t last; /* Last end of service */ + psched_time_t undertime; + long avgidle; + long deficit; /* Saved deficit for WRR */ + unsigned long penalized; + struct tc_stats stats; + struct tc_cbq_xstats xstats; + + struct tcf_proto *filter_list; + + int refcnt; + int filters; + + struct cbq_class *defaults[TC_PRIO_MAX+1]; +}; + +struct cbq_sched_data +{ + struct cbq_class *classes[16]; /* Hash table of all classes */ + int nclasses[TC_CBQ_MAXPRIO+1]; + unsigned quanta[TC_CBQ_MAXPRIO+1]; + + struct cbq_class link; + + unsigned activemask; + struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes + with backlog */ + +#ifdef CONFIG_NET_CLS_POLICE + struct cbq_class *rx_class; +#endif + struct cbq_class *tx_class; + struct cbq_class *tx_borrowed; + int tx_len; + psched_time_t now; /* Cached timestamp */ + psched_time_t now_rt; /* Cached real time */ + unsigned pmask; + + struct timer_list delay_timer; + struct timer_list wd_timer; /* Watchdog timer, + started when CBQ has + backlog, but cannot + transmit just now */ + long wd_expires; + int toplevel; + u32 hgenerator; +}; + + +#define L2T(cl,len) ((cl)->R_tab->data[(len)>>(cl)->R_tab->rate.cell_log]) + + +static __inline__ unsigned cbq_hash(u32 h) +{ + h ^= h>>8; + h ^= h>>4; + return h&0xF; +} + +static __inline__ struct cbq_class * +cbq_class_lookup(struct cbq_sched_data *q, u32 classid) +{ + struct cbq_class *cl; + + for (cl = q->classes[cbq_hash(classid)]; cl; cl = cl->next) + if (cl->classid == classid) + return cl; + return NULL; +} + +#ifdef CONFIG_NET_CLS_POLICE + +static struct cbq_class * +cbq_reclassify(struct sk_buff *skb, struct cbq_class *this) +{ + struct cbq_class *cl, *new; + + for (cl = this->tparent; cl; cl = cl->tparent) + if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this) + return new; + + return NULL; +} + +#endif + +/* Classify packet. The procedure is pretty complicated, but + it allows us to combine link sharing and priority scheduling + transparently. + + Namely, you can put link sharing rules (f.e. route based) at root of CBQ, + so that it resolves to split nodes. Then packets are classified + by logical priority, or a more specific classifier may be attached + to the split node. + */ + +static struct cbq_class * +cbq_classify(struct sk_buff *skb, struct Qdisc *sch) +{ + struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data; + struct cbq_class *head = &q->link; + struct cbq_class **defmap; + struct cbq_class *cl = NULL; + u32 prio = skb->priority; + struct tcf_result res; + + /* + * Step 1. If skb->priority points to one of our classes, use it. + */ + if (TC_H_MAJ(prio^sch->handle) == 0 && + (cl = cbq_class_lookup(q, prio)) != NULL) + return cl; + + for (;;) { + int result = 0; + + defmap = head->defaults; + + /* + * Step 2+n. Apply classifier. + */ + if (!head->filter_list || (result = tc_classify(skb, head->filter_list, &res)) < 0) + goto fallback; + + if ((cl = (void*)res.class) == NULL) { + if (TC_H_MAJ(res.classid)) + cl = cbq_class_lookup(q, res.classid); + else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL) + cl = defmap[TC_PRIO_BESTEFFORT]; + + if (cl == NULL || cl->level >= head->level) + goto fallback; + } + +#ifdef CONFIG_NET_CLS_POLICE + switch (result) { + case TC_POLICE_RECLASSIFY: + return cbq_reclassify(skb, cl); + case TC_POLICE_SHOT: + return NULL; + default: + break; + } +#endif + if (cl->level == 0) + return cl; + + /* + * Step 3+n. If classifier selected a link sharing class, + * apply agency specific classifier. + * Repeat this procdure until we hit a leaf node. + */ + head = cl; + } + +fallback: + cl = head; + + /* + * Step 4. No success... + */ + if (TC_H_MAJ(prio) == 0 && + !(cl = head->defaults[prio&TC_PRIO_MAX]) && + !(cl = head->defaults[TC_PRIO_BESTEFFORT])) + return head; + + return cl; +} + +/* + A packet has just been enqueued on the empty class. + cbq_activate_class adds it to the tail of active class list + of its priority band. + */ + +static __inline__ void cbq_activate_class(struct cbq_class *cl) +{ + struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data; + int prio = cl->cpriority; + struct cbq_class *cl_tail; + + cl_tail = q->active[prio]; + q->active[prio] = cl; + + if (cl_tail != NULL) { + cl->next_alive = cl_tail->next_alive; + cl_tail->next_alive = cl; + } else { + cl->next_alive = cl; + q->activemask |= (1<<prio); + } +} + +/* + Unlink class from active chain. + Note that this same procedure is done directly in cbq_dequeue* + during round-robin procedure. + */ + +static void cbq_deactivate_class(struct cbq_class *this) +{ + struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data; + int prio = this->cpriority; + struct cbq_class *cl; + struct cbq_class *cl_prev = q->active[prio]; + + do { + cl = cl_prev->next_alive; + if (cl == this) { + cl_prev->next_alive = cl->next_alive; + cl->next_alive = NULL; + + if (cl == q->active[prio]) { + q->active[prio] = cl_prev; + if (cl == q->active[prio]) { + q->active[prio] = NULL; + q->activemask &= ~(1<<prio); + return; + } + } + + cl = cl_prev->next_alive; + return; + } + } while ((cl_prev = cl) != q->active[prio]); +} + +static void +cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) +{ + int toplevel = q->toplevel; + + if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) { + psched_time_t now; + psched_tdiff_t incr; + + PSCHED_GET_TIME(now); + incr = PSCHED_TDIFF(now, q->now_rt); + PSCHED_TADD2(q->now, incr, now); + + do { + if (PSCHED_TLESS(cl->undertime, now)) { + q->toplevel = cl->level; + return; + } + } while ((cl=cl->borrow) != NULL && toplevel > cl->level); + } +} + +static int +cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl = cbq_classify(skb, sch); + int len = skb->len; + int ret = NET_XMIT_POLICED; + +#ifdef CONFIG_NET_CLS_POLICE + q->rx_class = cl; +#endif + if (cl) { +#ifdef CONFIG_NET_CLS_POLICE + cl->q->__parent = sch; +#endif + if ((ret = cl->q->enqueue(skb, cl->q)) == 0) { + sch->q.qlen++; + sch->stats.packets++; + sch->stats.bytes+=len; + cbq_mark_toplevel(q, cl); + if (!cl->next_alive) + cbq_activate_class(cl); + return 0; + } + } + + sch->stats.drops++; + if (cl == NULL) + kfree_skb(skb); + else { + cbq_mark_toplevel(q, cl); + cl->stats.drops++; + } + return ret; +} + +static int +cbq_requeue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl; + int ret; + + if ((cl = q->tx_class) == NULL) { + kfree_skb(skb); + sch->stats.drops++; + return NET_XMIT_CN; + } + q->tx_class = NULL; + + cbq_mark_toplevel(q, cl); + +#ifdef CONFIG_NET_CLS_POLICE + q->rx_class = cl; + cl->q->__parent = sch; +#endif + if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) { + sch->q.qlen++; + if (!cl->next_alive) + cbq_activate_class(cl); + return 0; + } + sch->stats.drops++; + cl->stats.drops++; + return ret; +} + +/* Overlimit actions */ + +/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */ + +static void cbq_ovl_classic(struct cbq_class *cl) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data; + psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now); + + if (!cl->delayed) { + delay += cl->offtime; + + /* + Class goes to sleep, so that it will have no + chance to work avgidle. Let's forgive it 8) + + BTW cbq-2.0 has a crap in this + place, apparently they forgot to shift it by cl->ewma_log. + */ + if (cl->avgidle < 0) + delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); + if (cl->avgidle < cl->minidle) + cl->avgidle = cl->minidle; + if (delay <= 0) + delay = 1; + PSCHED_TADD2(q->now, delay, cl->undertime); + + cl->xstats.overactions++; + cl->delayed = 1; + } + if (q->wd_expires == 0 || q->wd_expires > delay) + q->wd_expires = delay; + + /* Dirty work! We must schedule wakeups based on + real available rate, rather than leaf rate, + which may be tiny (even zero). + */ + if (q->toplevel == TC_CBQ_MAXLEVEL) { + struct cbq_class *b; + psched_tdiff_t base_delay = q->wd_expires; + + for (b = cl->borrow; b; b = b->borrow) { + delay = PSCHED_TDIFF(b->undertime, q->now); + if (delay < base_delay) { + if (delay <= 0) + delay = 1; + base_delay = delay; + } + } + + q->wd_expires = base_delay; + } +} + +/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when + they go overlimit + */ + +static void cbq_ovl_rclassic(struct cbq_class *cl) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data; + struct cbq_class *this = cl; + + do { + if (cl->level > q->toplevel) { + cl = NULL; + break; + } + } while ((cl = cl->borrow) != NULL); + + if (cl == NULL) + cl = this; + cbq_ovl_classic(cl); +} + +/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */ + +static void cbq_ovl_delay(struct cbq_class *cl) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data; + psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now); + + if (!cl->delayed) { + unsigned long sched = jiffies; + + delay += cl->offtime; + if (cl->avgidle < 0) + delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); + if (cl->avgidle < cl->minidle) + cl->avgidle = cl->minidle; + PSCHED_TADD2(q->now, delay, cl->undertime); + + if (delay > 0) { + sched += PSCHED_US2JIFFIE(delay) + cl->penalty; + cl->penalized = sched; + cl->cpriority = TC_CBQ_MAXPRIO; + q->pmask |= (1<<TC_CBQ_MAXPRIO); + if (del_timer(&q->delay_timer) && + (long)(q->delay_timer.expires - sched) > 0) + q->delay_timer.expires = sched; + add_timer(&q->delay_timer); + cl->delayed = 1; + cl->xstats.overactions++; + return; + } + delay = 1; + } + if (q->wd_expires == 0 || q->wd_expires > delay) + q->wd_expires = delay; +} + +/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */ + +static void cbq_ovl_lowprio(struct cbq_class *cl) +{ + struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data; + + cl->penalized = jiffies + cl->penalty; + + if (cl->cpriority != cl->priority2) { + cl->cpriority = cl->priority2; + q->pmask |= (1<<cl->cpriority); + cl->xstats.overactions++; + } + cbq_ovl_classic(cl); +} + +/* TC_CBQ_OVL_DROP: penalize class by dropping */ + +static void cbq_ovl_drop(struct cbq_class *cl) +{ + if (cl->q->ops->drop) + if (cl->q->ops->drop(cl->q)) + cl->qdisc->q.qlen--; + cl->xstats.overactions++; + cbq_ovl_classic(cl); +} + +static void cbq_watchdog(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + + sch->flags &= ~TCQ_F_THROTTLED; + netif_schedule(sch->dev); +} + +static unsigned long cbq_undelay_prio(struct cbq_sched_data *q, int prio) +{ + struct cbq_class *cl; + struct cbq_class *cl_prev = q->active[prio]; + unsigned long now = jiffies; + unsigned long sched = now; + + if (cl_prev == NULL) + return now; + + do { + cl = cl_prev->next_alive; + if ((long)(now - cl->penalized) > 0) { + cl_prev->next_alive = cl->next_alive; + cl->next_alive = NULL; + cl->cpriority = cl->priority; + cl->delayed = 0; + cbq_activate_class(cl); + + if (cl == q->active[prio]) { + q->active[prio] = cl_prev; + if (cl == q->active[prio]) { + q->active[prio] = NULL; + return 0; + } + } + + cl = cl_prev->next_alive; + } else if ((long)(sched - cl->penalized) > 0) + sched = cl->penalized; + } while ((cl_prev = cl) != q->active[prio]); + + return (long)(sched - now); +} + +static void cbq_undelay(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data; + long delay = 0; + unsigned pmask; + + pmask = q->pmask; + q->pmask = 0; + + while (pmask) { + int prio = ffz(~pmask); + long tmp; + + pmask &= ~(1<<prio); + + tmp = cbq_undelay_prio(q, prio); + if (tmp > 0) { + q->pmask |= 1<<prio; + if (tmp < delay || delay == 0) + delay = tmp; + } + } + + if (delay) { + q->delay_timer.expires = jiffies + delay; + add_timer(&q->delay_timer); + } + + sch->flags &= ~TCQ_F_THROTTLED; + netif_schedule(sch->dev); +} + + +#ifdef CONFIG_NET_CLS_POLICE + +static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child) +{ + int len = skb->len; + struct Qdisc *sch = child->__parent; + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl = q->rx_class; + + q->rx_class = NULL; + + if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) { + + cbq_mark_toplevel(q, cl); + + q->rx_class = cl; + cl->q->__parent = sch; + + if (cl->q->enqueue(skb, cl->q) == 0) { + sch->q.qlen++; + sch->stats.packets++; + sch->stats.bytes+=len; + if (!cl->next_alive) + cbq_activate_class(cl); + return 0; + } + sch->stats.drops++; + return 0; + } + + sch->stats.drops++; + return -1; +} +#endif + +/* + It is mission critical procedure. + + We "regenerate" toplevel cutoff, if transmitting class + has backlog and it is not regulated. It is not part of + original CBQ description, but looks more reasonable. + Probably, it is wrong. This question needs further investigation. +*/ + +static __inline__ void +cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl, + struct cbq_class *borrowed) +{ + if (cl && q->toplevel >= borrowed->level) { + if (cl->q->q.qlen > 1) { + do { + if (PSCHED_IS_PASTPERFECT(borrowed->undertime)) { + q->toplevel = borrowed->level; + return; + } + } while ((borrowed=borrowed->borrow) != NULL); + } + } +} + +static void +cbq_update(struct cbq_sched_data *q) +{ + struct cbq_class *this = q->tx_class; + struct cbq_class *cl = this; + int len = q->tx_len; + + q->tx_class = NULL; + + for ( ; cl; cl = cl->share) { + long avgidle = cl->avgidle; + long idle; + + cl->stats.packets++; + cl->stats.bytes += len; + + /* + (now - last) is total time between packet right edges. + (last_pktlen/rate) is "virtual" busy time, so that + + idle = (now - last) - last_pktlen/rate + */ + + idle = PSCHED_TDIFF(q->now, cl->last); + if ((unsigned long)idle > 128*1024*1024) { + avgidle = cl->maxidle; + } else { + idle -= L2T(cl, len); + + /* true_avgidle := (1-W)*true_avgidle + W*idle, + where W=2^{-ewma_log}. But cl->avgidle is scaled: + cl->avgidle == true_avgidle/W, + hence: + */ + avgidle += idle - (avgidle>>cl->ewma_log); + } + + if (avgidle <= 0) { + /* Overlimit or at-limit */ + + if (avgidle < cl->minidle) + avgidle = cl->minidle; + + cl->avgidle = avgidle; + + /* Calculate expected time, when this class + will be allowed to send. + It will occur, when: + (1-W)*true_avgidle + W*delay = 0, i.e. + idle = (1/W - 1)*(-true_avgidle) + or + idle = (1 - W)*(-cl->avgidle); + */ + idle = (-avgidle) - ((-avgidle) >> cl->ewma_log); + + /* + That is not all. + To maintain the rate allocated to the class, + we add to undertime virtual clock, + necesary to complete transmitted packet. + (len/phys_bandwidth has been already passed + to the moment of cbq_update) + */ + + idle -= L2T(&q->link, len); + idle += L2T(cl, len); + + PSCHED_AUDIT_TDIFF(idle); + + PSCHED_TADD2(q->now, idle, cl->undertime); + } else { + /* Underlimit */ + + PSCHED_SET_PASTPERFECT(cl->undertime); + if (avgidle > cl->maxidle) + cl->avgidle = cl->maxidle; + else + cl->avgidle = avgidle; + } + cl->last = q->now; + } + + cbq_update_toplevel(q, this, q->tx_borrowed); +} + +static __inline__ struct cbq_class * +cbq_under_limit(struct cbq_class *cl) +{ + struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data; + struct cbq_class *this_cl = cl; + + if (cl->tparent == NULL) + return cl; + + if (PSCHED_IS_PASTPERFECT(cl->undertime) || + !PSCHED_TLESS(q->now, cl->undertime)) { + cl->delayed = 0; + return cl; + } + + do { + /* It is very suspicious place. Now overlimit + action is generated for not bounded classes + only if link is completely congested. + Though it is in agree with ancestor-only paradigm, + it looks very stupid. Particularly, + it means that this chunk of code will either + never be called or result in strong amplification + of burstiness. Dangerous, silly, and, however, + no another solution exists. + */ + if ((cl = cl->borrow) == NULL) { + this_cl->stats.overlimits++; + this_cl->overlimit(this_cl); + return NULL; + } + if (cl->level > q->toplevel) + return NULL; + } while (!PSCHED_IS_PASTPERFECT(cl->undertime) && + PSCHED_TLESS(q->now, cl->undertime)); + + cl->delayed = 0; + return cl; +} + +static __inline__ struct sk_buff * +cbq_dequeue_prio(struct Qdisc *sch, int prio) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl_tail, *cl_prev, *cl; + struct sk_buff *skb; + int deficit; + + cl_tail = cl_prev = q->active[prio]; + cl = cl_prev->next_alive; + + do { + deficit = 0; + + /* Start round */ + do { + struct cbq_class *borrow = cl; + + if (cl->q->q.qlen && + (borrow = cbq_under_limit(cl)) == NULL) + goto skip_class; + + if (cl->deficit <= 0) { + /* Class exhausted its allotment per + this round. Switch to the next one. + */ + deficit = 1; + cl->deficit += cl->quantum; + goto next_class; + } + + skb = cl->q->dequeue(cl->q); + + /* Class did not give us any skb :-( + It could occur even if cl->q->q.qlen != 0 + f.e. if cl->q == "tbf" + */ + if (skb == NULL) + goto skip_class; + + cl->deficit -= skb->len; + q->tx_class = cl; + q->tx_borrowed = borrow; + if (borrow != cl) { +#ifndef CBQ_XSTATS_BORROWS_BYTES + borrow->xstats.borrows++; + cl->xstats.borrows++; +#else + borrow->xstats.borrows += skb->len; + cl->xstats.borrows += skb->len; +#endif + } + q->tx_len = skb->len; + + if (cl->deficit <= 0) { + q->active[prio] = cl; + cl = cl->next_alive; + cl->deficit += cl->quantum; + } + return skb; + +skip_class: + if (cl->q->q.qlen == 0 || prio != cl->cpriority) { + /* Class is empty or penalized. + Unlink it from active chain. + */ + cl_prev->next_alive = cl->next_alive; + cl->next_alive = NULL; + + /* Did cl_tail point to it? */ + if (cl == cl_tail) { + /* Repair it! */ + cl_tail = cl_prev; + + /* Was it the last class in this band? */ + if (cl == cl_tail) { + /* Kill the band! */ + q->active[prio] = NULL; + q->activemask &= ~(1<<prio); + if (cl->q->q.qlen) + cbq_activate_class(cl); + return NULL; + } + + q->active[prio] = cl_tail; + } + if (cl->q->q.qlen) + cbq_activate_class(cl); + + cl = cl_prev; + } + +next_class: + cl_prev = cl; + cl = cl->next_alive; + } while (cl_prev != cl_tail); + } while (deficit); + + q->active[prio] = cl_prev; + + return NULL; +} + +static __inline__ struct sk_buff * +cbq_dequeue_1(struct Qdisc *sch) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct sk_buff *skb; + unsigned activemask; + + activemask = q->activemask&0xFF; + while (activemask) { + int prio = ffz(~activemask); + activemask &= ~(1<<prio); + skb = cbq_dequeue_prio(sch, prio); + if (skb) + return skb; + } + return NULL; +} + +static struct sk_buff * +cbq_dequeue(struct Qdisc *sch) +{ + struct sk_buff *skb; + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + psched_time_t now; + psched_tdiff_t incr; + + PSCHED_GET_TIME(now); + incr = PSCHED_TDIFF(now, q->now_rt); + + if (q->tx_class) { + psched_tdiff_t incr2; + /* Time integrator. We calculate EOS time + by adding expected packet transmittion time. + If real time is greater, we warp artificial clock, + so that: + + cbq_time = max(real_time, work); + */ + incr2 = L2T(&q->link, q->tx_len); + PSCHED_TADD(q->now, incr2); + cbq_update(q); + if ((incr -= incr2) < 0) + incr = 0; + } + PSCHED_TADD(q->now, incr); + q->now_rt = now; + + for (;;) { + q->wd_expires = 0; + + skb = cbq_dequeue_1(sch); + if (skb) { + sch->q.qlen--; + sch->flags &= ~TCQ_F_THROTTLED; + return skb; + } + + /* All the classes are overlimit. + + It is possible, if: + + 1. Scheduler is empty. + 2. Toplevel cutoff inhibited borrowing. + 3. Root class is overlimit. + + Reset 2d and 3d conditions and retry. + + Note, that NS and cbq-2.0 are buggy, peeking + an arbitrary class is appropriate for ancestor-only + sharing, but not for toplevel algorithm. + + Our version is better, but slower, because it requires + two passes, but it is unavoidable with top-level sharing. + */ + + if (q->toplevel == TC_CBQ_MAXLEVEL && + PSCHED_IS_PASTPERFECT(q->link.undertime)) + break; + + q->toplevel = TC_CBQ_MAXLEVEL; + PSCHED_SET_PASTPERFECT(q->link.undertime); + } + + /* No packets in scheduler or nobody wants to give them to us :-( + Sigh... start watchdog timer in the last case. */ + + if (sch->q.qlen) { + sch->stats.overlimits++; + if (q->wd_expires && !netif_queue_stopped(sch->dev)) { + long delay = PSCHED_US2JIFFIE(q->wd_expires); + del_timer(&q->wd_timer); + if (delay <= 0) + delay = 1; + q->wd_timer.expires = jiffies + delay; + add_timer(&q->wd_timer); + sch->flags |= TCQ_F_THROTTLED; + } + } + return NULL; +} + +/* CBQ class maintanance routines */ + +static void cbq_adjust_levels(struct cbq_class *this) +{ + if (this == NULL) + return; + + do { + int level = 0; + struct cbq_class *cl; + + if ((cl = this->children) != NULL) { + do { + if (cl->level > level) + level = cl->level; + } while ((cl = cl->sibling) != this->children); + } + this->level = level+1; + } while ((this = this->tparent) != NULL); +} + +static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio) +{ + struct cbq_class *cl; + unsigned h; + + if (q->quanta[prio] == 0) + return; + + for (h=0; h<16; h++) { + for (cl = q->classes[h]; cl; cl = cl->next) { + /* BUGGGG... Beware! This expression suffer of + arithmetic overflows! + */ + if (cl->priority == prio) { + cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/ + q->quanta[prio]; + } + if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) { + printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum); + cl->quantum = cl->qdisc->dev->mtu/2 + 1; + } + } + } +} + +static void cbq_sync_defmap(struct cbq_class *cl) +{ + struct cbq_sched_data *q = (struct cbq_sched_data*)cl->qdisc->data; + struct cbq_class *split = cl->split; + unsigned h; + int i; + + if (split == NULL) + return; + + for (i=0; i<=TC_PRIO_MAX; i++) { + if (split->defaults[i] == cl && !(cl->defmap&(1<<i))) + split->defaults[i] = NULL; + } + + for (i=0; i<=TC_PRIO_MAX; i++) { + int level = split->level; + + if (split->defaults[i]) + continue; + + for (h=0; h<16; h++) { + struct cbq_class *c; + + for (c = q->classes[h]; c; c = c->next) { + if (c->split == split && c->level < level && + c->defmap&(1<<i)) { + split->defaults[i] = c; + level = c->level; + } + } + } + } +} + +static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask) +{ + struct cbq_class *split = NULL; + + if (splitid == 0) { + if ((split = cl->split) == NULL) + return; + splitid = split->classid; + } + + if (split == NULL || split->classid != splitid) { + for (split = cl->tparent; split; split = split->tparent) + if (split->classid == splitid) + break; + } + + if (split == NULL) + return; + + if (cl->split != split) { + cl->defmap = 0; + cbq_sync_defmap(cl); + cl->split = split; + cl->defmap = def&mask; + } else + cl->defmap = (cl->defmap&~mask)|(def&mask); + + cbq_sync_defmap(cl); +} + +static void cbq_unlink_class(struct cbq_class *this) +{ + struct cbq_class *cl, **clp; + struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data; + + for (clp = &q->classes[cbq_hash(this->classid)]; (cl = *clp) != NULL; clp = &cl->next) { + if (cl == this) { + *clp = cl->next; + cl->next = NULL; + break; + } + } + + if (this->tparent) { + clp=&this->sibling; + cl = *clp; + do { + if (cl == this) { + *clp = cl->sibling; + break; + } + clp = &cl->sibling; + } while ((cl = *clp) != this->sibling); + + if (this->tparent->children == this) { + this->tparent->children = this->sibling; + if (this->sibling == this) + this->tparent->children = NULL; + } + } else { + BUG_TRAP(this->sibling == this); + } +} + +static void cbq_link_class(struct cbq_class *this) +{ + struct cbq_sched_data *q = (struct cbq_sched_data*)this->qdisc->data; + unsigned h = cbq_hash(this->classid); + struct cbq_class *parent = this->tparent; + + this->sibling = this; + this->next = q->classes[h]; + q->classes[h] = this; + + if (parent == NULL) + return; + + if (parent->children == NULL) { + parent->children = this; + } else { + this->sibling = parent->children->sibling; + parent->children->sibling = this; + } +} + +static int cbq_drop(struct Qdisc* sch) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl, *cl_head; + int prio; + + for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) { + if ((cl_head = q->active[prio]) == NULL) + continue; + + cl = cl_head; + do { + if (cl->q->ops->drop && cl->q->ops->drop(cl->q)) { + sch->q.qlen--; + return 1; + } + } while ((cl = cl->next_alive) != cl_head); + } + return 0; +} + +static void +cbq_reset(struct Qdisc* sch) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl; + int prio; + unsigned h; + + q->activemask = 0; + q->pmask = 0; + q->tx_class = NULL; + q->tx_borrowed = NULL; + del_timer(&q->wd_timer); + del_timer(&q->delay_timer); + q->toplevel = TC_CBQ_MAXLEVEL; + PSCHED_GET_TIME(q->now); + q->now_rt = q->now; + + for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++) + q->active[prio] = NULL; + + for (h = 0; h < 16; h++) { + for (cl = q->classes[h]; cl; cl = cl->next) { + qdisc_reset(cl->q); + + cl->next_alive = NULL; + PSCHED_SET_PASTPERFECT(cl->undertime); + cl->avgidle = cl->maxidle; + cl->deficit = cl->quantum; + cl->cpriority = cl->priority; + } + } + sch->q.qlen = 0; +} + + +static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss) +{ + if (lss->change&TCF_CBQ_LSS_FLAGS) { + cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent; + cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent; + } + if (lss->change&TCF_CBQ_LSS_EWMA) + cl->ewma_log = lss->ewma_log; + if (lss->change&TCF_CBQ_LSS_AVPKT) + cl->avpkt = lss->avpkt; + if (lss->change&TCF_CBQ_LSS_MINIDLE) + cl->minidle = -(long)lss->minidle; + if (lss->change&TCF_CBQ_LSS_MAXIDLE) { + cl->maxidle = lss->maxidle; + cl->avgidle = lss->maxidle; + } + if (lss->change&TCF_CBQ_LSS_OFFTIME) + cl->offtime = lss->offtime; + return 0; +} + +static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl) +{ + q->nclasses[cl->priority]--; + q->quanta[cl->priority] -= cl->weight; + cbq_normalize_quanta(q, cl->priority); +} + +static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl) +{ + q->nclasses[cl->priority]++; + q->quanta[cl->priority] += cl->weight; + cbq_normalize_quanta(q, cl->priority); +} + +static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)cl->qdisc->data; + + if (wrr->allot) + cl->allot = wrr->allot; + if (wrr->weight) + cl->weight = wrr->weight; + if (wrr->priority) { + cl->priority = wrr->priority-1; + cl->cpriority = cl->priority; + if (cl->priority >= cl->priority2) + cl->priority2 = TC_CBQ_MAXPRIO-1; + } + + cbq_addprio(q, cl); + return 0; +} + +static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl) +{ + switch (ovl->strategy) { + case TC_CBQ_OVL_CLASSIC: + cl->overlimit = cbq_ovl_classic; + break; + case TC_CBQ_OVL_DELAY: + cl->overlimit = cbq_ovl_delay; + break; + case TC_CBQ_OVL_LOWPRIO: + if (ovl->priority2-1 >= TC_CBQ_MAXPRIO || + ovl->priority2-1 <= cl->priority) + return -EINVAL; + cl->priority2 = ovl->priority2-1; + cl->overlimit = cbq_ovl_lowprio; + break; + case TC_CBQ_OVL_DROP: + cl->overlimit = cbq_ovl_drop; + break; + case TC_CBQ_OVL_RCLASSIC: + cl->overlimit = cbq_ovl_rclassic; + break; + default: + return -EINVAL; + } + cl->penalty = (ovl->penalty*HZ)/1000; + return 0; +} + +#ifdef CONFIG_NET_CLS_POLICE +static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p) +{ + cl->police = p->police; + + if (cl->q->handle) { + if (p->police == TC_POLICE_RECLASSIFY) + cl->q->reshape_fail = cbq_reshape_fail; + else + cl->q->reshape_fail = NULL; + } + return 0; +} +#endif + +static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt) +{ + cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange); + return 0; +} + +static int cbq_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data; + struct rtattr *tb[TCA_CBQ_MAX]; + struct tc_ratespec *r; + + if (rtattr_parse(tb, TCA_CBQ_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)) < 0 || + tb[TCA_CBQ_RTAB-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL || + RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec)) + return -EINVAL; + + if (tb[TCA_CBQ_LSSOPT-1] && + RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt)) + return -EINVAL; + + r = RTA_DATA(tb[TCA_CBQ_RATE-1]); + + MOD_INC_USE_COUNT; + if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB-1])) == NULL) { + MOD_DEC_USE_COUNT; + return -EINVAL; + } + + q->link.refcnt = 1; + q->link.sibling = &q->link; + q->link.classid = sch->handle; + q->link.qdisc = sch; + if (!(q->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops))) + q->link.q = &noop_qdisc; + + q->link.priority = TC_CBQ_MAXPRIO-1; + q->link.priority2 = TC_CBQ_MAXPRIO-1; + q->link.cpriority = TC_CBQ_MAXPRIO-1; + q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC; + q->link.overlimit = cbq_ovl_classic; + q->link.allot = psched_mtu(sch->dev); + q->link.quantum = q->link.allot; + q->link.weight = q->link.R_tab->rate.rate; + + q->link.ewma_log = TC_CBQ_DEF_EWMA; + q->link.avpkt = q->link.allot/2; + q->link.minidle = -0x7FFFFFFF; + q->link.stats.lock = &sch->dev->queue_lock; + + init_timer(&q->wd_timer); + q->wd_timer.data = (unsigned long)sch; + q->wd_timer.function = cbq_watchdog; + init_timer(&q->delay_timer); + q->delay_timer.data = (unsigned long)sch; + q->delay_timer.function = cbq_undelay; + q->toplevel = TC_CBQ_MAXLEVEL; + PSCHED_GET_TIME(q->now); + q->now_rt = q->now; + + cbq_link_class(&q->link); + + if (tb[TCA_CBQ_LSSOPT-1]) + cbq_set_lss(&q->link, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); + + cbq_addprio(q, &q->link); + return 0; +} + +static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + + RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_lssopt opt; + + opt.flags = 0; + if (cl->borrow == NULL) + opt.flags |= TCF_CBQ_LSS_BOUNDED; + if (cl->share == NULL) + opt.flags |= TCF_CBQ_LSS_ISOLATED; + opt.ewma_log = cl->ewma_log; + opt.level = cl->level; + opt.avpkt = cl->avpkt; + opt.maxidle = cl->maxidle; + opt.minidle = (u32)(-cl->minidle); + opt.offtime = cl->offtime; + opt.change = ~0; + RTA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_wrropt opt; + + opt.flags = 0; + opt.allot = cl->allot; + opt.priority = cl->priority+1; + opt.cpriority = cl->cpriority+1; + opt.weight = cl->weight; + RTA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_ovl opt; + + opt.strategy = cl->ovl_strategy; + opt.priority2 = cl->priority2+1; + opt.penalty = (cl->penalty*1000)/HZ; + RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_fopt opt; + + if (cl->split || cl->defmap) { + opt.split = cl->split ? cl->split->classid : 0; + opt.defmap = cl->defmap; + opt.defchange = ~0; + RTA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt); + } + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +#ifdef CONFIG_NET_CLS_POLICE +static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_police opt; + + if (cl->police) { + opt.police = cl->police; + RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt); + } + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} +#endif + +static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl) +{ + if (cbq_dump_lss(skb, cl) < 0 || + cbq_dump_rate(skb, cl) < 0 || + cbq_dump_wrr(skb, cl) < 0 || + cbq_dump_ovl(skb, cl) < 0 || +#ifdef CONFIG_NET_CLS_POLICE + cbq_dump_police(skb, cl) < 0 || +#endif + cbq_dump_fopt(skb, cl) < 0) + return -1; + return 0; +} + +int cbq_copy_xstats(struct sk_buff *skb, struct tc_cbq_xstats *st) +{ + RTA_PUT(skb, TCA_XSTATS, sizeof(*st), st); + return 0; + +rtattr_failure: + return -1; +} + + +static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data; + unsigned char *b = skb->tail; + struct rtattr *rta; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + if (cbq_dump_attr(skb, &q->link) < 0) + goto rtattr_failure; + rta->rta_len = skb->tail - b; + spin_lock_bh(&sch->dev->queue_lock); + q->link.xstats.avgidle = q->link.avgidle; + if (cbq_copy_xstats(skb, &q->link.xstats)) { + spin_unlock_bh(&sch->dev->queue_lock); + goto rtattr_failure; + } + spin_unlock_bh(&sch->dev->queue_lock); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int +cbq_dump_class(struct Qdisc *sch, unsigned long arg, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct cbq_sched_data *q = (struct cbq_sched_data*)sch->data; + struct cbq_class *cl = (struct cbq_class*)arg; + unsigned char *b = skb->tail; + struct rtattr *rta; + + if (cl->tparent) + tcm->tcm_parent = cl->tparent->classid; + else + tcm->tcm_parent = TC_H_ROOT; + tcm->tcm_handle = cl->classid; + tcm->tcm_info = cl->q->handle; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + if (cbq_dump_attr(skb, cl) < 0) + goto rtattr_failure; + rta->rta_len = skb->tail - b; + cl->stats.qlen = cl->q->q.qlen; + if (qdisc_copy_stats(skb, &cl->stats)) + goto rtattr_failure; + spin_lock_bh(&sch->dev->queue_lock); + cl->xstats.avgidle = cl->avgidle; + cl->xstats.undertime = 0; + if (!PSCHED_IS_PASTPERFECT(cl->undertime)) + cl->xstats.undertime = PSCHED_TDIFF(cl->undertime, q->now); + q->link.xstats.avgidle = q->link.avgidle; + if (cbq_copy_xstats(skb, &cl->xstats)) { + spin_unlock_bh(&sch->dev->queue_lock); + goto rtattr_failure; + } + spin_unlock_bh(&sch->dev->queue_lock); + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct cbq_class *cl = (struct cbq_class*)arg; + + if (cl) { + if (new == NULL) { + if ((new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)) == NULL) + return -ENOBUFS; + } else { +#ifdef CONFIG_NET_CLS_POLICE + if (cl->police == TC_POLICE_RECLASSIFY) + new->reshape_fail = cbq_reshape_fail; +#endif + } + sch_tree_lock(sch); + *old = cl->q; + cl->q = new; + qdisc_reset(*old); + sch_tree_unlock(sch); + + return 0; + } + return -ENOENT; +} + +static struct Qdisc * +cbq_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_class *cl = (struct cbq_class*)arg; + + return cl ? cl->q : NULL; +} + +static unsigned long cbq_get(struct Qdisc *sch, u32 classid) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl = cbq_class_lookup(q, classid); + + if (cl) { + cl->refcnt++; + return (unsigned long)cl; + } + return 0; +} + +static void cbq_destroy_filters(struct cbq_class *cl) +{ + struct tcf_proto *tp; + + while ((tp = cl->filter_list) != NULL) { + cl->filter_list = tp->next; + tp->ops->destroy(tp); + } +} + +static void cbq_destroy_class(struct cbq_class *cl) +{ + cbq_destroy_filters(cl); + qdisc_destroy(cl->q); + qdisc_put_rtab(cl->R_tab); +#ifdef CONFIG_NET_ESTIMATOR + qdisc_kill_estimator(&cl->stats); +#endif + kfree(cl); +} + +static void +cbq_destroy(struct Qdisc* sch) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl; + unsigned h; + +#ifdef CONFIG_NET_CLS_POLICE + q->rx_class = NULL; +#endif + for (h = 0; h < 16; h++) { + for (cl = q->classes[h]; cl; cl = cl->next) + cbq_destroy_filters(cl); + } + + for (h = 0; h < 16; h++) { + struct cbq_class *next; + + for (cl = q->classes[h]; cl; cl = next) { + next = cl->next; + if (cl != &q->link) + cbq_destroy_class(cl); + } + } + + qdisc_put_rtab(q->link.R_tab); + MOD_DEC_USE_COUNT; +} + +static void cbq_put(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_class *cl = (struct cbq_class*)arg; + + if (--cl->refcnt == 0) { +#ifdef CONFIG_NET_CLS_POLICE + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + + spin_lock_bh(&sch->dev->queue_lock); + if (q->rx_class == cl) + q->rx_class = NULL; + spin_unlock_bh(&sch->dev->queue_lock); +#endif + + cbq_destroy_class(cl); + } +} + +static int +cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca, + unsigned long *arg) +{ + int err; + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl = (struct cbq_class*)*arg; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_CBQ_MAX]; + struct cbq_class *parent; + struct qdisc_rate_table *rtab = NULL; + + if (opt==NULL || + rtattr_parse(tb, TCA_CBQ_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt))) + return -EINVAL; + + if (tb[TCA_CBQ_OVL_STRATEGY-1] && + RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY-1]) < sizeof(struct tc_cbq_ovl)) + return -EINVAL; + + if (tb[TCA_CBQ_FOPT-1] && + RTA_PAYLOAD(tb[TCA_CBQ_FOPT-1]) < sizeof(struct tc_cbq_fopt)) + return -EINVAL; + + if (tb[TCA_CBQ_RATE-1] && + RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec)) + return -EINVAL; + + if (tb[TCA_CBQ_LSSOPT-1] && + RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt)) + return -EINVAL; + + if (tb[TCA_CBQ_WRROPT-1] && + RTA_PAYLOAD(tb[TCA_CBQ_WRROPT-1]) < sizeof(struct tc_cbq_wrropt)) + return -EINVAL; + +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_CBQ_POLICE-1] && + RTA_PAYLOAD(tb[TCA_CBQ_POLICE-1]) < sizeof(struct tc_cbq_police)) + return -EINVAL; +#endif + + if (cl) { + /* Check parent */ + if (parentid) { + if (cl->tparent && cl->tparent->classid != parentid) + return -EINVAL; + if (!cl->tparent && parentid != TC_H_ROOT) + return -EINVAL; + } + + if (tb[TCA_CBQ_RATE-1]) { + rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]); + if (rtab == NULL) + return -EINVAL; + } + + /* Change class parameters */ + sch_tree_lock(sch); + + if (cl->next_alive != NULL) + cbq_deactivate_class(cl); + + if (rtab) { + rtab = xchg(&cl->R_tab, rtab); + qdisc_put_rtab(rtab); + } + + if (tb[TCA_CBQ_LSSOPT-1]) + cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); + + if (tb[TCA_CBQ_WRROPT-1]) { + cbq_rmprio(q, cl); + cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1])); + } + + if (tb[TCA_CBQ_OVL_STRATEGY-1]) + cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1])); + +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_CBQ_POLICE-1]) + cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1])); +#endif + + if (tb[TCA_CBQ_FOPT-1]) + cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1])); + + if (cl->q->q.qlen) + cbq_activate_class(cl); + + sch_tree_unlock(sch); + +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) { + qdisc_kill_estimator(&cl->stats); + qdisc_new_estimator(&cl->stats, tca[TCA_RATE-1]); + } +#endif + return 0; + } + + if (parentid == TC_H_ROOT) + return -EINVAL; + + if (tb[TCA_CBQ_WRROPT-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL || + tb[TCA_CBQ_LSSOPT-1] == NULL) + return -EINVAL; + + rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]); + if (rtab == NULL) + return -EINVAL; + + if (classid) { + err = -EINVAL; + if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid)) + goto failure; + } else { + int i; + classid = TC_H_MAKE(sch->handle,0x8000); + + for (i=0; i<0x8000; i++) { + if (++q->hgenerator >= 0x8000) + q->hgenerator = 1; + if (cbq_class_lookup(q, classid|q->hgenerator) == NULL) + break; + } + err = -ENOSR; + if (i >= 0x8000) + goto failure; + classid = classid|q->hgenerator; + } + + parent = &q->link; + if (parentid) { + parent = cbq_class_lookup(q, parentid); + err = -EINVAL; + if (parent == NULL) + goto failure; + } + + err = -ENOBUFS; + cl = kmalloc(sizeof(*cl), GFP_KERNEL); + if (cl == NULL) + goto failure; + memset(cl, 0, sizeof(*cl)); + cl->R_tab = rtab; + rtab = NULL; + cl->refcnt = 1; + if (!(cl->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops))) + cl->q = &noop_qdisc; + cl->classid = classid; + cl->tparent = parent; + cl->qdisc = sch; + cl->allot = parent->allot; + cl->quantum = cl->allot; + cl->weight = cl->R_tab->rate.rate; + cl->stats.lock = &sch->dev->queue_lock; + + sch_tree_lock(sch); + cbq_link_class(cl); + cl->borrow = cl->tparent; + if (cl->tparent != &q->link) + cl->share = cl->tparent; + cbq_adjust_levels(parent); + cl->minidle = -0x7FFFFFFF; + cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); + cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1])); + if (cl->ewma_log==0) + cl->ewma_log = q->link.ewma_log; + if (cl->maxidle==0) + cl->maxidle = q->link.maxidle; + if (cl->avpkt==0) + cl->avpkt = q->link.avpkt; + cl->overlimit = cbq_ovl_classic; + if (tb[TCA_CBQ_OVL_STRATEGY-1]) + cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1])); +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_CBQ_POLICE-1]) + cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1])); +#endif + if (tb[TCA_CBQ_FOPT-1]) + cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1])); + sch_tree_unlock(sch); + +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) + qdisc_new_estimator(&cl->stats, tca[TCA_RATE-1]); +#endif + + *arg = (unsigned long)cl; + return 0; + +failure: + qdisc_put_rtab(rtab); + return err; +} + +static int cbq_delete(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl = (struct cbq_class*)arg; + + if (cl->filters || cl->children || cl == &q->link) + return -EBUSY; + + sch_tree_lock(sch); + + if (cl->next_alive) + cbq_deactivate_class(cl); + + if (q->tx_borrowed == cl) + q->tx_borrowed = q->tx_class; + if (q->tx_class == cl) { + q->tx_class = NULL; + q->tx_borrowed = NULL; + } +#ifdef CONFIG_NET_CLS_POLICE + if (q->rx_class == cl) + q->rx_class = NULL; +#endif + + cbq_unlink_class(cl); + cbq_adjust_levels(cl->tparent); + cl->defmap = 0; + cbq_sync_defmap(cl); + + cbq_rmprio(q, cl); + sch_tree_unlock(sch); + + if (--cl->refcnt == 0) + cbq_destroy_class(cl); + + return 0; +} + +static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *cl = (struct cbq_class *)arg; + + if (cl == NULL) + cl = &q->link; + + return &cl->filter_list; +} + +static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + struct cbq_class *p = (struct cbq_class*)parent; + struct cbq_class *cl = cbq_class_lookup(q, classid); + + if (cl) { + if (p && p->level <= cl->level) + return 0; + cl->filters++; + return (unsigned long)cl; + } + return 0; +} + +static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_class *cl = (struct cbq_class*)arg; + + cl->filters--; +} + +static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct cbq_sched_data *q = (struct cbq_sched_data *)sch->data; + unsigned h; + + if (arg->stop) + return; + + for (h = 0; h < 16; h++) { + struct cbq_class *cl; + + for (cl = q->classes[h]; cl; cl = cl->next) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, (unsigned long)cl, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } +} + +static struct Qdisc_class_ops cbq_class_ops = +{ + cbq_graft, + cbq_leaf, + cbq_get, + cbq_put, + cbq_change_class, + cbq_delete, + cbq_walk, + + cbq_find_tcf, + cbq_bind_filter, + cbq_unbind_filter, + + cbq_dump_class, +}; + +struct Qdisc_ops cbq_qdisc_ops = +{ + NULL, + &cbq_class_ops, + "cbq", + sizeof(struct cbq_sched_data), + + cbq_enqueue, + cbq_dequeue, + cbq_requeue, + cbq_drop, + + cbq_init, + cbq_reset, + cbq_destroy, + NULL /* cbq_change */, + + cbq_dump, +}; + +#ifdef MODULE +int init_module(void) +{ + return register_qdisc(&cbq_qdisc_ops); +} + +void cleanup_module(void) +{ + unregister_qdisc(&cbq_qdisc_ops); +} +#endif +MODULE_LICENSE("GPL"); diff --git a/release/src/linux/linux/net/sched/sch_csz.c b/release/src/linux/linux/net/sched/sch_csz.c new file mode 100644 index 00000000..1d1b2397 --- /dev/null +++ b/release/src/linux/linux/net/sched/sch_csz.c @@ -0,0 +1,1041 @@ +/* + * net/sched/sch_csz.c Clark-Shenker-Zhang scheduler. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + + +/* Clark-Shenker-Zhang algorithm. + ======================================= + + SOURCE. + + David D. Clark, Scott Shenker and Lixia Zhang + "Supporting Real-Time Applications in an Integrated Services Packet + Network: Architecture and Mechanism". + + CBQ presents a flexible universal algorithm for packet scheduling, + but it has pretty poor delay characteristics. + Round-robin scheduling and link-sharing goals + apparently contradict minimization of network delay and jitter. + Moreover, correct handling of predictive flows seems to be + impossible in CBQ. + + CSZ presents a more precise but less flexible and less efficient + approach. As I understand it, the main idea is to create + WFQ flows for each guaranteed service and to allocate + the rest of bandwith to dummy flow-0. Flow-0 comprises + the predictive services and the best effort traffic; + it is handled by a priority scheduler with the highest + priority band allocated for predictive services, and the rest --- + to the best effort packets. + + Note that in CSZ flows are NOT limited to their bandwidth. It + is supposed that the flow passed admission control at the edge + of the QoS network and it doesn't need further shaping. Any + attempt to improve the flow or to shape it to a token bucket + at intermediate hops will introduce undesired delays and raise + jitter. + + At the moment CSZ is the only scheduler that provides + true guaranteed service. Another schemes (including CBQ) + do not provide guaranteed delay and randomize jitter. + There is a proof (Sally Floyd), that delay + can be estimated by a IntServ compliant formula. + This result is true formally, but it is wrong in principle. + It takes into account only round-robin delays, + ignoring delays introduced by link sharing i.e. overlimiting. + Note that temporary overlimits are inevitable because + real links are not ideal, and the real algorithm must take this + into account. + + ALGORITHM. + + --- Notations. + + $B$ is link bandwidth (bits/sec). + + $I$ is set of all flows, including flow $0$. + Every flow $a \in I$ has associated bandwidth slice $r_a < 1$ and + $\sum_{a \in I} r_a = 1$. + + --- Flow model. + + Let $m_a$ is the number of backlogged bits in flow $a$. + The flow is {\em active}, if $m_a > 0$. + This number is a discontinuous function of time; + when a packet $i$ arrives: + \[ + m_a(t_i+0) - m_a(t_i-0) = L^i, + \] + where $L^i$ is the length of the arrived packet. + The flow queue is drained continuously until $m_a == 0$: + \[ + {d m_a \over dt} = - { B r_a \over \sum_{b \in A} r_b}. + \] + I.e. flow rates are their allocated rates proportionally + scaled to take all available link bandwidth. Apparently, + it is not the only possible policy. F.e. CBQ classes + without borrowing would be modelled by: + \[ + {d m_a \over dt} = - B r_a . + \] + More complicated hierarchical bandwidth allocation + policies are possible, but unfortunately, the basic + flow equations have a simple solution only for proportional + scaling. + + --- Departure times. + + We calculate the time until the last bit of packet is sent: + \[ + E_a^i(t) = { m_a(t_i) - \delta_a(t) \over r_a }, + \] + where $\delta_a(t)$ is number of bits drained since $t_i$. + We have to evaluate $E_a^i$ for all queued packets, + then find the packet with minimal $E_a^i$ and send it. + + This sounds good, but direct implementation of the algorithm + is absolutely infeasible. Luckily, if flow rates + are scaled proportionally, the equations have a simple solution. + + The differential equation for $E_a^i$ is + \[ + {d E_a^i (t) \over dt } = - { d \delta_a(t) \over dt} { 1 \over r_a} = + { B \over \sum_{b \in A} r_b} + \] + with initial condition + \[ + E_a^i (t_i) = { m_a(t_i) \over r_a } . + \] + + Let's introduce an auxiliary function $R(t)$: + + --- Round number. + + Consider the following model: we rotate over active flows, + sending $r_a B$ bits from every flow, so that we send + $B \sum_{a \in A} r_a$ bits per round, that takes + $\sum_{a \in A} r_a$ seconds. + + Hence, $R(t)$ (round number) is a monotonically increasing + linear function of time when $A$ is not changed + \[ + { d R(t) \over dt } = { 1 \over \sum_{a \in A} r_a } + \] + and it is continuous when $A$ changes. + + The central observation is that the quantity + $F_a^i = R(t) + E_a^i(t)/B$ does not depend on time at all! + $R(t)$ does not depend on flow, so that $F_a^i$ can be + calculated only once on packet arrival, and we need not + recalculate $E$ numbers and resorting queues. + The number $F_a^i$ is called finish number of the packet. + It is just the value of $R(t)$ when the last bit of packet + is sent out. + + Maximal finish number on flow is called finish number of flow + and minimal one is "start number of flow". + Apparently, flow is active if and only if $F_a \leq R$. + + When a packet of length $L_i$ bit arrives to flow $a$ at time $t_i$, + we calculate $F_a^i$ as: + + If flow was inactive ($F_a < R$): + $F_a^i = R(t) + {L_i \over B r_a}$ + otherwise + $F_a^i = F_a + {L_i \over B r_a}$ + + These equations complete the algorithm specification. + + It looks pretty hairy, but there is a simple + procedure for solving these equations. + See procedure csz_update(), that is a generalization of + the algorithm from S. Keshav's thesis Chapter 3 + "Efficient Implementation of Fair Queeing". + + NOTES. + + * We implement only the simplest variant of CSZ, + when flow-0 is a explicit 4band priority fifo. + This is bad, but we need a "peek" operation in addition + to "dequeue" to implement complete CSZ. + I do not want to do that, unless it is absolutely + necessary. + + * A primitive support for token bucket filtering + presents itself too. It directly contradicts CSZ, but + even though the Internet is on the globe ... :-) + "the edges of the network" really exist. + + BUGS. + + * Fixed point arithmetic is overcomplicated, suboptimal and even + wrong. Check it later. */ + + +/* This number is arbitrary */ + +#define CSZ_GUARANTEED 16 +#define CSZ_FLOWS (CSZ_GUARANTEED+4) + +struct csz_head +{ + struct csz_head *snext; + struct csz_head *sprev; + struct csz_head *fnext; + struct csz_head *fprev; +}; + +struct csz_flow +{ + struct csz_head *snext; + struct csz_head *sprev; + struct csz_head *fnext; + struct csz_head *fprev; + +/* Parameters */ + struct tc_ratespec rate; + struct tc_ratespec slice; + u32 *L_tab; /* Lookup table for L/(B*r_a) values */ + unsigned long limit; /* Maximal length of queue */ +#ifdef CSZ_PLUS_TBF + struct tc_ratespec peakrate; + __u32 buffer; /* Depth of token bucket, normalized + as L/(B*r_a) */ + __u32 mtu; +#endif + +/* Variables */ +#ifdef CSZ_PLUS_TBF + unsigned long tokens; /* Tokens number: usecs */ + psched_time_t t_tbf; + unsigned long R_tbf; + int throttled; +#endif + unsigned peeked; + unsigned long start; /* Finish number of the first skb */ + unsigned long finish; /* Finish number of the flow */ + + struct sk_buff_head q; /* FIFO queue */ +}; + +#define L2R(f,L) ((f)->L_tab[(L)>>(f)->slice.cell_log]) + +struct csz_sched_data +{ +/* Parameters */ + unsigned char rate_log; /* fixed point position for rate; + * really we need not it */ + unsigned char R_log; /* fixed point position for round number */ + unsigned char delta_log; /* 1<<delta_log is maximal timeout in usecs; + * 21 <-> 2.1sec is MAXIMAL value */ + +/* Variables */ + struct tcf_proto *filter_list; + u8 prio2band[TC_PRIO_MAX+1]; +#ifdef CSZ_PLUS_TBF + struct timer_list wd_timer; + long wd_expires; +#endif + psched_time_t t_c; /* Time check-point */ + unsigned long R_c; /* R-number check-point */ + unsigned long rate; /* Current sum of rates of active flows */ + struct csz_head s; /* Flows sorted by "start" */ + struct csz_head f; /* Flows sorted by "finish" */ + + struct sk_buff_head other[4];/* Predicted (0) and the best efforts + classes (1,2,3) */ + struct csz_flow flow[CSZ_GUARANTEED]; /* Array of flows */ +}; + +/* These routines (csz_insert_finish and csz_insert_start) are + the most time consuming part of all the algorithm. + + We insert to sorted list, so that time + is linear with respect to number of active flows in the worst case. + Note that we have not very large number of guaranteed flows, + so that logarithmic algorithms (heap etc.) are useless, + they are slower than linear one when length of list <= 32. + + Heap would take sence if we used WFQ for best efforts + flows, but SFQ is better choice in this case. + */ + + +/* Insert flow "this" to the list "b" before + flow with greater finish number. + */ + +/* Scan backward */ +extern __inline__ void csz_insert_finish(struct csz_head *b, + struct csz_flow *this) +{ + struct csz_head *f = b->fprev; + unsigned long finish = this->finish; + + while (f != b) { + if (((struct csz_flow*)f)->finish - finish <= 0) + break; + f = f->fprev; + } + this->fnext = f->fnext; + this->fprev = f; + this->fnext->fprev = this->fprev->fnext = (struct csz_head*)this; +} + +/* Insert flow "this" to the list "b" before + flow with greater start number. + */ + +extern __inline__ void csz_insert_start(struct csz_head *b, + struct csz_flow *this) +{ + struct csz_head *f = b->snext; + unsigned long start = this->start; + + while (f != b) { + if (((struct csz_flow*)f)->start - start > 0) + break; + f = f->snext; + } + this->snext = f; + this->sprev = f->sprev; + this->snext->sprev = this->sprev->snext = (struct csz_head*)this; +} + + +/* Calculate and return current round number. + It is another time consuming part, but + it is impossible to avoid it. + + It costs O(N) that make all the algorithm useful only + to play with closest to ideal fluid model. + + There exist less academic, but more practical modifications, + which might have even better characteristics (WF2Q+, HPFQ, HFSC) + */ + +static unsigned long csz_update(struct Qdisc *sch) +{ + struct csz_sched_data *q = (struct csz_sched_data*)sch->data; + struct csz_flow *a; + unsigned long F; + unsigned long tmp; + psched_time_t now; + unsigned long delay; + unsigned long R_c; + + PSCHED_GET_TIME(now); + delay = PSCHED_TDIFF_SAFE(now, q->t_c, 0, goto do_reset); + + if (delay>>q->delta_log) { +do_reset: + /* Delta is too large. + It is possible if MTU/BW > 1<<q->delta_log + (i.e. configuration error) or because of hardware + fault. We have no choice... + */ + qdisc_reset(sch); + return 0; + } + + q->t_c = now; + + for (;;) { + a = (struct csz_flow*)q->f.fnext; + + /* No more active flows. Reset R and exit. */ + if (a == (struct csz_flow*)&q->f) { +#ifdef CSZ_DEBUG + if (q->rate) { + printk("csz_update: rate!=0 on inactive csz\n"); + q->rate = 0; + } +#endif + q->R_c = 0; + return 0; + } + + F = a->finish; + +#ifdef CSZ_DEBUG + if (q->rate == 0) { + printk("csz_update: rate=0 on active csz\n"); + goto do_reset; + } +#endif + + /* + * tmp = (t - q->t_c)/q->rate; + */ + + tmp = ((delay<<(31-q->delta_log))/q->rate)>>(31-q->delta_log+q->R_log); + + tmp += q->R_c; + + /* OK, this flow (and all flows with greater + finish numbers) is still active */ + if (F - tmp > 0) + break; + + /* It is more not active */ + + a->fprev->fnext = a->fnext; + a->fnext->fprev = a->fprev; + + /* + * q->t_c += (F - q->R_c)*q->rate + */ + + tmp = ((F-q->R_c)*q->rate)<<q->R_log; + R_c = F; + q->rate -= a->slice.rate; + + if ((long)(delay - tmp) >= 0) { + delay -= tmp; + continue; + } + delay = 0; + } + + q->R_c = tmp; + return tmp; +} + +unsigned csz_classify(struct sk_buff *skb, struct csz_sched_data *q) +{ + return CSZ_GUARANTEED; +} + +static int +csz_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + unsigned flow_id = csz_classify(skb, q); + unsigned long R; + int prio = 0; + struct csz_flow *this; + + if (flow_id >= CSZ_GUARANTEED) { + prio = flow_id - CSZ_GUARANTEED; + flow_id = 0; + } + + this = &q->flow[flow_id]; + if (this->q.qlen >= this->limit || this->L_tab == NULL) { + sch->stats.drops++; + kfree_skb(skb); + return NET_XMIT_DROP; + } + + R = csz_update(sch); + + if ((long)(this->finish - R) >= 0) { + /* It was active */ + this->finish += L2R(this,skb->len); + } else { + /* It is inactive; activate it */ + this->finish = R + L2R(this,skb->len); + q->rate += this->slice.rate; + csz_insert_finish(&q->f, this); + } + + /* If this flow was empty, remember start number + and insert it into start queue */ + if (this->q.qlen == 0) { + this->start = this->finish; + csz_insert_start(&q->s, this); + } + if (flow_id) + skb_queue_tail(&this->q, skb); + else + skb_queue_tail(&q->other[prio], skb); + sch->q.qlen++; + sch->stats.bytes += skb->len; + sch->stats.packets++; + return 0; +} + +static __inline__ struct sk_buff * +skb_dequeue_best(struct csz_sched_data * q) +{ + int i; + struct sk_buff *skb; + + for (i=0; i<4; i++) { + skb = skb_dequeue(&q->other[i]); + if (skb) { + q->flow[0].q.qlen--; + return skb; + } + } + return NULL; +} + +static __inline__ struct sk_buff * +skb_peek_best(struct csz_sched_data * q) +{ + int i; + struct sk_buff *skb; + + for (i=0; i<4; i++) { + skb = skb_peek(&q->other[i]); + if (skb) + return skb; + } + return NULL; +} + +#ifdef CSZ_PLUS_TBF + +static void csz_watchdog(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + + qdisc_wakeup(sch->dev); +} + +static __inline__ void +csz_move_queue(struct csz_flow *this, long delta) +{ + this->fprev->fnext = this->fnext; + this->fnext->fprev = this->fprev; + + this->start += delta; + this->finish += delta; + + csz_insert_finish(this); +} + +static __inline__ int csz_enough_tokens(struct csz_sched_data *q, + struct csz_flow *this, + struct sk_buff *skb) +{ + long toks; + long shift; + psched_time_t now; + + PSCHED_GET_TIME(now); + + toks = PSCHED_TDIFF(now, t_tbf) + this->tokens - L2R(q,this,skb->len); + + shift = 0; + if (this->throttled) { + /* Remember aposteriory delay */ + + unsigned long R = csz_update(q); + shift = R - this->R_tbf; + this->R_tbf = R; + } + + if (toks >= 0) { + /* Now we have enough tokens to proceed */ + + this->tokens = toks <= this->depth ? toks : this->depth; + this->t_tbf = now; + + if (!this->throttled) + return 1; + + /* Flow was throttled. Update its start&finish numbers + with delay calculated aposteriori. + */ + + this->throttled = 0; + if (shift > 0) + csz_move_queue(this, shift); + return 1; + } + + if (!this->throttled) { + /* Flow has just been throttled; remember + current round number to calculate aposteriori delay + */ + this->throttled = 1; + this->R_tbf = csz_update(q); + } + + /* Move all the queue to the time when it will be allowed to send. + We should translate time to round number, but it is impossible, + so that we made the most conservative estimate i.e. we suppose + that only this flow is active and, hence, R = t. + Really toks <= R <= toks/r_a. + + This apriory shift in R will be adjusted later to reflect + real delay. We cannot avoid it because of: + - throttled flow continues to be active from the viewpoint + of CSZ, so that it would acquire the highest priority, + if you not adjusted start numbers. + - Eventually, finish number would become less than round + number and flow were declared inactive. + */ + + toks = -toks; + + /* Remeber, that we should start watchdog */ + if (toks < q->wd_expires) + q->wd_expires = toks; + + toks >>= q->R_log; + shift += toks; + if (shift > 0) { + this->R_tbf += toks; + csz_move_queue(this, shift); + } + csz_insert_start(this); + return 0; +} +#endif + + +static struct sk_buff * +csz_dequeue(struct Qdisc* sch) +{ + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + struct sk_buff *skb; + struct csz_flow *this; + +#ifdef CSZ_PLUS_TBF + q->wd_expires = 0; +#endif + this = (struct csz_flow*)q->s.snext; + + while (this != (struct csz_flow*)&q->s) { + + /* First of all: unlink from start list */ + this->sprev->snext = this->snext; + this->snext->sprev = this->sprev; + + if (this != &q->flow[0]) { /* Guaranteed flow */ + skb = __skb_dequeue(&this->q); + if (skb) { +#ifdef CSZ_PLUS_TBF + if (this->depth) { + if (!csz_enough_tokens(q, this, skb)) + continue; + } +#endif + if (this->q.qlen) { + struct sk_buff *nskb = skb_peek(&this->q); + this->start += L2R(this,nskb->len); + csz_insert_start(&q->s, this); + } + sch->q.qlen--; + return skb; + } + } else { /* Predicted or best effort flow */ + skb = skb_dequeue_best(q); + if (skb) { + unsigned peeked = this->peeked; + this->peeked = 0; + + if (--this->q.qlen) { + struct sk_buff *nskb; + unsigned dequeued = L2R(this,skb->len); + + /* We got not the same thing that + peeked earlier; adjust start number + */ + if (peeked != dequeued && peeked) + this->start += dequeued - peeked; + + nskb = skb_peek_best(q); + peeked = L2R(this,nskb->len); + this->start += peeked; + this->peeked = peeked; + csz_insert_start(&q->s, this); + } + sch->q.qlen--; + return skb; + } + } + } +#ifdef CSZ_PLUS_TBF + /* We are about to return no skb. + Schedule watchdog timer, if it occurred because of shaping. + */ + if (q->wd_expires) { + unsigned long delay = PSCHED_US2JIFFIE(q->wd_expires); + del_timer(&q->wd_timer); + if (delay == 0) + delay = 1; + q->wd_timer.expires = jiffies + delay; + add_timer(&q->wd_timer); + sch->stats.overlimits++; + } +#endif + return NULL; +} + +static void +csz_reset(struct Qdisc* sch) +{ + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + int i; + + for (i=0; i<4; i++) + skb_queue_purge(&q->other[i]); + + for (i=0; i<CSZ_GUARANTEED; i++) { + struct csz_flow *this = q->flow + i; + skb_queue_purge(&this->q); + this->snext = this->sprev = + this->fnext = this->fprev = (struct csz_head*)this; + this->start = this->finish = 0; + } + q->s.snext = q->s.sprev = &q->s; + q->f.fnext = q->f.fprev = &q->f; + q->R_c = 0; +#ifdef CSZ_PLUS_TBF + PSCHED_GET_TIME(&q->t_tbf); + q->tokens = q->depth; + del_timer(&q->wd_timer); +#endif + sch->q.qlen = 0; +} + +static void +csz_destroy(struct Qdisc* sch) +{ + MOD_DEC_USE_COUNT; +} + +static int csz_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + struct rtattr *tb[TCA_CSZ_PTAB]; + struct tc_csz_qopt *qopt; + int i; + + rtattr_parse(tb, TCA_CSZ_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt)); + if (tb[TCA_CSZ_PARMS-1] == NULL || + RTA_PAYLOAD(tb[TCA_CSZ_PARMS-1]) < sizeof(*qopt)) + return -EINVAL; + qopt = RTA_DATA(tb[TCA_CSZ_PARMS-1]); + + q->R_log = qopt->R_log; + q->delta_log = qopt->delta_log; + for (i=0; i<=TC_PRIO_MAX; i++) { + if (qopt->priomap[i] >= CSZ_FLOWS) + return -EINVAL; + q->prio2band[i] = qopt->priomap[i]; + } + + for (i=0; i<4; i++) + skb_queue_head_init(&q->other[i]); + + for (i=0; i<CSZ_GUARANTEED; i++) { + struct csz_flow *this = q->flow + i; + skb_queue_head_init(&this->q); + this->snext = this->sprev = + this->fnext = this->fprev = (struct csz_head*)this; + this->start = this->finish = 0; + } + q->s.snext = q->s.sprev = &q->s; + q->f.fnext = q->f.fprev = &q->f; + q->R_c = 0; +#ifdef CSZ_PLUS_TBF + init_timer(&q->wd_timer); + q->wd_timer.data = (unsigned long)sch; + q->wd_timer.function = csz_watchdog; +#endif + MOD_INC_USE_COUNT; + return 0; +} + +static int csz_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + unsigned char *b = skb->tail; + struct rtattr *rta; + struct tc_csz_qopt opt; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + opt.flows = CSZ_FLOWS; + memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1); + RTA_PUT(skb, TCA_CSZ_PARMS, sizeof(opt), &opt); + rta->rta_len = skb->tail - b; + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int csz_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, + struct Qdisc **old) +{ + return -EINVAL; +} + +static struct Qdisc * csz_leaf(struct Qdisc *sch, unsigned long cl) +{ + return NULL; +} + + +static unsigned long csz_get(struct Qdisc *sch, u32 classid) +{ + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + unsigned long band = TC_H_MIN(classid) - 1; + + if (band >= CSZ_FLOWS) + return 0; + + if (band < CSZ_GUARANTEED && q->flow[band].L_tab == NULL) + return 0; + + return band+1; +} + +static unsigned long csz_bind(struct Qdisc *sch, unsigned long parent, u32 classid) +{ + return csz_get(sch, classid); +} + + +static void csz_put(struct Qdisc *sch, unsigned long cl) +{ + return; +} + +static int csz_change(struct Qdisc *sch, u32 handle, u32 parent, struct rtattr **tca, unsigned long *arg) +{ + unsigned long cl = *arg; + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_CSZ_PTAB]; + struct tc_csz_copt *copt; + + rtattr_parse(tb, TCA_CSZ_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt)); + if (tb[TCA_CSZ_PARMS-1] == NULL || + RTA_PAYLOAD(tb[TCA_CSZ_PARMS-1]) < sizeof(*copt)) + return -EINVAL; + copt = RTA_DATA(tb[TCA_CSZ_PARMS-1]); + + if (tb[TCA_CSZ_RTAB-1] && + RTA_PAYLOAD(tb[TCA_CSZ_RTAB-1]) < 1024) + return -EINVAL; + + if (cl) { + struct csz_flow *a; + cl--; + if (cl >= CSZ_FLOWS) + return -ENOENT; + if (cl >= CSZ_GUARANTEED || q->flow[cl].L_tab == NULL) + return -EINVAL; + + a = &q->flow[cl]; + + spin_lock_bh(&sch->dev->queue_lock); +#ifdef CSZ_PLUS_TBF + a->limit = copt->limit; + a->rate = copt->rate; + a->buffer = copt->buffer; + a->mtu = copt->mtu; +#endif + + if (tb[TCA_CSZ_RTAB-1]) + memcpy(a->L_tab, RTA_DATA(tb[TCA_CSZ_RTAB-1]), 1024); + + spin_unlock_bh(&sch->dev->queue_lock); + return 0; + } + /* NI */ + return 0; +} + +static int csz_delete(struct Qdisc *sch, unsigned long cl) +{ + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + struct csz_flow *a; + + cl--; + + if (cl >= CSZ_FLOWS) + return -ENOENT; + if (cl >= CSZ_GUARANTEED || q->flow[cl].L_tab == NULL) + return -EINVAL; + + a = &q->flow[cl]; + + spin_lock_bh(&sch->dev->queue_lock); + a->fprev->fnext = a->fnext; + a->fnext->fprev = a->fprev; + a->sprev->snext = a->snext; + a->snext->sprev = a->sprev; + a->start = a->finish = 0; + kfree(xchg(&q->flow[cl].L_tab, NULL)); + spin_unlock_bh(&sch->dev->queue_lock); + + return 0; +} + +static int csz_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) +{ + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + unsigned char *b = skb->tail; + struct rtattr *rta; + struct tc_csz_copt opt; + + tcm->tcm_handle = sch->handle|cl; + + cl--; + + if (cl > CSZ_FLOWS) + goto rtattr_failure; + + if (cl < CSZ_GUARANTEED) { + struct csz_flow *f = &q->flow[cl]; + + if (f->L_tab == NULL) + goto rtattr_failure; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + opt.limit = f->limit; + opt.rate = f->rate; + opt.slice = f->slice; + memset(&opt.peakrate, 0, sizeof(opt.peakrate)); +#ifdef CSZ_PLUS_TBF + opt.buffer = f->buffer; + opt.mtu = f->mtu; +#else + opt.buffer = 0; + opt.mtu = 0; +#endif + + RTA_PUT(skb, TCA_CSZ_PARMS, sizeof(opt), &opt); + rta->rta_len = skb->tail - b; + } + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static void csz_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + int prio = 0; + + if (arg->stop) + return; + + for (prio = 0; prio < CSZ_FLOWS; prio++) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (prio < CSZ_GUARANTEED && q->flow[prio].L_tab == NULL) { + arg->count++; + continue; + } + if (arg->fn(sch, prio+1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static struct tcf_proto ** csz_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + struct csz_sched_data *q = (struct csz_sched_data *)sch->data; + + if (cl) + return NULL; + + return &q->filter_list; +} + +struct Qdisc_class_ops csz_class_ops = +{ + csz_graft, + csz_leaf, + + csz_get, + csz_put, + csz_change, + csz_delete, + csz_walk, + + csz_find_tcf, + csz_bind, + csz_put, + + csz_dump_class, +}; + +struct Qdisc_ops csz_qdisc_ops = +{ + NULL, + &csz_class_ops, + "csz", + sizeof(struct csz_sched_data), + + csz_enqueue, + csz_dequeue, + NULL, + NULL, + + csz_init, + csz_reset, + csz_destroy, + NULL /* csz_change */, + + csz_dump, +}; + + +#ifdef MODULE +int init_module(void) +{ + return register_qdisc(&csz_qdisc_ops); +} + +void cleanup_module(void) +{ + unregister_qdisc(&csz_qdisc_ops); +} +#endif +MODULE_LICENSE("GPL"); diff --git a/release/src/linux/linux/net/sched/sch_dsmark.c b/release/src/linux/linux/net/sched/sch_dsmark.c new file mode 100644 index 00000000..e8726612 --- /dev/null +++ b/release/src/linux/linux/net/sched/sch_dsmark.c @@ -0,0 +1,472 @@ +/* net/sched/sch_dsmark.c - Differentiated Services field marker */ + +/* Written 1998-2000 by Werner Almesberger, EPFL ICA */ + + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> /* for pkt_sched */ +#include <linux/rtnetlink.h> +#include <net/pkt_sched.h> +#include <net/dsfield.h> +#include <asm/byteorder.h> + + +#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) + +#define D2PRINTK(format,args...) + + +#define PRIV(sch) ((struct dsmark_qdisc_data *) (sch)->data) + + +/* + * classid class marking + * ------- ----- ------- + * n/a 0 n/a + * x:0 1 use entry [0] + * ... ... ... + * x:y y>0 y+1 use entry [y] + * ... ... ... + * x:indices-1 indices use entry [indices-1] + * ... ... ... + * x:y y+1 use entry [y & (indices-1)] + * ... ... ... + * 0xffff 0x10000 use entry [indices-1] + */ + + +#define NO_DEFAULT_INDEX (1 << 16) + +struct dsmark_qdisc_data { + struct Qdisc *q; + struct tcf_proto *filter_list; + __u8 *mask; /* "owns" the array */ + __u8 *value; + __u16 indices; + __u32 default_index; /* index range is 0...0xffff */ + int set_tc_index; +}; + + +/* ------------------------- Class/flow operations ------------------------- */ + + +static int dsmark_graft(struct Qdisc *sch,unsigned long arg, + struct Qdisc *new,struct Qdisc **old) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + + DPRINTK("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n",sch,p,new, + old); + if (!new) + new = &noop_qdisc; + sch_tree_lock(sch); + *old = xchg(&p->q,new); + if (*old) + qdisc_reset(*old); + sch_tree_unlock(sch); /* @@@ move up ? */ + return 0; +} + + +static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + + return p->q; +} + + +static unsigned long dsmark_get(struct Qdisc *sch,u32 classid) +{ + struct dsmark_qdisc_data *p __attribute__((unused)) = PRIV(sch); + + DPRINTK("dsmark_get(sch %p,[qdisc %p],classid %x)\n",sch,p,classid); + return TC_H_MIN(classid)+1; +} + + +static unsigned long dsmark_bind_filter(struct Qdisc *sch, + unsigned long parent, u32 classid) +{ + return dsmark_get(sch,classid); +} + + +static void dsmark_put(struct Qdisc *sch, unsigned long cl) +{ +} + + +static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent, + struct rtattr **tca, unsigned long *arg) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_DSMARK_MAX]; + + DPRINTK("dsmark_change(sch %p,[qdisc %p],classid %x,parent %x)," + "arg 0x%lx\n",sch,p,classid,parent,*arg); + if (*arg > p->indices) + return -ENOENT; + if (!opt || rtattr_parse(tb, TCA_DSMARK_MAX, RTA_DATA(opt), + RTA_PAYLOAD(opt))) + return -EINVAL; + if (tb[TCA_DSMARK_MASK-1]) { + if (!RTA_PAYLOAD(tb[TCA_DSMARK_MASK-1])) + return -EINVAL; + p->mask[*arg-1] = *(__u8 *) RTA_DATA(tb[TCA_DSMARK_MASK-1]); + } + if (tb[TCA_DSMARK_VALUE-1]) { + if (!RTA_PAYLOAD(tb[TCA_DSMARK_VALUE-1])) + return -EINVAL; + p->value[*arg-1] = *(__u8 *) RTA_DATA(tb[TCA_DSMARK_VALUE-1]); + } + return 0; +} + + +static int dsmark_delete(struct Qdisc *sch,unsigned long arg) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + + if (!arg || arg > p->indices) + return -EINVAL; + p->mask[arg-1] = 0xff; + p->value[arg-1] = 0; + return 0; +} + + +static void dsmark_walk(struct Qdisc *sch,struct qdisc_walker *walker) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + int i; + + DPRINTK("dsmark_walk(sch %p,[qdisc %p],walker %p)\n",sch,p,walker); + if (walker->stop) + return; + for (i = 0; i < p->indices; i++) { + if (p->mask[i] == 0xff && !p->value[i]) + continue; + if (walker->count >= walker->skip) { + if (walker->fn(sch, i+1, walker) < 0) { + walker->stop = 1; + break; + } + } + walker->count++; + } +} + + +static struct tcf_proto **dsmark_find_tcf(struct Qdisc *sch,unsigned long cl) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + + return &p->filter_list; +} + + +/* --------------------------- Qdisc operations ---------------------------- */ + + +static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + struct tcf_result res; + int result; + int ret = NET_XMIT_POLICED; + + D2PRINTK("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p); + if (p->set_tc_index) { + switch (skb->protocol) { + case __constant_htons(ETH_P_IP): + skb->tc_index = ipv4_get_dsfield(skb->nh.iph); + break; + case __constant_htons(ETH_P_IPV6): + skb->tc_index = ipv6_get_dsfield(skb->nh.ipv6h); + break; + default: + skb->tc_index = 0; + break; + }; + } + result = TC_POLICE_OK; /* be nice to gcc */ + if (TC_H_MAJ(skb->priority) == sch->handle) { + skb->tc_index = TC_H_MIN(skb->priority); + } else { + result = tc_classify(skb,p->filter_list,&res); + D2PRINTK("result %d class 0x%04x\n",result,res.classid); + switch (result) { +#ifdef CONFIG_NET_CLS_POLICE + case TC_POLICE_SHOT: + kfree_skb(skb); + break; +#endif + case TC_POLICE_OK: + skb->tc_index = TC_H_MIN(res.classid); + break; + case TC_POLICE_UNSPEC: + /* fall through */ + default: + if (p->default_index != NO_DEFAULT_INDEX) + skb->tc_index = p->default_index; + break; + }; + } + if ( +#ifdef CONFIG_NET_CLS_POLICE + result == TC_POLICE_SHOT || +#endif + + ((ret = p->q->enqueue(skb,p->q)) != 0)) { + sch->stats.drops++; + return ret; + } + sch->stats.bytes += skb->len; + sch->stats.packets++; + sch->q.qlen++; + return ret; +} + + +static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + struct sk_buff *skb; + int index; + + D2PRINTK("dsmark_dequeue(sch %p,[qdisc %p])\n",sch,p); + skb = p->q->ops->dequeue(p->q); + if (!skb) + return NULL; + sch->q.qlen--; + index = skb->tc_index & (p->indices-1); + D2PRINTK("index %d->%d\n",skb->tc_index,index); + switch (skb->protocol) { + case __constant_htons(ETH_P_IP): + ipv4_change_dsfield(skb->nh.iph, + p->mask[index],p->value[index]); + break; + case __constant_htons(ETH_P_IPV6): + ipv6_change_dsfield(skb->nh.ipv6h, + p->mask[index],p->value[index]); + break; + default: + /* + * Only complain if a change was actually attempted. + * This way, we can send non-IP traffic through dsmark + * and don't need yet another qdisc as a bypass. + */ + if (p->mask[index] != 0xff || p->value[index]) + printk(KERN_WARNING "dsmark_dequeue: " + "unsupported protocol %d\n", + htons(skb->protocol)); + break; + }; + return skb; +} + + +static int dsmark_requeue(struct sk_buff *skb,struct Qdisc *sch) +{ + int ret; + struct dsmark_qdisc_data *p = PRIV(sch); + + D2PRINTK("dsmark_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p); + if ((ret = p->q->ops->requeue(skb, p->q)) == 0) { + sch->q.qlen++; + return 0; + } + sch->stats.drops++; + return ret; +} + + +static int dsmark_drop(struct Qdisc *sch) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + + DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n",sch,p); + if (!p->q->ops->drop) + return 0; + if (!p->q->ops->drop(p->q)) + return 0; + sch->q.qlen--; + return 1; +} + + +int dsmark_init(struct Qdisc *sch,struct rtattr *opt) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + struct rtattr *tb[TCA_DSMARK_MAX]; + __u16 tmp; + + DPRINTK("dsmark_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt); + if (rtattr_parse(tb,TCA_DSMARK_MAX,RTA_DATA(opt),RTA_PAYLOAD(opt)) < 0 || + !tb[TCA_DSMARK_INDICES-1] || + RTA_PAYLOAD(tb[TCA_DSMARK_INDICES-1]) < sizeof(__u16)) + return -EINVAL; + memset(p,0,sizeof(*p)); + p->filter_list = NULL; + p->indices = *(__u16 *) RTA_DATA(tb[TCA_DSMARK_INDICES-1]); + if (!p->indices) + return -EINVAL; + for (tmp = p->indices; tmp != 1; tmp >>= 1) { + if (tmp & 1) + return -EINVAL; + } + p->default_index = NO_DEFAULT_INDEX; + if (tb[TCA_DSMARK_DEFAULT_INDEX-1]) { + if (RTA_PAYLOAD(tb[TCA_DSMARK_DEFAULT_INDEX-1]) < sizeof(__u16)) + return -EINVAL; + p->default_index = + *(__u16 *) RTA_DATA(tb[TCA_DSMARK_DEFAULT_INDEX-1]); + } + p->set_tc_index = !!tb[TCA_DSMARK_SET_TC_INDEX-1]; + p->mask = kmalloc(p->indices*2,GFP_KERNEL); + if (!p->mask) + return -ENOMEM; + p->value = p->mask+p->indices; + memset(p->mask,0xff,p->indices); + memset(p->value,0,p->indices); + if (!(p->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops))) + p->q = &noop_qdisc; + DPRINTK("dsmark_init: qdisc %p\n",&p->q); + MOD_INC_USE_COUNT; + return 0; +} + + +static void dsmark_reset(struct Qdisc *sch) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + + DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n",sch,p); + qdisc_reset(p->q); + sch->q.qlen = 0; +} + + +static void dsmark_destroy(struct Qdisc *sch) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + struct tcf_proto *tp; + + DPRINTK("dsmark_destroy(sch %p,[qdisc %p])\n",sch,p); + while (p->filter_list) { + tp = p->filter_list; + p->filter_list = tp->next; + tp->ops->destroy(tp); + } + qdisc_destroy(p->q); + p->q = &noop_qdisc; + kfree(p->mask); + MOD_DEC_USE_COUNT; +} + + +static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + unsigned char *b = skb->tail; + struct rtattr *rta; + + DPRINTK("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n",sch,p,cl); + if (!cl || cl > p->indices) + return -EINVAL; + tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle),cl-1); + rta = (struct rtattr *) b; + RTA_PUT(skb,TCA_OPTIONS,0,NULL); + RTA_PUT(skb,TCA_DSMARK_MASK,1,&p->mask[cl-1]); + RTA_PUT(skb,TCA_DSMARK_VALUE,1,&p->value[cl-1]); + rta->rta_len = skb->tail-b; + return skb->len; + +rtattr_failure: + skb_trim(skb,b-skb->data); + return -1; +} + +static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + unsigned char *b = skb->tail; + struct rtattr *rta; + + rta = (struct rtattr *) b; + RTA_PUT(skb,TCA_OPTIONS,0,NULL); + RTA_PUT(skb,TCA_DSMARK_INDICES,sizeof(__u16),&p->indices); + if (p->default_index != NO_DEFAULT_INDEX) { + __u16 tmp = p->default_index; + + RTA_PUT(skb,TCA_DSMARK_DEFAULT_INDEX, sizeof(__u16), &tmp); + } + if (p->set_tc_index) + RTA_PUT(skb, TCA_DSMARK_SET_TC_INDEX, 0, NULL); + rta->rta_len = skb->tail-b; + return skb->len; + +rtattr_failure: + skb_trim(skb,b-skb->data); + return -1; +} + +static struct Qdisc_class_ops dsmark_class_ops = +{ + dsmark_graft, /* graft */ + dsmark_leaf, /* leaf */ + dsmark_get, /* get */ + dsmark_put, /* put */ + dsmark_change, /* change */ + dsmark_delete, /* delete */ + dsmark_walk, /* walk */ + + dsmark_find_tcf, /* tcf_chain */ + dsmark_bind_filter, /* bind_tcf */ + dsmark_put, /* unbind_tcf */ + + dsmark_dump_class, /* dump */ +}; + +struct Qdisc_ops dsmark_qdisc_ops = +{ + NULL, /* next */ + &dsmark_class_ops, /* cl_ops */ + "dsmark", + sizeof(struct dsmark_qdisc_data), + + dsmark_enqueue, /* enqueue */ + dsmark_dequeue, /* dequeue */ + dsmark_requeue, /* requeue */ + dsmark_drop, /* drop */ + + dsmark_init, /* init */ + dsmark_reset, /* reset */ + dsmark_destroy, /* destroy */ + NULL, /* change */ + + dsmark_dump /* dump */ +}; + +#ifdef MODULE +int init_module(void) +{ + return register_qdisc(&dsmark_qdisc_ops); +} + + +void cleanup_module(void) +{ + unregister_qdisc(&dsmark_qdisc_ops); +} +#endif +MODULE_LICENSE("GPL"); diff --git a/release/src/linux/linux/net/sched/sch_fifo.c b/release/src/linux/linux/net/sched/sch_fifo.c new file mode 100644 index 00000000..d8ce46f2 --- /dev/null +++ b/release/src/linux/linux/net/sched/sch_fifo.c @@ -0,0 +1,208 @@ +/* + * net/sched/sch_fifo.c The simplest FIFO queue. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <linux/config.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +/* 1 band FIFO pseudo-"scheduler" */ + +struct fifo_sched_data +{ + unsigned limit; +}; + +static int +bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data; + + if (sch->stats.backlog <= q->limit) { + __skb_queue_tail(&sch->q, skb); + sch->stats.backlog += skb->len; + sch->stats.bytes += skb->len; + sch->stats.packets++; + return 0; + } + sch->stats.drops++; +#ifdef CONFIG_NET_CLS_POLICE + if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch)) +#endif + kfree_skb(skb); + return NET_XMIT_DROP; +} + +static int +bfifo_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + __skb_queue_head(&sch->q, skb); + sch->stats.backlog += skb->len; + return 0; +} + +static struct sk_buff * +bfifo_dequeue(struct Qdisc* sch) +{ + struct sk_buff *skb; + + skb = __skb_dequeue(&sch->q); + if (skb) + sch->stats.backlog -= skb->len; + return skb; +} + +static int +fifo_drop(struct Qdisc* sch) +{ + struct sk_buff *skb; + + skb = __skb_dequeue_tail(&sch->q); + if (skb) { + sch->stats.backlog -= skb->len; + kfree_skb(skb); + return 1; + } + return 0; +} + +static void +fifo_reset(struct Qdisc* sch) +{ + skb_queue_purge(&sch->q); + sch->stats.backlog = 0; +} + +static int +pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct fifo_sched_data *q = (struct fifo_sched_data *)sch->data; + + if (sch->q.qlen <= q->limit) { + __skb_queue_tail(&sch->q, skb); + sch->stats.bytes += skb->len; + sch->stats.packets++; + return 0; + } + sch->stats.drops++; +#ifdef CONFIG_NET_CLS_POLICE + if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch)) +#endif + kfree_skb(skb); + return NET_XMIT_DROP; +} + +static int +pfifo_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + __skb_queue_head(&sch->q, skb); + return 0; +} + + +static struct sk_buff * +pfifo_dequeue(struct Qdisc* sch) +{ + return __skb_dequeue(&sch->q); +} + +static int fifo_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct fifo_sched_data *q = (void*)sch->data; + + if (opt == NULL) { + if (sch->ops == &bfifo_qdisc_ops) + q->limit = sch->dev->tx_queue_len*sch->dev->mtu; + else + q->limit = sch->dev->tx_queue_len; + } else { + struct tc_fifo_qopt *ctl = RTA_DATA(opt); + if (opt->rta_len < RTA_LENGTH(sizeof(*ctl))) + return -EINVAL; + q->limit = ctl->limit; + } + return 0; +} + +static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct fifo_sched_data *q = (void*)sch->data; + unsigned char *b = skb->tail; + struct tc_fifo_qopt opt; + + opt.limit = q->limit; + RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +struct Qdisc_ops pfifo_qdisc_ops = +{ + NULL, + NULL, + "pfifo", + sizeof(struct fifo_sched_data), + + pfifo_enqueue, + pfifo_dequeue, + pfifo_requeue, + fifo_drop, + + fifo_init, + fifo_reset, + NULL, + fifo_init, + + fifo_dump, +}; + +struct Qdisc_ops bfifo_qdisc_ops = +{ + NULL, + NULL, + "bfifo", + sizeof(struct fifo_sched_data), + + bfifo_enqueue, + bfifo_dequeue, + bfifo_requeue, + fifo_drop, + + fifo_init, + fifo_reset, + NULL, + fifo_init, + fifo_dump, +}; diff --git a/release/src/linux/linux/net/sched/sch_generic.c b/release/src/linux/linux/net/sched/sch_generic.c new file mode 100644 index 00000000..7b0d49e7 --- /dev/null +++ b/release/src/linux/linux/net/sched/sch_generic.c @@ -0,0 +1,518 @@ +/* + * net/sched/sch_generic.c Generic packet scheduler routines. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * Jamal Hadi Salim, <hadi@nortelnetworks.com> 990601 + * - Ingress support + */ + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/config.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +/* Main transmission queue. */ + +/* Main qdisc structure lock. + + However, modifications + to data, participating in scheduling must be additionally + protected with dev->queue_lock spinlock. + + The idea is the following: + - enqueue, dequeue are serialized via top level device + spinlock dev->queue_lock. + - tree walking is protected by read_lock(qdisc_tree_lock) + and this lock is used only in process context. + - updates to tree are made only under rtnl semaphore, + hence this lock may be made without local bh disabling. + + qdisc_tree_lock must be grabbed BEFORE dev->queue_lock! + */ +rwlock_t qdisc_tree_lock = RW_LOCK_UNLOCKED; + +/* + dev->queue_lock serializes queue accesses for this device + AND dev->qdisc pointer itself. + + dev->xmit_lock serializes accesses to device driver. + + dev->queue_lock and dev->xmit_lock are mutually exclusive, + if one is grabbed, another must be free. + */ + + +/* Kick device. + Note, that this procedure can be called by a watchdog timer, so that + we do not check dev->tbusy flag here. + + Returns: 0 - queue is empty. + >0 - queue is not empty, but throttled. + <0 - queue is not empty. Device is throttled, if dev->tbusy != 0. + + NOTE: Called under dev->queue_lock with locally disabled BH. +*/ + +int qdisc_restart(struct net_device *dev) +{ + struct Qdisc *q = dev->qdisc; + struct sk_buff *skb; + + /* Dequeue packet */ + if ((skb = q->dequeue(q)) != NULL) { + if (spin_trylock(&dev->xmit_lock)) { + /* Remember that the driver is grabbed by us. */ + dev->xmit_lock_owner = smp_processor_id(); + + /* And release queue */ + spin_unlock(&dev->queue_lock); + + if (!netif_queue_stopped(dev)) { + if (netdev_nit) + dev_queue_xmit_nit(skb, dev); + + if (dev->hard_start_xmit(skb, dev) == 0) { + dev->xmit_lock_owner = -1; + spin_unlock(&dev->xmit_lock); + + spin_lock(&dev->queue_lock); + return -1; + } + } + + /* Release the driver */ + dev->xmit_lock_owner = -1; + spin_unlock(&dev->xmit_lock); + spin_lock(&dev->queue_lock); + q = dev->qdisc; + } else { + /* So, someone grabbed the driver. */ + + /* It may be transient configuration error, + when hard_start_xmit() recurses. We detect + it by checking xmit owner and drop the + packet when deadloop is detected. + */ + if (dev->xmit_lock_owner == smp_processor_id()) { + kfree_skb(skb); + if (net_ratelimit()) + printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name); + return -1; + } + netdev_rx_stat[smp_processor_id()].cpu_collision++; + } + + /* Device kicked us out :( + This is possible in three cases: + + 0. driver is locked + 1. fastroute is enabled + 2. device cannot determine busy state + before start of transmission (f.e. dialout) + 3. device is buggy (ppp) + */ + + q->ops->requeue(skb, q); + netif_schedule(dev); + return 1; + } + return q->q.qlen; +} + +static void dev_watchdog(unsigned long arg) +{ + struct net_device *dev = (struct net_device *)arg; + + spin_lock(&dev->xmit_lock); + if (dev->qdisc != &noop_qdisc) { + if (netif_device_present(dev) && + netif_running(dev) && + netif_carrier_ok(dev)) { + if (netif_queue_stopped(dev) && + (jiffies - dev->trans_start) > dev->watchdog_timeo) { + printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n", dev->name); + dev->tx_timeout(dev); + } + if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo)) + dev_hold(dev); + } + } + spin_unlock(&dev->xmit_lock); + + dev_put(dev); +} + +static void dev_watchdog_init(struct net_device *dev) +{ + init_timer(&dev->watchdog_timer); + dev->watchdog_timer.data = (unsigned long)dev; + dev->watchdog_timer.function = dev_watchdog; +} + +void __netdev_watchdog_up(struct net_device *dev) +{ + if (dev->tx_timeout) { + if (dev->watchdog_timeo <= 0) + dev->watchdog_timeo = 5*HZ; + if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo)) + dev_hold(dev); + } +} + +static void dev_watchdog_up(struct net_device *dev) +{ + spin_lock_bh(&dev->xmit_lock); + __netdev_watchdog_up(dev); + spin_unlock_bh(&dev->xmit_lock); +} + +static void dev_watchdog_down(struct net_device *dev) +{ + spin_lock_bh(&dev->xmit_lock); + if (del_timer(&dev->watchdog_timer)) + __dev_put(dev); + spin_unlock_bh(&dev->xmit_lock); +} + +/* "NOOP" scheduler: the best scheduler, recommended for all interfaces + under all circumstances. It is difficult to invent anything faster or + cheaper. + */ + +static int +noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc) +{ + kfree_skb(skb); + return NET_XMIT_CN; +} + +static struct sk_buff * +noop_dequeue(struct Qdisc * qdisc) +{ + return NULL; +} + +static int +noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc) +{ + if (net_ratelimit()) + printk(KERN_DEBUG "%s deferred output. It is buggy.\n", skb->dev->name); + kfree_skb(skb); + return NET_XMIT_CN; +} + +struct Qdisc_ops noop_qdisc_ops = +{ + NULL, + NULL, + "noop", + 0, + + noop_enqueue, + noop_dequeue, + noop_requeue, +}; + +struct Qdisc noop_qdisc = +{ + noop_enqueue, + noop_dequeue, + TCQ_F_BUILTIN, + &noop_qdisc_ops, +}; + + +struct Qdisc_ops noqueue_qdisc_ops = +{ + NULL, + NULL, + "noqueue", + 0, + + noop_enqueue, + noop_dequeue, + noop_requeue, + +}; + +struct Qdisc noqueue_qdisc = +{ + NULL, + noop_dequeue, + TCQ_F_BUILTIN, + &noqueue_qdisc_ops, +}; + + +static const u8 prio2band[TC_PRIO_MAX+1] = +{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 }; + +/* 3-band FIFO queue: old style, but should be a bit faster than + generic prio+fifo combination. + */ + +static int +pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc) +{ + struct sk_buff_head *list; + + list = ((struct sk_buff_head*)qdisc->data) + + prio2band[skb->priority&TC_PRIO_MAX]; + + if (list->qlen <= qdisc->dev->tx_queue_len) { + __skb_queue_tail(list, skb); + qdisc->q.qlen++; + return 0; + } + qdisc->stats.drops++; + kfree_skb(skb); + return NET_XMIT_DROP; +} + +static struct sk_buff * +pfifo_fast_dequeue(struct Qdisc* qdisc) +{ + int prio; + struct sk_buff_head *list = ((struct sk_buff_head*)qdisc->data); + struct sk_buff *skb; + + for (prio = 0; prio < 3; prio++, list++) { + skb = __skb_dequeue(list); + if (skb) { + qdisc->q.qlen--; + return skb; + } + } + return NULL; +} + +static int +pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc) +{ + struct sk_buff_head *list; + + list = ((struct sk_buff_head*)qdisc->data) + + prio2band[skb->priority&TC_PRIO_MAX]; + + __skb_queue_head(list, skb); + qdisc->q.qlen++; + return 0; +} + +static void +pfifo_fast_reset(struct Qdisc* qdisc) +{ + int prio; + struct sk_buff_head *list = ((struct sk_buff_head*)qdisc->data); + + for (prio=0; prio < 3; prio++) + skb_queue_purge(list+prio); + qdisc->q.qlen = 0; +} + +static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt) +{ + int i; + struct sk_buff_head *list; + + list = ((struct sk_buff_head*)qdisc->data); + + for (i=0; i<3; i++) + skb_queue_head_init(list+i); + + return 0; +} + +static struct Qdisc_ops pfifo_fast_ops = +{ + NULL, + NULL, + "pfifo_fast", + 3 * sizeof(struct sk_buff_head), + + pfifo_fast_enqueue, + pfifo_fast_dequeue, + pfifo_fast_requeue, + NULL, + + pfifo_fast_init, + pfifo_fast_reset, +}; + +struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops) +{ + struct Qdisc *sch; + int size = sizeof(*sch) + ops->priv_size; + + sch = kmalloc(size, GFP_KERNEL); + if (!sch) + return NULL; + memset(sch, 0, size); + + skb_queue_head_init(&sch->q); + sch->ops = ops; + sch->enqueue = ops->enqueue; + sch->dequeue = ops->dequeue; + sch->dev = dev; + sch->stats.lock = &dev->queue_lock; + atomic_set(&sch->refcnt, 1); + if (!ops->init || ops->init(sch, NULL) == 0) + return sch; + + kfree(sch); + return NULL; +} + +/* Under dev->queue_lock and BH! */ + +void qdisc_reset(struct Qdisc *qdisc) +{ + struct Qdisc_ops *ops = qdisc->ops; + + if (ops->reset) + ops->reset(qdisc); +} + +/* Under dev->queue_lock and BH! */ + +void qdisc_destroy(struct Qdisc *qdisc) +{ + struct Qdisc_ops *ops = qdisc->ops; + struct net_device *dev; + + if (!atomic_dec_and_test(&qdisc->refcnt)) + return; + + dev = qdisc->dev; + +#ifdef CONFIG_NET_SCHED + if (dev) { + struct Qdisc *q, **qp; + for (qp = &qdisc->dev->qdisc_list; (q=*qp) != NULL; qp = &q->next) { + if (q == qdisc) { + *qp = q->next; + break; + } + } + } +#ifdef CONFIG_NET_ESTIMATOR + qdisc_kill_estimator(&qdisc->stats); +#endif +#endif + if (ops->reset) + ops->reset(qdisc); + if (ops->destroy) + ops->destroy(qdisc); + if (!(qdisc->flags&TCQ_F_BUILTIN)) + kfree(qdisc); +} + + +void dev_activate(struct net_device *dev) +{ + /* No queueing discipline is attached to device; + create default one i.e. pfifo_fast for devices, + which need queueing and noqueue_qdisc for + virtual interfaces + */ + + if (dev->qdisc_sleeping == &noop_qdisc) { + struct Qdisc *qdisc; + if (dev->tx_queue_len) { + qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops); + if (qdisc == NULL) { + printk(KERN_INFO "%s: activation failed\n", dev->name); + return; + } + } else { + qdisc = &noqueue_qdisc; + } + write_lock(&qdisc_tree_lock); + dev->qdisc_sleeping = qdisc; + write_unlock(&qdisc_tree_lock); + } + + spin_lock_bh(&dev->queue_lock); + if ((dev->qdisc = dev->qdisc_sleeping) != &noqueue_qdisc) { + dev->trans_start = jiffies; + dev_watchdog_up(dev); + } + spin_unlock_bh(&dev->queue_lock); +} + +void dev_deactivate(struct net_device *dev) +{ + struct Qdisc *qdisc; + + spin_lock_bh(&dev->queue_lock); + qdisc = dev->qdisc; + dev->qdisc = &noop_qdisc; + + qdisc_reset(qdisc); + + spin_unlock_bh(&dev->queue_lock); + + dev_watchdog_down(dev); + + while (test_bit(__LINK_STATE_SCHED, &dev->state)) + yield(); + + spin_unlock_wait(&dev->xmit_lock); +} + +void dev_init_scheduler(struct net_device *dev) +{ + write_lock(&qdisc_tree_lock); + spin_lock_bh(&dev->queue_lock); + dev->qdisc = &noop_qdisc; + spin_unlock_bh(&dev->queue_lock); + dev->qdisc_sleeping = &noop_qdisc; + dev->qdisc_list = NULL; + write_unlock(&qdisc_tree_lock); + + dev_watchdog_init(dev); +} + +void dev_shutdown(struct net_device *dev) +{ + struct Qdisc *qdisc; + + write_lock(&qdisc_tree_lock); + spin_lock_bh(&dev->queue_lock); + qdisc = dev->qdisc_sleeping; + dev->qdisc = &noop_qdisc; + dev->qdisc_sleeping = &noop_qdisc; + qdisc_destroy(qdisc); +#if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE) + if ((qdisc = dev->qdisc_ingress) != NULL) { + dev->qdisc_ingress = NULL; + qdisc_destroy(qdisc); + } +#endif + BUG_TRAP(dev->qdisc_list == NULL); + BUG_TRAP(!timer_pending(&dev->watchdog_timer)); + dev->qdisc_list = NULL; + spin_unlock_bh(&dev->queue_lock); + write_unlock(&qdisc_tree_lock); +} diff --git a/release/src/linux/linux/net/sched/sch_gred.c b/release/src/linux/linux/net/sched/sch_gred.c new file mode 100644 index 00000000..6c664155 --- /dev/null +++ b/release/src/linux/linux/net/sched/sch_gred.c @@ -0,0 +1,625 @@ +/* + * net/sched/sch_gred.c Generic Random Early Detection queue. + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: J Hadi Salim (hadi@cyberus.ca) 1998-2002 + * + * 991129: - Bug fix with grio mode + * - a better sing. AvgQ mode with Grio(WRED) + * - A finer grained VQ dequeue based on sugestion + * from Ren Liu + * - More error checks + * + * + * + * For all the glorious comments look at Alexey's sch_red.c + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) + +#define D2PRINTK(format,args...) + +struct gred_sched_data; +struct gred_sched; + +struct gred_sched_data +{ +/* Parameters */ + u32 limit; /* HARD maximal queue length */ + u32 qth_min; /* Min average length threshold: A scaled */ + u32 qth_max; /* Max average length threshold: A scaled */ + u32 DP; /* the drop pramaters */ + char Wlog; /* log(W) */ + char Plog; /* random number bits */ + u32 Scell_max; + u32 Rmask; + u32 bytesin; /* bytes seen on virtualQ so far*/ + u32 packetsin; /* packets seen on virtualQ so far*/ + u32 backlog; /* bytes on the virtualQ */ + u32 forced; /* packets dropped for exceeding limits */ + u32 early; /* packets dropped as a warning */ + u32 other; /* packets dropped by invoking drop() */ + u32 pdrop; /* packets dropped because we exceeded physical queue limits */ + char Scell_log; + u8 Stab[256]; + u8 prio; /* the prio of this vq */ + +/* Variables */ + unsigned long qave; /* Average queue length: A scaled */ + int qcount; /* Packets since last random number generation */ + u32 qR; /* Cached random number */ + + psched_time_t qidlestart; /* Start of idle period */ +}; + +struct gred_sched +{ + struct gred_sched_data *tab[MAX_DPs]; + u32 DPs; + u32 def; + u8 initd; + u8 grio; + u8 eqp; +}; + +static int +gred_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + psched_time_t now; + struct gred_sched_data *q=NULL; + struct gred_sched *t= (struct gred_sched *)sch->data; + unsigned long qave=0; + int i=0; + + if (!t->initd && skb_queue_len(&sch->q) <= sch->dev->tx_queue_len) { + D2PRINTK("NO GRED Queues setup yet! Enqueued anyway\n"); + goto do_enqueue; + } + + + if ( ((skb->tc_index&0xf) > t->DPs) || !(q=t->tab[skb->tc_index&0xf])) { + printk("GRED: setting to default (%d)\n ",t->def); + if (!(q=t->tab[t->def])) { + DPRINTK("GRED: setting to default FAILED! dropping!! " + "(%d)\n ", t->def); + goto drop; + } + /* fix tc_index? --could be controvesial but needed for + requeueing */ + skb->tc_index=(skb->tc_index&0xfffffff0) | t->def; + } + + D2PRINTK("gred_enqueue virtualQ 0x%x classid %x backlog %d " + "general backlog %d\n",skb->tc_index&0xf,sch->handle,q->backlog, + sch->stats.backlog); + /* sum up all the qaves of prios <= to ours to get the new qave*/ + if (!t->eqp && t->grio) { + for (i=0;i<t->DPs;i++) { + if ((!t->tab[i]) || (i==q->DP)) + continue; + + if ((t->tab[i]->prio < q->prio) && (PSCHED_IS_PASTPERFECT(t->tab[i]->qidlestart))) + qave +=t->tab[i]->qave; + } + + } + + q->packetsin++; + q->bytesin+=skb->len; + + if (t->eqp && t->grio) { + qave=0; + q->qave=t->tab[t->def]->qave; + q->qidlestart=t->tab[t->def]->qidlestart; + } + + if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { + long us_idle; + PSCHED_GET_TIME(now); + us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max, 0); + PSCHED_SET_PASTPERFECT(q->qidlestart); + + q->qave >>= q->Stab[(us_idle>>q->Scell_log)&0xFF]; + } else { + if (t->eqp) { + q->qave += sch->stats.backlog - (q->qave >> q->Wlog); + } else { + q->qave += q->backlog - (q->qave >> q->Wlog); + } + + } + + + if (t->eqp && t->grio) + t->tab[t->def]->qave=q->qave; + + if ((q->qave+qave) < q->qth_min) { + q->qcount = -1; +enqueue: + if (q->backlog <= q->limit) { + q->backlog += skb->len; +do_enqueue: + __skb_queue_tail(&sch->q, skb); + sch->stats.backlog += skb->len; + sch->stats.bytes += skb->len; + sch->stats.packets++; + return 0; + } else { + q->pdrop++; + } + +drop: + kfree_skb(skb); + sch->stats.drops++; + return NET_XMIT_DROP; + } + if ((q->qave+qave) >= q->qth_max) { + q->qcount = -1; + sch->stats.overlimits++; + q->forced++; + goto drop; + } + if (++q->qcount) { + if ((((qave+q->qave) - q->qth_min)>>q->Wlog)*q->qcount < q->qR) + goto enqueue; + q->qcount = 0; + q->qR = net_random()&q->Rmask; + sch->stats.overlimits++; + q->early++; + goto drop; + } + q->qR = net_random()&q->Rmask; + goto enqueue; +} + +static int +gred_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct gred_sched_data *q; + struct gred_sched *t= (struct gred_sched *)sch->data; + q= t->tab[(skb->tc_index&0xf)]; +/* error checking here -- probably unnecessary */ + PSCHED_SET_PASTPERFECT(q->qidlestart); + + __skb_queue_head(&sch->q, skb); + sch->stats.backlog += skb->len; + q->backlog += skb->len; + return 0; +} + +static struct sk_buff * +gred_dequeue(struct Qdisc* sch) +{ + struct sk_buff *skb; + struct gred_sched_data *q; + struct gred_sched *t= (struct gred_sched *)sch->data; + + skb = __skb_dequeue(&sch->q); + if (skb) { + sch->stats.backlog -= skb->len; + q= t->tab[(skb->tc_index&0xf)]; + if (q) { + q->backlog -= skb->len; + if (!q->backlog && !t->eqp) + PSCHED_GET_TIME(q->qidlestart); + } else { + D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); + } + return skb; + } + + if (t->eqp) { + q= t->tab[t->def]; + if (!q) + D2PRINTK("no default VQ set: Results will be " + "screwed up\n"); + else + PSCHED_GET_TIME(q->qidlestart); + } + + return NULL; +} + +static int +gred_drop(struct Qdisc* sch) +{ + struct sk_buff *skb; + + struct gred_sched_data *q; + struct gred_sched *t= (struct gred_sched *)sch->data; + + skb = __skb_dequeue_tail(&sch->q); + if (skb) { + sch->stats.backlog -= skb->len; + sch->stats.drops++; + q= t->tab[(skb->tc_index&0xf)]; + if (q) { + q->backlog -= skb->len; + q->other++; + if (!q->backlog && !t->eqp) + PSCHED_GET_TIME(q->qidlestart); + } else { + D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); + } + + kfree_skb(skb); + return 1; + } + + q=t->tab[t->def]; + if (!q) { + D2PRINTK("no default VQ set: Results might be screwed up\n"); + return 0; + } + + PSCHED_GET_TIME(q->qidlestart); + return 0; + +} + +static void gred_reset(struct Qdisc* sch) +{ + int i; + struct gred_sched_data *q; + struct gred_sched *t= (struct gred_sched *)sch->data; + + __skb_queue_purge(&sch->q); + + sch->stats.backlog = 0; + + for (i=0;i<t->DPs;i++) { + q= t->tab[i]; + if (!q) + continue; + PSCHED_SET_PASTPERFECT(q->qidlestart); + q->qave = 0; + q->qcount = -1; + q->backlog = 0; + q->other=0; + q->forced=0; + q->pdrop=0; + q->early=0; + } +} + +static int gred_change(struct Qdisc *sch, struct rtattr *opt) +{ + struct gred_sched *table = (struct gred_sched *)sch->data; + struct gred_sched_data *q; + struct tc_gred_qopt *ctl; + struct tc_gred_sopt *sopt; + struct rtattr *tb[TCA_GRED_STAB]; + struct rtattr *tb2[TCA_GRED_STAB]; + int i; + + if (opt == NULL || + rtattr_parse(tb, TCA_GRED_STAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) ) + return -EINVAL; + + if (tb[TCA_GRED_PARMS-1] == 0 && tb[TCA_GRED_STAB-1] == 0 && + tb[TCA_GRED_DPS-1] != 0) { + rtattr_parse(tb2, TCA_GRED_DPS, RTA_DATA(opt), + RTA_PAYLOAD(opt)); + + sopt = RTA_DATA(tb2[TCA_GRED_DPS-1]); + table->DPs=sopt->DPs; + table->def=sopt->def_DP; + table->grio=sopt->grio; + table->initd=0; + /* probably need to clear all the table DP entries as well */ + MOD_INC_USE_COUNT; + return 0; + } + + + if (!table->DPs || tb[TCA_GRED_PARMS-1] == 0 || tb[TCA_GRED_STAB-1] == 0 || + RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) || + RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256) + return -EINVAL; + + ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]); + if (ctl->DP > MAX_DPs-1 ) { + /* misbehaving is punished! Put in the default drop probability */ + DPRINTK("\nGRED: DP %u not in the proper range fixed. New DP " + "set to default at %d\n",ctl->DP,table->def); + ctl->DP=table->def; + } + + if (table->tab[ctl->DP] == NULL) { + table->tab[ctl->DP]=kmalloc(sizeof(struct gred_sched_data), + GFP_KERNEL); + if (NULL == table->tab[ctl->DP]) + return -ENOMEM; + memset(table->tab[ctl->DP], 0, (sizeof(struct gred_sched_data))); + } + q= table->tab[ctl->DP]; + + if (table->grio) { + if (ctl->prio <=0) { + if (table->def && table->tab[table->def]) { + DPRINTK("\nGRED: DP %u does not have a prio" + "setting default to %d\n",ctl->DP, + table->tab[table->def]->prio); + q->prio=table->tab[table->def]->prio; + } else { + DPRINTK("\nGRED: DP %u does not have a prio" + " setting default to 8\n",ctl->DP); + q->prio=8; + } + } else { + q->prio=ctl->prio; + } + } else { + q->prio=8; + } + + + q->DP=ctl->DP; + q->Wlog = ctl->Wlog; + q->Plog = ctl->Plog; + q->limit = ctl->limit; + q->Scell_log = ctl->Scell_log; + q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL; + q->Scell_max = (255<<q->Scell_log); + q->qth_min = ctl->qth_min<<ctl->Wlog; + q->qth_max = ctl->qth_max<<ctl->Wlog; + q->qave=0; + q->backlog=0; + q->qcount = -1; + q->other=0; + q->forced=0; + q->pdrop=0; + q->early=0; + + PSCHED_SET_PASTPERFECT(q->qidlestart); + memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256); + + if ( table->initd && table->grio) { + /* this looks ugly but its not in the fast path */ + for (i=0;i<table->DPs;i++) { + if ((!table->tab[i]) || (i==q->DP) ) + continue; + if (table->tab[i]->prio == q->prio ){ + /* WRED mode detected */ + table->eqp=1; + break; + } + } + } + + if (!table->initd) { + table->initd=1; + /* + the first entry also goes into the default until + over-written + */ + + if (table->tab[table->def] == NULL) { + table->tab[table->def]= + kmalloc(sizeof(struct gred_sched_data), GFP_KERNEL); + if (NULL == table->tab[table->def]) + return -ENOMEM; + + memset(table->tab[table->def], 0, + (sizeof(struct gred_sched_data))); + } + q= table->tab[table->def]; + q->DP=table->def; + q->Wlog = ctl->Wlog; + q->Plog = ctl->Plog; + q->limit = ctl->limit; + q->Scell_log = ctl->Scell_log; + q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL; + q->Scell_max = (255<<q->Scell_log); + q->qth_min = ctl->qth_min<<ctl->Wlog; + q->qth_max = ctl->qth_max<<ctl->Wlog; + + if (table->grio) + q->prio=table->tab[ctl->DP]->prio; + else + q->prio=8; + + q->qcount = -1; + PSCHED_SET_PASTPERFECT(q->qidlestart); + memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256); + } + return 0; + +} + +static int gred_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct gred_sched *table = (struct gred_sched *)sch->data; + struct tc_gred_sopt *sopt; + struct rtattr *tb[TCA_GRED_STAB]; + struct rtattr *tb2[TCA_GRED_STAB]; + + if (opt == NULL || + rtattr_parse(tb, TCA_GRED_STAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) ) + return -EINVAL; + + if (tb[TCA_GRED_PARMS-1] == 0 && tb[TCA_GRED_STAB-1] == 0 && + tb[TCA_GRED_DPS-1] != 0) { + rtattr_parse(tb2, TCA_GRED_DPS, RTA_DATA(opt),RTA_PAYLOAD(opt)); + + sopt = RTA_DATA(tb2[TCA_GRED_DPS-1]); + table->DPs=sopt->DPs; + table->def=sopt->def_DP; + table->grio=sopt->grio; + table->initd=0; + MOD_INC_USE_COUNT; + return 0; + } + + DPRINTK("\n GRED_INIT error!\n"); + return -EINVAL; +} + +static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + unsigned long qave; + struct rtattr *rta; + struct tc_gred_qopt *opt = NULL ; + struct tc_gred_qopt *dst; + struct gred_sched *table = (struct gred_sched *)sch->data; + struct gred_sched_data *q; + int i; + unsigned char *b = skb->tail; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + opt=kmalloc(sizeof(struct tc_gred_qopt)*MAX_DPs, GFP_KERNEL); + + if (opt == NULL) { + DPRINTK("gred_dump:failed to malloc for %Zd\n", + sizeof(struct tc_gred_qopt)*MAX_DPs); + goto rtattr_failure; + } + + memset(opt, 0, (sizeof(struct tc_gred_qopt))*table->DPs); + + if (!table->initd) { + DPRINTK("NO GRED Queues setup!\n"); + } + + for (i=0;i<MAX_DPs;i++) { + dst= &opt[i]; + q= table->tab[i]; + + if (!q) { + /* hack -- fix at some point with proper message + This is how we indicate to tc that there is no VQ + at this DP */ + + dst->DP=MAX_DPs+i; + continue; + } + + dst->limit=q->limit; + dst->qth_min=q->qth_min>>q->Wlog; + dst->qth_max=q->qth_max>>q->Wlog; + dst->DP=q->DP; + dst->backlog=q->backlog; + if (q->qave) { + if (table->eqp && table->grio) { + q->qidlestart=table->tab[table->def]->qidlestart; + q->qave=table->tab[table->def]->qave; + } + if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { + long idle; + psched_time_t now; + PSCHED_GET_TIME(now); + idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max, 0); + qave = q->qave >> q->Stab[(idle>>q->Scell_log)&0xFF]; + dst->qave = qave >> q->Wlog; + + } else { + dst->qave = q->qave >> q->Wlog; + } + } else { + dst->qave = 0; + } + + + dst->Wlog = q->Wlog; + dst->Plog = q->Plog; + dst->Scell_log = q->Scell_log; + dst->other = q->other; + dst->forced = q->forced; + dst->early = q->early; + dst->pdrop = q->pdrop; + dst->prio = q->prio; + dst->packets=q->packetsin; + dst->bytesin=q->bytesin; + } + + RTA_PUT(skb, TCA_GRED_PARMS, sizeof(struct tc_gred_qopt)*MAX_DPs, opt); + rta->rta_len = skb->tail - b; + + kfree(opt); + return skb->len; + +rtattr_failure: + if (opt) + kfree(opt); + DPRINTK("gred_dump: FAILURE!!!!\n"); + +/* also free the opt struct here */ + skb_trim(skb, b - skb->data); + return -1; +} + +static void gred_destroy(struct Qdisc *sch) +{ + struct gred_sched *table = (struct gred_sched *)sch->data; + int i; + + for (i = 0;i < table->DPs; i++) { + if (table->tab[i]) + kfree(table->tab[i]); + } + MOD_DEC_USE_COUNT; +} + +struct Qdisc_ops gred_qdisc_ops = +{ + NULL, + NULL, + "gred", + sizeof(struct gred_sched), + gred_enqueue, + gred_dequeue, + gred_requeue, + gred_drop, + gred_init, + gred_reset, + gred_destroy, + gred_change, /* change */ + gred_dump, +}; + + +#ifdef MODULE +int init_module(void) +{ + return register_qdisc(&gred_qdisc_ops); +} + +void cleanup_module(void) +{ + unregister_qdisc(&gred_qdisc_ops); +} +#endif +MODULE_LICENSE("GPL"); diff --git a/release/src/linux/linux/net/sched/sch_htb.c b/release/src/linux/linux/net/sched/sch_htb.c new file mode 100644 index 00000000..7539e490 --- /dev/null +++ b/release/src/linux/linux/net/sched/sch_htb.c @@ -0,0 +1,1660 @@ +/* vim: ts=8 sw=8 + * net/sched/sch_htb.c Hierarchical token bucket, feed tree version + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Martin Devera, <devik@cdi.cz> + * + * Credits (in time order) for older HTB versions: + * Ondrej Kraus, <krauso@barr.cz> + * found missing INIT_QDISC(htb) + * Vladimir Smelhaus, Aamer Akhter, Bert Hubert + * helped a lot to locate nasty class stall bug + * Andi Kleen, Jamal Hadi, Bert Hubert + * code review and helpful comments on shaping + * Tomasz Wrona, <tw@eter.tym.pl> + * created test case so that I was able to fix nasty bug + * and many others. thanks. + * + * $Id: sch_htb.c,v 1.1.1.4 2003/10/14 08:09:35 sparq Exp $ + */ +#include <linux/config.h> +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/version.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <linux/list.h> +#include <linux/compiler.h> +#include <net/sock.h> +#include <net/pkt_sched.h> +#include <linux/rbtree.h> + +/* HTB algorithm. + Author: devik@cdi.cz + ======================================================================== + HTB is like TBF with multiple classes. It is also similar to CBQ because + it allows to assign priority to each class in hierarchy. + In fact it is another implementation of Floyd's formal sharing. + + Levels: + Each class is assigned level. Leaf has ALWAYS level 0 and root + classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level + one less than their parent. +*/ + +#define HTB_HSIZE 16 /* classid hash size */ +#define HTB_EWMAC 2 /* rate average over HTB_EWMAC*HTB_HSIZE sec */ +#define HTB_DEBUG 1 /* compile debugging support (activated by tc tool) */ +#define HTB_RATECM 1 /* whether to use rate computer */ +#define HTB_HYSTERESIS 1/* whether to use mode hysteresis for speedup */ +#define HTB_QLOCK(S) spin_lock_bh(&(S)->dev->queue_lock) +#define HTB_QUNLOCK(S) spin_unlock_bh(&(S)->dev->queue_lock) +#define HTB_VER 0x30007 /* major must be matched with number suplied by TC as version */ + +#if HTB_VER >> 16 != TC_HTB_PROTOVER +#error "Mismatched sch_htb.c and pkt_sch.h" +#endif + +/* temporary debug defines to be removed after beta stage */ +#define DEVIK_MEND(N) +#define DEVIK_MSTART(N) + +/* debugging support; S is subsystem, these are defined: + 0 - netlink messages + 1 - enqueue + 2 - drop & requeue + 3 - dequeue main + 4 - dequeue one prio DRR part + 5 - dequeue class accounting + 6 - class overlimit status computation + 7 - hint tree + 8 - event queue + 10 - rate estimator + 11 - classifier + 12 - fast dequeue cache + + L is level; 0 = none, 1 = basic info, 2 = detailed, 3 = full + q->debug uint32 contains 16 2-bit fields one for subsystem starting + from LSB + */ +#ifdef HTB_DEBUG +#define HTB_DBG(S,L,FMT,ARG...) if (((q->debug>>(2*S))&3) >= L) \ + printk(KERN_DEBUG FMT,##ARG) +#define HTB_CHCL(cl) BUG_TRAP((cl)->magic == HTB_CMAGIC) +#define HTB_PASSQ q, +#define HTB_ARGQ struct htb_sched *q, +#define static +#define __inline__ +#define inline +#define HTB_CMAGIC 0xFEFAFEF1 +#define htb_safe_rb_erase(N,R) do { BUG_TRAP((N)->rb_color != -1); \ + if ((N)->rb_color == -1) break; \ + rb_erase(N,R); \ + (N)->rb_color = -1; } while (0) +#else +#define HTB_DBG(S,L,FMT,ARG...) +#define HTB_PASSQ +#define HTB_ARGQ +#define HTB_CHCL(cl) +#define htb_safe_rb_erase(N,R) rb_erase(N,R) +#endif + + +/* used internaly to keep status of single class */ +enum htb_cmode { + HTB_CANT_SEND, /* class can't send and can't borrow */ + HTB_MAY_BORROW, /* class can't send but may borrow */ + HTB_CAN_SEND /* class can send */ +}; + +/* interior & leaf nodes; props specific to leaves are marked L: */ +struct htb_class +{ +#ifdef HTB_DEBUG + unsigned magic; +#endif + /* general class parameters */ + u32 classid; + struct tc_stats stats; /* generic stats */ + struct tc_htb_xstats xstats;/* our special stats */ + int refcnt; /* usage count of this class */ + +#ifdef HTB_RATECM + /* rate measurement counters */ + unsigned long rate_bytes,sum_bytes; + unsigned long rate_packets,sum_packets; +#endif + + /* topology */ + int level; /* our level (see above) */ + struct htb_class *parent; /* parent class */ + struct list_head hlist; /* classid hash list item */ + struct list_head sibling; /* sibling list item */ + struct list_head children; /* children list */ + + union { + struct htb_class_leaf { + struct Qdisc *q; + int prio; + int aprio; + int quantum; + int deficit[TC_HTB_MAXDEPTH]; + struct list_head drop_list; + } leaf; + struct htb_class_inner { + rb_root_t feed[TC_HTB_NUMPRIO]; /* feed trees */ + rb_node_t *ptr[TC_HTB_NUMPRIO]; /* current class ptr */ + } inner; + } un; + rb_node_t node[TC_HTB_NUMPRIO]; /* node for self or feed tree */ + rb_node_t pq_node; /* node for event queue */ + unsigned long pq_key; /* the same type as jiffies global */ + + int prio_activity; /* for which prios are we active */ + enum htb_cmode cmode; /* current mode of the class */ + + /* class attached filters */ + struct tcf_proto *filter_list; + int filter_cnt; + + int warned; /* only one warning about non work conserving .. */ + + /* token bucket parameters */ + struct qdisc_rate_table *rate; /* rate table of the class itself */ + struct qdisc_rate_table *ceil; /* ceiling rate (limits borrows too) */ + long buffer,cbuffer; /* token bucket depth/rate */ + long mbuffer; /* max wait time */ + long tokens,ctokens; /* current number of tokens */ + psched_time_t t_c; /* checkpoint time */ +}; + +/* TODO: maybe compute rate when size is too large .. or drop ? */ +static __inline__ long L2T(struct htb_class *cl,struct qdisc_rate_table *rate, + int size) +{ + int slot = size >> rate->rate.cell_log; + if (slot > 255) { + cl->xstats.giants++; + slot = 255; + } + return rate->data[slot]; +} + +struct htb_sched +{ + struct list_head root; /* root classes list */ + struct list_head hash[HTB_HSIZE]; /* hashed by classid */ + struct list_head drops[TC_HTB_NUMPRIO]; /* active leaves (for drops) */ + + /* self list - roots of self generating tree */ + rb_root_t row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; + int row_mask[TC_HTB_MAXDEPTH]; + rb_node_t *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; + + /* self wait list - roots of wait PQs per row */ + rb_root_t wait_pq[TC_HTB_MAXDEPTH]; + + /* time of nearest event per level (row) */ + unsigned long near_ev_cache[TC_HTB_MAXDEPTH]; + + /* whether we hit non-work conserving class during this dequeue; we use */ + int nwc_hit; /* this to disable mindelay complaint in dequeue */ + + int defcls; /* class where unclassified flows go to */ + u32 debug; /* subsystem debug levels */ + + /* filters for qdisc itself */ + struct tcf_proto *filter_list; + int filter_cnt; + + int rate2quantum; /* quant = rate / rate2quantum */ + psched_time_t now; /* cached dequeue time */ + struct timer_list timer; /* send delay timer */ +#ifdef HTB_RATECM + struct timer_list rttim; /* rate computer timer */ + int recmp_bucket; /* which hash bucket to recompute next */ +#endif + + /* non shaped skbs; let them go directly thru */ + struct sk_buff_head direct_queue; + int direct_qlen; /* max qlen of above */ + + long direct_pkts; +}; + +/* compute hash of size HTB_HSIZE for given handle */ +static __inline__ int htb_hash(u32 h) +{ +#if HTB_HSIZE != 16 + #error "Declare new hash for your HTB_HSIZE" +#endif + h ^= h>>8; /* stolen from cbq_hash */ + h ^= h>>4; + return h & 0xf; +} + +/* find class in global hash table using given handle */ +static __inline__ struct htb_class *htb_find(u32 handle, struct Qdisc *sch) +{ + struct htb_sched *q = (struct htb_sched *)sch->data; + struct list_head *p; + if (TC_H_MAJ(handle) != sch->handle) + return NULL; + + list_for_each (p,q->hash+htb_hash(handle)) { + struct htb_class *cl = list_entry(p,struct htb_class,hlist); + if (cl->classid == handle) + return cl; + } + return NULL; +} + +/** + * htb_classify - classify a packet into class + * + * It returns NULL if the packet should be dropped or -1 if the packet + * should be passed directly thru. In all other cases leaf class is returned. + * We allow direct class selection by classid in priority. The we examine + * filters in qdisc and in inner nodes (if higher filter points to the inner + * node). If we end up with classid MAJOR:0 we enqueue the skb into special + * internal fifo (direct). These packets then go directly thru. If we still + * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull + * then finish and return direct queue. + */ +#define HTB_DIRECT (struct htb_class*)-1 +static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch) +{ + struct htb_sched *q = (struct htb_sched *)sch->data; + struct htb_class *cl; + struct tcf_result res; + struct tcf_proto *tcf; + int result; + + /* allow to select class by setting skb->priority to valid classid; + note that nfmark can be used too by attaching filter fw with no + rules in it */ + if (skb->priority == sch->handle) + return HTB_DIRECT; /* X:0 (direct flow) selected */ + if ((cl = htb_find(skb->priority,sch)) != NULL) + return cl; + + tcf = q->filter_list; + while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { +#ifdef CONFIG_NET_CLS_POLICE + if (result == TC_POLICE_SHOT) + return NULL; +#endif + if ((cl = (void*)res.class) == NULL) { + if (res.classid == sch->handle) + return HTB_DIRECT; /* X:0 (direct flow) */ + if ((cl = htb_find(res.classid,sch)) == NULL) + break; /* filter selected invalid classid */ + } + if (!cl->level) + return cl; /* we hit leaf; return it */ + + /* we have got inner class; apply inner filter chain */ + tcf = cl->filter_list; + } + /* classification failed; try to use default class */ + cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle),q->defcls),sch); + if (!cl || cl->level) + return HTB_DIRECT; /* bad default .. this is safe bet */ + return cl; +} + +#ifdef HTB_DEBUG +static void htb_next_rb_node(rb_node_t **n); +#define HTB_DUMTREE(root,memb) if(root) { \ + rb_node_t *n = (root)->rb_node; \ + while (n->rb_left) n = n->rb_left; \ + while (n) { \ + struct htb_class *cl = rb_entry(n, struct htb_class, memb); \ + printk(" %x",cl->classid); htb_next_rb_node (&n); \ + } } + +static void htb_debug_dump (struct htb_sched *q) +{ + int i,p; + printk(KERN_DEBUG "htb*g j=%lu\n",jiffies); + /* rows */ + for (i=TC_HTB_MAXDEPTH-1;i>=0;i--) { + printk(KERN_DEBUG "htb*r%d m=%x",i,q->row_mask[i]); + for (p=0;p<TC_HTB_NUMPRIO;p++) { + if (!q->row[i][p].rb_node) continue; + printk(" p%d:",p); + HTB_DUMTREE(q->row[i]+p,node[p]); + } + printk("\n"); + } + /* classes */ + for (i = 0; i < HTB_HSIZE; i++) { + struct list_head *l; + list_for_each (l,q->hash+i) { + struct htb_class *cl = list_entry(l,struct htb_class,hlist); + long diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer, 0); + printk(KERN_DEBUG "htb*c%x m=%d t=%ld c=%ld pq=%lu df=%ld ql=%d " + "pa=%x f:", + cl->classid,cl->cmode,cl->tokens,cl->ctokens, + cl->pq_node.rb_color==-1?0:cl->pq_key,diff, + cl->level?0:cl->un.leaf.q->q.qlen,cl->prio_activity); + if (cl->level) + for (p=0;p<TC_HTB_NUMPRIO;p++) { + if (!cl->un.inner.feed[p].rb_node) continue; + printk(" p%d a=%x:",p,cl->un.inner.ptr[p]?rb_entry(cl->un.inner.ptr[p], struct htb_class,node[p])->classid:0); + HTB_DUMTREE(cl->un.inner.feed+p,node[p]); + } + printk("\n"); + } + } +} +#endif +/** + * htb_add_to_id_tree - adds class to the round robin list + * + * Routine adds class to the list (actually tree) sorted by classid. + * Make sure that class is not already on such list for given prio. + */ +static void htb_add_to_id_tree (HTB_ARGQ rb_root_t *root, + struct htb_class *cl,int prio) +{ + rb_node_t **p = &root->rb_node, *parent = NULL; + HTB_DBG(7,3,"htb_add_id_tree cl=%X prio=%d\n",cl->classid,prio); +#ifdef HTB_DEBUG + if (cl->node[prio].rb_color != -1) { BUG_TRAP(0); return; } + HTB_CHCL(cl); + if (*p) { + struct htb_class *x = rb_entry(*p,struct htb_class,node[prio]); + HTB_CHCL(x); + } +#endif + while (*p) { + struct htb_class *c; parent = *p; + c = rb_entry(parent, struct htb_class, node[prio]); + HTB_CHCL(c); + if (cl->classid > c->classid) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&cl->node[prio], parent, p); + rb_insert_color(&cl->node[prio], root); +} + +/** + * htb_add_to_wait_tree - adds class to the event queue with delay + * + * The class is added to priority event queue to indicate that class will + * change its mode in cl->pq_key microseconds. Make sure that class is not + * already in the queue. + */ +static void htb_add_to_wait_tree (struct htb_sched *q, + struct htb_class *cl,long delay,int debug_hint) +{ + rb_node_t **p = &q->wait_pq[cl->level].rb_node, *parent = NULL; + HTB_DBG(7,3,"htb_add_wt cl=%X key=%lu\n",cl->classid,cl->pq_key); +#ifdef HTB_DEBUG + if (cl->pq_node.rb_color != -1) { BUG_TRAP(0); return; } + HTB_CHCL(cl); + if ((delay <= 0 || delay > cl->mbuffer) && net_ratelimit()) + printk(KERN_ERR "HTB: suspicious delay in wait_tree d=%ld cl=%X h=%d\n",delay,cl->classid,debug_hint); +#endif + DEVIK_MSTART(9); + cl->pq_key = jiffies + PSCHED_US2JIFFIE(delay); + if (cl->pq_key == jiffies) + cl->pq_key++; + + /* update the nearest event cache */ + if (q->near_ev_cache[cl->level] - cl->pq_key < 0x80000000) + q->near_ev_cache[cl->level] = cl->pq_key; + + while (*p) { + struct htb_class *c; parent = *p; + c = rb_entry(parent, struct htb_class, pq_node); + if (cl->pq_key - c->pq_key < 0x80000000) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&cl->pq_node, parent, p); + rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]); + DEVIK_MEND(9); +} + +/** + * htb_next_rb_node - finds next node in binary tree + * + * When we are past last key we return NULL. + * Average complexity is 2 steps per call. + */ +static void htb_next_rb_node(rb_node_t **n) +{ + rb_node_t *p; + if ((*n)->rb_right) { + *n = (*n)->rb_right; + while ((*n)->rb_left) + *n = (*n)->rb_left; + return; + } + while ((p = (*n)->rb_parent) != NULL) { + if (p->rb_left == *n) break; + *n = p; + } + *n = p; +} + +/** + * htb_add_class_to_row - add class to its row + * + * The class is added to row at priorities marked in mask. + * It does nothing if mask == 0. + */ +static inline void htb_add_class_to_row(struct htb_sched *q, + struct htb_class *cl,int mask) +{ + HTB_DBG(7,2,"htb_addrow cl=%X mask=%X rmask=%X\n", + cl->classid,mask,q->row_mask[cl->level]); + HTB_CHCL(cl); + q->row_mask[cl->level] |= mask; + while (mask) { + int prio = ffz(~mask); + mask &= ~(1 << prio); + htb_add_to_id_tree(HTB_PASSQ q->row[cl->level]+prio,cl,prio); + } +} + +/** + * htb_remove_class_from_row - removes class from its row + * + * The class is removed from row at priorities marked in mask. + * It does nothing if mask == 0. + */ +static __inline__ void htb_remove_class_from_row(struct htb_sched *q, + struct htb_class *cl,int mask) +{ + int m = 0; + HTB_CHCL(cl); + while (mask) { + int prio = ffz(~mask); + mask &= ~(1 << prio); + if (q->ptr[cl->level][prio] == cl->node+prio) + htb_next_rb_node(q->ptr[cl->level]+prio); + htb_safe_rb_erase(cl->node + prio,q->row[cl->level]+prio); + if (!q->row[cl->level][prio].rb_node) + m |= 1 << prio; + } + HTB_DBG(7,2,"htb_delrow cl=%X mask=%X rmask=%X maskdel=%X\n", + cl->classid,mask,q->row_mask[cl->level],m); + q->row_mask[cl->level] &= ~m; +} + +/** + * htb_activate_prios - creates active classe's feed chain + * + * The class is connected to ancestors and/or appropriate rows + * for priorities it is participating on. cl->cmode must be new + * (activated) mode. It does nothing if cl->prio_activity == 0. + */ +static void htb_activate_prios(struct htb_sched *q,struct htb_class *cl) +{ + struct htb_class *p = cl->parent; + long m,mask = cl->prio_activity; + HTB_DBG(7,2,"htb_act_prios cl=%X mask=%lX cmode=%d\n",cl->classid,mask,cl->cmode); + HTB_CHCL(cl); + + while (cl->cmode == HTB_MAY_BORROW && p && mask) { + HTB_CHCL(p); + m = mask; while (m) { + int prio = ffz(~m); + m &= ~(1 << prio); + + if (p->un.inner.feed[prio].rb_node) + /* parent already has its feed in use so that + reset bit in mask as parent is already ok */ + mask &= ~(1 << prio); + + htb_add_to_id_tree(HTB_PASSQ p->un.inner.feed+prio,cl,prio); + } + HTB_DBG(7,3,"htb_act_pr_aft p=%X pact=%X mask=%lX pmode=%d\n", + p->classid,p->prio_activity,mask,p->cmode); + p->prio_activity |= mask; + cl = p; p = cl->parent; + HTB_CHCL(cl); + } + if (cl->cmode == HTB_CAN_SEND && mask) + htb_add_class_to_row(q,cl,mask); +} + +/** + * htb_deactivate_prios - remove class from feed chain + * + * cl->cmode must represent old mode (before deactivation). It does + * nothing if cl->prio_activity == 0. Class is removed from all feed + * chains and rows. + */ +static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl) +{ + struct htb_class *p = cl->parent; + long m,mask = cl->prio_activity; + HTB_DBG(7,2,"htb_deact_prios cl=%X mask=%lX cmode=%d\n",cl->classid,mask,cl->cmode); + HTB_CHCL(cl); + + while (cl->cmode == HTB_MAY_BORROW && p && mask) { + m = mask; mask = 0; + while (m) { + int prio = ffz(~m); + m &= ~(1 << prio); + + if (p->un.inner.ptr[prio] == cl->node+prio) + htb_next_rb_node(p->un.inner.ptr + prio); + + htb_safe_rb_erase(cl->node + prio,p->un.inner.feed + prio); + + if (!p->un.inner.feed[prio].rb_node) + mask |= 1 << prio; + } + HTB_DBG(7,3,"htb_deact_pr_aft p=%X pact=%X mask=%lX pmode=%d\n", + p->classid,p->prio_activity,mask,p->cmode); + p->prio_activity &= ~mask; + cl = p; p = cl->parent; + HTB_CHCL(cl); + } + if (cl->cmode == HTB_CAN_SEND && mask) + htb_remove_class_from_row(q,cl,mask); +} + +/** + * htb_class_mode - computes and returns current class mode + * + * It computes cl's mode at time cl->t_c+diff and returns it. If mode + * is not HTB_CAN_SEND then cl->pq_key is updated to time difference + * from now to time when cl will change its state. + * Also it is worth to note that class mode doesn't change simply + * at cl->{c,}tokens == 0 but there can rather be hysteresis of + * 0 .. -cl->{c,}buffer range. It is meant to limit number of + * mode transitions per time unit. The speed gain is about 1/6. + */ +static __inline__ enum htb_cmode +htb_class_mode(struct htb_class *cl,long *diff) +{ + long toks; + + if ((toks = (cl->ctokens + *diff)) < ( +#ifdef HTB_HYSTERESIS + cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : +#endif + 0)) { + *diff = -toks; + return HTB_CANT_SEND; + } + if ((toks = (cl->tokens + *diff)) >= ( +#ifdef HTB_HYSTERESIS + cl->cmode == HTB_CAN_SEND ? -cl->buffer : +#endif + 0)) + return HTB_CAN_SEND; + + *diff = -toks; + return HTB_MAY_BORROW; +} + +/** + * htb_change_class_mode - changes classe's mode + * + * This should be the only way how to change classe's mode under normal + * cirsumstances. Routine will update feed lists linkage, change mode + * and add class to the wait event queue if appropriate. New mode should + * be different from old one and cl->pq_key has to be valid if changing + * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree). + */ +static void +htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff) +{ + enum htb_cmode new_mode = htb_class_mode(cl,diff); + + HTB_CHCL(cl); + HTB_DBG(7,1,"htb_chging_clmode %d->%d cl=%X\n",cl->cmode,new_mode,cl->classid); + + if (new_mode == cl->cmode) + return; + + if (cl->prio_activity) { /* not neccessary: speed optimization */ + if (cl->cmode != HTB_CANT_SEND) + htb_deactivate_prios(q,cl); + cl->cmode = new_mode; + if (new_mode != HTB_CANT_SEND) + htb_activate_prios(q,cl); + } else + cl->cmode = new_mode; +} + +/** + * htb_activate - inserts leaf cl into appropriate active feeds + * + * Routine learns (new) priority of leaf and activates feed chain + * for the prio. It can be called on already active leaf safely. + * It also adds leaf into droplist. + */ +static __inline__ void htb_activate(struct htb_sched *q,struct htb_class *cl) +{ + BUG_TRAP(!cl->level && cl->un.leaf.q && cl->un.leaf.q->q.qlen); + HTB_CHCL(cl); + if (!cl->prio_activity) { + cl->prio_activity = 1 << (cl->un.leaf.aprio = cl->un.leaf.prio); + htb_activate_prios(q,cl); + list_add_tail(&cl->un.leaf.drop_list,q->drops+cl->un.leaf.aprio); + } +} + +/** + * htb_deactivate - remove leaf cl from active feeds + * + * Make sure that leaf is active. In the other words it can't be called + * with non-active leaf. It also removes class from the drop list. + */ +static __inline__ void +htb_deactivate(struct htb_sched *q,struct htb_class *cl) +{ + BUG_TRAP(cl->prio_activity); + HTB_CHCL(cl); + htb_deactivate_prios(q,cl); + cl->prio_activity = 0; + list_del_init(&cl->un.leaf.drop_list); +} + +static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct htb_sched *q = (struct htb_sched *)sch->data; + struct htb_class *cl = htb_classify(skb,sch); + + DEVIK_MSTART(0); + if (cl == HTB_DIRECT || !cl) { + /* enqueue to helper queue */ + if (q->direct_queue.qlen < q->direct_qlen && cl) { + __skb_queue_tail(&q->direct_queue, skb); + q->direct_pkts++; + } else { + kfree_skb (skb); + sch->stats.drops++; + DEVIK_MEND(0); + return NET_XMIT_DROP; + } + } else if (cl->un.leaf.q->enqueue(skb, cl->un.leaf.q) != NET_XMIT_SUCCESS) { + sch->stats.drops++; + cl->stats.drops++; + DEVIK_MEND(0); + return NET_XMIT_DROP; + } else { + cl->stats.packets++; cl->stats.bytes += skb->len; + DEVIK_MSTART(1); + htb_activate (q,cl); + DEVIK_MEND(1); + } + + sch->q.qlen++; + sch->stats.packets++; sch->stats.bytes += skb->len; + HTB_DBG(1,1,"htb_enq_ok cl=%X skb=%p\n",cl?cl->classid:0,skb); + DEVIK_MEND(0); + return NET_XMIT_SUCCESS; +} + +/* TODO: requeuing packet charges it to policers again !! */ +static int htb_requeue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct htb_sched *q = (struct htb_sched *)sch->data; + struct htb_class *cl = htb_classify(skb,sch); + + if (cl == HTB_DIRECT || !cl) { + /* enqueue to helper queue */ + if (q->direct_queue.qlen < q->direct_qlen && cl) { + __skb_queue_tail(&q->direct_queue, skb); + q->direct_pkts++; + } else { + kfree_skb (skb); + sch->stats.drops++; + return NET_XMIT_DROP; + } + } else if (cl->un.leaf.q->ops->requeue(skb, cl->un.leaf.q) != NET_XMIT_SUCCESS) { + sch->stats.drops++; + cl->stats.drops++; + return NET_XMIT_DROP; + } else + htb_activate (q,cl); + + sch->q.qlen++; + HTB_DBG(1,1,"htb_req_ok cl=%X skb=%p\n",cl?cl->classid:0,skb); + return NET_XMIT_SUCCESS; +} + +static void htb_timer(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + sch->flags &= ~TCQ_F_THROTTLED; + wmb(); + netif_schedule(sch->dev); +} + +#ifdef HTB_RATECM +#define RT_GEN(D,R) R+=D-(R/HTB_EWMAC);D=0 +static void htb_rate_timer(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + struct htb_sched *q = (struct htb_sched *)sch->data; + struct list_head *p; + + /* lock queue so that we can muck with it */ + HTB_QLOCK(sch); + HTB_DBG(10,1,"htb_rttmr j=%ld\n",jiffies); + + q->rttim.expires = jiffies + HZ; + add_timer(&q->rttim); + + /* scan and recompute one bucket at time */ + if (++q->recmp_bucket >= HTB_HSIZE) + q->recmp_bucket = 0; + list_for_each (p,q->hash+q->recmp_bucket) { + struct htb_class *cl = list_entry(p,struct htb_class,hlist); + HTB_DBG(10,2,"htb_rttmr_cl cl=%X sbyte=%lu spkt=%lu\n", + cl->classid,cl->sum_bytes,cl->sum_packets); + RT_GEN (cl->sum_bytes,cl->rate_bytes); + RT_GEN (cl->sum_packets,cl->rate_packets); + } + HTB_QUNLOCK(sch); +} +#endif + +/** + * htb_charge_class - charges ammount "bytes" to leaf and ancestors + * + * Routine assumes that packet "bytes" long was dequeued from leaf cl + * borrowing from "level". It accounts bytes to ceil leaky bucket for + * leaf and all ancestors and to rate bucket for ancestors at levels + * "level" and higher. It also handles possible change of mode resulting + * from the update. Note that mode can also increase here (MAY_BORROW to + * CAN_SEND) because we can use more precise clock that event queue here. + * In such case we remove class from event queue first. + */ +static void htb_charge_class(struct htb_sched *q,struct htb_class *cl, + int level,int bytes) +{ + long toks,diff; + enum htb_cmode old_mode; + HTB_DBG(5,1,"htb_chrg_cl cl=%X lev=%d len=%d\n",cl->classid,level,bytes); + +#define HTB_ACCNT(T,B,R) toks = diff + cl->T; \ + if (toks > cl->B) toks = cl->B; \ + toks -= L2T(cl, cl->R, bytes); \ + if (toks <= -cl->mbuffer) toks = 1-cl->mbuffer; \ + cl->T = toks + + while (cl) { + HTB_CHCL(cl); + diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer, 0); +#ifdef HTB_DEBUG + if (diff > cl->mbuffer || diff < 0 || PSCHED_TLESS(q->now, cl->t_c)) { + if (net_ratelimit()) + printk(KERN_ERR "HTB: bad diff in charge, cl=%X diff=%lX now=%Lu then=%Lu j=%lu\n", + cl->classid, diff, + (unsigned long long) q->now, + (unsigned long long) cl->t_c, + jiffies); + diff = 1000; + } +#endif + if (cl->level >= level) { + if (cl->level == level) cl->xstats.lends++; + HTB_ACCNT (tokens,buffer,rate); + } else { + cl->xstats.borrows++; + cl->tokens += diff; /* we moved t_c; update tokens */ + } + HTB_ACCNT (ctokens,cbuffer,ceil); + cl->t_c = q->now; + HTB_DBG(5,2,"htb_chrg_clp cl=%X diff=%ld tok=%ld ctok=%ld\n",cl->classid,diff,cl->tokens,cl->ctokens); + + old_mode = cl->cmode; diff = 0; + htb_change_class_mode(q,cl,&diff); + if (old_mode != cl->cmode) { + if (old_mode != HTB_CAN_SEND) + htb_safe_rb_erase(&cl->pq_node,q->wait_pq+cl->level); + if (cl->cmode != HTB_CAN_SEND) + htb_add_to_wait_tree (q,cl,diff,1); + } + +#ifdef HTB_RATECM + /* update rate counters */ + cl->sum_bytes += bytes; cl->sum_packets++; +#endif + + /* update byte stats except for leaves which are already updated */ + if (cl->level) { + cl->stats.bytes += bytes; + cl->stats.packets++; + } + cl = cl->parent; + } +} + +/** + * htb_do_events - make mode changes to classes at the level + * + * Scans event queue for pending events and applies them. Returns jiffies to + * next pending event (0 for no event in pq). + */ +static long htb_do_events(struct htb_sched *q,int level) +{ + int i; + HTB_DBG(8,1,"htb_do_events l=%d root=%p rmask=%X\n", + level,q->wait_pq[level].rb_node,q->row_mask[level]); + for (i = 0; i < 500; i++) { + struct htb_class *cl; + long diff; + rb_node_t *p = q->wait_pq[level].rb_node; + if (!p) return 0; + while (p->rb_left) p = p->rb_left; + + cl = rb_entry(p, struct htb_class, pq_node); + if (cl->pq_key - (jiffies+1) < 0x80000000) { + HTB_DBG(8,3,"htb_do_ev_ret delay=%ld\n",cl->pq_key - jiffies); + return cl->pq_key - jiffies; + } + htb_safe_rb_erase(p,q->wait_pq+level); + diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer, 0); +#ifdef HTB_DEBUG + if (diff > cl->mbuffer || diff < 0 || PSCHED_TLESS(q->now, cl->t_c)) { + if (net_ratelimit()) + printk(KERN_ERR "HTB: bad diff in events, cl=%X diff=%lX now=%Lu then=%Lu j=%lu\n", + cl->classid, diff, + (unsigned long long) q->now, + (unsigned long long) cl->t_c, + jiffies); + diff = 1000; + } +#endif + htb_change_class_mode(q,cl,&diff); + if (cl->cmode != HTB_CAN_SEND) + htb_add_to_wait_tree (q,cl,diff,2); + } + if (net_ratelimit()) + printk(KERN_WARNING "htb: too many events !\n"); + return HZ/10; +} + +/** + * htb_lookup_leaf - returns next leaf class in DRR order + * + * Find leaf where current feed pointers points to. + */ +static struct htb_class * +htb_lookup_leaf(rb_root_t *tree,int prio,rb_node_t **pptr) +{ + int i; + struct { + rb_node_t *root; + rb_node_t **pptr; + } stk[TC_HTB_MAXDEPTH],*sp = stk; + + sp->root = tree->rb_node; + sp->pptr = pptr; + + for (i = 0; i < 65535; i++) { + if (!*sp->pptr) { /* we are at right end; rewind & go up */ + *sp->pptr = sp->root; + while ((*sp->pptr)->rb_left) + *sp->pptr = (*sp->pptr)->rb_left; + if (sp > stk) { + sp--; + BUG_TRAP(*sp->pptr); if(!*sp->pptr) return NULL; + htb_next_rb_node (sp->pptr); + } + } else { + struct htb_class *cl; + cl = rb_entry(*sp->pptr,struct htb_class,node[prio]); + HTB_CHCL(cl); + if (!cl->level) + return cl; + (++sp)->root = cl->un.inner.feed[prio].rb_node; + sp->pptr = cl->un.inner.ptr+prio; + } + } + BUG_TRAP(0); + return NULL; +} + +/* dequeues packet at given priority and level; call only if + you are sure that there is active class at prio/level */ +static struct sk_buff * +htb_dequeue_tree(struct htb_sched *q,int prio,int level) +{ + struct sk_buff *skb = NULL; + //struct htb_sched *q = (struct htb_sched *)sch->data; + struct htb_class *cl,*start; + /* look initial class up in the row */ + DEVIK_MSTART(6); + start = cl = htb_lookup_leaf (q->row[level]+prio,prio,q->ptr[level]+prio); + + do { + BUG_TRAP(cl && cl->un.leaf.q->q.qlen); if (!cl) return NULL; + HTB_DBG(4,1,"htb_deq_tr prio=%d lev=%d cl=%X defic=%d\n", + prio,level,cl->classid,cl->un.leaf.deficit[level]); + + if (likely((skb = cl->un.leaf.q->dequeue(cl->un.leaf.q)) != NULL)) + break; + if (!cl->warned) { + printk(KERN_WARNING "htb: class %X isn't work conserving ?!\n",cl->classid); + cl->warned = 1; + } + q->nwc_hit++; + htb_next_rb_node((level?cl->parent->un.inner.ptr:q->ptr[0])+prio); + cl = htb_lookup_leaf (q->row[level]+prio,prio,q->ptr[level]+prio); + } while (cl != start); + + DEVIK_MEND(6); + DEVIK_MSTART(7); + if (likely(skb != NULL)) { + if ((cl->un.leaf.deficit[level] -= skb->len) < 0) { + HTB_DBG(4,2,"htb_next_cl oldptr=%p quant_add=%d\n", + level?cl->parent->un.inner.ptr[prio]:q->ptr[0][prio],cl->un.leaf.quantum); + cl->un.leaf.deficit[level] += cl->un.leaf.quantum; + htb_next_rb_node((level?cl->parent->un.inner.ptr:q->ptr[0])+prio); + } + /* this used to be after charge_class but this constelation + gives us slightly better performance */ + if (!cl->un.leaf.q->q.qlen) + htb_deactivate (q,cl); + DEVIK_MSTART(8); + htb_charge_class (q,cl,level,skb->len); + DEVIK_MEND(8); + } + DEVIK_MEND(7); + return skb; +} + +static void htb_delay_by(struct Qdisc *sch,long delay) +{ + struct htb_sched *q = (struct htb_sched *)sch->data; + if (netif_queue_stopped(sch->dev)) return; + if (delay <= 0) delay = 1; + if (unlikely(delay > 5*HZ)) { + if (net_ratelimit()) + printk(KERN_INFO "HTB delay %ld > 5sec\n", delay); + delay = 5*HZ; + } + del_timer(&q->timer); + q->timer.expires = jiffies + delay; + add_timer(&q->timer); + sch->flags |= TCQ_F_THROTTLED; + sch->stats.overlimits++; + HTB_DBG(3,1,"htb_deq t_delay=%ld\n",delay); +} + +static struct sk_buff *htb_dequeue(struct Qdisc *sch) +{ + struct sk_buff *skb = NULL; + struct htb_sched *q = (struct htb_sched *)sch->data; + int level; + long min_delay; + + HTB_DBG(3,1,"htb_deq dircnt=%d qlen=%d\n",skb_queue_len(&q->direct_queue), + sch->q.qlen); + + /* try to dequeue direct packets as high prio (!) to minimize cpu work */ + if ((skb = __skb_dequeue(&q->direct_queue)) != NULL) { + sch->flags &= ~TCQ_F_THROTTLED; + sch->q.qlen--; + return skb; + } + + DEVIK_MSTART(2); + if (!sch->q.qlen) goto fin; + PSCHED_GET_TIME(q->now); + + min_delay = HZ*5; + q->nwc_hit = 0; + for (level = 0; level < TC_HTB_MAXDEPTH; level++) { + /* common case optimization - skip event handler quickly */ + int m; + long delay; + DEVIK_MSTART(3); + if (jiffies - q->near_ev_cache[level] < 0x80000000 || 0) { + delay = htb_do_events(q,level); + q->near_ev_cache[level] += delay ? delay : HZ; + } else + delay = q->near_ev_cache[level] - jiffies; + + if (delay && min_delay > delay) + min_delay = delay; + DEVIK_MEND(3); + DEVIK_MSTART(5); + m = ~q->row_mask[level]; + while (m != (int)(-1)) { + int prio = ffz (m); + m |= 1 << prio; + skb = htb_dequeue_tree(q,prio,level); + if (likely(skb != NULL)) { + sch->q.qlen--; + sch->flags &= ~TCQ_F_THROTTLED; + DEVIK_MEND(5); + goto fin; + } + } + DEVIK_MEND(5); + } + DEVIK_MSTART(4); +#ifdef HTB_DEBUG + if (!q->nwc_hit && min_delay >= 5*HZ && net_ratelimit()) { + printk(KERN_ERR "HTB: mindelay=%ld, report it please !\n",min_delay); + htb_debug_dump(q); + } +#endif + htb_delay_by (sch,min_delay); + DEVIK_MEND(4); +fin: + HTB_DBG(3,1,"htb_deq_end %s j=%lu skb=%p\n",sch->dev->name,jiffies,skb); + DEVIK_MEND(2); + return skb; +} + +/* try to drop from each class (by prio) until one succeed */ +static int htb_drop(struct Qdisc* sch) +{ + struct htb_sched *q = (struct htb_sched *)sch->data; + int prio; + + for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) { + struct list_head *p; + list_for_each (p,q->drops+prio) { + struct htb_class *cl = list_entry(p,struct htb_class, + un.leaf.drop_list); + if (cl->un.leaf.q->ops->drop && + cl->un.leaf.q->ops->drop(cl->un.leaf.q)) { + sch->q.qlen--; + if (!cl->un.leaf.q->q.qlen) + htb_deactivate (q,cl); + return 1; + } + } + } + return 0; +} + +/* reset all classes */ +/* always caled under BH & queue lock */ +static void htb_reset(struct Qdisc* sch) +{ + struct htb_sched *q = (struct htb_sched *)sch->data; + int i; + HTB_DBG(0,1,"htb_reset sch=%p, handle=%X\n",sch,sch->handle); + + for (i = 0; i < HTB_HSIZE; i++) { + struct list_head *p; + list_for_each (p,q->hash+i) { + struct htb_class *cl = list_entry(p,struct htb_class,hlist); + if (cl->level) + memset(&cl->un.inner,0,sizeof(cl->un.inner)); + else { + if (cl->un.leaf.q) + qdisc_reset(cl->un.leaf.q); + INIT_LIST_HEAD(&cl->un.leaf.drop_list); + } + cl->prio_activity = 0; + cl->cmode = HTB_CAN_SEND; +#ifdef HTB_DEBUG + cl->pq_node.rb_color = -1; + memset(cl->node,255,sizeof(cl->node)); +#endif + + } + } + sch->flags &= ~TCQ_F_THROTTLED; + del_timer(&q->timer); + __skb_queue_purge(&q->direct_queue); + sch->q.qlen = 0; + memset(q->row,0,sizeof(q->row)); + memset(q->row_mask,0,sizeof(q->row_mask)); + memset(q->wait_pq,0,sizeof(q->wait_pq)); + memset(q->ptr,0,sizeof(q->ptr)); + for (i = 0; i < TC_HTB_NUMPRIO; i++) + INIT_LIST_HEAD(q->drops+i); +} + +static int htb_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct htb_sched *q = (struct htb_sched*)sch->data; + struct rtattr *tb[TCA_HTB_INIT]; + struct tc_htb_glob *gopt; + int i; +#ifdef HTB_DEBUG + printk(KERN_INFO "HTB init, kernel part version %d.%d\n", + HTB_VER >> 16,HTB_VER & 0xffff); +#endif + if (!opt || rtattr_parse(tb, TCA_HTB_INIT, RTA_DATA(opt), RTA_PAYLOAD(opt)) || + tb[TCA_HTB_INIT-1] == NULL || + RTA_PAYLOAD(tb[TCA_HTB_INIT-1]) < sizeof(*gopt)) { + printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n"); + return -EINVAL; + } + gopt = RTA_DATA(tb[TCA_HTB_INIT-1]); + if (gopt->version != HTB_VER >> 16) { + printk(KERN_ERR "HTB: need tc/htb version %d (minor is %d), you have %d\n", + HTB_VER >> 16,HTB_VER & 0xffff,gopt->version); + return -EINVAL; + } + memset(q,0,sizeof(*q)); + q->debug = gopt->debug; + HTB_DBG(0,1,"htb_init sch=%p handle=%X r2q=%d\n",sch,sch->handle,gopt->rate2quantum); + + INIT_LIST_HEAD(&q->root); + for (i = 0; i < HTB_HSIZE; i++) + INIT_LIST_HEAD(q->hash+i); + for (i = 0; i < TC_HTB_NUMPRIO; i++) + INIT_LIST_HEAD(q->drops+i); + + init_timer(&q->timer); + skb_queue_head_init(&q->direct_queue); + + q->direct_qlen = sch->dev->tx_queue_len; + if (q->direct_qlen < 2) /* some devices have zero tx_queue_len */ + q->direct_qlen = 2; + q->timer.function = htb_timer; + q->timer.data = (unsigned long)sch; + +#ifdef HTB_RATECM + init_timer(&q->rttim); + q->rttim.function = htb_rate_timer; + q->rttim.data = (unsigned long)sch; + q->rttim.expires = jiffies + HZ; + add_timer(&q->rttim); +#endif + if ((q->rate2quantum = gopt->rate2quantum) < 1) + q->rate2quantum = 1; + q->defcls = gopt->defcls; + + MOD_INC_USE_COUNT; + return 0; +} + +static int htb_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct htb_sched *q = (struct htb_sched*)sch->data; + unsigned char *b = skb->tail; + struct rtattr *rta; + struct tc_htb_glob gopt; + HTB_DBG(0,1,"htb_dump sch=%p, handle=%X\n",sch,sch->handle); + /* stats */ + HTB_QLOCK(sch); + gopt.direct_pkts = q->direct_pkts; + +#ifdef HTB_DEBUG + htb_debug_dump(q); +#endif + gopt.version = HTB_VER; + gopt.rate2quantum = q->rate2quantum; + gopt.defcls = q->defcls; + gopt.debug = q->debug; + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + RTA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt); + rta->rta_len = skb->tail - b; + sch->stats.qlen = sch->q.qlen; + RTA_PUT(skb, TCA_STATS, sizeof(sch->stats), &sch->stats); + HTB_QUNLOCK(sch); + return skb->len; +rtattr_failure: + HTB_QUNLOCK(sch); + skb_trim(skb, skb->tail - skb->data); + return -1; +} + +static int htb_dump_class(struct Qdisc *sch, unsigned long arg, + struct sk_buff *skb, struct tcmsg *tcm) +{ +#ifdef HTB_DEBUG + struct htb_sched *q = (struct htb_sched*)sch->data; +#endif + struct htb_class *cl = (struct htb_class*)arg; + unsigned char *b = skb->tail; + struct rtattr *rta; + struct tc_htb_opt opt; + + HTB_DBG(0,1,"htb_dump_class handle=%X clid=%X\n",sch->handle,cl->classid); + + HTB_QLOCK(sch); + tcm->tcm_parent = cl->parent ? cl->parent->classid : TC_H_ROOT; + tcm->tcm_handle = cl->classid; + if (!cl->level && cl->un.leaf.q) { + tcm->tcm_info = cl->un.leaf.q->handle; + cl->stats.qlen = cl->un.leaf.q->q.qlen; + } + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + memset (&opt,0,sizeof(opt)); + + opt.rate = cl->rate->rate; opt.buffer = cl->buffer; + opt.ceil = cl->ceil->rate; opt.cbuffer = cl->cbuffer; + opt.quantum = cl->un.leaf.quantum; opt.prio = cl->un.leaf.prio; + opt.level = cl->level; + RTA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt); + rta->rta_len = skb->tail - b; + +#ifdef HTB_RATECM + cl->stats.bps = cl->rate_bytes/(HTB_EWMAC*HTB_HSIZE); + cl->stats.pps = cl->rate_packets/(HTB_EWMAC*HTB_HSIZE); +#endif + + cl->xstats.tokens = cl->tokens; + cl->xstats.ctokens = cl->ctokens; + RTA_PUT(skb, TCA_STATS, sizeof(cl->stats), &cl->stats); + RTA_PUT(skb, TCA_XSTATS, sizeof(cl->xstats), &cl->xstats); + HTB_QUNLOCK(sch); + return skb->len; +rtattr_failure: + HTB_QUNLOCK(sch); + skb_trim(skb, b - skb->data); + return -1; +} + +static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct htb_class *cl = (struct htb_class*)arg; + + if (cl && !cl->level) { + if (new == NULL && (new = qdisc_create_dflt(sch->dev, + &pfifo_qdisc_ops)) == NULL) + return -ENOBUFS; + sch_tree_lock(sch); + if ((*old = xchg(&cl->un.leaf.q, new)) != NULL) { + /* TODO: is it correct ? Why CBQ doesn't do it ? */ + sch->q.qlen -= (*old)->q.qlen; + qdisc_reset(*old); + } + sch_tree_unlock(sch); + return 0; + } + return -ENOENT; +} + +static struct Qdisc * htb_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct htb_class *cl = (struct htb_class*)arg; + return (cl && !cl->level) ? cl->un.leaf.q : NULL; +} + +static unsigned long htb_get(struct Qdisc *sch, u32 classid) +{ +#ifdef HTB_DEBUG + struct htb_sched *q = (struct htb_sched *)sch->data; +#endif + struct htb_class *cl = htb_find(classid,sch); + HTB_DBG(0,1,"htb_get clid=%X q=%p cl=%p ref=%d\n",classid,q,cl,cl?cl->refcnt:0); + if (cl) + cl->refcnt++; + return (unsigned long)cl; +} + +static void htb_destroy_filters(struct tcf_proto **fl) +{ + struct tcf_proto *tp; + + while ((tp = *fl) != NULL) { + *fl = tp->next; + tp->ops->destroy(tp); + } +} + +static void htb_destroy_class(struct Qdisc* sch,struct htb_class *cl) +{ + struct htb_sched *q = (struct htb_sched *)sch->data; + HTB_DBG(0,1,"htb_destrycls clid=%X ref=%d\n", cl?cl->classid:0,cl?cl->refcnt:0); + if (!cl->level) { + BUG_TRAP(cl->un.leaf.q); + sch->q.qlen -= cl->un.leaf.q->q.qlen; + qdisc_destroy(cl->un.leaf.q); + } + qdisc_put_rtab(cl->rate); + qdisc_put_rtab(cl->ceil); + +#ifdef CONFIG_NET_ESTIMATOR + qdisc_kill_estimator(&cl->stats); +#endif + htb_destroy_filters (&cl->filter_list); + + while (!list_empty(&cl->children)) + htb_destroy_class (sch,list_entry(cl->children.next, + struct htb_class,sibling)); + + /* note: this delete may happen twice (see htb_delete) */ + list_del(&cl->hlist); + list_del(&cl->sibling); + + if (cl->prio_activity) + htb_deactivate (q,cl); + + if (cl->cmode != HTB_CAN_SEND) + htb_safe_rb_erase(&cl->pq_node,q->wait_pq+cl->level); + + kfree(cl); +} + +/* always caled under BH & queue lock */ +static void htb_destroy(struct Qdisc* sch) +{ + struct htb_sched *q = (struct htb_sched *)sch->data; + HTB_DBG(0,1,"htb_destroy q=%p\n",q); + + del_timer_sync (&q->timer); +#ifdef HTB_RATECM + del_timer_sync (&q->rttim); +#endif + while (!list_empty(&q->root)) + htb_destroy_class (sch,list_entry(q->root.next, + struct htb_class,sibling)); + + htb_destroy_filters(&q->filter_list); + __skb_queue_purge(&q->direct_queue); + MOD_DEC_USE_COUNT; +} + +static int htb_delete(struct Qdisc *sch, unsigned long arg) +{ + struct htb_sched *q = (struct htb_sched *)sch->data; + struct htb_class *cl = (struct htb_class*)arg; + HTB_DBG(0,1,"htb_delete q=%p cl=%X ref=%d\n",q,cl?cl->classid:0,cl?cl->refcnt:0); + + // TODO: why don't allow to delete subtree ? references ? does + // tc subsys quarantee us that in htb_destroy it holds no class + // refs so that we can remove children safely there ? + if (!list_empty(&cl->children) || cl->filter_cnt) + return -EBUSY; + + sch_tree_lock(sch); + + /* delete from hash and active; remainder in destroy_class */ + list_del_init(&cl->hlist); + if (cl->prio_activity) + htb_deactivate (q,cl); + + if (--cl->refcnt == 0) + htb_destroy_class(sch,cl); + + sch_tree_unlock(sch); + return 0; +} + +static void htb_put(struct Qdisc *sch, unsigned long arg) +{ +#ifdef HTB_DEBUG + struct htb_sched *q = (struct htb_sched *)sch->data; +#endif + struct htb_class *cl = (struct htb_class*)arg; + HTB_DBG(0,1,"htb_put q=%p cl=%X ref=%d\n",q,cl?cl->classid:0,cl?cl->refcnt:0); + + if (--cl->refcnt == 0) + htb_destroy_class(sch,cl); +} + +static int htb_change_class(struct Qdisc *sch, u32 classid, + u32 parentid, struct rtattr **tca, unsigned long *arg) +{ + int err = -EINVAL; + struct htb_sched *q = (struct htb_sched *)sch->data; + struct htb_class *cl = (struct htb_class*)*arg,*parent; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct qdisc_rate_table *rtab = NULL, *ctab = NULL; + struct rtattr *tb[TCA_HTB_RTAB]; + struct tc_htb_opt *hopt; + + /* extract all subattrs from opt attr */ + if (!opt || rtattr_parse(tb, TCA_HTB_RTAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) || + tb[TCA_HTB_PARMS-1] == NULL || + RTA_PAYLOAD(tb[TCA_HTB_PARMS-1]) < sizeof(*hopt)) + goto failure; + + parent = parentid == TC_H_ROOT ? NULL : htb_find (parentid,sch); + + hopt = RTA_DATA(tb[TCA_HTB_PARMS-1]); + HTB_DBG(0,1,"htb_chg cl=%p, clid=%X, opt/prio=%d, rate=%u, buff=%d, quant=%d\n", cl,cl?cl->classid:0,(int)hopt->prio,hopt->rate.rate,hopt->buffer,hopt->quantum); + rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB-1]); + ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB-1]); + if (!rtab || !ctab) goto failure; + + if (!cl) { /* new class */ + /* check for valid classid */ + if (!classid || TC_H_MAJ(classid^sch->handle) || htb_find(classid,sch)) + goto failure; + + /* check maximal depth */ + if (parent && parent->parent && parent->parent->level < 2) { + printk(KERN_ERR "htb: tree is too deep\n"); + goto failure; + } + err = -ENOBUFS; + if ((cl = kmalloc(sizeof(*cl), GFP_KERNEL)) == NULL) + goto failure; + + memset(cl, 0, sizeof(*cl)); + cl->refcnt = 1; + INIT_LIST_HEAD(&cl->sibling); + INIT_LIST_HEAD(&cl->hlist); + INIT_LIST_HEAD(&cl->children); + INIT_LIST_HEAD(&cl->un.leaf.drop_list); +#ifdef HTB_DEBUG + cl->magic = HTB_CMAGIC; +#endif + + sch_tree_lock(sch); + if (parent && !parent->level) { + /* turn parent into inner node */ + sch->q.qlen -= parent->un.leaf.q->q.qlen; + qdisc_destroy (parent->un.leaf.q); + if (parent->prio_activity) + htb_deactivate (q,parent); + + /* remove from evt list because of level change */ + if (parent->cmode != HTB_CAN_SEND) { + htb_safe_rb_erase(&parent->pq_node,q->wait_pq /*+0*/); + parent->cmode = HTB_CAN_SEND; + } + parent->level = (parent->parent ? parent->parent->level + : TC_HTB_MAXDEPTH) - 1; + memset (&parent->un.inner,0,sizeof(parent->un.inner)); + } + /* leaf (we) needs elementary qdisc */ + if (!(cl->un.leaf.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops))) + cl->un.leaf.q = &noop_qdisc; + + cl->classid = classid; cl->parent = parent; + + /* set class to be in HTB_CAN_SEND state */ + cl->tokens = hopt->buffer; + cl->ctokens = hopt->cbuffer; + cl->mbuffer = 60000000; /* 1min */ + PSCHED_GET_TIME(cl->t_c); + cl->cmode = HTB_CAN_SEND; + + /* attach to the hash list and parent's family */ + list_add_tail(&cl->hlist, q->hash+htb_hash(classid)); + list_add_tail(&cl->sibling, parent ? &parent->children : &q->root); +#ifdef HTB_DEBUG + { + int i; + for (i = 0; i < TC_HTB_NUMPRIO; i++) cl->node[i].rb_color = -1; + cl->pq_node.rb_color = -1; + } +#endif + } else sch_tree_lock(sch); + + /* it used to be a nasty bug here, we have to check that node + is really leaf before changing cl->un.leaf ! */ + if (!cl->level) { + cl->un.leaf.quantum = rtab->rate.rate / q->rate2quantum; + if (!hopt->quantum && cl->un.leaf.quantum < 1000) { + printk(KERN_WARNING "HTB: quantum of class %X is small. Consider r2q change.", cl->classid); + cl->un.leaf.quantum = 1000; + } + if (!hopt->quantum && cl->un.leaf.quantum > 200000) { + printk(KERN_WARNING "HTB: quantum of class %X is big. Consider r2q change.", cl->classid); + cl->un.leaf.quantum = 200000; + } + if (hopt->quantum) + cl->un.leaf.quantum = hopt->quantum; + if ((cl->un.leaf.prio = hopt->prio) >= TC_HTB_NUMPRIO) + cl->un.leaf.prio = TC_HTB_NUMPRIO - 1; + } + + cl->buffer = hopt->buffer; + cl->cbuffer = hopt->cbuffer; + if (cl->rate) qdisc_put_rtab(cl->rate); cl->rate = rtab; + if (cl->ceil) qdisc_put_rtab(cl->ceil); cl->ceil = ctab; + sch_tree_unlock(sch); + + *arg = (unsigned long)cl; + return 0; + +failure: + if (rtab) qdisc_put_rtab(rtab); + if (ctab) qdisc_put_rtab(ctab); + return err; +} + +static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg) +{ + struct htb_sched *q = (struct htb_sched *)sch->data; + struct htb_class *cl = (struct htb_class *)arg; + struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list; + HTB_DBG(0,2,"htb_tcf q=%p clid=%X fref=%d fl=%p\n",q,cl?cl->classid:0,cl?cl->filter_cnt:q->filter_cnt,*fl); + return fl; +} + +static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + struct htb_sched *q = (struct htb_sched *)sch->data; + struct htb_class *cl = htb_find (classid,sch); + HTB_DBG(0,2,"htb_bind q=%p clid=%X cl=%p fref=%d\n",q,classid,cl,cl?cl->filter_cnt:q->filter_cnt); + /*if (cl && !cl->level) return 0; + The line above used to be there to prevent attaching filters to + leaves. But at least tc_index filter uses this just to get class + for other reasons so that we have to allow for it. + ---- + 19.6.2002 As Werner explained it is ok - bind filter is just + another way to "lock" the class - unlike "get" this lock can + be broken by class during destroy IIUC. + */ + if (cl) + cl->filter_cnt++; + else + q->filter_cnt++; + return (unsigned long)cl; +} + +static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg) +{ + struct htb_sched *q = (struct htb_sched *)sch->data; + struct htb_class *cl = (struct htb_class *)arg; + HTB_DBG(0,2,"htb_unbind q=%p cl=%p fref=%d\n",q,cl,cl?cl->filter_cnt:q->filter_cnt); + if (cl) + cl->filter_cnt--; + else + q->filter_cnt--; +} + +static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct htb_sched *q = (struct htb_sched *)sch->data; + int i; + + if (arg->stop) + return; + + for (i = 0; i < HTB_HSIZE; i++) { + struct list_head *p; + list_for_each (p,q->hash+i) { + struct htb_class *cl = list_entry(p,struct htb_class,hlist); + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, (unsigned long)cl, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } +} + +static struct Qdisc_class_ops htb_class_ops = +{ + htb_graft, + htb_leaf, + htb_get, + htb_put, + htb_change_class, + htb_delete, + htb_walk, + + htb_find_tcf, + htb_bind_filter, + htb_unbind_filter, + + htb_dump_class, +}; + +struct Qdisc_ops htb_qdisc_ops = +{ + NULL, + &htb_class_ops, + "htb", + sizeof(struct htb_sched), + + htb_enqueue, + htb_dequeue, + htb_requeue, + htb_drop, + + htb_init, + htb_reset, + htb_destroy, + NULL /* htb_change */, + + htb_dump, +}; + +#ifdef MODULE +int init_module(void) +{ + return register_qdisc(&htb_qdisc_ops); +} + +void cleanup_module(void) +{ + unregister_qdisc(&htb_qdisc_ops); +} +MODULE_LICENSE("GPL"); +#endif diff --git a/release/src/linux/linux/net/sched/sch_ingress.c b/release/src/linux/linux/net/sched/sch_ingress.c new file mode 100644 index 00000000..2b30fce0 --- /dev/null +++ b/release/src/linux/linux/net/sched/sch_ingress.c @@ -0,0 +1,372 @@ +/* net/sched/sch_ingress.c - Ingress qdisc + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Jamal Hadi Salim 1999 + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter.h> +#include <net/pkt_sched.h> +#include <asm/byteorder.h> +#include <asm/uaccess.h> +#include <asm/smp.h> +#include <linux/kmod.h> +#include <linux/stat.h> +#include <linux/interrupt.h> +#include <linux/list.h> + + +#undef DEBUG_INGRESS + +#ifdef DEBUG_INGRESS /* control */ +#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) +#else +#define DPRINTK(format,args...) +#endif + +#define D2PRINTK(format,args...) + + +#define PRIV(sch) ((struct ingress_qdisc_data *) (sch)->data) + + +/* Thanks to Doron Oz for this hack +*/ +static int nf_registered = 0; + +struct ingress_qdisc_data { + struct Qdisc *q; + struct tcf_proto *filter_list; +}; + + +/* ------------------------- Class/flow operations ------------------------- */ + + +static int ingress_graft(struct Qdisc *sch,unsigned long arg, + struct Qdisc *new,struct Qdisc **old) +{ +#ifdef DEBUG_INGRESS + struct ingress_qdisc_data *p = PRIV(sch); +#endif + + DPRINTK("ingress_graft(sch %p,[qdisc %p],new %p,old %p)\n", + sch, p, new, old); + DPRINTK("\n ingress_graft: You cannot add qdiscs to classes"); + return 1; +} + + +static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg) +{ + return NULL; +} + + +static unsigned long ingress_get(struct Qdisc *sch,u32 classid) +{ +#ifdef DEBUG_INGRESS + struct ingress_qdisc_data *p = PRIV(sch); +#endif + DPRINTK("ingress_get(sch %p,[qdisc %p],classid %x)\n", sch, p, classid); + return TC_H_MIN(classid) + 1; +} + + +static unsigned long ingress_bind_filter(struct Qdisc *sch, + unsigned long parent, u32 classid) +{ + return ingress_get(sch, classid); +} + + +static void ingress_put(struct Qdisc *sch, unsigned long cl) +{ +} + + +static int ingress_change(struct Qdisc *sch, u32 classid, u32 parent, + struct rtattr **tca, unsigned long *arg) +{ +#ifdef DEBUG_INGRESS + struct ingress_qdisc_data *p = PRIV(sch); +#endif + DPRINTK("ingress_change(sch %p,[qdisc %p],classid %x,parent %x)," + "arg 0x%lx\n", sch, p, classid, parent, *arg); + DPRINTK("No effect. sch_ingress doesn't maintain classes at the moment"); + return 0; +} + + + +static void ingress_walk(struct Qdisc *sch,struct qdisc_walker *walker) +{ +#ifdef DEBUG_INGRESS + struct ingress_qdisc_data *p = PRIV(sch); +#endif + DPRINTK("ingress_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker); + DPRINTK("No effect. sch_ingress doesn't maintain classes at the moment"); +} + + +static struct tcf_proto **ingress_find_tcf(struct Qdisc *sch,unsigned long cl) +{ + struct ingress_qdisc_data *p = PRIV(sch); + + return &p->filter_list; +} + + +/* --------------------------- Qdisc operations ---------------------------- */ + + +static int ingress_enqueue(struct sk_buff *skb,struct Qdisc *sch) +{ + struct ingress_qdisc_data *p = PRIV(sch); + struct tcf_result res; + int result; + + D2PRINTK("ingress_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); + result = tc_classify(skb, p->filter_list, &res); + D2PRINTK("result %d class 0x%04x\n", result, res.classid); + /* + * Unlike normal "enqueue" functions, ingress_enqueue returns a + * firewall FW_* code. + */ +#ifdef CONFIG_NET_CLS_POLICE + switch (result) { + case TC_POLICE_SHOT: + result = NF_DROP; + sch->stats.drops++; + break; + case TC_POLICE_RECLASSIFY: /* DSCP remarking here ? */ + case TC_POLICE_OK: + case TC_POLICE_UNSPEC: + default: + sch->stats.packets++; + sch->stats.bytes += skb->len; + result = NF_ACCEPT; + break; + }; +#else + sch->stats.packets++; + sch->stats.bytes += skb->len; +#endif + + skb->tc_index = TC_H_MIN(res.classid); + return result; +} + + +static struct sk_buff *ingress_dequeue(struct Qdisc *sch) +{ +/* + struct ingress_qdisc_data *p = PRIV(sch); + D2PRINTK("ingress_dequeue(sch %p,[qdisc %p])\n",sch,PRIV(p)); +*/ + return NULL; +} + + +static int ingress_requeue(struct sk_buff *skb,struct Qdisc *sch) +{ +/* + struct ingress_qdisc_data *p = PRIV(sch); + D2PRINTK("ingress_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,PRIV(p)); +*/ + return 0; +} + +static int ingress_drop(struct Qdisc *sch) +{ +#ifdef DEBUG_INGRESS + struct ingress_qdisc_data *p = PRIV(sch); +#endif + DPRINTK("ingress_drop(sch %p,[qdisc %p])\n", sch, p); + return 0; +} + +static unsigned int +ing_hook(unsigned int hook, struct sk_buff **pskb, + const struct net_device *indev, + const struct net_device *outdev, + int (*okfn)(struct sk_buff *)) +{ + + struct Qdisc *q; + struct sk_buff *skb = *pskb; + struct net_device *dev = skb->dev; + int fwres=NF_ACCEPT; + + DPRINTK("ing_hook: skb %s dev=%s len=%u\n", + skb->sk ? "(owned)" : "(unowned)", + skb->dev ? (*pskb)->dev->name : "(no dev)", + skb->len); + +/* +revisit later: Use a private since lock dev->queue_lock is also +used on the egress (might slow things for an iota) +*/ + + if (dev->qdisc_ingress) { + spin_lock(&dev->queue_lock); + if ((q = dev->qdisc_ingress) != NULL) + fwres = q->enqueue(skb, q); + spin_unlock(&dev->queue_lock); + } + + return fwres; +} + +/* after ipt_filter */ +static struct nf_hook_ops ing_ops = +{ + { NULL, NULL}, + ing_hook, + PF_INET, + NF_IP_PRE_ROUTING, + NF_IP_PRI_FILTER + 1 +}; + +int ingress_init(struct Qdisc *sch,struct rtattr *opt) +{ + struct ingress_qdisc_data *p = PRIV(sch); + + if (!nf_registered) { + if (nf_register_hook(&ing_ops) < 0) { + printk("ingress qdisc registration error \n"); + goto error; + } + nf_registered++; + } + + DPRINTK("ingress_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt); + memset(p, 0, sizeof(*p)); + p->filter_list = NULL; + p->q = &noop_qdisc; + MOD_INC_USE_COUNT; + return 0; +error: + return -EINVAL; +} + + +static void ingress_reset(struct Qdisc *sch) +{ + struct ingress_qdisc_data *p = PRIV(sch); + + DPRINTK("ingress_reset(sch %p,[qdisc %p])\n", sch, p); + +/* +*/ +} + +/* ------------------------------------------------------------- */ + + +/* ------------------------------------------------------------- */ + +static void ingress_destroy(struct Qdisc *sch) +{ + struct ingress_qdisc_data *p = PRIV(sch); + struct tcf_proto *tp; + + DPRINTK("ingress_destroy(sch %p,[qdisc %p])\n", sch, p); + while (p->filter_list) { + tp = p->filter_list; + p->filter_list = tp->next; + tp->ops->destroy(tp); + } + memset(p, 0, sizeof(*p)); + p->filter_list = NULL; + + + MOD_DEC_USE_COUNT; + +} + + +static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + unsigned char *b = skb->tail; + struct rtattr *rta; + + rta = (struct rtattr *) b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + rta->rta_len = skb->tail - b; + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static struct Qdisc_class_ops ingress_class_ops = +{ + ingress_graft, /* graft */ + ingress_leaf, /* leaf */ + ingress_get, /* get */ + ingress_put, /* put */ + ingress_change, /* change */ + NULL, /* delete */ + ingress_walk, /* walk */ + + ingress_find_tcf, /* tcf_chain */ + ingress_bind_filter, /* bind_tcf */ + ingress_put, /* unbind_tcf */ + + NULL, /* dump */ +}; + +struct Qdisc_ops ingress_qdisc_ops = +{ + NULL, /* next */ + &ingress_class_ops, /* cl_ops */ + "ingress", + sizeof(struct ingress_qdisc_data), + + ingress_enqueue, /* enqueue */ + ingress_dequeue, /* dequeue */ + ingress_requeue, /* requeue */ + ingress_drop, /* drop */ + + ingress_init, /* init */ + ingress_reset, /* reset */ + ingress_destroy, /* destroy */ + NULL, /* change */ + + ingress_dump, /* dump */ +}; + + +#ifdef MODULE +int init_module(void) +{ + int ret = 0; + + if ((ret = register_qdisc(&ingress_qdisc_ops)) < 0) { + printk("Unable to register Ingress qdisc\n"); + return ret; + } + + return ret; +} + + +void cleanup_module(void) +{ + unregister_qdisc(&ingress_qdisc_ops); + if (nf_registered) + nf_unregister_hook(&ing_ops); +} +#endif +MODULE_LICENSE("GPL"); diff --git a/release/src/linux/linux/net/sched/sch_prio.c b/release/src/linux/linux/net/sched/sch_prio.c new file mode 100644 index 00000000..62a37363 --- /dev/null +++ b/release/src/linux/linux/net/sched/sch_prio.c @@ -0,0 +1,417 @@ +/* + * net/sched/sch_prio.c Simple 3-band priority "scheduler". + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * Fixes: 19990609: J Hadi Salim <hadi@nortelnetworks.com>: + * Init -- EINVAL when opt undefined + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + + +struct prio_sched_data +{ + int bands; + struct tcf_proto *filter_list; + u8 prio2band[TC_PRIO_MAX+1]; + struct Qdisc *queues[TCQ_PRIO_BANDS]; +}; + + +static __inline__ unsigned prio_classify(struct sk_buff *skb, struct Qdisc *sch) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + struct tcf_result res; + u32 band; + + band = skb->priority; + if (TC_H_MAJ(skb->priority) != sch->handle) { + if (!q->filter_list || tc_classify(skb, q->filter_list, &res)) { + if (TC_H_MAJ(band)) + band = 0; + return q->prio2band[band&TC_PRIO_MAX]; + } + band = res.classid; + } + band = TC_H_MIN(band) - 1; + return band < q->bands ? band : q->prio2band[0]; +} + +static int +prio_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + struct Qdisc *qdisc; + int ret; + + qdisc = q->queues[prio_classify(skb, sch)]; + + if ((ret = qdisc->enqueue(skb, qdisc)) == 0) { + sch->stats.bytes += skb->len; + sch->stats.packets++; + sch->q.qlen++; + return 0; + } + sch->stats.drops++; + return ret; +} + + +static int +prio_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + struct Qdisc *qdisc; + int ret; + + qdisc = q->queues[prio_classify(skb, sch)]; + + if ((ret = qdisc->ops->requeue(skb, qdisc)) == 0) { + sch->q.qlen++; + return 0; + } + sch->stats.drops++; + return ret; +} + + +static struct sk_buff * +prio_dequeue(struct Qdisc* sch) +{ + struct sk_buff *skb; + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + int prio; + struct Qdisc *qdisc; + + for (prio = 0; prio < q->bands; prio++) { + qdisc = q->queues[prio]; + skb = qdisc->dequeue(qdisc); + if (skb) { + sch->q.qlen--; + return skb; + } + } + return NULL; + +} + +static int +prio_drop(struct Qdisc* sch) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + int prio; + struct Qdisc *qdisc; + + for (prio = q->bands-1; prio >= 0; prio--) { + qdisc = q->queues[prio]; + if (qdisc->ops->drop(qdisc)) { + sch->q.qlen--; + return 1; + } + } + return 0; +} + + +static void +prio_reset(struct Qdisc* sch) +{ + int prio; + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + + for (prio=0; prio<q->bands; prio++) + qdisc_reset(q->queues[prio]); + sch->q.qlen = 0; +} + +static void +prio_destroy(struct Qdisc* sch) +{ + int prio; + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + + for (prio=0; prio<q->bands; prio++) { + qdisc_destroy(q->queues[prio]); + q->queues[prio] = &noop_qdisc; + } + MOD_DEC_USE_COUNT; +} + +static int prio_tune(struct Qdisc *sch, struct rtattr *opt) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + struct tc_prio_qopt *qopt = RTA_DATA(opt); + int i; + + if (opt->rta_len < RTA_LENGTH(sizeof(*qopt))) + return -EINVAL; + if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2) + return -EINVAL; + + for (i=0; i<=TC_PRIO_MAX; i++) { + if (qopt->priomap[i] >= qopt->bands) + return -EINVAL; + } + + sch_tree_lock(sch); + q->bands = qopt->bands; + memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1); + + for (i=q->bands; i<TCQ_PRIO_BANDS; i++) { + struct Qdisc *child = xchg(&q->queues[i], &noop_qdisc); + if (child != &noop_qdisc) + qdisc_destroy(child); + } + sch_tree_unlock(sch); + + for (i=0; i<=TC_PRIO_MAX; i++) { + int band = q->prio2band[i]; + if (q->queues[band] == &noop_qdisc) { + struct Qdisc *child; + child = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + if (child) { + sch_tree_lock(sch); + child = xchg(&q->queues[band], child); + + if (child != &noop_qdisc) + qdisc_destroy(child); + sch_tree_unlock(sch); + } + } + } + return 0; +} + +static int prio_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + int i; + + for (i=0; i<TCQ_PRIO_BANDS; i++) + q->queues[i] = &noop_qdisc; + + if (opt == NULL) { + return -EINVAL; + } else { + int err; + + if ((err= prio_tune(sch, opt)) != 0) + return err; + } + MOD_INC_USE_COUNT; + return 0; +} + +static int prio_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + unsigned char *b = skb->tail; + struct tc_prio_qopt opt; + + opt.bands = q->bands; + memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1); + RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + unsigned long band = arg - 1; + + if (band >= q->bands) + return -EINVAL; + + if (new == NULL) + new = &noop_qdisc; + + sch_tree_lock(sch); + *old = q->queues[band]; + q->queues[band] = new; + qdisc_reset(*old); + sch_tree_unlock(sch); + + return 0; +} + +static struct Qdisc * +prio_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + unsigned long band = arg - 1; + + if (band >= q->bands) + return NULL; + + return q->queues[band]; +} + +static unsigned long prio_get(struct Qdisc *sch, u32 classid) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + unsigned long band = TC_H_MIN(classid); + + if (band - 1 >= q->bands) + return 0; + return band; +} + +static unsigned long prio_bind(struct Qdisc *sch, unsigned long parent, u32 classid) +{ + return prio_get(sch, classid); +} + + +static void prio_put(struct Qdisc *q, unsigned long cl) +{ + return; +} + +static int prio_change(struct Qdisc *sch, u32 handle, u32 parent, struct rtattr **tca, unsigned long *arg) +{ + unsigned long cl = *arg; + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + + if (cl - 1 > q->bands) + return -ENOENT; + return 0; +} + +static int prio_delete(struct Qdisc *sch, unsigned long cl) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + if (cl - 1 > q->bands) + return -ENOENT; + return 0; +} + + +static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, + struct tcmsg *tcm) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + + if (cl - 1 > q->bands) + return -ENOENT; + tcm->tcm_handle |= TC_H_MIN(cl); + if (q->queues[cl-1]) + tcm->tcm_info = q->queues[cl-1]->handle; + return 0; +} + +static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + int prio; + + if (arg->stop) + return; + + for (prio = 0; prio < q->bands; prio++) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, prio+1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static struct tcf_proto ** prio_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + struct prio_sched_data *q = (struct prio_sched_data *)sch->data; + + if (cl) + return NULL; + return &q->filter_list; +} + +static struct Qdisc_class_ops prio_class_ops = +{ + prio_graft, + prio_leaf, + + prio_get, + prio_put, + prio_change, + prio_delete, + prio_walk, + + prio_find_tcf, + prio_bind, + prio_put, + + prio_dump_class, +}; + +struct Qdisc_ops prio_qdisc_ops = +{ + NULL, + &prio_class_ops, + "prio", + sizeof(struct prio_sched_data), + + prio_enqueue, + prio_dequeue, + prio_requeue, + prio_drop, + + prio_init, + prio_reset, + prio_destroy, + prio_tune, + + prio_dump, +}; + +#ifdef MODULE + +int init_module(void) +{ + return register_qdisc(&prio_qdisc_ops); +} + +void cleanup_module(void) +{ + unregister_qdisc(&prio_qdisc_ops); +} + +#endif +MODULE_LICENSE("GPL"); diff --git a/release/src/linux/linux/net/sched/sch_red.c b/release/src/linux/linux/net/sched/sch_red.c new file mode 100644 index 00000000..64cbc53e --- /dev/null +++ b/release/src/linux/linux/net/sched/sch_red.c @@ -0,0 +1,496 @@ +/* + * net/sched/sch_red.c Random Early Detection queue. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * Changes: + * J Hadi Salim <hadi@nortel.com> 980914: computation fixes + * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly. + * J Hadi Salim <hadi@nortelnetworks.com> 980816: ECN support + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> +#include <net/inet_ecn.h> + +#define RED_ECN_ECT 0x02 +#define RED_ECN_CE 0x01 + + +/* Random Early Detection (RED) algorithm. + ======================================= + + Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways + for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking. + + This file codes a "divisionless" version of RED algorithm + as written down in Fig.17 of the paper. + +Short description. +------------------ + + When a new packet arrives we calculate the average queue length: + + avg = (1-W)*avg + W*current_queue_len, + + W is the filter time constant (choosen as 2^(-Wlog)), it controls + the inertia of the algorithm. To allow larger bursts, W should be + decreased. + + if (avg > th_max) -> packet marked (dropped). + if (avg < th_min) -> packet passes. + if (th_min < avg < th_max) we calculate probability: + + Pb = max_P * (avg - th_min)/(th_max-th_min) + + and mark (drop) packet with this probability. + Pb changes from 0 (at avg==th_min) to max_P (avg==th_max). + max_P should be small (not 1), usually 0.01..0.02 is good value. + + max_P is chosen as a number, so that max_P/(th_max-th_min) + is a negative power of two in order arithmetics to contain + only shifts. + + + Parameters, settable by user: + ----------------------------- + + limit - bytes (must be > qth_max + burst) + + Hard limit on queue length, should be chosen >qth_max + to allow packet bursts. This parameter does not + affect the algorithms behaviour and can be chosen + arbitrarily high (well, less than ram size) + Really, this limit will never be reached + if RED works correctly. + + qth_min - bytes (should be < qth_max/2) + qth_max - bytes (should be at least 2*qth_min and less limit) + Wlog - bits (<32) log(1/W). + Plog - bits (<32) + + Plog is related to max_P by formula: + + max_P = (qth_max-qth_min)/2^Plog; + + F.e. if qth_max=128K and qth_min=32K, then Plog=22 + corresponds to max_P=0.02 + + Scell_log + Stab + + Lookup table for log((1-W)^(t/t_ave). + + +NOTES: + +Upper bound on W. +----------------- + + If you want to allow bursts of L packets of size S, + you should choose W: + + L + 1 - th_min/S < (1-(1-W)^L)/W + + th_min/S = 32 th_min/S = 4 + + log(W) L + -1 33 + -2 35 + -3 39 + -4 46 + -5 57 + -6 75 + -7 101 + -8 135 + -9 190 + etc. + */ + +struct red_sched_data +{ +/* Parameters */ + u32 limit; /* HARD maximal queue length */ + u32 qth_min; /* Min average length threshold: A scaled */ + u32 qth_max; /* Max average length threshold: A scaled */ + u32 Rmask; + u32 Scell_max; + unsigned char flags; + char Wlog; /* log(W) */ + char Plog; /* random number bits */ + char Scell_log; + u8 Stab[256]; + +/* Variables */ + unsigned long qave; /* Average queue length: A scaled */ + int qcount; /* Packets since last random number generation */ + u32 qR; /* Cached random number */ + + psched_time_t qidlestart; /* Start of idle period */ + struct tc_red_xstats st; +}; + +static int red_ecn_mark(struct sk_buff *skb) +{ + if (skb->nh.raw + 20 > skb->tail) + return 0; + + switch (skb->protocol) { + case __constant_htons(ETH_P_IP): + { + u8 tos = skb->nh.iph->tos; + + if (!(tos & RED_ECN_ECT)) + return 0; + + if (!(tos & RED_ECN_CE)) + IP_ECN_set_ce(skb->nh.iph); + + return 1; + } + + case __constant_htons(ETH_P_IPV6): + { + u32 label = *(u32*)skb->nh.raw; + + if (!(label & __constant_htonl(RED_ECN_ECT<<20))) + return 0; + label |= __constant_htonl(RED_ECN_CE<<20); + return 1; + } + + default: + return 0; + } +} + +static int +red_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct red_sched_data *q = (struct red_sched_data *)sch->data; + + psched_time_t now; + + if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { + long us_idle; + int shift; + + PSCHED_GET_TIME(now); + us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max, 0); + PSCHED_SET_PASTPERFECT(q->qidlestart); + +/* + The problem: ideally, average length queue recalcultion should + be done over constant clock intervals. This is too expensive, so that + the calculation is driven by outgoing packets. + When the queue is idle we have to model this clock by hand. + + SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth) + dummy packets as a burst after idle time, i.e. + + q->qave *= (1-W)^m + + This is an apparently overcomplicated solution (f.e. we have to precompute + a table to make this calculation in reasonable time) + I believe that a simpler model may be used here, + but it is field for experiments. +*/ + shift = q->Stab[us_idle>>q->Scell_log]; + + if (shift) { + q->qave >>= shift; + } else { + /* Approximate initial part of exponent + with linear function: + (1-W)^m ~= 1-mW + ... + + Seems, it is the best solution to + problem of too coarce exponent tabulation. + */ + + us_idle = (q->qave * us_idle)>>q->Scell_log; + if (us_idle < q->qave/2) + q->qave -= us_idle; + else + q->qave >>= 1; + } + } else { + q->qave += sch->stats.backlog - (q->qave >> q->Wlog); + /* NOTE: + q->qave is fixed point number with point at Wlog. + The formulae above is equvalent to floating point + version: + + qave = qave*(1-W) + sch->stats.backlog*W; + --ANK (980924) + */ + } + + if (q->qave < q->qth_min) { + q->qcount = -1; +enqueue: + if (sch->stats.backlog <= q->limit) { + __skb_queue_tail(&sch->q, skb); + sch->stats.backlog += skb->len; + sch->stats.bytes += skb->len; + sch->stats.packets++; + return NET_XMIT_SUCCESS; + } else { + q->st.pdrop++; + } + kfree_skb(skb); + sch->stats.drops++; + return NET_XMIT_DROP; + } + if (q->qave >= q->qth_max) { + q->qcount = -1; + sch->stats.overlimits++; +mark: + if (!(q->flags&TC_RED_ECN) || !red_ecn_mark(skb)) { + q->st.early++; + goto drop; + } + q->st.marked++; + goto enqueue; + } + + if (++q->qcount) { + /* The formula used below causes questions. + + OK. qR is random number in the interval 0..Rmask + i.e. 0..(2^Plog). If we used floating point + arithmetics, it would be: (2^Plog)*rnd_num, + where rnd_num is less 1. + + Taking into account, that qave have fixed + point at Wlog, and Plog is related to max_P by + max_P = (qth_max-qth_min)/2^Plog; two lines + below have the following floating point equivalent: + + max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount + + Any questions? --ANK (980924) + */ + if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR) + goto enqueue; + q->qcount = 0; + q->qR = net_random()&q->Rmask; + sch->stats.overlimits++; + goto mark; + } + q->qR = net_random()&q->Rmask; + goto enqueue; + +drop: + kfree_skb(skb); + sch->stats.drops++; + return NET_XMIT_CN; +} + +static int +red_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct red_sched_data *q = (struct red_sched_data *)sch->data; + + PSCHED_SET_PASTPERFECT(q->qidlestart); + + __skb_queue_head(&sch->q, skb); + sch->stats.backlog += skb->len; + return 0; +} + +static struct sk_buff * +red_dequeue(struct Qdisc* sch) +{ + struct sk_buff *skb; + struct red_sched_data *q = (struct red_sched_data *)sch->data; + + skb = __skb_dequeue(&sch->q); + if (skb) { + sch->stats.backlog -= skb->len; + return skb; + } + PSCHED_GET_TIME(q->qidlestart); + return NULL; +} + +static int +red_drop(struct Qdisc* sch) +{ + struct sk_buff *skb; + struct red_sched_data *q = (struct red_sched_data *)sch->data; + + skb = __skb_dequeue_tail(&sch->q); + if (skb) { + sch->stats.backlog -= skb->len; + sch->stats.drops++; + q->st.other++; + kfree_skb(skb); + return 1; + } + PSCHED_GET_TIME(q->qidlestart); + return 0; +} + +static void red_reset(struct Qdisc* sch) +{ + struct red_sched_data *q = (struct red_sched_data *)sch->data; + + __skb_queue_purge(&sch->q); + sch->stats.backlog = 0; + PSCHED_SET_PASTPERFECT(q->qidlestart); + q->qave = 0; + q->qcount = -1; +} + +static int red_change(struct Qdisc *sch, struct rtattr *opt) +{ + struct red_sched_data *q = (struct red_sched_data *)sch->data; + struct rtattr *tb[TCA_RED_STAB]; + struct tc_red_qopt *ctl; + + if (opt == NULL || + rtattr_parse(tb, TCA_RED_STAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) || + tb[TCA_RED_PARMS-1] == 0 || tb[TCA_RED_STAB-1] == 0 || + RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) || + RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256) + return -EINVAL; + + ctl = RTA_DATA(tb[TCA_RED_PARMS-1]); + + sch_tree_lock(sch); + q->flags = ctl->flags; + q->Wlog = ctl->Wlog; + q->Plog = ctl->Plog; + q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL; + q->Scell_log = ctl->Scell_log; + q->Scell_max = (255<<q->Scell_log); + q->qth_min = ctl->qth_min<<ctl->Wlog; + q->qth_max = ctl->qth_max<<ctl->Wlog; + q->limit = ctl->limit; + memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256); + + q->qcount = -1; + if (skb_queue_len(&sch->q) == 0) + PSCHED_SET_PASTPERFECT(q->qidlestart); + sch_tree_unlock(sch); + return 0; +} + +static int red_init(struct Qdisc* sch, struct rtattr *opt) +{ + int err; + + MOD_INC_USE_COUNT; + + if ((err = red_change(sch, opt)) != 0) { + MOD_DEC_USE_COUNT; + } + return err; +} + + +int red_copy_xstats(struct sk_buff *skb, struct tc_red_xstats *st) +{ + RTA_PUT(skb, TCA_XSTATS, sizeof(*st), st); + return 0; + +rtattr_failure: + return 1; +} + +static int red_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct red_sched_data *q = (struct red_sched_data *)sch->data; + unsigned char *b = skb->tail; + struct rtattr *rta; + struct tc_red_qopt opt; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + opt.limit = q->limit; + opt.qth_min = q->qth_min>>q->Wlog; + opt.qth_max = q->qth_max>>q->Wlog; + opt.Wlog = q->Wlog; + opt.Plog = q->Plog; + opt.Scell_log = q->Scell_log; + opt.flags = q->flags; + RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt); + rta->rta_len = skb->tail - b; + + if (red_copy_xstats(skb, &q->st)) + goto rtattr_failure; + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static void red_destroy(struct Qdisc *sch) +{ + MOD_DEC_USE_COUNT; +} + +struct Qdisc_ops red_qdisc_ops = +{ + NULL, + NULL, + "red", + sizeof(struct red_sched_data), + + red_enqueue, + red_dequeue, + red_requeue, + red_drop, + + red_init, + red_reset, + red_destroy, + red_change, + + red_dump, +}; + + +#ifdef MODULE +int init_module(void) +{ + return register_qdisc(&red_qdisc_ops); +} + +void cleanup_module(void) +{ + unregister_qdisc(&red_qdisc_ops); +} +#endif +MODULE_LICENSE("GPL"); diff --git a/release/src/linux/linux/net/sched/sch_sfq.c b/release/src/linux/linux/net/sched/sch_sfq.c new file mode 100644 index 00000000..c96762fb --- /dev/null +++ b/release/src/linux/linux/net/sched/sch_sfq.c @@ -0,0 +1,502 @@ +/* + * net/sched/sch_sfq.c Stochastic Fairness Queueing discipline. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <linux/init.h> +#include <net/ip.h> +#include <linux/ipv6.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + + +/* Stochastic Fairness Queuing algorithm. + ======================================= + + Source: + Paul E. McKenney "Stochastic Fairness Queuing", + IEEE INFOCOMM'90 Proceedings, San Francisco, 1990. + + Paul E. McKenney "Stochastic Fairness Queuing", + "Interworking: Research and Experience", v.2, 1991, p.113-131. + + + See also: + M. Shreedhar and George Varghese "Efficient Fair + Queuing using Deficit Round Robin", Proc. SIGCOMM 95. + + + This is not the thing that is usually called (W)FQ nowadays. + It does not use any timestamp mechanism, but instead + processes queues in round-robin order. + + ADVANTAGE: + + - It is very cheap. Both CPU and memory requirements are minimal. + + DRAWBACKS: + + - "Stochastic" -> It is not 100% fair. + When hash collisions occur, several flows are considered as one. + + - "Round-robin" -> It introduces larger delays than virtual clock + based schemes, and should not be used for isolating interactive + traffic from non-interactive. It means, that this scheduler + should be used as leaf of CBQ or P3, which put interactive traffic + to higher priority band. + + We still need true WFQ for top level CSZ, but using WFQ + for the best effort traffic is absolutely pointless: + SFQ is superior for this purpose. + + IMPLEMENTATION: + This implementation limits maximal queue length to 128; + maximal mtu to 2^15-1; number of hash buckets to 1024. + The only goal of this restrictions was that all data + fit into one 4K page :-). Struct sfq_sched_data is + organized in anti-cache manner: all the data for a bucket + are scattered over different locations. This is not good, + but it allowed me to put it into 4K. + + It is easy to increase these values, but not in flight. */ + +#define SFQ_DEPTH 128 +#define SFQ_HASH_DIVISOR 1024 + +/* This type should contain at least SFQ_DEPTH*2 values */ +typedef unsigned char sfq_index; + +struct sfq_head +{ + sfq_index next; + sfq_index prev; +}; + +struct sfq_sched_data +{ +/* Parameters */ + int perturb_period; + unsigned quantum; /* Allotment per round: MUST BE >= MTU */ + int limit; + +/* Variables */ + struct timer_list perturb_timer; + int perturbation; + sfq_index tail; /* Index of current slot in round */ + sfq_index max_depth; /* Maximal depth */ + + sfq_index ht[SFQ_HASH_DIVISOR]; /* Hash table */ + sfq_index next[SFQ_DEPTH]; /* Active slots link */ + short allot[SFQ_DEPTH]; /* Current allotment per slot */ + unsigned short hash[SFQ_DEPTH]; /* Hash value indexed by slots */ + struct sk_buff_head qs[SFQ_DEPTH]; /* Slot queue */ + struct sfq_head dep[SFQ_DEPTH*2]; /* Linked list of slots, indexed by depth */ +}; + +static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1) +{ + int pert = q->perturbation; + + /* Have we any rotation primitives? If not, WHY? */ + h ^= (h1<<pert) ^ (h1>>(0x1F - pert)); + h ^= h>>10; + return h & 0x3FF; +} + +#ifndef IPPROTO_ESP +#define IPPROTO_ESP 50 +#endif + +static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) +{ + u32 h, h2; + + switch (skb->protocol) { + case __constant_htons(ETH_P_IP): + { + struct iphdr *iph = skb->nh.iph; + h = iph->daddr; + h2 = iph->saddr^iph->protocol; + if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && + (iph->protocol == IPPROTO_TCP || + iph->protocol == IPPROTO_UDP || + iph->protocol == IPPROTO_ESP)) + h2 ^= *(((u32*)iph) + iph->ihl); + break; + } + case __constant_htons(ETH_P_IPV6): + { + struct ipv6hdr *iph = skb->nh.ipv6h; + h = iph->daddr.s6_addr32[3]; + h2 = iph->saddr.s6_addr32[3]^iph->nexthdr; + if (iph->nexthdr == IPPROTO_TCP || + iph->nexthdr == IPPROTO_UDP || + iph->nexthdr == IPPROTO_ESP) + h2 ^= *(u32*)&iph[1]; + break; + } + default: + h = (u32)(unsigned long)skb->dst^skb->protocol; + h2 = (u32)(unsigned long)skb->sk; + } + return sfq_fold_hash(q, h, h2); +} + +extern __inline__ void sfq_link(struct sfq_sched_data *q, sfq_index x) +{ + sfq_index p, n; + int d = q->qs[x].qlen + SFQ_DEPTH; + + p = d; + n = q->dep[d].next; + q->dep[x].next = n; + q->dep[x].prev = p; + q->dep[p].next = q->dep[n].prev = x; +} + +extern __inline__ void sfq_dec(struct sfq_sched_data *q, sfq_index x) +{ + sfq_index p, n; + + n = q->dep[x].next; + p = q->dep[x].prev; + q->dep[p].next = n; + q->dep[n].prev = p; + + if (n == p && q->max_depth == q->qs[x].qlen + 1) + q->max_depth--; + + sfq_link(q, x); +} + +extern __inline__ void sfq_inc(struct sfq_sched_data *q, sfq_index x) +{ + sfq_index p, n; + int d; + + n = q->dep[x].next; + p = q->dep[x].prev; + q->dep[p].next = n; + q->dep[n].prev = p; + d = q->qs[x].qlen; + if (q->max_depth < d) + q->max_depth = d; + + sfq_link(q, x); +} + +static int sfq_drop(struct Qdisc *sch) +{ + struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; + sfq_index d = q->max_depth; + struct sk_buff *skb; + + /* Queue is full! Find the longest slot and + drop a packet from it */ + + if (d > 1) { + sfq_index x = q->dep[d+SFQ_DEPTH].next; + skb = q->qs[x].prev; + __skb_unlink(skb, &q->qs[x]); + kfree_skb(skb); + sfq_dec(q, x); + sch->q.qlen--; + sch->stats.drops++; + return 1; + } + + if (d == 1) { + /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ + d = q->next[q->tail]; + q->next[q->tail] = q->next[d]; + q->allot[q->next[d]] += q->quantum; + skb = q->qs[d].prev; + __skb_unlink(skb, &q->qs[d]); + kfree_skb(skb); + sfq_dec(q, d); + sch->q.qlen--; + q->ht[q->hash[d]] = SFQ_DEPTH; + sch->stats.drops++; + return 1; + } + + return 0; +} + +static int +sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; + unsigned hash = sfq_hash(q, skb); + sfq_index x; + + x = q->ht[hash]; + if (x == SFQ_DEPTH) { + q->ht[hash] = x = q->dep[SFQ_DEPTH].next; + q->hash[x] = hash; + } + __skb_queue_tail(&q->qs[x], skb); + sfq_inc(q, x); + if (q->qs[x].qlen == 1) { /* The flow is new */ + if (q->tail == SFQ_DEPTH) { /* It is the first flow */ + q->tail = x; + q->next[x] = x; + q->allot[x] = q->quantum; + } else { + q->next[x] = q->next[q->tail]; + q->next[q->tail] = x; + q->tail = x; + } + } + if (++sch->q.qlen < q->limit-1) { + sch->stats.bytes += skb->len; + sch->stats.packets++; + return 0; + } + + sfq_drop(sch); + return NET_XMIT_CN; +} + +static int +sfq_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; + unsigned hash = sfq_hash(q, skb); + sfq_index x; + + x = q->ht[hash]; + if (x == SFQ_DEPTH) { + q->ht[hash] = x = q->dep[SFQ_DEPTH].next; + q->hash[x] = hash; + } + __skb_queue_head(&q->qs[x], skb); + sfq_inc(q, x); + if (q->qs[x].qlen == 1) { /* The flow is new */ + if (q->tail == SFQ_DEPTH) { /* It is the first flow */ + q->tail = x; + q->next[x] = x; + q->allot[x] = q->quantum; + } else { + q->next[x] = q->next[q->tail]; + q->next[q->tail] = x; + q->tail = x; + } + } + if (++sch->q.qlen < q->limit - 1) + return 0; + + sch->stats.drops++; + sfq_drop(sch); + return NET_XMIT_CN; +} + + + + +static struct sk_buff * +sfq_dequeue(struct Qdisc* sch) +{ + struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; + struct sk_buff *skb; + sfq_index a, old_a; + + /* No active slots */ + if (q->tail == SFQ_DEPTH) + return NULL; + + a = old_a = q->next[q->tail]; + + /* Grab packet */ + skb = __skb_dequeue(&q->qs[a]); + sfq_dec(q, a); + sch->q.qlen--; + + /* Is the slot empty? */ + if (q->qs[a].qlen == 0) { + a = q->next[a]; + if (a == old_a) { + q->tail = SFQ_DEPTH; + return skb; + } + q->next[q->tail] = a; + q->allot[a] += q->quantum; + } else if ((q->allot[a] -= skb->len) <= 0) { + q->tail = a; + a = q->next[a]; + q->allot[a] += q->quantum; + } + return skb; +} + +static void +sfq_reset(struct Qdisc* sch) +{ + struct sk_buff *skb; + + while ((skb = sfq_dequeue(sch)) != NULL) + kfree_skb(skb); +} + +static void sfq_perturbation(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; + + q->perturbation = net_random()&0x1F; + q->perturb_timer.expires = jiffies + q->perturb_period; + + if (q->perturb_period) { + q->perturb_timer.expires = jiffies + q->perturb_period; + add_timer(&q->perturb_timer); + } +} + +static int sfq_change(struct Qdisc *sch, struct rtattr *opt) +{ + struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; + struct tc_sfq_qopt *ctl = RTA_DATA(opt); + + if (opt->rta_len < RTA_LENGTH(sizeof(*ctl))) + return -EINVAL; + + sch_tree_lock(sch); + q->quantum = ctl->quantum ? : psched_mtu(sch->dev); + q->perturb_period = ctl->perturb_period*HZ; + if (ctl->limit) + q->limit = min_t(u32, ctl->limit, SFQ_DEPTH); + + while (sch->q.qlen >= q->limit-1) + sfq_drop(sch); + + del_timer(&q->perturb_timer); + if (q->perturb_period) { + q->perturb_timer.expires = jiffies + q->perturb_period; + add_timer(&q->perturb_timer); + } + sch_tree_unlock(sch); + return 0; +} + +static int sfq_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; + int i; + + q->perturb_timer.data = (unsigned long)sch; + q->perturb_timer.function = sfq_perturbation; + init_timer(&q->perturb_timer); + + for (i=0; i<SFQ_HASH_DIVISOR; i++) + q->ht[i] = SFQ_DEPTH; + for (i=0; i<SFQ_DEPTH; i++) { + skb_queue_head_init(&q->qs[i]); + q->dep[i+SFQ_DEPTH].next = i+SFQ_DEPTH; + q->dep[i+SFQ_DEPTH].prev = i+SFQ_DEPTH; + } + q->limit = SFQ_DEPTH; + q->max_depth = 0; + q->tail = SFQ_DEPTH; + if (opt == NULL) { + q->quantum = psched_mtu(sch->dev); + q->perturb_period = 0; + } else { + int err = sfq_change(sch, opt); + if (err) + return err; + } + for (i=0; i<SFQ_DEPTH; i++) + sfq_link(q, i); + MOD_INC_USE_COUNT; + return 0; +} + +static void sfq_destroy(struct Qdisc *sch) +{ + struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; + del_timer(&q->perturb_timer); + MOD_DEC_USE_COUNT; +} + +static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct sfq_sched_data *q = (struct sfq_sched_data *)sch->data; + unsigned char *b = skb->tail; + struct tc_sfq_qopt opt; + + opt.quantum = q->quantum; + opt.perturb_period = q->perturb_period/HZ; + + opt.limit = q->limit; + opt.divisor = SFQ_HASH_DIVISOR; + opt.flows = q->limit; + + RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +struct Qdisc_ops sfq_qdisc_ops = +{ + NULL, + NULL, + "sfq", + sizeof(struct sfq_sched_data), + + sfq_enqueue, + sfq_dequeue, + sfq_requeue, + sfq_drop, + + sfq_init, + sfq_reset, + sfq_destroy, + NULL, /* sfq_change */ + + sfq_dump, +}; + +#ifdef MODULE +int init_module(void) +{ + return register_qdisc(&sfq_qdisc_ops); +} + +void cleanup_module(void) +{ + unregister_qdisc(&sfq_qdisc_ops); +} +#endif +MODULE_LICENSE("GPL"); diff --git a/release/src/linux/linux/net/sched/sch_tbf.c b/release/src/linux/linux/net/sched/sch_tbf.c new file mode 100644 index 00000000..19a3de99 --- /dev/null +++ b/release/src/linux/linux/net/sched/sch_tbf.c @@ -0,0 +1,426 @@ +/* + * net/sched/sch_tbf.c Token Bucket Filter queue. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + + +/* Simple Token Bucket Filter. + ======================================= + + SOURCE. + ------- + + None. + + Description. + ------------ + + A data flow obeys TBF with rate R and depth B, if for any + time interval t_i...t_f the number of transmitted bits + does not exceed B + R*(t_f-t_i). + + Packetized version of this definition: + The sequence of packets of sizes s_i served at moments t_i + obeys TBF, if for any i<=k: + + s_i+....+s_k <= B + R*(t_k - t_i) + + Algorithm. + ---------- + + Let N(t_i) be B/R initially and N(t) grow continuously with time as: + + N(t+delta) = min{B/R, N(t) + delta} + + If the first packet in queue has length S, it may be + transmitted only at the time t_* when S/R <= N(t_*), + and in this case N(t) jumps: + + N(t_* + 0) = N(t_* - 0) - S/R. + + + + Actually, QoS requires two TBF to be applied to a data stream. + One of them controls steady state burst size, another + one with rate P (peak rate) and depth M (equal to link MTU) + limits bursts at a smaller time scale. + + It is easy to see that P>R, and B>M. If P is infinity, this double + TBF is equivalent to a single one. + + When TBF works in reshaping mode, latency is estimated as: + + lat = max ((L-B)/R, (L-M)/P) + + + NOTES. + ------ + + If TBF throttles, it starts a watchdog timer, which will wake it up + when it is ready to transmit. + Note that the minimal timer resolution is 1/HZ. + If no new packets arrive during this period, + or if the device is not awaken by EOI for some previous packet, + TBF can stop its activity for 1/HZ. + + + This means, that with depth B, the maximal rate is + + R_crit = B*HZ + + F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes. + + Note that the peak rate TBF is much more tough: with MTU 1500 + P_crit = 150Kbytes/sec. So, if you need greater peak + rates, use alpha with HZ=1000 :-) +*/ + +struct tbf_sched_data +{ +/* Parameters */ + u32 limit; /* Maximal length of backlog: bytes */ + u32 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */ + u32 mtu; + u32 max_size; + struct qdisc_rate_table *R_tab; + struct qdisc_rate_table *P_tab; + +/* Variables */ + long tokens; /* Current number of B tokens */ + long ptokens; /* Current number of P tokens */ + psched_time_t t_c; /* Time check-point */ + struct timer_list wd_timer; /* Watchdog timer */ +}; + +#define L2T(q,L) ((q)->R_tab->data[(L)>>(q)->R_tab->rate.cell_log]) +#define L2T_P(q,L) ((q)->P_tab->data[(L)>>(q)->P_tab->rate.cell_log]) + +static int +tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; + + if (skb->len > q->max_size) + goto drop; + __skb_queue_tail(&sch->q, skb); + if ((sch->stats.backlog += skb->len) <= q->limit) { + sch->stats.bytes += skb->len; + sch->stats.packets++; + return 0; + } + + /* Drop action: undo the things that we just did, + * i.e. make tail drop + */ + + __skb_unlink(skb, &sch->q); + sch->stats.backlog -= skb->len; + +drop: + sch->stats.drops++; +#ifdef CONFIG_NET_CLS_POLICE + if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch)) +#endif + kfree_skb(skb); + return NET_XMIT_DROP; +} + +static int +tbf_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + __skb_queue_head(&sch->q, skb); + sch->stats.backlog += skb->len; + return 0; +} + +static int +tbf_drop(struct Qdisc* sch) +{ + struct sk_buff *skb; + + skb = __skb_dequeue_tail(&sch->q); + if (skb) { + sch->stats.backlog -= skb->len; + sch->stats.drops++; + kfree_skb(skb); + return 1; + } + return 0; +} + +static void tbf_watchdog(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + + sch->flags &= ~TCQ_F_THROTTLED; + netif_schedule(sch->dev); +} + +static struct sk_buff * +tbf_dequeue(struct Qdisc* sch) +{ + struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; + struct sk_buff *skb; + + skb = __skb_dequeue(&sch->q); + + if (skb) { + psched_time_t now; + long toks; + long ptoks = 0; + + PSCHED_GET_TIME(now); + + toks = PSCHED_TDIFF_SAFE(now, q->t_c, q->buffer, 0); + + if (q->P_tab) { + ptoks = toks + q->ptokens; + if (ptoks > (long)q->mtu) + ptoks = q->mtu; + ptoks -= L2T_P(q, skb->len); + } + toks += q->tokens; + if (toks > (long)q->buffer) + toks = q->buffer; + toks -= L2T(q, skb->len); + + if ((toks|ptoks) >= 0) { + q->t_c = now; + q->tokens = toks; + q->ptokens = ptoks; + sch->stats.backlog -= skb->len; + sch->flags &= ~TCQ_F_THROTTLED; + return skb; + } + + if (!netif_queue_stopped(sch->dev)) { + long delay = PSCHED_US2JIFFIE(max_t(long, -toks, -ptoks)); + + if (delay == 0) + delay = 1; + + mod_timer(&q->wd_timer, jiffies+delay); + } + + /* Maybe we have a shorter packet in the queue, + which can be sent now. It sounds cool, + but, however, this is wrong in principle. + We MUST NOT reorder packets under these circumstances. + + Really, if we split the flow into independent + subflows, it would be a very good solution. + This is the main idea of all FQ algorithms + (cf. CSZ, HPFQ, HFSC) + */ + __skb_queue_head(&sch->q, skb); + + sch->flags |= TCQ_F_THROTTLED; + sch->stats.overlimits++; + } + return NULL; +} + + +static void +tbf_reset(struct Qdisc* sch) +{ + struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; + + skb_queue_purge(&sch->q); + sch->stats.backlog = 0; + PSCHED_GET_TIME(q->t_c); + q->tokens = q->buffer; + q->ptokens = q->mtu; + sch->flags &= ~TCQ_F_THROTTLED; + del_timer(&q->wd_timer); +} + +static int tbf_change(struct Qdisc* sch, struct rtattr *opt) +{ + int err = -EINVAL; + struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; + struct rtattr *tb[TCA_TBF_PTAB]; + struct tc_tbf_qopt *qopt; + struct qdisc_rate_table *rtab = NULL; + struct qdisc_rate_table *ptab = NULL; + int max_size,n; + + if (rtattr_parse(tb, TCA_TBF_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) || + tb[TCA_TBF_PARMS-1] == NULL || + RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt)) + goto done; + + qopt = RTA_DATA(tb[TCA_TBF_PARMS-1]); + rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]); + if (rtab == NULL) + goto done; + + if (qopt->peakrate.rate) { + if (qopt->peakrate.rate > qopt->rate.rate) + ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB-1]); + if (ptab == NULL) + goto done; + } + + for (n = 0; n < 256; n++) + if (rtab->data[n] > qopt->buffer) break; + max_size = (n << qopt->rate.cell_log)-1; + if (ptab) { + int size; + + for (n = 0; n < 256; n++) + if (ptab->data[n] > qopt->mtu) break; + size = (n << qopt->peakrate.cell_log)-1; + if (size < max_size) max_size = size; + } + if (max_size < 0) + goto done; + + sch_tree_lock(sch); + q->limit = qopt->limit; + q->mtu = qopt->mtu; + q->max_size = max_size; + q->buffer = qopt->buffer; + q->tokens = q->buffer; + q->ptokens = q->mtu; + rtab = xchg(&q->R_tab, rtab); + ptab = xchg(&q->P_tab, ptab); + sch_tree_unlock(sch); + err = 0; +done: + if (rtab) + qdisc_put_rtab(rtab); + if (ptab) + qdisc_put_rtab(ptab); + return err; +} + +static int tbf_init(struct Qdisc* sch, struct rtattr *opt) +{ + int err; + struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; + + if (opt == NULL) + return -EINVAL; + + MOD_INC_USE_COUNT; + + PSCHED_GET_TIME(q->t_c); + init_timer(&q->wd_timer); + q->wd_timer.function = tbf_watchdog; + q->wd_timer.data = (unsigned long)sch; + + if ((err = tbf_change(sch, opt)) != 0) { + MOD_DEC_USE_COUNT; + } + return err; +} + +static void tbf_destroy(struct Qdisc *sch) +{ + struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; + + del_timer(&q->wd_timer); + + if (q->P_tab) + qdisc_put_rtab(q->P_tab); + if (q->R_tab) + qdisc_put_rtab(q->R_tab); + + MOD_DEC_USE_COUNT; +} + +static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data; + unsigned char *b = skb->tail; + struct rtattr *rta; + struct tc_tbf_qopt opt; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + opt.limit = q->limit; + opt.rate = q->R_tab->rate; + if (q->P_tab) + opt.peakrate = q->P_tab->rate; + else + memset(&opt.peakrate, 0, sizeof(opt.peakrate)); + opt.mtu = q->mtu; + opt.buffer = q->buffer; + RTA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt); + rta->rta_len = skb->tail - b; + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +struct Qdisc_ops tbf_qdisc_ops = +{ + NULL, + NULL, + "tbf", + sizeof(struct tbf_sched_data), + + tbf_enqueue, + tbf_dequeue, + tbf_requeue, + tbf_drop, + + tbf_init, + tbf_reset, + tbf_destroy, + tbf_change, + + tbf_dump, +}; + + +#ifdef MODULE +int init_module(void) +{ + return register_qdisc(&tbf_qdisc_ops); +} + +void cleanup_module(void) +{ + unregister_qdisc(&tbf_qdisc_ops); +} +#endif +MODULE_LICENSE("GPL"); diff --git a/release/src/linux/linux/net/sched/sch_teql.c b/release/src/linux/linux/net/sched/sch_teql.c new file mode 100644 index 00000000..7bc13e30 --- /dev/null +++ b/release/src/linux/linux/net/sched/sch_teql.c @@ -0,0 +1,496 @@ +/* net/sched/sch_teql.c "True" (or "trivial") link equalizer. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <linux/init.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + +/* + How to setup it. + ---------------- + + After loading this module you will find a new device teqlN + and new qdisc with the same name. To join a slave to the equalizer + you should just set this qdisc on a device f.e. + + # tc qdisc add dev eth0 root teql0 + # tc qdisc add dev eth1 root teql0 + + That's all. Full PnP 8) + + Applicability. + -------------- + + 1. Slave devices MUST be active devices, i.e., they must raise the tbusy + signal and generate EOI events. If you want to equalize virtual devices + like tunnels, use a normal eql device. + 2. This device puts no limitations on physical slave characteristics + f.e. it will equalize 9600baud line and 100Mb ethernet perfectly :-) + Certainly, large difference in link speeds will make the resulting + eqalized link unusable, because of huge packet reordering. + I estimate an upper useful difference as ~10 times. + 3. If the slave requires address resolution, only protocols using + neighbour cache (IPv4/IPv6) will work over the equalized link. + Other protocols are still allowed to use the slave device directly, + which will not break load balancing, though native slave + traffic will have the highest priority. */ + +struct teql_master +{ + struct Qdisc_ops qops; + struct net_device dev; + struct Qdisc *slaves; + struct net_device_stats stats; +}; + +struct teql_sched_data +{ + struct Qdisc *next; + struct teql_master *m; + struct neighbour *ncache; + struct sk_buff_head q; +}; + +#define NEXT_SLAVE(q) (((struct teql_sched_data*)((q)->data))->next) + +#define FMASK (IFF_BROADCAST|IFF_POINTOPOINT|IFF_BROADCAST) + +/* "teql*" qdisc routines */ + +static int +teql_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct net_device *dev = sch->dev; + struct teql_sched_data *q = (struct teql_sched_data *)sch->data; + + __skb_queue_tail(&q->q, skb); + if (q->q.qlen <= dev->tx_queue_len) { + sch->stats.bytes += skb->len; + sch->stats.packets++; + return 0; + } + + __skb_unlink(skb, &q->q); + kfree_skb(skb); + sch->stats.drops++; + return NET_XMIT_DROP; +} + +static int +teql_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct teql_sched_data *q = (struct teql_sched_data *)sch->data; + + __skb_queue_head(&q->q, skb); + return 0; +} + +static struct sk_buff * +teql_dequeue(struct Qdisc* sch) +{ + struct teql_sched_data *dat = (struct teql_sched_data *)sch->data; + struct sk_buff *skb; + + skb = __skb_dequeue(&dat->q); + if (skb == NULL) { + struct net_device *m = dat->m->dev.qdisc->dev; + if (m) { + dat->m->slaves = sch; + netif_wake_queue(m); + } + } + sch->q.qlen = dat->q.qlen + dat->m->dev.qdisc->q.qlen; + return skb; +} + +static __inline__ void +teql_neigh_release(struct neighbour *n) +{ + if (n) + neigh_release(n); +} + +static void +teql_reset(struct Qdisc* sch) +{ + struct teql_sched_data *dat = (struct teql_sched_data *)sch->data; + + skb_queue_purge(&dat->q); + sch->q.qlen = 0; + teql_neigh_release(xchg(&dat->ncache, NULL)); +} + +static void +teql_destroy(struct Qdisc* sch) +{ + struct Qdisc *q, *prev; + struct teql_sched_data *dat = (struct teql_sched_data *)sch->data; + struct teql_master *master = dat->m; + + if ((prev = master->slaves) != NULL) { + do { + q = NEXT_SLAVE(prev); + if (q == sch) { + NEXT_SLAVE(prev) = NEXT_SLAVE(q); + if (q == master->slaves) { + master->slaves = NEXT_SLAVE(q); + if (q == master->slaves) { + master->slaves = NULL; + spin_lock_bh(&master->dev.queue_lock); + qdisc_reset(master->dev.qdisc); + spin_unlock_bh(&master->dev.queue_lock); + } + } + skb_queue_purge(&dat->q); + teql_neigh_release(xchg(&dat->ncache, NULL)); + break; + } + + } while ((prev = q) != master->slaves); + } + + MOD_DEC_USE_COUNT; +} + +static int teql_qdisc_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct net_device *dev = sch->dev; + struct teql_master *m = (struct teql_master*)sch->ops; + struct teql_sched_data *q = (struct teql_sched_data *)sch->data; + + if (dev->hard_header_len > m->dev.hard_header_len) + return -EINVAL; + + if (&m->dev == dev) + return -ELOOP; + + q->m = m; + + skb_queue_head_init(&q->q); + + if (m->slaves) { + if (m->dev.flags & IFF_UP) { + if ((m->dev.flags&IFF_POINTOPOINT && !(dev->flags&IFF_POINTOPOINT)) + || (m->dev.flags&IFF_BROADCAST && !(dev->flags&IFF_BROADCAST)) + || (m->dev.flags&IFF_MULTICAST && !(dev->flags&IFF_MULTICAST)) + || dev->mtu < m->dev.mtu) + return -EINVAL; + } else { + if (!(dev->flags&IFF_POINTOPOINT)) + m->dev.flags &= ~IFF_POINTOPOINT; + if (!(dev->flags&IFF_BROADCAST)) + m->dev.flags &= ~IFF_BROADCAST; + if (!(dev->flags&IFF_MULTICAST)) + m->dev.flags &= ~IFF_MULTICAST; + if (dev->mtu < m->dev.mtu) + m->dev.mtu = dev->mtu; + } + q->next = NEXT_SLAVE(m->slaves); + NEXT_SLAVE(m->slaves) = sch; + } else { + q->next = sch; + m->slaves = sch; + m->dev.mtu = dev->mtu; + m->dev.flags = (m->dev.flags&~FMASK)|(dev->flags&FMASK); + } + + MOD_INC_USE_COUNT; + return 0; +} + +/* "teql*" netdevice routines */ + +static int +__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev) +{ + struct teql_sched_data *q = (void*)dev->qdisc->data; + struct neighbour *mn = skb->dst->neighbour; + struct neighbour *n = q->ncache; + + if (mn->tbl == NULL) + return -EINVAL; + if (n && n->tbl == mn->tbl && + memcmp(n->primary_key, mn->primary_key, mn->tbl->key_len) == 0) { + atomic_inc(&n->refcnt); + } else { + n = __neigh_lookup_errno(mn->tbl, mn->primary_key, dev); + if (IS_ERR(n)) + return PTR_ERR(n); + } + if (neigh_event_send(n, skb_res) == 0) { + int err; + read_lock(&n->lock); + err = dev->hard_header(skb, dev, ntohs(skb->protocol), n->ha, NULL, skb->len); + read_unlock(&n->lock); + if (err < 0) { + neigh_release(n); + return -EINVAL; + } + teql_neigh_release(xchg(&q->ncache, n)); + return 0; + } + neigh_release(n); + return (skb_res == NULL) ? -EAGAIN : 1; +} + +static __inline__ int +teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev) +{ + if (dev->hard_header == NULL || + skb->dst == NULL || + skb->dst->neighbour == NULL) + return 0; + return __teql_resolve(skb, skb_res, dev); +} + +static int teql_master_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct teql_master *master = (void*)dev->priv; + struct Qdisc *start, *q; + int busy; + int nores; + int len = skb->len; + struct sk_buff *skb_res = NULL; + + start = master->slaves; + +restart: + nores = 0; + busy = 0; + + if ((q = start) == NULL) + goto drop; + + do { + struct net_device *slave = q->dev; + + if (slave->qdisc_sleeping != q) + continue; + if (netif_queue_stopped(slave) || ! netif_running(slave)) { + busy = 1; + continue; + } + + switch (teql_resolve(skb, skb_res, slave)) { + case 0: + if (spin_trylock(&slave->xmit_lock)) { + slave->xmit_lock_owner = smp_processor_id(); + if (!netif_queue_stopped(slave) && + slave->hard_start_xmit(skb, slave) == 0) { + slave->xmit_lock_owner = -1; + spin_unlock(&slave->xmit_lock); + master->slaves = NEXT_SLAVE(q); + netif_wake_queue(dev); + master->stats.tx_packets++; + master->stats.tx_bytes += len; + return 0; + } + slave->xmit_lock_owner = -1; + spin_unlock(&slave->xmit_lock); + } + if (netif_queue_stopped(dev)) + busy = 1; + break; + case 1: + master->slaves = NEXT_SLAVE(q); + return 0; + default: + nores = 1; + break; + } + __skb_pull(skb, skb->nh.raw - skb->data); + } while ((q = NEXT_SLAVE(q)) != start); + + if (nores && skb_res == NULL) { + skb_res = skb; + goto restart; + } + + if (busy) { + netif_stop_queue(dev); + return 1; + } + master->stats.tx_errors++; + +drop: + master->stats.tx_dropped++; + dev_kfree_skb(skb); + return 0; +} + +static int teql_master_open(struct net_device *dev) +{ + struct Qdisc * q; + struct teql_master *m = (void*)dev->priv; + int mtu = 0xFFFE; + unsigned flags = IFF_NOARP|IFF_MULTICAST; + + if (m->slaves == NULL) + return -EUNATCH; + + flags = FMASK; + + q = m->slaves; + do { + struct net_device *slave = q->dev; + + if (slave == NULL) + return -EUNATCH; + + if (slave->mtu < mtu) + mtu = slave->mtu; + if (slave->hard_header_len > LL_MAX_HEADER) + return -EINVAL; + + /* If all the slaves are BROADCAST, master is BROADCAST + If all the slaves are PtP, master is PtP + Otherwise, master is NBMA. + */ + if (!(slave->flags&IFF_POINTOPOINT)) + flags &= ~IFF_POINTOPOINT; + if (!(slave->flags&IFF_BROADCAST)) + flags &= ~IFF_BROADCAST; + if (!(slave->flags&IFF_MULTICAST)) + flags &= ~IFF_MULTICAST; + } while ((q = NEXT_SLAVE(q)) != m->slaves); + + m->dev.mtu = mtu; + m->dev.flags = (m->dev.flags&~FMASK) | flags; + netif_start_queue(&m->dev); + MOD_INC_USE_COUNT; + return 0; +} + +static int teql_master_close(struct net_device *dev) +{ + netif_stop_queue(dev); + MOD_DEC_USE_COUNT; + return 0; +} + +static struct net_device_stats *teql_master_stats(struct net_device *dev) +{ + struct teql_master *m = (void*)dev->priv; + return &m->stats; +} + +static int teql_master_mtu(struct net_device *dev, int new_mtu) +{ + struct teql_master *m = (void*)dev->priv; + struct Qdisc *q; + + if (new_mtu < 68) + return -EINVAL; + + q = m->slaves; + if (q) { + do { + if (new_mtu > q->dev->mtu) + return -EINVAL; + } while ((q=NEXT_SLAVE(q)) != m->slaves); + } + + dev->mtu = new_mtu; + return 0; +} + +static int teql_master_init(struct net_device *dev) +{ + dev->open = teql_master_open; + dev->hard_start_xmit = teql_master_xmit; + dev->stop = teql_master_close; + dev->get_stats = teql_master_stats; + dev->change_mtu = teql_master_mtu; + dev->type = ARPHRD_VOID; + dev->mtu = 1500; + dev->tx_queue_len = 100; + dev->flags = IFF_NOARP; + dev->hard_header_len = LL_MAX_HEADER; + return 0; +} + +static struct teql_master the_master = { +{ + NULL, + NULL, + "", + sizeof(struct teql_sched_data), + + teql_enqueue, + teql_dequeue, + teql_requeue, + NULL, + + teql_qdisc_init, + teql_reset, + teql_destroy, + NULL, +},}; + + +#ifdef MODULE +int init_module(void) +#else +int __init teql_init(void) +#endif +{ + int err; + + rtnl_lock(); + + the_master.dev.priv = (void*)&the_master; + err = dev_alloc_name(&the_master.dev, "teql%d"); + if (err < 0) + return err; + memcpy(the_master.qops.id, the_master.dev.name, IFNAMSIZ); + the_master.dev.init = teql_master_init; + + err = register_netdevice(&the_master.dev); + if (err == 0) { + err = register_qdisc(&the_master.qops); + if (err) + unregister_netdevice(&the_master.dev); + } + rtnl_unlock(); + return err; +} + +#ifdef MODULE +void cleanup_module(void) +{ + rtnl_lock(); + unregister_qdisc(&the_master.qops); + unregister_netdevice(&the_master.dev); + rtnl_unlock(); +} +#endif +MODULE_LICENSE("GPL"); |