summaryrefslogtreecommitdiff
path: root/release/src/linux/linux/net/ipv4/netfilter/ip_nat_core.c
diff options
context:
space:
mode:
Diffstat (limited to 'release/src/linux/linux/net/ipv4/netfilter/ip_nat_core.c')
-rw-r--r--release/src/linux/linux/net/ipv4/netfilter/ip_nat_core.c954
1 files changed, 954 insertions, 0 deletions
diff --git a/release/src/linux/linux/net/ipv4/netfilter/ip_nat_core.c b/release/src/linux/linux/net/ipv4/netfilter/ip_nat_core.c
new file mode 100644
index 00000000..f64ddabf
--- /dev/null
+++ b/release/src/linux/linux/net/ipv4/netfilter/ip_nat_core.c
@@ -0,0 +1,954 @@
+/* NAT for netfilter; shared with compatibility layer. */
+
+/* (c) 1999 Paul `Rusty' Russell. Licenced under the GNU General
+ Public Licence. */
+#ifdef MODULE
+#define __NO_VERSION__
+#endif
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/brlock.h>
+#include <linux/vmalloc.h>
+#include <net/checksum.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/tcp.h> /* For tcp_prot in getorigdst */
+
+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
+
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+#define DEBUGP(format, args...)
+
+DECLARE_RWLOCK(ip_nat_lock);
+DECLARE_RWLOCK_EXTERN(ip_conntrack_lock);
+
+/* Calculated at init based on memory size */
+static unsigned int ip_nat_htable_size;
+
+static struct list_head *bysource;
+static struct list_head *byipsproto;
+LIST_HEAD(protos);
+LIST_HEAD(helpers);
+
+extern struct ip_nat_protocol unknown_nat_protocol;
+
+/* We keep extra hashes for each conntrack, for fast searching. */
+static inline size_t
+hash_by_ipsproto(u_int32_t src, u_int32_t dst, u_int16_t proto)
+{
+ /* Modified src and dst, to ensure we don't create two
+ identical streams. */
+ return (src + dst + proto) % ip_nat_htable_size;
+}
+
+static inline size_t
+hash_by_src(const struct ip_conntrack_manip *manip, u_int16_t proto)
+{
+ /* Original src, to ensure we map it consistently if poss. */
+ return (manip->ip + manip->u.all + proto) % ip_nat_htable_size;
+}
+
+/* Noone using conntrack by the time this called. */
+static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
+{
+ struct ip_nat_info *info = &conn->nat.info;
+
+ if (!info->initialized)
+ return;
+
+ IP_NF_ASSERT(info->bysource.conntrack);
+ IP_NF_ASSERT(info->byipsproto.conntrack);
+
+ WRITE_LOCK(&ip_nat_lock);
+ LIST_DELETE(&bysource[hash_by_src(&conn->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.src,
+ conn->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.dst.protonum)],
+ &info->bysource);
+
+ LIST_DELETE(&byipsproto
+ [hash_by_ipsproto(conn->tuplehash[IP_CT_DIR_REPLY]
+ .tuple.src.ip,
+ conn->tuplehash[IP_CT_DIR_REPLY]
+ .tuple.dst.ip,
+ conn->tuplehash[IP_CT_DIR_REPLY]
+ .tuple.dst.protonum)],
+ &info->byipsproto);
+ WRITE_UNLOCK(&ip_nat_lock);
+}
+
+/* We do checksum mangling, so if they were wrong before they're still
+ * wrong. Also works for incomplete packets (eg. ICMP dest
+ * unreachables.) */
+u_int16_t
+ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
+{
+ u_int32_t diffs[] = { oldvalinv, newval };
+ return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
+ oldcheck^0xFFFF));
+}
+
+static inline int cmp_proto(const struct ip_nat_protocol *i, int proto)
+{
+ return i->protonum == proto;
+}
+
+struct ip_nat_protocol *
+find_nat_proto(u_int16_t protonum)
+{
+ struct ip_nat_protocol *i;
+
+ MUST_BE_READ_LOCKED(&ip_nat_lock);
+ i = LIST_FIND(&protos, cmp_proto, struct ip_nat_protocol *, protonum);
+ if (!i)
+ i = &unknown_nat_protocol;
+ return i;
+}
+
+/* Is this tuple already taken? (not by us) */
+int
+ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack *ignored_conntrack)
+{
+ /* Conntrack tracking doesn't keep track of outgoing tuples; only
+ incoming ones. NAT means they don't have a fixed mapping,
+ so we invert the tuple and look for the incoming reply.
+
+ We could keep a separate hash if this proves too slow. */
+ struct ip_conntrack_tuple reply;
+
+ invert_tuplepr(&reply, tuple);
+ return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
+}
+
+/* Does tuple + the source manip come within the range mr */
+static int
+in_range(const struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack_manip *manip,
+ const struct ip_nat_multi_range *mr)
+{
+ struct ip_nat_protocol *proto = find_nat_proto(tuple->dst.protonum);
+ unsigned int i;
+ struct ip_conntrack_tuple newtuple = { *manip, tuple->dst };
+
+ for (i = 0; i < mr->rangesize; i++) {
+ /* If we are allowed to map IPs, then we must be in the
+ range specified, otherwise we must be unchanged. */
+ if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
+ if (ntohl(newtuple.src.ip) < ntohl(mr->range[i].min_ip)
+ || (ntohl(newtuple.src.ip)
+ > ntohl(mr->range[i].max_ip)))
+ continue;
+ } else {
+ if (newtuple.src.ip != tuple->src.ip)
+ continue;
+ }
+
+ if ((mr->range[i].flags & IP_NAT_RANGE_PROTO_SPECIFIED)
+ && proto->in_range(&newtuple, IP_NAT_MANIP_SRC,
+ &mr->range[i].min, &mr->range[i].max))
+ return 1;
+ }
+ return 0;
+}
+
+static inline int
+src_cmp(const struct ip_nat_hash *i,
+ const struct ip_conntrack_tuple *tuple,
+ const struct ip_nat_multi_range *mr)
+{
+ return (i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
+ == tuple->dst.protonum
+ && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
+ == tuple->src.ip
+ && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
+ == tuple->src.u.all
+ && in_range(tuple,
+ &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.src,
+ mr));
+}
+
+/* Only called for SRC manip */
+static struct ip_conntrack_manip *
+find_appropriate_src(const struct ip_conntrack_tuple *tuple,
+ const struct ip_nat_multi_range *mr)
+{
+ unsigned int h = hash_by_src(&tuple->src, tuple->dst.protonum);
+ struct ip_nat_hash *i;
+
+ MUST_BE_READ_LOCKED(&ip_nat_lock);
+ i = LIST_FIND(&bysource[h], src_cmp, struct ip_nat_hash *, tuple, mr);
+ if (i)
+ return &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src;
+ else
+ return NULL;
+}
+
+#ifdef CONFIG_IP_NF_NAT_LOCAL
+/* If it's really a local destination manip, it may need to do a
+ source manip too. */
+static int
+do_extra_mangle(u_int32_t var_ip, u_int32_t *other_ipp)
+{
+ struct rtable *rt;
+
+ if (ip_route_output(&rt, var_ip, 0, 0, 0) != 0) {
+ DEBUGP("do_extra_mangle: Can't get route to %u.%u.%u.%u\n",
+ NIPQUAD(var_ip));
+ return 0;
+ }
+
+ *other_ipp = rt->rt_src;
+ ip_rt_put(rt);
+ return 1;
+}
+#endif
+
+/* Simple way to iterate through all. */
+static inline int fake_cmp(const struct ip_nat_hash *i,
+ u_int32_t src, u_int32_t dst, u_int16_t protonum,
+ unsigned int *score,
+ const struct ip_conntrack *conntrack)
+{
+ /* Compare backwards: we're dealing with OUTGOING tuples, and
+ inside the conntrack is the REPLY tuple. Don't count this
+ conntrack. */
+ if (i->conntrack != conntrack
+ && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip == dst
+ && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip == src
+ && (i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum
+ == protonum))
+ (*score)++;
+ return 0;
+}
+
+static inline unsigned int
+count_maps(u_int32_t src, u_int32_t dst, u_int16_t protonum,
+ const struct ip_conntrack *conntrack)
+{
+ unsigned int score = 0;
+
+ MUST_BE_READ_LOCKED(&ip_nat_lock);
+ LIST_FIND(&byipsproto[hash_by_ipsproto(src, dst, protonum)],
+ fake_cmp, struct ip_nat_hash *, src, dst, protonum, &score,
+ conntrack);
+
+ return score;
+}
+
+/* For [FUTURE] fragmentation handling, we want the least-used
+ src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
+ if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
+ 1-65535, we don't do pro-rata allocation based on ports; we choose
+ the ip with the lowest src-ip/dst-ip/proto usage.
+
+ If an allocation then fails (eg. all 6 ports used in the 1.2.3.4
+ range), we eliminate that and try again. This is not the most
+ efficient approach, but if you're worried about that, don't hand us
+ ranges you don't really have. */
+static struct ip_nat_range *
+find_best_ips_proto(struct ip_conntrack_tuple *tuple,
+ const struct ip_nat_multi_range *mr,
+ const struct ip_conntrack *conntrack,
+ unsigned int hooknum)
+{
+ unsigned int i;
+ struct {
+ const struct ip_nat_range *range;
+ unsigned int score;
+ struct ip_conntrack_tuple tuple;
+ } best = { NULL, 0xFFFFFFFF };
+ u_int32_t *var_ipp, *other_ipp, saved_ip, orig_dstip;
+ static unsigned int randomness = 0;
+
+ if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) {
+ var_ipp = &tuple->src.ip;
+ saved_ip = tuple->dst.ip;
+ other_ipp = &tuple->dst.ip;
+ } else {
+ var_ipp = &tuple->dst.ip;
+ saved_ip = tuple->src.ip;
+ other_ipp = &tuple->src.ip;
+ }
+ /* Don't do do_extra_mangle unless neccessary (overrides
+ explicit socket bindings, for example) */
+ orig_dstip = tuple->dst.ip;
+
+ IP_NF_ASSERT(mr->rangesize >= 1);
+ for (i = 0; i < mr->rangesize; i++) {
+ /* Host order */
+ u_int32_t minip, maxip, j;
+
+ /* Don't do ranges which are already eliminated. */
+ if (mr->range[i].flags & IP_NAT_RANGE_FULL) {
+ continue;
+ }
+
+ if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
+ minip = ntohl(mr->range[i].min_ip);
+ maxip = ntohl(mr->range[i].max_ip);
+ } else
+ minip = maxip = ntohl(*var_ipp);
+
+ randomness++;
+ for (j = 0; j < maxip - minip + 1; j++) {
+ unsigned int score;
+
+ *var_ipp = htonl(minip + (randomness + j)
+ % (maxip - minip + 1));
+
+ /* Reset the other ip in case it was mangled by
+ * do_extra_mangle last time. */
+ *other_ipp = saved_ip;
+
+#ifdef CONFIG_IP_NF_NAT_LOCAL
+ if (hooknum == NF_IP_LOCAL_OUT
+ && *var_ipp != orig_dstip
+ && !do_extra_mangle(*var_ipp, other_ipp)) {
+ DEBUGP("Range %u %u.%u.%u.%u rt failed!\n",
+ i, NIPQUAD(*var_ipp));
+ /* Can't route? This whole range part is
+ * probably screwed, but keep trying
+ * anyway. */
+ continue;
+ }
+#endif
+
+ /* Count how many others map onto this. */
+ score = count_maps(tuple->src.ip, tuple->dst.ip,
+ tuple->dst.protonum, conntrack);
+ if (score < best.score) {
+ /* Optimization: doesn't get any better than
+ this. */
+ if (score == 0)
+ return (struct ip_nat_range *)
+ &mr->range[i];
+
+ best.score = score;
+ best.tuple = *tuple;
+ best.range = &mr->range[i];
+ }
+ }
+ }
+ *tuple = best.tuple;
+
+ /* Discard const. */
+ return (struct ip_nat_range *)best.range;
+}
+
+/* Fast version doesn't iterate through hash chains, but only handles
+ common case of single IP address (null NAT, masquerade) */
+static struct ip_nat_range *
+find_best_ips_proto_fast(struct ip_conntrack_tuple *tuple,
+ const struct ip_nat_multi_range *mr,
+ const struct ip_conntrack *conntrack,
+ unsigned int hooknum)
+{
+ if (mr->rangesize != 1
+ || (mr->range[0].flags & IP_NAT_RANGE_FULL)
+ || ((mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
+ && mr->range[0].min_ip != mr->range[0].max_ip))
+ return find_best_ips_proto(tuple, mr, conntrack, hooknum);
+
+ if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
+ if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC)
+ tuple->src.ip = mr->range[0].min_ip;
+ else {
+ /* Only do extra mangle when required (breaks
+ socket binding) */
+#ifdef CONFIG_IP_NF_NAT_LOCAL
+ if (tuple->dst.ip != mr->range[0].min_ip
+ && hooknum == NF_IP_LOCAL_OUT
+ && !do_extra_mangle(mr->range[0].min_ip,
+ &tuple->src.ip))
+ return NULL;
+#endif
+ tuple->dst.ip = mr->range[0].min_ip;
+ }
+ }
+
+ /* Discard const. */
+ return (struct ip_nat_range *)&mr->range[0];
+}
+
+static int
+get_unique_tuple(struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack_tuple *orig_tuple,
+ const struct ip_nat_multi_range *mrr,
+ struct ip_conntrack *conntrack,
+ unsigned int hooknum)
+{
+ struct ip_nat_protocol *proto
+ = find_nat_proto(orig_tuple->dst.protonum);
+ struct ip_nat_range *rptr;
+ unsigned int i;
+ int ret;
+
+ /* We temporarily use flags for marking full parts, but we
+ always clean up afterwards */
+ struct ip_nat_multi_range *mr = (void *)mrr;
+
+ /* 1) If this srcip/proto/src-proto-part is currently mapped,
+ and that same mapping gives a unique tuple within the given
+ range, use that.
+
+ This is only required for source (ie. NAT/masq) mappings.
+ So far, we don't do local source mappings, so multiple
+ manips not an issue. */
+ if (hooknum == NF_IP_POST_ROUTING) {
+ struct ip_conntrack_manip *manip;
+
+ manip = find_appropriate_src(orig_tuple, mr);
+ if (manip) {
+ /* Apply same source manipulation. */
+ *tuple = ((struct ip_conntrack_tuple)
+ { *manip, orig_tuple->dst });
+ DEBUGP("get_unique_tuple: Found current src map\n");
+ return 1;
+ }
+ }
+
+ /* 2) Select the least-used IP/proto combination in the given
+ range.
+ */
+ *tuple = *orig_tuple;
+ while ((rptr = find_best_ips_proto_fast(tuple, mr, conntrack, hooknum))
+ != NULL) {
+ DEBUGP("Found best for "); DUMP_TUPLE_RAW(tuple);
+ /* 3) The per-protocol part of the manip is made to
+ map into the range to make a unique tuple. */
+
+ /* Only bother mapping if it's not already in range
+ and unique */
+ if ((!(rptr->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
+ || proto->in_range(tuple, HOOK2MANIP(hooknum),
+ &rptr->min, &rptr->max))
+ && !ip_nat_used_tuple(tuple, conntrack)) {
+ ret = 1;
+ goto clear_fulls;
+ } else {
+ if (proto->unique_tuple(tuple, rptr,
+ HOOK2MANIP(hooknum),
+ conntrack)) {
+ /* Must be unique. */
+ IP_NF_ASSERT(!ip_nat_used_tuple(tuple,
+ conntrack));
+ ret = 1;
+ goto clear_fulls;
+ } else if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST) {
+ /* Try implicit source NAT; protocol
+ may be able to play with ports to
+ make it unique. */
+ struct ip_nat_range r
+ = { IP_NAT_RANGE_MAP_IPS,
+ tuple->src.ip, tuple->src.ip,
+ { 0 }, { 0 } };
+ DEBUGP("Trying implicit mapping\n");
+ if (proto->unique_tuple(tuple, &r,
+ IP_NAT_MANIP_SRC,
+ conntrack)) {
+ /* Must be unique. */
+ IP_NF_ASSERT(!ip_nat_used_tuple
+ (tuple, conntrack));
+ ret = 1;
+ goto clear_fulls;
+ }
+ }
+ DEBUGP("Protocol can't get unique tuple %u.\n",
+ hooknum);
+ }
+
+ /* Eliminate that from range, and try again. */
+ rptr->flags |= IP_NAT_RANGE_FULL;
+ *tuple = *orig_tuple;
+ }
+
+ ret = 0;
+
+ clear_fulls:
+ /* Clear full flags. */
+ IP_NF_ASSERT(mr->rangesize >= 1);
+ for (i = 0; i < mr->rangesize; i++)
+ mr->range[i].flags &= ~IP_NAT_RANGE_FULL;
+
+ return ret;
+}
+
+static inline int
+helper_cmp(const struct ip_nat_helper *helper,
+ const struct ip_conntrack_tuple *tuple)
+{
+ return ip_ct_tuple_mask_cmp(tuple, &helper->tuple, &helper->mask);
+}
+
+/* Where to manip the reply packets (will be reverse manip). */
+static unsigned int opposite_hook[NF_IP_NUMHOOKS]
+= { [NF_IP_PRE_ROUTING] = NF_IP_POST_ROUTING,
+ [NF_IP_POST_ROUTING] = NF_IP_PRE_ROUTING,
+#ifdef CONFIG_IP_NF_NAT_LOCAL
+ [NF_IP_LOCAL_OUT] = NF_IP_LOCAL_IN,
+ [NF_IP_LOCAL_IN] = NF_IP_LOCAL_OUT,
+#endif
+};
+
+unsigned int
+ip_nat_setup_info(struct ip_conntrack *conntrack,
+ const struct ip_nat_multi_range *mr,
+ unsigned int hooknum)
+{
+ struct ip_conntrack_tuple new_tuple, inv_tuple, reply;
+ struct ip_conntrack_tuple orig_tp;
+ struct ip_nat_info *info = &conntrack->nat.info;
+
+ MUST_BE_WRITE_LOCKED(&ip_nat_lock);
+ IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
+ || hooknum == NF_IP_POST_ROUTING
+ || hooknum == NF_IP_LOCAL_OUT);
+ IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
+
+ /* What we've got will look like inverse of reply. Normally
+ this is what is in the conntrack, except for prior
+ manipulations (future optimization: if num_manips == 0,
+ orig_tp =
+ conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
+ invert_tuplepr(&orig_tp,
+ &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+
+ do {
+ if (!get_unique_tuple(&new_tuple, &orig_tp, mr, conntrack,
+ hooknum)) {
+ DEBUGP("ip_nat_setup_info: Can't get unique for %p.\n",
+ conntrack);
+ return NF_DROP;
+ }
+
+
+ /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT):
+ the original (A/B/C/D') and the mangled one (E/F/G/H').
+
+ We're only allowed to work with the SRC per-proto
+ part, so we create inverses of both to start, then
+ derive the other fields we need. */
+
+ /* Reply connection: simply invert the new tuple
+ (G/H/E/F') */
+ invert_tuplepr(&reply, &new_tuple);
+
+ /* Alter conntrack table so it recognizes replies.
+ If fail this race (reply tuple now used), repeat. */
+ } while (!ip_conntrack_alter_reply(conntrack, &reply));
+
+ /* Create inverse of original: C/D/A/B' */
+ invert_tuplepr(&inv_tuple, &orig_tp);
+
+ /* Has source changed?. */
+ if (!ip_ct_tuple_src_equal(&new_tuple, &orig_tp)) {
+ /* In this direction, a source manip. */
+ info->manips[info->num_manips++] =
+ ((struct ip_nat_info_manip)
+ { IP_CT_DIR_ORIGINAL, hooknum,
+ IP_NAT_MANIP_SRC, new_tuple.src });
+
+ IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
+
+ /* In the reverse direction, a destination manip. */
+ info->manips[info->num_manips++] =
+ ((struct ip_nat_info_manip)
+ { IP_CT_DIR_REPLY, opposite_hook[hooknum],
+ IP_NAT_MANIP_DST, orig_tp.src });
+ IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
+ }
+
+ /* Has destination changed? */
+ if (!ip_ct_tuple_dst_equal(&new_tuple, &orig_tp)) {
+ /* In this direction, a destination manip */
+ info->manips[info->num_manips++] =
+ ((struct ip_nat_info_manip)
+ { IP_CT_DIR_ORIGINAL, hooknum,
+ IP_NAT_MANIP_DST, reply.src });
+
+ IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
+
+ /* In the reverse direction, a source manip. */
+ info->manips[info->num_manips++] =
+ ((struct ip_nat_info_manip)
+ { IP_CT_DIR_REPLY, opposite_hook[hooknum],
+ IP_NAT_MANIP_SRC, inv_tuple.src });
+ IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
+ }
+
+ /* If there's a helper, assign it; based on new tuple. */
+ if (!conntrack->master)
+ info->helper = LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *,
+ &reply);
+
+ /* It's done. */
+ info->initialized |= (1 << HOOK2MANIP(hooknum));
+ return NF_ACCEPT;
+}
+
+void replace_in_hashes(struct ip_conntrack *conntrack,
+ struct ip_nat_info *info)
+{
+ /* Source has changed, so replace in hashes. */
+ unsigned int srchash
+ = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.src,
+ conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.dst.protonum);
+ /* We place packet as seen OUTGOUNG in byips_proto hash
+ (ie. reverse dst and src of reply packet. */
+ unsigned int ipsprotohash
+ = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
+ .tuple.dst.ip,
+ conntrack->tuplehash[IP_CT_DIR_REPLY]
+ .tuple.src.ip,
+ conntrack->tuplehash[IP_CT_DIR_REPLY]
+ .tuple.dst.protonum);
+
+ IP_NF_ASSERT(info->bysource.conntrack == conntrack);
+ MUST_BE_WRITE_LOCKED(&ip_nat_lock);
+
+ list_del(&info->bysource.list);
+ list_del(&info->byipsproto.list);
+
+ list_prepend(&bysource[srchash], &info->bysource);
+ list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
+}
+
+void place_in_hashes(struct ip_conntrack *conntrack,
+ struct ip_nat_info *info)
+{
+ unsigned int srchash
+ = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.src,
+ conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.dst.protonum);
+ /* We place packet as seen OUTGOUNG in byips_proto hash
+ (ie. reverse dst and src of reply packet. */
+ unsigned int ipsprotohash
+ = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
+ .tuple.dst.ip,
+ conntrack->tuplehash[IP_CT_DIR_REPLY]
+ .tuple.src.ip,
+ conntrack->tuplehash[IP_CT_DIR_REPLY]
+ .tuple.dst.protonum);
+
+ IP_NF_ASSERT(!info->bysource.conntrack);
+
+ MUST_BE_WRITE_LOCKED(&ip_nat_lock);
+ info->byipsproto.conntrack = conntrack;
+ info->bysource.conntrack = conntrack;
+
+ list_prepend(&bysource[srchash], &info->bysource);
+ list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
+}
+
+static void
+manip_pkt(u_int16_t proto, struct iphdr *iph, size_t len,
+ const struct ip_conntrack_manip *manip,
+ enum ip_nat_manip_type maniptype,
+ __u32 *nfcache)
+{
+ *nfcache |= NFC_ALTERED;
+ find_nat_proto(proto)->manip_pkt(iph, len, manip, maniptype);
+
+ if (maniptype == IP_NAT_MANIP_SRC) {
+ iph->check = ip_nat_cheat_check(~iph->saddr, manip->ip,
+ iph->check);
+ iph->saddr = manip->ip;
+ } else {
+ iph->check = ip_nat_cheat_check(~iph->daddr, manip->ip,
+ iph->check);
+ iph->daddr = manip->ip;
+ }
+}
+
+static inline int exp_for_packet(struct ip_conntrack_expect *exp,
+ struct sk_buff **pskb)
+{
+ struct ip_conntrack_protocol *proto;
+ int ret = 1;
+
+ MUST_BE_READ_LOCKED(&ip_conntrack_lock);
+ proto = __ip_ct_find_proto((*pskb)->nh.iph->protocol);
+ if (proto->exp_matches_pkt)
+ ret = proto->exp_matches_pkt(exp, pskb);
+
+ return ret;
+}
+
+/* Do packet manipulations according to binding. */
+unsigned int
+do_bindings(struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo,
+ struct ip_nat_info *info,
+ unsigned int hooknum,
+ struct sk_buff **pskb)
+{
+ unsigned int i;
+ struct ip_nat_helper *helper;
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+ int is_tcp = (*pskb)->nh.iph->protocol == IPPROTO_TCP;
+
+ /* Need nat lock to protect against modification, but neither
+ conntrack (referenced) and helper (deleted with
+ synchronize_bh()) can vanish. */
+ READ_LOCK(&ip_nat_lock);
+ for (i = 0; i < info->num_manips; i++) {
+ /* raw socket (tcpdump) may have clone of incoming
+ skb: don't disturb it --RR */
+ if (skb_cloned(*pskb) && !(*pskb)->sk) {
+ struct sk_buff *nskb = skb_copy(*pskb, GFP_ATOMIC);
+ if (!nskb) {
+ READ_UNLOCK(&ip_nat_lock);
+ return NF_DROP;
+ }
+ kfree_skb(*pskb);
+ *pskb = nskb;
+ }
+
+ if (info->manips[i].direction == dir
+ && info->manips[i].hooknum == hooknum) {
+ DEBUGP("Mangling %p: %s to %u.%u.%u.%u %u\n",
+ *pskb,
+ info->manips[i].maniptype == IP_NAT_MANIP_SRC
+ ? "SRC" : "DST",
+ NIPQUAD(info->manips[i].manip.ip),
+ htons(info->manips[i].manip.u.all));
+ manip_pkt((*pskb)->nh.iph->protocol,
+ (*pskb)->nh.iph,
+ (*pskb)->len,
+ &info->manips[i].manip,
+ info->manips[i].maniptype,
+ &(*pskb)->nfcache);
+ }
+ }
+ helper = info->helper;
+ READ_UNLOCK(&ip_nat_lock);
+
+ if (helper) {
+ struct ip_conntrack_expect *exp = NULL;
+ struct list_head *cur_item;
+ int ret = NF_ACCEPT;
+
+ DEBUGP("do_bindings: helper existing for (%p)\n", ct);
+
+ /* Always defragged for helpers */
+ IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
+ & htons(IP_MF|IP_OFFSET)));
+
+ /* Have to grab read lock before sibling_list traversal */
+ READ_LOCK(&ip_conntrack_lock);
+ list_for_each(cur_item, &ct->sibling_list) {
+ exp = list_entry(cur_item, struct ip_conntrack_expect,
+ expected_list);
+
+ /* if this expectation is already established, skip */
+ if (exp->sibling)
+ {
+ //lzh add 2007/3/16 for fix sip alg CDROUTE test
+ exp = NULL;
+ //lzh end
+ continue;
+ }
+
+
+ if (exp_for_packet(exp, pskb)) {
+ DEBUGP("calling nat helper (exp=%p) for packet\n",
+ exp);
+ ret = helper->help(ct, exp, info, ctinfo,
+ hooknum, pskb);
+ if (ret != NF_ACCEPT) {
+ READ_UNLOCK(&ip_conntrack_lock);
+ return ret;
+ }
+ }
+ }
+ /* Helper might want to manip the packet even when there is no expectation */
+ if (!exp && helper->flags & IP_NAT_HELPER_F_ALWAYS) {
+ DEBUGP("calling nat helper for packet without expectation\n");
+ ret = helper->help(ct, NULL, info, ctinfo,
+ hooknum, pskb);
+ if (ret != NF_ACCEPT) {
+ READ_UNLOCK(&ip_conntrack_lock);
+ return ret;
+ }
+ }
+ READ_UNLOCK(&ip_conntrack_lock);
+
+ /* Adjust sequence number only once per packet
+ * (helper is called at all hooks) */
+ if (is_tcp && (hooknum == NF_IP_POST_ROUTING
+ || hooknum == NF_IP_LOCAL_IN)) {
+ DEBUGP("ip_nat_core: adjusting sequence number\n");
+ /* future: put this in a l4-proto specific function,
+ * and call this function here. */
+ ip_nat_seq_adjust(*pskb, ct, ctinfo);
+ }
+
+ return ret;
+
+ } else
+ return NF_ACCEPT;
+
+ /* not reached */
+}
+
+unsigned int
+icmp_reply_translation(struct sk_buff *skb,
+ struct ip_conntrack *conntrack,
+ unsigned int hooknum,
+ int dir)
+{
+ struct iphdr *iph = skb->nh.iph;
+ struct icmphdr *hdr = (struct icmphdr *)((u_int32_t *)iph + iph->ihl);
+ struct iphdr *inner = (struct iphdr *)(hdr + 1);
+ size_t datalen = skb->len - ((void *)inner - (void *)iph);
+ unsigned int i;
+ struct ip_nat_info *info = &conntrack->nat.info;
+
+ IP_NF_ASSERT(skb->len >= iph->ihl*4 + sizeof(struct icmphdr));
+ /* Must be RELATED */
+ IP_NF_ASSERT(skb->nfct - (struct ip_conntrack *)skb->nfct->master
+ == IP_CT_RELATED
+ || skb->nfct - (struct ip_conntrack *)skb->nfct->master
+ == IP_CT_RELATED+IP_CT_IS_REPLY);
+
+ /* Redirects on non-null nats must be dropped, else they'll
+ start talking to each other without our translation, and be
+ confused... --RR */
+ if (hdr->type == ICMP_REDIRECT) {
+ /* Don't care about races here. */
+ if (info->initialized
+ != ((1 << IP_NAT_MANIP_SRC) | (1 << IP_NAT_MANIP_DST))
+ || info->num_manips != 0)
+ return NF_DROP;
+ }
+
+ DEBUGP("icmp_reply_translation: translating error %p hook %u dir %s\n",
+ skb, hooknum, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
+ /* Note: May not be from a NAT'd host, but probably safest to
+ do translation always as if it came from the host itself
+ (even though a "host unreachable" coming from the host
+ itself is a bit weird).
+
+ More explanation: some people use NAT for anonymizing.
+ Also, CERT recommends dropping all packets from private IP
+ addresses (although ICMP errors from internal links with
+ such addresses are not too uncommon, as Alan Cox points
+ out) */
+
+ READ_LOCK(&ip_nat_lock);
+ for (i = 0; i < info->num_manips; i++) {
+ DEBUGP("icmp_reply: manip %u dir %s hook %u\n",
+ i, info->manips[i].direction == IP_CT_DIR_ORIGINAL ?
+ "ORIG" : "REPLY", info->manips[i].hooknum);
+
+ if (info->manips[i].direction != dir)
+ continue;
+
+ /* Mapping the inner packet is just like a normal
+ packet, except it was never src/dst reversed, so
+ where we would normally apply a dst manip, we apply
+ a src, and vice versa. */
+ if (info->manips[i].hooknum == opposite_hook[hooknum]) {
+ DEBUGP("icmp_reply: inner %s -> %u.%u.%u.%u %u\n",
+ info->manips[i].maniptype == IP_NAT_MANIP_SRC
+ ? "DST" : "SRC",
+ NIPQUAD(info->manips[i].manip.ip),
+ ntohs(info->manips[i].manip.u.udp.port));
+ manip_pkt(inner->protocol, inner,
+ skb->len - ((void *)inner - (void *)iph),
+ &info->manips[i].manip,
+ !info->manips[i].maniptype,
+ &skb->nfcache);
+ /* Outer packet needs to have IP header NATed like
+ it's a reply. */
+ } else if (info->manips[i].hooknum == hooknum) {
+ /* Use mapping to map outer packet: 0 give no
+ per-proto mapping */
+ DEBUGP("icmp_reply: outer %s -> %u.%u.%u.%u\n",
+ info->manips[i].maniptype == IP_NAT_MANIP_SRC
+ ? "SRC" : "DST",
+ NIPQUAD(info->manips[i].manip.ip));
+ manip_pkt(0, iph, skb->len,
+ &info->manips[i].manip,
+ info->manips[i].maniptype,
+ &skb->nfcache);
+ }
+ }
+ READ_UNLOCK(&ip_nat_lock);
+
+ /* Since we mangled inside ICMP packet, recalculate its
+ checksum from scratch. (Hence the handling of incorrect
+ checksums in conntrack, so we don't accidentally fix one.) */
+ hdr->checksum = 0;
+ hdr->checksum = ip_compute_csum((unsigned char *)hdr,
+ sizeof(*hdr) + datalen);
+
+ return NF_ACCEPT;
+}
+
+int __init ip_nat_init(void)
+{
+ size_t i;
+
+ /* Leave them the same for the moment. */
+ ip_nat_htable_size = ip_conntrack_htable_size;
+
+ /* One vmalloc for both hash tables */
+ bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size*2);
+ if (!bysource) {
+ return -ENOMEM;
+ }
+ byipsproto = bysource + ip_nat_htable_size;
+
+ /* Sew in builtin protocols. */
+ WRITE_LOCK(&ip_nat_lock);
+ list_append(&protos, &ip_nat_protocol_tcp);
+ list_append(&protos, &ip_nat_protocol_udp);
+ list_append(&protos, &ip_nat_protocol_icmp);
+ WRITE_UNLOCK(&ip_nat_lock);
+
+ for (i = 0; i < ip_nat_htable_size; i++) {
+ INIT_LIST_HEAD(&bysource[i]);
+ INIT_LIST_HEAD(&byipsproto[i]);
+ }
+
+ IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
+ ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
+
+ return 0;
+}
+
+/* Clear NAT section of all conntracks, in case we're loaded again. */
+static int clean_nat(const struct ip_conntrack *i, void *data)
+{
+ memset((void *)&i->nat, 0, sizeof(i->nat));
+ return 0;
+}
+
+/* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
+void ip_nat_cleanup(void)
+{
+ ip_ct_selective_cleanup(&clean_nat, NULL);
+ ip_conntrack_destroyed = NULL;
+ vfree(bysource);
+}