\documentclass[a4paper,portrait,english,mathserif]{beamer}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{textcomp}
\usepackage{url}
\usepackage{lmodern}
\setcounter{tocdepth}{2}

\usetheme{CambridgeUS}

\setbeamercolor{palette primary}{fg=blue!50!black,bg=gray!20!white}
\setbeamercolor{palette secondary}{fg=blue!50!black,bg=gray!40!white}
\setbeamercolor{palette tertiary}{bg=blue!50!black,fg=white}
\setbeamercolor{titlelike}{bg=gray!20!white,fg=blue!50!black}
%\setbeamerfont{block title}{family=\sffamily}
\setbeamercolor{block title}{use=structure,fg=white,bg=structure.fg!75!black}
\setbeamercolor{block title alerted}{use=alerted text,fg=white,bg=alerted text.fg!75!black}
\setbeamercolor{block title example}{use=example text,fg=white,bg=example text.fg!75!black}
\setbeamercolor{block body}{parent=normal text,use=block title,bg=block title.bg!20!bg}
\setbeamercolor{block body alerted}{parent=normal text,use=block title alerted,bg=block title alerted.bg!20!bg}
\setbeamercolor{block body example}{parent=normal text,use=block title example,bg=block title example.bg!20!bg}

%\usefonttheme{serif}

\begin{document}

\title{Xtables2: Love for blobs}
\author{Jan\ Engelhardt}
\date{2010-Oct-18}
\institute[\textsf{NFWS2010}]{Presented at NFWS 2010}

\frame{
	\titlepage
}

\frame{
	\frametitle{Table of Contents}
	\tableofcontents[hideallsubsections]
}

\AtBeginSection{
	\frame{
		\frametitle{Section TOC}
		\tableofcontents[currentsection,hideothersubsections]
	}
}

\section{Introduction}
\subsection{Xtables1}

\frame{
	\frametitle{Current status}

	\begin{itemize}

	\item \textsf{ip\_tables} started with a
	packed serialized ruleset (``blob''~-- binary large object)

	\pause

	\item \textsf{ip6\_tables} is a copy-and-paste product of
	\textsf{ip\_tables}. And so is \textsf{arp\_tables}. And so is
	\textsf{ebtables}. Yuck!

% And we know who caused the mindless duplication. Dave, one minus point.

	\pause

	\item Changes to \textsf{ip\_tables} could still be mirrored to
	\textsf{ip6\_tables} and \textsf{arp\_tables}

	\item \textsf{ebtables} took its own incompatible path of
	development

	\pause

	\item Combined with compat support, there are now
	\textit{\only<4>{seven}\only<5->{eight}} formats to support in the
	kernel

% It was seven. fw, why did you have to add compat support to ebtables :-(

	\item \only<4>{A big itch to scratch.}\only<5->{Eight itches to
	scrub.}

	\end{itemize}
}

\subsection{Xtables2 ideas (some)}

\frame{
	\frametitle{A protocol-independent format}

	\begin{itemize}

	\item Rule tree without protocol-specific parts in it, to be used by
	and for all protocol handlers

	\item Translatation from and to input formats on-the-fly, i.\,e.\
	during \texttt{SO\_SET\_REPLACE}/etc.

	\pause

	\item Formats are just minimally different: serialized stream of
	\texttt{struct ipt\_entry} vs.\ \texttt{struct ip6t\_entry}

	\end{itemize}

	\pause

	$\Rightarrow$ Led to \textsf{Xtables2}
}

\subsection{Xtables2 development history}

\frame{
	\frametitle{Developments}

	SSA/LL\footnote{Small scale allocations, or small scattered
	allocations, combined with linked lists}$^{\text{,}}$\footnote{Has
	nothing to do with GCC's SSA} style:

	\begin{itemize}

	\item ``proto1'': initial submission on 2009-Aug-04 for v2.6.31-rc\\
	(103 patches)

% google keywords: xtables2 snapshot 20090804

	\item busy dealing with cleanups: 46/103

	\pause

	\item ``proto2'': partial set posted on 2010-Jun-04 for v2.6.35-rc\\
	(33 patches, and a nasty surprise)

% google keywords: xt2 table core

	\pause

	\item ``proto3'': simple rebase for v2.6.36-rc for better comparison
	with the upcoming proto4

	\end{itemize}

	PCR style:

	\begin{itemize}

	\item ``proto4'': xt2 using packed-chain rulesets, for v2.6.36-rc

	\end{itemize}
}

\section{Xtables2 SSA prototype}
\subsection{Data layout}

\frame{
	\frametitle{Chosen data layout}

	\begin{itemize}

	\item Linked lists allow for ``easy manipulation'' of the ruleset

	\item Small-scale allocations (SSA) are more easily satisfiable.

%	\item Eric Dumazet: ``\texttt{vmap allocation for size 19869696
%	failed}'' (odd)

	\pause

	\item Prototype: Translators work nicely, and with a bit of macro
	magic, eliminated 40\% of LOC from the \{ip,ip6,arp\} combo.

	\end{itemize}
}

\subsection{Tests}
\subsection*{Test \#1}

\frame{
	\frametitle{Ruleset}

	\begin{itemize}

% For testing, I used a simple ruleset that "would take long enough" so that
% measuring wall clock time ain't just jitter.

	\item Just a simple ruleset that would be large enough so that
	wall time is visible

	\pause

	\begin{block}{Just \texttt{struct ip6t\_entry}, but lots of them}
	\texttt{-A \$chain -s ::1 -d ::1}
	\end{block}

	\item no extensions, just \texttt{struct ip6t\_entry}~$\times$ 1000
	rules~$\times$ 100 chains reachable from \textsf{INPUT}
	(\textsf{OUTPUT} is left empty)

	\pause

	\item 100,202 rules (100,000 base rules + 100 calls + 100 implicit
	invisible \textsf{RETURN}s converted from Xt1 + 2 implicit Xt1
	\textsf{RETURN}s from base chains)

	\item $\approx$20 MB in packed form

% This ruleset is approximately 20 MB in size. It is this ruleset that Eric
% Dumazet ran into a memory allocation shortage. Which seems a bit off because
% 20 MB is not a lot of memory per se - though it gets worse the more logical
% CPUs there are in a system due to duplication.

	\end{itemize}
}

\frame{
	\frametitle{Comparison with real rulesets}

% Now some people claimed that my simplistic ruleset does not match reality. I
% never debated that, but let's look at some real-world ruleset nevertheless,
% because there are in fact some interesting points to note.

	\pause

	\begin{itemize}

% I'll let Jesper tell the tales of what it is like with a _real_ ruleset.
%
% When asked for the rulesets used for the previous workshop's presentation,
% Jesper gave me this ruleset. Out of fun and interest, I ran standard
% statistics on it. (This depicts the output of GNU R.)

	\item Jesper has down-to-earth rulesets:

	\begin{exampleblock}{67,892 visible rules in 18,329 chains: rule
	density distribution}

	\texttt{\small
	> summary(data)\\
	~~~Min.~1st~Qu.~~Median~~~~Mean~3rd~Qu.~~~~Max.\\
	~~1.000~~~1.000~~~2.000~~~3.745~~~4.000~119.000\\
	}
	\end{exampleblock}

% It is almost as big as the one I used. Whereas I used a plain IPv6 ruleset,
% Jesper's is IPv4 with matches and targets.

	\item Packed size is 16,866,200 bytes

% He also put thought into his ruleset to minimize the number of rules that
% need to be processed - quite the contrary to what I needed for timing tests.

	\item Design: fanned tree, only $\approx$53 rules executed per packet

% Hard to statically analyze
%	\item Longest theoretical rule span executed: 362,221 rules

	\pause

% The low rule density arouses interest on whether it might not have adverse
% affects on scalability. So let's keep it in mind.

	\item Low rule density sounds like management overhead~--
	need to keep that in mind for later

	\end{itemize}
}

\frame{
	\frametitle{Comparison with real rulesets}

% Later, he sent me another ruleset that was an agglomeration of all of his.

	\begin{itemize}

	\item Jesper has down-to-earth rulesets:

	\begin{alertblock}{662,160 visible rules in 151,426 chains: rule
	density distribution}

	\texttt{\small
	> summary(data)\\
	~~~Min.~1st~Qu.~~Median~~~~Mean~3rd~Qu.~~~~Max.\\
	~~1.000~~~1.000~~~4.000~~~4.477~~~4.000~144.000\\
	}
	\end{alertblock}

	\item Packed size is 156,258,112 bytes

	\item Design: fanned tree, only $\approx$77 rules executed per packet

	\item Low rule density sounds like management overhead~--
	need to keep that in mind for later

	\end{itemize}
}

\frame{
	\frametitle{Test procedure}

	\begin{block}{100,000× \texttt{struct ip6t\_entry}}
	\texttt{-A mychain\$i -s ::1 -d ::1}
	\end{block}

	\begin{itemize}

	\item Earlier tests with \texttt{ping6 -f} were flawed.

	\begin{exampleblock}{Testing proto2}
	\texttt{ping6 -fqc 500 -i .001 localhost}
	\end{exampleblock}

	\pause

	\item Without rules, this gives 500 ms total execution time: packet
	handling is quick, ping is just waiting for the intervals to expire.

	\item \texttt{-i .001} made sure that (with rules) no packets were
	reported dropped

	\item With rules, this goes up: once it starts going above 500 ms,
	we know packet processing takes longer than the 1 ms interval.

	\end{itemize}
}

\frame{
	\frametitle{Results}

	\begin{itemize}

% mean(c(14755,14383,14381,15432,14498)) / mean(c(3405,3525,3426,3388,3328))
% mean(c(25340,25212,25047,24699,25042)) / mean(c(3635,3307,3278))

	\item So-gathered statistics showed an execution time expansion of
	4.30× (xt1: 3500 ms $\rightarrow$ proto2: 15000 msec)

	\item ``Linked lists no good?''

% though it did seem like something was wrong. Increased timings are within
% the realm of expectation, but I did not expect it to be quite this bad.

	\pause

	\item Using ping this way was flawed... ping handles packets
	asynchronously when using \texttt{-f}

	\item Let's reset.

	\end{itemize}
}

\subsection*{Test \#2}

\frame{
	\frametitle{Test procedure}

	\pause

	\begin{exampleblock}{Testing proto3 with revised command}
	\texttt{ping6 -Ac 500 ::1}
	\end{exampleblock}

	\begin{itemize}

	\item Observing ping's RTT statistics rather than execution time

	\pause

	\item Additionally, I sampled the CPU cycle counter around
	\texttt{xt2\_do\_table} and the ematch loop in
	\texttt{xt2\_do\_actions}

	\end{itemize}

	$\Rightarrow$ much more consistent results
}

\frame{
	\frametitle{Results}

	\begin{itemize}

	\item Expansion factor: 2.80× (xt1: 40.477 ms $\rightarrow$ proto3:
	113.424 ms)

	\item Increase expected (being a pessimist), but this much still blew
	everything

	\pause

% And, like before, when the expansion factor had been determined to be
% 4-fold, I wondered why there is such an increase of execution time.

% At LCJ 2010, Frank Rowand held a talk about how I-cache dirtying impacts
% realtime guarantees. This sparked the idea that, maybe, what we are seeing in
% the SSA prototypes with linked lists are due to D-cache misses as a result of
% objects being randomly distributed in memory.

	\item Speculation: lots of D-cache
	misses\footnote{\url{http://events.linuxfoundation.org/2010/linuxcon-japan/rowand}~--
	Identifying Embedded Real-Time Latency Issues: I-Cache and Locks} due
	to the objects being ``spread out'' in memory

	\pause

	\item Use of \texttt{kmem\_cache} pools for objects of constant size
	(table, chain and rule list heads) showed no improvement

	\pause

	\item And then there was memory...

	\end{itemize}
}

\subsection{Memory considerations}

\frame{
	\frametitle{Memory usage}

	Previously, with a blob:

	\begin{itemize}

	\item 1 vmalloc'd object of $\approx$20 MB

	\end{itemize}

	\pause

% But, while smaller allocations are more easily satisfiable, they have
% serious issues.

	Now, split allocations...?

	\begin{itemize}	

	\item SL{*}B has to housekeep an 1,002,111 extra kmalloc'd objects now

	\only<2>{\begin{itemize}

	\item 1× \texttt{struct xt2\_table}

	\item 100× \texttt{struct xt2\_chain}s

	\item 100,201× \texttt{struct xt2\_rule}s

	\item 100,201× \texttt{struct xt2\_entry\_match} for ``ipv6''

	\item 100,201× \texttt{struct ip6t\_ip6} for ``ipv6''

	\item 200,402× \texttt{struct xt2\_entry\_match} for ``quota''

	\item 200,402× \texttt{struct xt\_quota} for ``quota''

	\item 200,402× \texttt{struct xt\_quota\_priv} for ``quota''

	\item 100,201× \texttt{struct xt2\_entry\_target} for implicit CONTINUE

	\item This is of course the other end of the two extremes.

	\end{itemize}}

	\pause

	\item Memory usage increase of 2.7× (i586).
	\texttt{/proc/slabinfo}:

	\begin{itemize}

	\item $\approx$900,000× \texttt{size-32}

	\item $\approx$100,000× \texttt{size-192}

	\item 48 MB, plus some housekeeping, for a total of $\approx$53 MB

	\begin{alertblock}{Layman's observation}
	\texttt{\small \textbf{\# }free; ip6tables-restore bigrules; free\\
	~~~~~~~~~~~~~~~~~~~~~~~~~used~~~~~~~free\\
	-/+~buffers/cache:~~~~~~34056~~~~1002172\\
	-/+~buffers/cache:~~~~~~86392~~~~~949836\\
	}
	\end{alertblock}

	\end{itemize}

	\end{itemize}

	\pause

	$\Rightarrow$ Small scattered allocations are a no-go.
}

\section{Ideas for fixing}
\subsection{Packed rulesets}

\frame{
	\frametitle{Love for blobs}

	\begin{itemize}

% For performant evaluation of rules, we want no scattered allocations.

	\item Evaluation of rules: we want no scattered allocs

% And to keep memory usage down, as few allocations as possible.

	\item Housekeeping: we want few allocs

	\pause

% But that is exactly what the original iptables design does.

	\item Original iptables design decision pays off (Harald was right all
	along!)

	\begin{itemize}

	\item packed ruleset allows for streaming reads

% Once again we can gloat over BSD class operating systems, because they use...
% linked lists. Blubb!

	\item \textsf{ipfw} and \textsf{pf} use linked
	lists~~<\textdegree{}\})))><

% For the record, the OpenBSD 4.7 documentation points out that PF only uses a
% single core. """PF will only use one processor, so multiple processors (or
% multiple cores) WILL NOT improve PF performance."""

	\end{itemize}

	\pause

	\item Let's try concentrating on packed rulesets again (kernel side
	only)

	\end{itemize}

	\pause

	Need to find ways to make working with them easier

	\pause

	\begin{itemize}

	\item A good API is half the job

	\item Algorithms to keep the time cost of updating rulesets in-place
	low

	\end{itemize}
}

\subsection{API guide}

\frame{
	\frametitle{About APIs}

	\begin{itemize}

% Computer Science always votes for opacity. Opaque types (typedefs),
% opaque this, opaque that, but sometimes that just goes awry,

	\item Opaque macros/functions gone too opaque

	\end{itemize}

	\begin{alertblock}{\texttt{IP6T\_MATCH\_ITERATE}}

	\texttt{\small
	\textbf{struct} compat\_ip6t\_entry \textbf{*}e = ...;\\
	ret = COMPAT\_IP6T\_MATCH\_ITERATE(e, compat\_find\_calc\_match, name,\\
	~~~~~~\&e->ipv6, e->comefrom, \&off, \&j);
	}

	\end{alertblock}

% and open-coding is friendlier to the developer eye in the long run. The
% kernel's foreach idiom many developers know already, so why not devise
% an iteration that works similarly

	\begin{exampleblock}{\texttt{xt\_ematch\_foreach}}

	\texttt{\small
	\textbf{struct} compat\_ip6t\_entry \textbf{*}e = ...;\\
	\textbf{struct} xt\_entry\_match \textbf{*}ematch;\\
	xt\_ematch\_foreach(ematch, e) \{\\
	~~~~~~~~ret = compat\_find\_calc\_match(ematch, name, \&e->ipv6,\\
	~~~~~~~~~~~~~~e->comefrom, \&off);\\
	~~~~~~~~\textbf{if} (ret != 0)\\
	~~~~~~~~~~~~~~~~break;\\
	~~~~~~~~++j;\\
	\}}

	\end{exampleblock}
}

\frame{
	\begin{itemize}

	\item Implementation is also much friendlier to long-term maintainers

	\item \texttt{xt\_ematch\_foreach} is KISS and may save function call
	overhead

	\end{itemize}

% I was trying to be generous to IP6T_MATCH_ITERATE

	\begin{alertblock}{\texttt{IP6T\_MATCH\_ITERATE}}

	\texttt{\tiny
	\#define XT\_MATCH\_ITERATE(type, e, fn, args...)~\textbackslash{}\\
	(\{~\textbackslash{}\\
	~~~~~~~~\textbf{unsigned int} i;~\textbackslash{}\\
	~~~~~~~~\textbf{int} ret;~\textbackslash{}\\
	~~~~~~~~\textbf{struct} xt\_entry\_match \textbf{*}m;~\textbackslash{}\\
	~~~~~~~~\textbf{for} (i = sizeof(type); i < e->target\_offset; i += m->u.match\_size) \{~\textbackslash{}\\
	~~~~~~~~~~~~~~~~m = e + i;~\textbackslash{}\\
	~~~~~~~~~~~~~~~~ret = fn(m, \#\# args);~\textbackslash{}\\
	~~~~~~~~~~~~~~~~\textbf{if} (ret != 0)~\textbackslash{}\\
	~~~~~~~~~~~~~~~~~~~~~~~~break;~\textbackslash{}\\
	~~~~~~~~\}~\textbackslash{}\\
	~~~~~~~~ret;~\textbackslash{}\\
	\})\\
	}

	\end{alertblock}

	\begin{exampleblock}{\texttt{xt\_ematch\_foreach}}

	\texttt{\tiny
	\#define xt\_ematch\_foreach(pos, entry)~\textbackslash{}\\
	~~~~~~~~\textbf{for} (pos = entry->elems;~\textbackslash{}\\
	~~~~~~~~~~~~~pos < entry + entry->target\_offset;~\textbackslash{}\\
	~~~~~~~~~~~~~pos = pos + pos->u.match\_size)\\
	}

	\end{exampleblock}
}

\subsection{Update cost considerations}

\frame{
	\frametitle{Blobs for ¥100: Single rules}

	\pause

	\begin{itemize}

	\item Xt1 blob rules refer to chains (when jumping) by their absolute
	offset in the blob (i.\,e.\ bytes from the start of the blob)

	\pause

	\item Insertion or deletion of a chain/rule in a blob shifts the offset
	of all subsequent chains

	\item Requires updating the chain offsets of all jumping rules

	\item With $k$ rules already loaded, that is
	$\mathcal{O}\left(k\right)$

	\pause

	\item Adding $n$ rules leads to $\mathcal{O}\left(n^2\right)$
	behavior~-- ouch

	\item Userspace \textsf{iptables}(8) still submits entire tables, but
	translation process does currently add one rule at a time to xt2
	however

	\item Important to keep in mind for future fine-grained modifications
	initiated from userspace

	\end{itemize}
}

\frame{
	\frametitle{Blobs for ¥200: Bulk operations}

	\pause

	\begin{itemize}

	\item Insertion of rules can be batched; reservation of enough bytes at
	once:

	\begin{exampleblock}{Multi-rule reservation also in
	$\mathcal{O}\left(k\right)$}

	\texttt{\small
	new = malloc(cur\_size + x);\\
	memcpy(new, cur\_ruleset, ins\_offset);\\
	memcpy(new + ins\_offset + x, cur\_ruleset + ins\_offset, cur\_size -
	ins\_offset);}

	\end{exampleblock}

	\item Process is similar for bulk deletion

	\pause

	\item Largest contiguous block is the set of rules of a chain

% Why cannot we add two chains at once? Well, in practice, one creates all
% chain heads first so that there is no ordering dependency introduced
% on rule insertion.

	\item Therefore, with $c$ chains, a bulk update would only be
	$\mathcal{O}\left(c\cdot n\right)$

	\pause

	\item Still suboptimal: Consider low rule density from earlier:
	$\frac{n}{c} \rightarrow 1 \Longrightarrow \lim_{c\rightarrow
	n}\mathcal{O}\left(c\cdot n\right)=\mathcal{O}\left(n^2\right)$

	\end{itemize}
}

\frame{
	\frametitle{Blobs for ¥500: Indirect addressing}

	\begin{itemize}

	\item Can we get rid of the costly updates?

	\end{itemize}

	\pause

	Yes, in two stages. Number one:

	\begin{block}{Indirect chain lookup}

	\texttt{\small
	~next\_rule = tbl->blob +\\
	~~~~~~~~~~~~~~tbl->chain\_offset{[}rule->chain\_index{]}
	}

	\end{block}

	\begin{itemize}

	\item (cf.\ Xt1: \texttt{next\_rule = tbl->blob + rule->jump\_offset})

	\pause

	\item On rule insertion/deletion, only \texttt{chain\_offset} needs to
	be adjusted, for $\mathcal{O}\left(c\right)$.

	\pause

	\item Still has other costs: chain head deletion is
	$\mathcal{O}\left(k\right)$ (can be mitigated by lazy deletion).

	\end{itemize}
}

\section{Xtables2 PCR prototype}
\subsection{Data layout}

\frame{
	\frametitle{Blobs for ¥1,000: Decoupled chains}

	\begin{itemize}

	\item Prediction/Assumption: Since jumps can go across the entire blob,
	D-cache won't help anyway

	\item Loosen up on strict packing, just a little

	\pause

	\item Let largest contiguous entity be the chain rather than table

	\pause

	\item Combined with indirect chain lookup, no chain offset updates
	needed \textit{at all}.

	\end{itemize}
}

\frame{
	\begin{block}{xt2 sample chain head}

	\texttt{\small
	\textbf{struct} xt2\_chain \{\\
	~~~~~~~~\textbf{char} name\textbf{{[}XT\_EXTENSION\_MAXNAMELEN{]}};\\
	~~~~~~~~\textbf{void {*}}rule\_blob;\\
	\};\\
	}

	\end{block}

	\begin{exampleblock}{Jump action}

	\texttt{\small
	\textbf{struct} xt2\_packed\_etarget \textbf{*}target;\\
	next\_rule = target->r\_jump->rule\_blob;\\
	}

	\end{exampleblock}

	\begin{itemize}

	\item \texttt{\&some\_xt2\_chain} always remains the same over its
	lifetime~-- no more updates of rules required

	\end{itemize}
}

\subsection{Tests}
\subsection*{Test \#3}

\frame{
	\frametitle{Results}

	\begin{itemize}

	\item 100k rules like before, measuring RTT again

	\end{itemize}

	\begin{exampleblock}{Testing RTT for proto4}
	\texttt{ping6 -Ac 500 ::1}
	\end{exampleblock}

	\pause

	\begin{itemize}

	\item Observed expansion: 1.83× (xt1: 40.477 ms $\rightarrow$
	proto4: 74.135 ms)

	\item Splendid! Packed-chain rulesets work.

	\pause

	\item But what's with the remaining 83\%?

	\end{itemize}
}

\subsection{Latency observations}

\frame{
	\frametitle{Rule counters in Xtables2}

	\begin{itemize}

	\item xt2 rules carry absolutely nothing per default

	\item Per-rule counters are temporarily implemented by using two
	\textsf{xt\_quota} ematches in upcounting mode

	\pause

	\begin{itemize}

	\item The ``ipv6'' match with \texttt{-s ::1 -d ::1} runs in 200--300
	cycles

	\item One ``quota'' ematch takes prohibitely costly 4500 cycles

	\pause

	\item (In)significance of raw cycle counts

	\item Does not tell whether PCR	might still incur a bottleneck

	\item Main function of \textsf{xt\_quota} is only 19 LOC, but
	\textsf{xt\_ipv6}'s is 79 LOC.

	\end{itemize}

	\end{itemize}
}

\frame{
	\frametitle{Equal-power comparison}

	\begin{itemize}

	\begin{alertblock}{Just as costly}
	\texttt{-A INPUT -s ::1 -d ::1 -m quota -{}-grow -m quota -{}-grow}
	\end{alertblock}

	\item Driving xt1 with \textsf{xt\_quota} counters yields an RTT of
	77.373 ms.

	\pause

	\item Xtables2 PCR (74.135 ms) is absolutely on par

	\item \textsf{xt\_quota} is the one and only bottleneck

	\end{itemize}
}

\frame{
	\frametitle{\textsf{xt\_quota} analysis}

	\begin{itemize}

	\item Using the simplest possible counter implementation instead of
	full-featured \textsf{xt\_quota}, proto4 execution time drops to 44.254
	ms.

	\pause

	\item Adding a kmalloc for a private data structure to this simple
	impl. and time jumps to 50.733 ms (=\ +15\%).

	\item D-cache misses~-- again!?

	\end{itemize}
}

\section{Conclusion}

\frame{
	\frametitle{Future}

	Roadmap:

	\begin{itemize}

	\item Continue using packed rulesets for packet processing

	\end{itemize}

	Deemed solvable:

	\begin{itemize}

	\item Optimize extensions to contain fewer far-away accesses

	\end{itemize}

	Deemed infeasbly solvable:

	\begin{itemize}

	\item ebtables

	\end{itemize}
}

\frame{
	\frametitle{Questions}

	\only<1>{

	\begin{itemize}

	\item I know you have some!

	\end{itemize}

	}

	\onslide<2->

	\begin{itemize}

	\item K7: AMD K7 Athlon 1.66GHz (manuf. 2003) 256K cache 2.6.36

	\item i7: Intel Core i7 920 4-core 2.67GHz (2009) 8MB 2.6.33

	\item VM: VirtualBox machine 1-core on i7 2.6.36

	\end{itemize}

	\begin{tabular}{lccc}
	Driver & RTT K7 & RTT i7 & RTT VM \tabularnewline
	\hline 
	xt1 +2s & 40.447 & 2.83 & 3.08 \tabularnewline
	xt1 +1Q & 58.882 & 5.18 & 11.47 \tabularnewline
	xt1 +2Q & 77.373 & 11.50 & 21.00 \tabularnewline
	xt2-proto3 +2Q & 113.424 & n/a & 24.47 \tabularnewline
	xt2-proto4 +2Q & 74.135 & n/a & 21.79 \tabularnewline
	xt2-proto4 +2s & 44.254 & n/a & n/a \tabularnewline
	\onslide<3->{nft +2s & 57.8\tabularnewline}
	\end{tabular}

	\begin{itemize}

	\item s: simple local counters

	\item Q: \textsf{xt\_quota}-based counters

	\end{itemize}
}

\end{document}

