\documentclass[11pt]{article}
\usepackage{amsmath,amssymb,amsthm}
\usepackage{fullpage}
\usepackage{hyperref}
\usepackage[capitalise,nameinlink]{cleveref}
\hypersetup{hidelinks,hypertexnames=false,colorlinks=true,allcolors=blue}
\usepackage{cite}
\crefname{lemma}{Lemma}{Lemmas}
\crefname{fact}{Fact}{Facts}
\crefname{theorem}{Theorem}{Theorems}
\crefname{corollary}{Corollary}{Corollaries}
\crefname{claim}{Claim}{Claims}
\crefname{example}{Example}{Examples}
\crefname{problem}{Problem}{Problems}
\crefname{setting}{Setting}{Settings}
\crefname{definition}{Definition}{Definitions}
\crefname{assumption}{Assumption}{Assumptions}
\crefname{subsection}{Subsection}{Subsections}
\crefname{section}{Section}{Sections}
\DeclareMathOperator*{\E}{\mathbb{E}}
\let\Pr\relax
\DeclareMathOperator*{\Pr}{\mathbb{P}}
\newcommand{\eps}{\varepsilon}
\newcommand{\inprod}[1]{\left\langle #1 \right\rangle}
\newcommand{\R}{\mathbb{R}}
\newcommand{\handout}[5]{
\noindent
\begin{center}
\framebox{
\vbox{
\hbox to 5.78in { {\bf CS 270: Combinatorial Algorithms and Data Structures
} \hfill #2 }
\vspace{4mm}
\hbox to 5.78in { {\Large \hfill #5 \hfill} }
\vspace{2mm}
\hbox to 5.78in { {\em #3 \hfill #4} }
}
}
\end{center}
\vspace*{4mm}
}
\newcommand{\lecture}[4]{\handout{#1}{#2}{#3}{Scribe: #4}{Lecture #1}}
\newtheorem{theorem}{Theorem}[section]
\newtheorem*{theorem*}{Theorem}
\newtheorem{itheorem}{Theorem}
\newtheorem{subclaim}{Claim}[theorem]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem*{proposition*}{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem*{lemma*}{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem*{conjecture*}{Conjecture}
\newtheorem{fact}[theorem]{Fact}
\newtheorem*{fact*}{Fact}
\newtheorem{exercise}[theorem]{Exercise}
\newtheorem*{exercise*}{Exercise}
\newtheorem{hypothesis}[theorem]{Hypothesis}
\newtheorem*{hypothesis*}{Hypothesis}
\newtheorem{conjecture}[theorem]{Conjecture}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{setting}[theorem]{Setting}
\newtheorem{construction}[theorem]{Construction}
\newtheorem{example}[theorem]{Example}
\newtheorem{question}[theorem]{Question}
\newtheorem{openquestion}[theorem]{Open Question}
% \newtheorem{algorithm}[theorem]{Algorithm}
\newtheorem{problem}[theorem]{Problem}
\newtheorem{protocol}[theorem]{Protocol}
\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{exercise-easy}[theorem]{Exercise}
\newtheorem{exercise-med}[theorem]{Exercise}
\newtheorem{exercise-hard}[theorem]{Exercise$^\star$}
\newtheorem{claim}[theorem]{Claim}
\newtheorem*{claim*}{Claim}
\newtheorem{remark}[theorem]{Remark}
\newtheorem*{remark*}{Remark}
\newtheorem{observation}[theorem]{Observation}
\newtheorem*{observation*}{Observation}
% additional packages/commands
\usepackage{mathtools}
\usepackage{xparse}
\usepackage{suffix} % commands with suffix
\usepackage{physics}
\usepackage{parskip, float}
\usepackage{microtype} % Micro-kerning
\newcommand\definedas\coloneqq
\newcommand\from{:}
\renewcommand{\Pr}{\operatorname{\mathbb{P}}\mathopen{}\opbraces{}} % Probability
\WithSuffix\newcommand\Pr*{\operatorname{\mathbb{P}}\mathopen{}} % Probability (no qty)
\newcommand{\Exp}{\operatorname{\mathbb{E}}\mathopen{}\opbraces{}} % Expected value
\WithSuffix\newcommand\Exp*{\operatorname{\mathbb{E}}\mathopen{}} % Expected value (no qty)
\usepackage{tikz}
\usepackage{circuitikz}
\usetikzlibrary{calc,trees,positioning,arrows,fit,shapes,calc,intersections,fadings,decorations.markings,decorations.pathmorphing,calligraphy,matrix,math}
% general settings
\tikzset{
>=latex
}
% box and pointer diagrams
\tikzset{
/tikz/boxandpointer/.style={
node distance=0pt and 0pt,
/tikz/element/.append style={
outer sep=0pt,
minimum size=20pt
}
}
}
% tikz array diagram
% syntax: \tikzarray[start coord]{label prefix}{elements}
\NewDocumentCommand{\tikzarray}{ O{0,0} m m }{%
\node[outer sep=0pt, inner sep=0pt] (#2-0) at (#1) {};
\foreach[count=\i] \x in {#3} {
\pgfmathtruncatemacro\prev{\i - 1}
\node[element, draw, right=0pt of #2-\prev] (#2-\i) {\x};
}
}
% 1-inch margins, from fullpage.sty by H.Partl, Version 2, Dec. 15, 1988.
% \topmargin 0pt
% \advance \topmargin by -\headheight
% \advance \topmargin by -\headsep
% \textheight 8.9in
% \oddsidemargin 0pt
% \evensidemargin \oddsidemargin
% \marginparwidth 0.5in
% \textwidth 6.5in
% \parindent 0in
% \parskip 1.5ex
\begin{document}
\lecture{13 --- Feburary 28, 2023}{Spring 2023}{Prof.\ Jelani Nelson}{Alec Li, Anna Deza}
\section{Overview}
In this lecture, we will look at the approximate membership query problem, and how Bloom filters \cite{Bloom70} provide an efficient solution using only $O(nw)$ bits of memory. We'll also look at Cuckoo hashing \cite{PR01}, a solution to the dynamic dictionary problem that gives $O(1)$ worst-case query time, an improvement over the expected $O(1)$ query time from hashing with chaining or with linear probing.
\section{Bloom Filters}
Recall the approximate membership problem; we want to maintain a set $S \subseteq [U]$ (where $n \definedas \abs{S}$), subject to:
\begin{itemize}
\item \texttt{insert(x)}: Updates $S$ to include $x$, i.e. $S \gets S \cup \{x\}$
\item \texttt{query(x)}: Query the system to ask if $x \in S$; i.e. return true if $x \in S$, false if $x \notin S$.
\end{itemize}
Here, we want the probability of a wrong answer to be at most $\varepsilon$.
We'll look at one (Monte Carlo) randomized data structure to solve this problem; it'll always be efficient, but it may give the wrong answer with some probability. In particular, we'll look at Bloom filters (from the 1970s)\cite{Bloom70}.
A natural question is: what's the point of approximate membership? We've already seen dynamic hashing and linear probing, which gives linear space and expected constant time operations, and we can use these solutions to the dynamic dictionary problem to solve approximate membership.
The advantage of Bloom filters is that it uses only $o(nw)$ bits of memory; we use less space than just storing the keys themselves. In particular, it uses approximately $1.44n \ln \frac{1}{\varepsilon}$ bits (the $1.44$ comes from $\log_2(e)$).
The lower bound for the space complexity is $\Omega(n \ln \frac{1}{\varepsilon})$ bits of space (i.e. removing the $1.44$ constant multiple), which is actually achievable in the static case; it turns out it is impossible to achieve this space for the dynamic case, shown by Lovett and Porat in 2013 \cite{LP13}.
With Bloom filters, we first initialize a bit array $A$ of $m$ bits to all 0's. We then pick $k$ independent fully random hash functions mapping $[U] \to [m]$, and implement the operations as follows:
\begin{itemize}
\item \texttt{insert(x)}: set $A[h_i(x)] = 1$ for each $i$ from $1$ to $k$.
\item \texttt{query(x)}: query $A[h_i(x)]$ for each $i$ from $1$ to $k$, and take the AND of all of them.
\end{itemize}
Note that if $x$ was truly in the set, then all of these locations would be set to 1; otherwise, at least one of these location is probably 0, and we'd say no (there is the possibility that other elements have hashed to some of the same locations).
We could also analyze this algorithm to find the optimal values for $m$ and $k$; the optimal value of $m$ turns out to be $1.44 n \ln \frac{1}{\varepsilon}$, and $k$ to be $\Theta(\ln \frac{1}{\varepsilon})$.
We won't show the analysis of traditional Bloom filters here, since we run into independence problems, but we'll analyze a slightly different version, which is easier to explain.
In the traditional Bloom filter, we have an array of length $m = \Theta(n \log \frac{1}{\varepsilon})$, and we set $k$ locations in the array to 1. (Note that we can possibly have two hash functions hashing to the same location.)
\begin{center}
\begin{tikzpicture}[boxandpointer]
\tikzarray{a}{,,,1,,1,,1,1,,}
\node[below=35pt of a-6] (x) {$x$};
\draw[gray,->] (x) edge[bend left, in=130] node[left] {\small $h_1(x)$} (a-4)
(x) edge node[left] {\small $h_2(x)$} (a-6)
(x) edge node[right] {\small $h_3(x)$} (a-8)
(x) edge[bend right, in=-120] node[below right] {\small $h_4(x)$} (a-9);
\draw[Bar-Bar] ($(a-1.south west) + (0, 1)$) -- ($(a-11.south east) + (0, 1)$) node[pos=0.5, above] {$m$};
\end{tikzpicture}
\end{center}
Today, we'll analyze a slightly different formulation; we have instead a 2D array with $k$ rows and $cn$ columns. When we insert $x$, we set the location $h_i(x)$ in row $i$ to 1, for each $i = 1, \ldots, k$. Here, the crucial difference is that each row is independent from one other.
\begin{center}
\begin{tikzpicture}
\def\s{0.75}
\foreach \x in {0,1,2,...,6} {
\foreach \y in {0,1,2,...,3} {
\draw (\s*\x, \s*\y) rectangle ({\s*(\x + 1)}, {\s*(\y + 1)});
}
}
\draw[Bar-Bar] (5.75, 0) -- node[right, pos=0.5] {$k$} (5.75, 3);
\draw[Bar-Bar] (0, -0.5) -- node[below, pos=0.5] {$cn$} (5.25, -0.5);
\node (x) at (-2, 1.5) {$x$};
\node (r0) at ({\s*1+0.375}, {\s*0+0.375}) {1};
\node (r1) at ({\s*6+0.375}, {\s*1+0.375}) {1};
\node (r2) at ({\s*3+0.375}, {\s*2+0.375}) {1};
\node (r3) at ({\s*4+0.375}, {\s*3+0.375}) {1};
\draw[gray,->] (x) edge[bend left, looseness=0.7] node[above left, pos=0.2] {\small $h_1(x)$} (r3);
\draw[gray,->] (x) edge node[above, pos=0.28] {\small $h_2(x)$} (r2);
\draw[gray,->] (x) edge node[below, pos=0.18] {\small $h_3(x)$} (r1);
\draw[gray,->] (x) edge[bend right] node[below left, pos=0.2] {\small $h_4(x)$} (r0);
\end{tikzpicture}
\end{center}
Notice that (in both versions) we never have any false negatives; we only have false positives. That is, we'll always be certain that an item is \emph{not} in the set, but we could incorrectly say that an item \emph{is} in the set.
\begin{claim}
If $x \notin S$, then $\Pr(\text{say $x \in S$}) \le \varepsilon$.
\end{claim}
\begin{proof}
Suppose we fix a row $i$. We have a false positive if each $A[i][h_i(x)]$ is already occupied for $i = 1, \ldots, k$. This means the false positive probability is $\Pr(A[i][h_i(x)] = 1)^k$. To compute this value, suppose we look at the expected number of items in $S$ that collide under $x$:
\[
\Exp[\text{\# items in $S$ colliding with $x$ under $h_i$}]
= \sum_{y \in S} \underbrace{\Pr(h_i(x) = h_i(y))}_{1 / cn}
= \frac{1}{c}
.\]
Notice that the probability that at least 1 item collides with $x$ is at most $\frac{1}{c}$ by Markov's inequality. This is because the random variable for the number of items that collided with $x$ takes a value $c$ times its expected value. (We can improve this analysis to get a better constant, but we omit it here.)
This probability we computed is the probability we get fooled in row $i$, so the probability we got fooled in \emph{all} $k$ rows is $\frac{1}{c^k}$, which we want to be at most $\varepsilon$. This gives $k = \Theta(\log \frac{1}{\varepsilon})$.
Notice that although we got a worse constant factor, we only needed 2-wise independence here, to reduce $\Pr(h_i(x) = h_i(y)) = \frac{1}{cn}$.
\end{proof}
If we wanted to get a better constant multiple for the space, we can reduce to the dictionary problem; we can map to a range where there is a small probability of collisions, and then store the set of $h(x)$'s using a dictionary that uses as little space as possible. Notice that the main reason why we were able to get away with using less memory in bloom filters is because we just set $k$ bits to 1, instead of storing $x$. Here, we're doing something similar, storing $h(x)$ instead of storing $x$ in the dictionary.
\section{Cuckoo Hashing}
The goal of Cuckoo hashing \cite{PR01} is to solve the dynamic dictionary problem. In hashing with chaining and linear probing, the query and insertion time are both expected constant time, whereas in cuckoo hashing, query time is \emph{worst case} constant (we only look in two locations), and insertion time is expected constant. It's an open problem to get a solution to the dynamic dictionary problem that is worst case constant in both query and insertion. (It's theoretically possible to get linear space and constant time query and insertion time, deterministically, but the best way we know of is to use a balanced binary search tree.)
To implement Cuckoo hashing, we pick two fully independent hash functions $h, g \from [U] \to [m]$, where $m$ is the size of the hash table. We then implement the operations as follows.
To query $x$, we check both $A[h(x)]$ and $A[g(x)]$. The invariant we'll maintain is that $x$ will always be in either $A[h(x)]$ or $A[g(x)]$. If $x$ is in neither location, then we say that $x$ is not in the database.
For insertion, we put $x$ in $A[g(x)]$. If $A[g(x)]$ was null, then we're done. However, if $A[g(x)]$ is already occupied, let $x'$ be the old key in $A[g(x)]$ (let $j = g(x)$ for ease of notation). If $j = g(x')$, then we move $x'$ into $h(x')$; otherwise, move $x'$ to $A[g(x')]$ (since here $j = h(x')$). We then recurse in these two cases to insert $x'$ if there are any additional conflicts.
Intuitively, we insert $x$ into $A[g(x)]$, and move any existing element into their other hash location. If this element \emph{also} conflicts with another element, we continue moving these elements until there are no conflicts. However, it is possible for this to go on forever (if there's a cycle).
To resolve this, if this process goes on for longer than say $10\log n$ steps, we rebuild the entire hash table from scratch. That is, we pick new hash functions $h$ and $g$ and re-insert the items. We then repeat this process until it works, i.e. until we successfully insert every element back into the hash table. The main idea is that the probability of this occurring is very tiny, so it's unlikely this process will happen or go on for to long.
(This insertion algorithm is why this algorithm is called ``cuckoo hashing''; a cuckoo chick pushes other eggs or young out of the nest when it hatches, much like how this algorithm pushes existing elements out when inserting a new item.)
It's clear that query time is worst case constant, but we'll show that the query time is expected constant in a little bit.
As a little detour, cuckoo hashing can also be used to solve the static approximate dictionary problem in $O(nr)$ bits of memory.
The approximate dictionary problem is similar to approximate membership; the only difference here is that we store a set of $n$ key-value pairs, where all the values are $r$-bit strings. Notice that here we'll never actually store the keys. (This is in problem set 4; a hint is to use cuckoo hashing when the cuckoo graph has no cycles, where $\Pr(\text{no cycles}) \ge \frac{1}{2}$.)
\subsection{Cuckoo Hashing Analysis}
\begin{definition}[Cuckoo Graph]
A \emph{Cuckoo graph} is a multigraph (i.e. there could be multiple edges between the same vertices) illustrating the process of Cuckoo hashing. Vertices are locations in the hash table (so there are $m$ vertices), and edges connect between $h(x)$ and $g(x)$ for each key $x \in S$, where $S$ is the set of keys in the database (so there are $n$ edges).
\end{definition}
Let's look at the possible cases that can happen during an insertion.
\begin{itemize}
\item One case is that we have a \textbf{path}. (The squiggly line is the first hash of $x$, and other arrows denote the movement of elements due to conflicts.)
\begin{center}
\begin{tikzpicture}[node distance=45pt,line width=0.75pt]
\node[circ] (x) at (0, 0) {};
\node[left=of x] (x-label) {$x$};
\node[circ, right=of x] (x2) {};
\node[circ, right=of x2] (x3) {};
\node[circ, right=of x3] (x4) {};
\node[circ, right=of x4] (x5) {};
\draw[->] (x-label) edge[gray,decorate,decoration={snake,pre length=3pt,post length=4pt}] (x);
\draw[->] (x) edge node[below] {$x_2$} (x2)
(x2) edge node[below] {$x_3$} (x3)
(x3) edge node[below] {$x_4$} (x4);
\path (x4) -- (x5) node[pos=0.5] {$\cdots$};
\end{tikzpicture}
\end{center}
Here, $x$ hashes to where $x_2$ is originally, so $x_2$ moves to where $x_3$ is originally, and this continues until there are no collisions.
\item Another case is that we have a \textbf{single cycle}. (Solid arrows denote the first movement of an element due to a collision, and dashed arrows denote a second movement due to another collision after the cycle.)
\begin{center}
\begin{tikzpicture}[node distance=45pt,line width=0.75pt]
\node[circ] (x) at (0, 0) {};
\node[left=of x] (x-label) {$x$};
\node[circ, right=of x] (x2) {};
\node[circ, right=of x2] (x3) {};
\node[circ, right=of x3] (x4) {};
\node[circ, above=30pt of x4] (x5) {};
\node[circ, below=of x] (xp) {};
\node[circ, right=of xp] (x7) {};
\node[circ, right=of x7] (x8) {};
\node[circ, right=of x8] (x9) {};
\draw[->] (x-label) edge[gray,decorate,decoration={snake,pre length=3pt,post length=4pt}] (x);
\draw[->] (x) edge node[below] {$x_2$} (x2)
(x2) edge node[below] {$x_3$} (x3)
(x3) edge node[below] {$x_4$} (x4)
(x4) edge node[right] {$x_5$} (x5)
(x5) edge node[above left] {$x_6$} (x3)
(x3) edge[bend right, dashed] node[above] {$x_3$} (x2)
(x2) edge[bend right, dashed] node[above] {$x_2$} (x)
(x) edge node[left] {$x$} (xp)
(xp) edge node[below] {$x_7$} (x7)
(x7) edge node[below] {$x_8$} (x8);
\path (x8) -- (x9) node[pos=0.5] {$\cdots$};
\end{tikzpicture}
\end{center}
Here, the movement of $x_6$ causes $x_3$ to get moved once more, which propagates back to $x$. This means that we now try hashing $x$ using $h(x)$, which displaces $x_7$, etc., until we have no collisions along this second path.
\item A last case is that we have a \textbf{double cycle}, which does actually go on infinitely.
\begin{center}
\begin{tikzpicture}[node distance=45pt,line width=0.75pt]
\node[circ] (x) at (0, 0) {};
\node[left=of x] (x-label) {$x$};
\node[circ, right=of x] (x2) {};
\node[circ, right=of x2] (x3) {};
\node[circ, right=of x3] (x4) {};
\node[circ, above=30pt of x4] (x5) {};
\node[circ, below=45pt of x] (xp) {};
\node[circ, right=of xp] (x7) {};
\node[circ, right=of x7] (x8) {};
\node[circ, right=of x8] (x9) {};
\node[circ, below=30pt of x9] (x10) {};
\draw[->] (x-label) edge[gray,decorate,decoration={snake,pre length=3pt,post length=4pt}] (x);
\draw[->] (x) edge node[below] {$x_2$} (x2)
(x2) edge node[below] {$x_3$} (x3)
(x3) edge node[below] {$x_4$} (x4)
(x4) edge node[right] {$x_5$} (x5)
(x5) edge node[above left] {$x_6$} (x3)
(x3) edge[bend right, dashed] node[above] {$x_3$} (x2)
(x2) edge[bend right, dashed] node[above] {$x_2$} (x)
(x) edge node[left] {$x$} (xp)
(xp) edge node[above] {$x_7$} (x7)
(x7) edge node[above] {$x_8$} (x8)
(x8) edge node[above] {$x_9$} (x9)
(x9) edge node[right] {$x_{10}$} (x10)
(x10) edge node[below left] {$x_{11}$} (x8)
(x8) edge[bend left, dashed] node[below] {$x_8$} (x7)
(x7) edge[bend left, dashed] node[below] {$x_7$} (xp)
(xp) edge[bend right, dashed] node[right] {$x$} (x);
\end{tikzpicture}
\end{center}
Here, after we go through the first cycle, we hit another cycle in the second path; this causes an infinite loop of collisions that never resolves.
\end{itemize}
For the analysis, suppose we define:
\begin{itemize}
\item $T$: the runtime to do an insert
\item $P_k$: the indicator $\mathbf{1}\{\text{have path of length $\ge k$}\}$
\item $C_k$: the indicator $\mathbf{1}\{\text{have cycle of length $\ge k$}\}$
\item $D$: the indicator $\mathbf{1}\{\text{have a double cycle}\}$
\end{itemize}
Our expected runtime is
\begin{align*}
\Exp[T] &\le \underbrace{\Exp[\sum_{k=1}^{\infty} P_k]}_{\text{path case}} + \underbrace{\Exp[\sum_{k=1}^{\infty} C_k]}_{\text{cycle case}} + \underbrace{\Pr(D = 1)}_{\text{double cycle}} \cdot \underbrace{\qty(10 \log n + n\Exp[T])}_{\text{rebuild table}} \\
&+ \underbrace{\Pr(\text{path/cycle of length} \ge 10 \log n)}_{\text{took too long}} \cdot \underbrace{\qty(10 \log n + n\Exp[T])}_{\text{rebuild table}}
\end{align*}
We can show that $\Pr(P_k = 1) = \exp(-\Omega(k))$ for $m$ sufficiently large (say $m=4n$), and we'll also show that $\Pr(C_k = 1) = \exp(-\Omega(k))$. This means the sums in the first two expectations converge to a constant.
We'll also show that $\Pr(D = 1)$ and $\Pr(\text{path/cycle of length $\ge 10\log n$})$ are both $O(\frac{1}{n^2})$. This simplifies the runtime to
\begin{align*}
\Exp[T] &\le O(1) + O(1) + O\qty(\frac{1}{n^2}) \qty(10 \log n + n \Exp[T]) + O\qty(\frac{1}{n^2}) \qty(10 \log n + n \Exp[T]) \\
\Exp[T] &\le O(1) + O\qty(\frac{1}{n}) \Exp[T] \\
\qty(1 - O\qty(\frac{1}{n})) \Exp[T] &\le O(1) \\
\Exp[T] &\le O(1)
\end{align*}
\begin{claim}
$\Pr(P_k = 1) = \exp(-\Omega(k))$, in particular $\Pr(P_k = 1) \le \frac{1}{2^k}$.
\end{claim}
\begin{proof}
Notice that we can have a lot of possibilities for paths of length $k$; we have a choice of what elements are involved in the path. (We'll call each of these possibilities a ``realization'' of a path.)
The probability by union bound gives
\begin{align*}
\Pr(P_k = 1)
&\le \sum_{\substack{\text{all possible paths} \\ \text{$P$ of length $k$}}} \Pr(\text{have $P$})
\end{align*}
Notice that the number of possible realizations is at most $m^{k+1} \cdot n^k$, since there are at most $m$ possibilities for each of the $k+1$ vertices in the path, and $n$ possibilities for each of the $k$ edges in the path.
The probability of a fixed realization is
\[
\Pr(\text{a fixed realization}) \le \frac{1}{m} \cdot \frac{2^k}{m^{2k}}
.\]
Here, the $\frac{1}{m}$ factor is because $x$ must hash to the start of the path, and for each of the $k$ edges along the path, we have a probability at most $\frac{2}{m^2}$ for the pair of hash functions to hash to the two locations incident to the edge.
If we fix $m = 4n$, then this simplifies to
\begin{align*}
\Pr(P_k = 1) &\le (\text{\# possible realizations}) \cdot \Pr(\text{fixed realization}) \\
&\le m^{k+1} \cdot n^k \cdot \frac{2^k}{m^{2k+1}} \\
&= \qty(\frac{2n}{m})^k = \frac{1}{2^k}
\end{align*}
\end{proof}
To show $\Pr(C_k = 1) \le \exp(-\Omega(k))$, we break up the cycle into two parts; edges before the cycle (in blue), and edges after the cycle (in red).
\begin{center}
\begin{tikzpicture}[node distance=45pt,line width=0.75pt]
\node[circ] (x) at (0, 0) {};
\node[left=of x] (x-label) {$x$};
\node[circ, right=of x] (x2) {};
\node[circ, right=of x2] (x3) {};
\node[circ, right=of x3] (x4) {};
\node[circ, above=of x4] (x5) {};
\node[circ, below=of x] (xp) {};
\node[circ, right=of xp] (x7) {};
\node[circ, right=of x7] (x8) {};
\node[circ, right=of x8] (x9) {};
\draw[->] (x-label) edge[gray,decorate,decoration={snake,pre length=3pt,post length=4pt}] (x);
\draw[blue,->] (x) edge node[below] {$x_2$} (x2)
(x2) edge node[below] {$x_3$} (x3)
(x3) edge node[below] {$x_4$} (x4)
(x4) edge node[right] {$x_5$} (x5);
\draw[red,->] (x5) edge node[above left] {$x_6$} (x3)
(x3) edge[bend right, dashed] node[above] {$x_3$} (x2)
(x2) edge[bend right, dashed] node[above] {$x_2$} (x)
(x) edge node[left] {$x$} (xp)
(xp) edge node[below] {$x_7$} (x7)
(x7) edge node[below] {$x_8$} (x8);
\path (x8) -- (x9) node[pos=0.5] {$\cdots$};
\end{tikzpicture}
\end{center}
If the entire cycle is of length $k$, then at least one part must be of length at most $\frac{k}{2}$. We can then reuse the same analysis we did for paths to get a similar bound for cycles.
The probability of a path or cycle of length at most $\ge 10 \log n$ is similar; it's also $\exp(-10 \log n) = O(\frac{1}{n^{10}})$, which is definitely $O\qty(\frac{1}{n^2})$.
\begin{claim}
$\Pr(D = 1) = O\qty(\frac{1}{n^2})$.
\end{claim}
\begin{proof}
% TODO
\end{proof}
\section{Preview of Power of 2 Choices}
Think about hashing with chaining; the expected length of a linked list is $O(1)$, but the worst case load is on the order of $\frac{\log n}{\log\log n}$. To reduce this load, we can instead pick two random hash functions $h, g \from [U] \to [m]$. We'll also augment the hash table to keep track of the length of the linked list.
When we insert $x$, we look at both $h(x)$ and $g(x)$, and choose the linked list with fewer items. Intuitively, this can only do better than with only one hash function. With high probability, max load turns out to be at most $\frac{\ln \ln n}{\ln 2} + \Theta(1)$. (This essentially goes from a max load of $O(\ln n)$ to $O(\ln \ln n)$.)
\bibliographystyle{alpha}
\begin{thebibliography}{Blo70}
\bibitem[Blo70]{Bloom70}
Burton~H. Bloom.
\newblock Space/time trade-offs in hash coding with allowable errors.
\newblock {\em Communications of the ACM}, 13(7):422–426, Jul 1970.
\bibitem[LP13]{LP13}
Shachar Lovett and Ely Porat.
\newblock A space lower bound for dynamic approximate membership data
structures.
\newblock {\em SIAM Journal on Computing}, 42(6):2182–2196, Jan 2013.
\bibitem[PR01]{PR01}
Rasmus Pagh and Flemming~Friche Rodler.
\newblock Cuckoo hashing.
\newblock In Friedhelm~Meyer auf~der Heide, editor, {\em Algorithms — ESA
2001}, Lecture Notes in Computer Science, page 121–133, Berlin, Heidelberg,
2001. Springer.
\end{thebibliography}
\end{document}