\documentclass[11pt]{article}
\usepackage{amsmath,amssymb,amsthm}
\usepackage{fullpage}
\usepackage[capitalise,nameinlink]{cleveref}
\crefname{lemma}{Lemma}{Lemmas}
\crefname{fact}{Fact}{Facts}
\crefname{theorem}{Theorem}{Theorems}
\crefname{corollary}{Corollary}{Corollaries}
\crefname{claim}{Claim}{Claims}
\crefname{example}{Example}{Examples}
\crefname{problem}{Problem}{Problems}
\crefname{setting}{Setting}{Settings}
\crefname{definition}{Definition}{Definitions}
\crefname{assumption}{Assumption}{Assumptions}
\crefname{subsection}{Subsection}{Subsections}
\crefname{section}{Section}{Sections}
\usepackage{graphicx}
\DeclareMathOperator*{\E}{\mathbb{E}}
\let\Pr\relax
\DeclareMathOperator*{\Pr}{\mathbb{P}}
\newcommand{\eps}{\varepsilon}
\newcommand{\inprod}[1]{\left\langle #1 \right\rangle}
\newcommand{\R}{\mathbb{R}}
\newcommand{\handout}[5]{
\noindent
\begin{center}
\framebox{
\vbox{
\hbox to 5.78in { {\bf CS 270: Combinatorial Algorithms and Data Structures
} \hfill #2 }
\vspace{4mm}
\hbox to 5.78in { {\Large \hfill #5 \hfill} }
\vspace{2mm}
\hbox to 5.78in { {\em #3 \hfill #4} }
}
}
\end{center}
\vspace*{4mm}
}
\newcommand{\lecture}[4]{\handout{#1}{#2}{#3}{Scribe: #4}{Lecture #1}}
\newtheorem{theorem}{Theorem}[section]
\newtheorem*{theorem*}{Theorem}
\newtheorem{itheorem}{Theorem}
\newtheorem{subclaim}{Claim}[theorem]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem*{proposition*}{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem*{lemma*}{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem*{conjecture*}{Conjecture}
\newtheorem{fact}[theorem]{Fact}
\newtheorem*{fact*}{Fact}
\newtheorem{exercise}[theorem]{Exercise}
\newtheorem*{exercise*}{Exercise}
\newtheorem{hypothesis}[theorem]{Hypothesis}
\newtheorem*{hypothesis*}{Hypothesis}
\newtheorem{conjecture}[theorem]{Conjecture}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{setting}[theorem]{Setting}
\newtheorem{construction}[theorem]{Construction}
\newtheorem{example}[theorem]{Example}
\newtheorem{question}[theorem]{Question}
\newtheorem{openquestion}[theorem]{Open Question}
% \newtheorem{algorithm}[theorem]{Algorithm}
\newtheorem{problem}[theorem]{Problem}
\newtheorem{protocol}[theorem]{Protocol}
\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{exercise-easy}[theorem]{Exercise}
\newtheorem{exercise-med}[theorem]{Exercise}
\newtheorem{exercise-hard}[theorem]{Exercise$^\star$}
\newtheorem{claim}[theorem]{Claim}
\newtheorem*{claim*}{Claim}
\newtheorem{remark}[theorem]{Remark}
\newtheorem*{remark*}{Remark}
\newtheorem{observation}[theorem]{Observation}
\newtheorem*{observation*}{Observation}
% 1-inch margins, from fullpage.sty by H.Partl, Version 2, Dec. 15, 1988.
% \topmargin 0pt
% \advance \topmargin by -\headheight
% \advance \topmargin by -\headsep
% \textheight 8.9in
% \oddsidemargin 0pt
% \evensidemargin \oddsidemargin
% \marginparwidth 0.5in
% \textwidth 6.5in
% \parindent 0in
% \parskip 1.5ex
\begin{document}
\lecture{12 --- February 23, 2023}{Spring 2023}{Prof.\ Jelani Nelson}{Matthew Ding, Jonathan Tay}
\section{Overview}
In the last lecture we introduced hashing with linear probing, and proved that it achieves constant expected query time with a fully-random hash function.
In this lecture we show linear probing with a $k$-wise independent hash function also achieves constant expected query time for $k=7$ and $k=5$. The proof comes from Pagh, Pagh, and Ru\v{z}i\'{c} \cite{PaghPaghRuzic}, and utilizes the ``symmetrization trick''. We also briefly introduce the approximate membership and dictionary problems.
\section{Linear Probing with k-wise Independent Hashing}
\subsection{7-wise Independent Hashing}
We first assume that the length of the hash table, $m = 2n$. In the previous lecture we showed that
\begin{equation} \label{initial_ineq}
\E[\# \text{ probes to } \text{query}(z)] \leq \sum\limits_{i=1}^\infty k \cdot \Pr(\text{a specific length $k$ interval containing $h(z)$ is full})
\end{equation}
Note that the fullness of an interval is about the actual location that $z$ is stored, as opposed to $h(z)$ which is about the location that the key hashes to.\\
We define $E_k$ to be the indicator random variable
\begin{equation}
E_k=
\begin{cases}
1 & \text{if $z$ is contained in a full interval of length} \ge k\\
0 & \text{otherwise }
\end{cases}
\end{equation}
\begin{align*}
\sum_{k=1}^\infty E_k &= \sum_{k=1}^{\infty} P(E_k = 1) &&\text{($E_k$ is an indicator variable)}\\
&= \sum_{k=1}^{\infty} \Pr\left( \lor_{i=1}^k \; \text{ the $i^\text{th}$ $k$-interval containing $h(z)$ is full}\right)\\
&\le \sum_{k=1}^{\infty} k \cdot \Pr(\text{a $k$-interval containing $h(z)$ is full}) &&\text{(symmetry between intervals)}\\
&\le \sum_k k\cdot e^{-\Omega(k)} &&\text{(by the Chernoff bound)}\\
&= O(1)\\
\end{align*}
\textbf{Problem}: We need the use of fully random variables to use the Chernoff bound in the last step.
\textbf{Solution}: Use 7-wise independent hashing, and bound $\Pr(\text{a $k$-interval containing $h(z)$ is full}) = O\left(\frac{1}{k^3}\right)$\\
Let $I$ denote the specific (arbitrary) interval in \cref{initial_ineq}. Define the indicator random variable $X_i$ to be
\begin{equation}
X_i=
\begin{cases}
1 & \text{if } h(i\text{th key}) \in I\\
0 & \text{otherwise }
\end{cases}
\end{equation}
Additionally we define the load on interval $I$ as $L(I)=\sum_{i=1}^n X_i$. Note that $\E[L(I)] = \frac{k}{2}$.
\begin{definition}[Full Interval]
An interval $I$ is considered full if $|\{x: h(x) \in I\}| \ge |I|$.
\end{definition}
We now seek to bound that probability that a length $k$ interval is full.
\begin{align}
\Pr\left(|L(I) - \E[L(I)] > \frac{k}{2}\right) &< \left(\dfrac{k}{2}\right)^{-6} \cdot \E[\; L(I) - \E[L(I)] \; ] &&\text{(by \cref{thm:markov})}\\
&= \left(\dfrac{k}{2}\right) ^{-6} \cdot \E\left[\left(\sum_{i=1}^n X_i - \dfrac{k}{2}\right)^6\right]
\end{align}
To deal with the final term, it is doable with combinatorics by expanding out the powers, but it is better dealt with a probability trick of ``symmetrization''.
\paragraph{Probability and Vector Detour}
The $l_p$ norm of a vector is defined as: $||x||_p := (\sum |x_i|^p)^{1/p}$. For random variables, we can also define the $l_p$ norm as $\|X\|_p := (\E[|X|^p])^{1/p}$.
We will also use the following inequalities without proof:
\begin{theorem}[Extended Markov Inequality] \label{thm:markov}
For a nonnegative random variable $X$, we have
\begin{equation}
\Pr(|X| > a) \leq \frac{E[|X|^n]}{a^n}
\end{equation}
\end{theorem}
\begin{theorem}[Minkowski's Inequality] \label{thm:minkowski}
$L^p$ spaces are normed vector spaces. Therefore, for $p\geq1$, we have the triangle inequality
\begin{equation}
\|x+y\|_p \leq \|x\|_p + \|y\|_p
\end{equation}
\end{theorem}
\begin{theorem}[Jensen's Inequality] \label{thm:jensen}
If $f$ is a convex function and $X$ is a random variable, then
\begin{equation}
f(\E[X]) \leq \E[f(X)]
\end{equation}
\end{theorem}
Lastly, we also note that we are able to define a new random variable $Y$ that is drawn from the same distribution as $X$ but independent from $X$. This allows us to do the following:
\begin{equation}
\E[|X - \mu|] = \E[|X - \E[Y]|] = \E_X[|\E_Y[X-Y]|] = \E[|X-Y|]
\end{equation}
% Probability detour done
Now, back to simplifying Equation (5). To make things easier, we deal with the expectation term without the exponent first. We also define another random variable $X'$ drawn from the same distribution as $X$ but independently selected; and define $\sigma_i \in \{-1,1\}$ to be an uniform independent random variable.
\begin{align*}
\left|\left|\sum_{i=1}^n X_i - \E[X_i]\right|\right|_6&= \left|\left|\E_{x_i}\left[\sum X_i - \sum X_i'\right]\right|\right|_6 &&\text{(from Equation 9)}\\
&\le \left|\left| \sum (X_i - X_i') \right|\right|_6 &&\text{(by \cref{thm:jensen})}\\
&\le \left|\left| \sum ( \sigma_i (X_i - X_i')) \right|\right|_6 &&\text{(by symmetry)}\\
&\le 2 \left|\left| \sum ( \sigma_i \cdot X_i) \right|\right|_6 &&\text{(by \cref{thm:minkowski})}\\
\end{align*}
We complete that last step, by setting $X_i - X_i'$ to be another random variable $Z$ which is valid, because these two variables are independently chosen. Its symmetry can be observed as $\Pr(Z_i) = \alpha$ is the same as $\Pr(Z_i) = -\alpha$, $\forall \alpha \in \mathbb{R}$.\\
Now if we include back the expectation and the exponent,
\begin{align*}
\E\left[\left(\sum_{i=1}^n X_i - \dfrac{k}{2}\right)^6\right] &= \E\left[\left(\sum_{i=1}^n X_i - \E[X_i]\right)^6\right] \\
&= \E\left[\sum \sigma_i X_i\right]^6 &&\text{(Eliminate constant term)} \\
% probably find a better way to express the "eliminate constant term"
&= \sum_{i_1, i_2 \dots i_6} (\E [X_{i1}\dots X_{i6}] \cdot \E[\sigma_{i1}\dots \sigma_{i6}]) &&\text{(Expansion of exponent)}
\end{align*}
We note that
$$
\E[\sigma_i^j] =
\begin{cases}
1, \text{ if $j$ is odd}\\
0, \text{ if $j$ is even}
\end{cases}
$$
because it is a random uniform variable with magnitude $1$, $\{-1, 1\}$ squared will always give $1$, while the expectation of a single variable would be $0$. This allows us to remove all terms of the expanded exponentiation with odd powers of $\sigma$. The remaining terms would remain only if the contributing 6 terms are in the form $\{(2,2,2), (4,2), (6)\}$. Additionally, we note that we can treat any $X_{i1}\dots X_{i6}$ as independent, given that we have a 7-wise hash functions, so keys $i_1$, $i_2, \cdots i_6$, and $z$ are effectively hashed to random independent locations.
To take the first possibility as an example $\{2,2,2\}$, it means that there are 3 distinct locations, and out of 6 terms, 3 sets of 2 terms have hashed to the same location as each other (but distinct from the other sets). The expectation of this happening for 3 specific locations is $\left(\dfrac{k}{m}\right)^3$, and the number of ways to choose these random variables is $n^3$. This gives $O\left(n^3 \cdot \left(\dfrac{k}{m}\right)^3\right)$. Generalizing this for the set $\{2,4\}$ we get $O\left(n^2 \cdot \left(\dfrac{k}{m}\right)^2\right)$, and for the set $\{6\}$ we get $O\left(n \cdot \dfrac{k}{m}\right)$.
\begin{align*}
\sum_{i_1, i_2 \dots i_6} (\E [X_{i1}\dots X_{i6}] \cdot \E[\sigma_{i1}\dots \sigma_{i6}]) &= O\left(n^3 \cdot \left(\dfrac{k}{m}\right)^3\right) + O\left(n^2 \cdot \left(\dfrac{k}{m}\right)^2\right) + O\left(n \cdot \dfrac{k}{m}\right)\\
&= O\left(n^3 \cdot \left(\dfrac{k}{m}\right)^3\right)\\
&= O(k^3) &&\text{($m = 2n$)}
\end{align*}
Using this result, we get that
\begin{align*}
\left(\dfrac{k}{2}\right) ^{-6} \cdot \E\left[\left(\sum_{i=1}^n X_i - \E[X_i]\right)^6\right] &= \left(\dfrac{k}{2}\right) ^{-6} \cdot O(k^3)\\
&= O\left(\dfrac{1}{k^3}\right)
\end{align*}
Plugging back into \cref{initial_ineq}, we have
\begin{align*}
\E[\# \text{ probes to } \text{query}(z)] &\leq \sum\limits_{i=1}^\infty k \cdot \Pr(\text{a specific length k interval containing h(x) is full}) \\ &= \sum\limits_{i=1}^\infty k \cdot O\left(\frac{1}{k^3}\right)
\\ &= O(1)
\end{align*}
giving us expected constant query time with a 7-wise independent hash function, as desired.
\subsection{5-wise Independent Hashing}
\begin{figure}
\centering
\includegraphics[scale=0.5]{5wise.png}
\caption{Diagram of 5-wise independent hash function proof, $m=16$, $k=8$}
\label{fig:5wise_hashing}
\end{figure}
We first note that $5$-independent hash functions are optimal, as it was shown by P{\v{a}}tra{\c{s}}cu and Thorup \cite{PatrascuThorup} that there exist random 3 and 4-independent hash functions with expected logarithmic search time for specific keys.
We now sketch the proof (by picture) that a 5-wise independent hash function is still sufficient for expected constant query time. Our goal is to construct a constant number of intervals where if $I$ is full, at least one of these intervals is ``almost full''.
\begin{enumerate}
\item Construct a perfectly balanced binary search tree with the leaves corresponding to the entries of our array. Round up $k$ to nearest power of 2 and consider the union of all arbitrary length $k$ intervals that cover $h(z)$ (colored yellow in \cref{fig:5wise_hashing}).
\item Go to the level of the tree that where every node has $k$ leaves (marked in green)
\item Go 2 levels lower, where each node has $k/4$ leaves (marked in pink). The total number of pink nodes that intersect all possible $k$ length intervals containing $h(z)$ is $O(1)$ pink nodes, as each pink node has $O(k/4)$ leaves and the yellow interval is at most $O(2k)$. In particular, at most 5 pink intervals intersect $I$.
\end{enumerate}
To reuse the equation used for the proof of 7-wise independent hashing
\begin{align*}
\E[\text{Runtime}] &\le \sum_{k=1}^\infty E_k \\
&= \sum_{k=1}^{\infty} P(E_k = 1) &&\text{($E_k$ is an indicator variable)}\\
&= \sum_{k=1}^{\infty} \Pr\left( \exists \text{ a full length $k$-interval containing $h(z)$}\right)\\
&= \sum_{k=1}^{\infty} \Pr\left( \exists \text{ a pink interval that is almost full}\right)\\
&\le \sum_{k=1}^\infty O(1) \cdot P(\exists \text{ a particular pink interval almost full}) &&\text{(symmetry between intervals)}\\
\end{align*}
\begin{claim}
If $I$ is full, at least one pink interval must be at least 3/5 full.
\end{claim}
\begin{proof}
The length $k$ interval $T$ is fully contained in the union of $\leq 5$ pink nodes; call them $b_1,..,b_5$. If $T$ is full, then $k$ elements hash to $T$, which by pigeonhole means one of $b_1,...,b_5$ must have at least $k/5$ elements hashed to it. But that $b_i$ is an interval of length $k/4$, and $\frac{k/5}{k/4} = 80\%$, and thus it is at least 80\% full, and thus greater than 3/5 full.
This assumes though that $k$ is a power of 2. In reality we pick the pink nodes by rounding up to the nearest power of 2, then going down 2 levels which corresponds to dividing by 4 (so if we care about $k=14$, we would have pink nodes covering intervals of length 4). If the pink node interval length is $t$, then basically we know $1/4 \leq t/k < 1/2$ (close to 1/2 if $k$ is 1 more than a power of 2). The ``worst case'' is $t/k$ is close to 1/2, in which case we really have $b_i$'s being intervals of length $~k/2$ and $T$ covered by $\leq 3$ pink nodes, so the pigeonhole argument gives $\frac{k/3}{k/2} = 66.7\% > 3/5$.
\end{proof}
Now we see that the intervals represented by pink nodes satisfy our desired property. From this, we see that we can actually modify \cref{initial_ineq} as
\begin{equation}
\E[\# \text{ probes to } \text{query}(z)] \leq \sum\limits_{i=1}^\infty \Pr(\text{there exists a pink interval that is ``almost'' full})
\end{equation}
Using the symmetrization trick with a 5-wise independent hash function instead of 7-wise, we can bound this as
\begin{equation}
\Pr(\text{there exists a pink interval that is ``almost'' full}) \leq \sum\limits_{i=1}^\infty O(1) \cdot O(\frac{1}{k^2}) = O(1)
\end{equation}
giving us expected constant query time with a 5-wise independent hash function, as desired.
\section{Approximate Membership and Dictionary}
Solutions to the dictionary problem typically take $O(nW)$ bits, where $W$ is the word size. For data structures with smaller space complexity, we settle for \emph{approximate} solutions.
\subsection{Approximate Membership Problem}
The following is the approximate membership problem: Store a database of keys subject to
\begin{enumerate}
\item insert(x): adds $x$ to the database
\item query(x): returns whether $x$ is in the database. If $x$ is actually present, it returns $YES$ with probability 1. If $x$ is not actually present, it returns $NO$ with probability $\geq 1 - \epsilon$.
\end{enumerate}
We wish to solve this problem with $O(n\log \frac{1}{\epsilon})$. In the next lecture, we will see how to do this with Bloom filters.
\subsection{Approximate Dictionary Problem}
Approximate dictionary has a very similar setup as approximate membership. Assuming that a key is not in the database, querying it outputs an arbitrary value with a probability $\leq \epsilon$. If the key actually is in the database, it will return the correct value with probability 1. In the next lecture, we will see how to implement this data structure using bloomier filters and cuckoo hashing.
\bibliographystyle{alpha}
\begin{thebibliography}{42}
\bibitem{PaghPaghRuzic}
Anna~Pagh, Rasmus~Pagh, Milan~Ru\v{z}i\'{c}.
\newblock Linear Probing with 5-wise Independence.
\newblock {\em SIAM Review}, 53(3):547--558, 2011.
\bibitem{PatrascuThorup}
Mihai~P{\v{a}}tra{\c{s}}cu, Mikkel~Thorup.
\newblock On the k-Independence Required by Linear Probing and Minwise Independence.
\newblock ACM Transactions on Algorithms (TALG) 12(1): 1-27, 2015.
\end{thebibliography}
\end{document}