\documentclass[10pt]{article}
\usepackage[usenames]{color} %used for font color
\usepackage{amssymb} %maths
\usepackage{amsmath} %maths
\usepackage[utf8]{inputenc} %useful to type directly diacritic characters
\begin{document}
\begin{align*}\documentclass[11pt]{article}
\usepackage{amsmath,amssymb,amsthm,bbm}
\usepackage{fullpage}
\usepackage[capitalise,nameinlink]{cleveref}
\crefname{lemma}{Lemma}{Lemmas}
\crefname{fact}{Fact}{Facts}
\crefname{theorem}{Theorem}{Theorems}
\crefname{corollary}{Corollary}{Corollaries}
\crefname{claim}{Claim}{Claims}
\crefname{example}{Example}{Examples}
\crefname{problem}{Problem}{Problems}
\crefname{setting}{Setting}{Settings}
\crefname{definition}{Definition}{Definitions}
\crefname{assumption}{Assumption}{Assumptions}
\crefname{subsection}{Subsection}{Subsections}
\crefname{section}{Section}{Sections}
\DeclareMathOperator*{\E}{\mathbb{E}}
\let\Pr\relax
\DeclareMathOperator*{\Pr}{\mathbb{P}}
\newcommand{\eps}{\varepsilon}
\newcommand{\inprod}[1]{\left\langle #1 \right\rangle}
\newcommand{\R}{\mathbb{R}}
\renewcommand{\H}{\mathcal{H}}
\newcommand{\handout}[5]{
\noindent
\begin{center}
\framebox{
\vbox{
\hbox to 5.78in { {\bf CS 270: Combinatorial Algorithms and Data Structures
} \hfill #2 }
\vspace{4mm}
\hbox to 5.78in { {\Large \hfill #5 \hfill} }
\vspace{2mm}
\hbox to 5.78in { {\em #3 \hfill #4} }
}
}
\end{center}
\vspace*{4mm}
}
\newcommand{\lecture}[4]{\handout{#1}{#2}{#3}{Scribe: #4}{Lecture #1}}
\newtheorem{theorem}{Theorem}[section]
\newtheorem*{theorem*}{Theorem}
\newtheorem{itheorem}{Theorem}
\newtheorem{subclaim}{Claim}[theorem]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem*{proposition*}{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem*{lemma*}{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem*{conjecture*}{Conjecture}
\newtheorem{fact}[theorem]{Fact}
\newtheorem*{fact*}{Fact}
\newtheorem{exercise}[theorem]{Exercise}
\newtheorem*{exercise*}{Exercise}
\newtheorem{hypothesis}[theorem]{Hypothesis}
\newtheorem*{hypothesis*}{Hypothesis}
\newtheorem{conjecture}[theorem]{Conjecture}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{setting}[theorem]{Setting}
\newtheorem{construction}[theorem]{Construction}
\newtheorem{example}[theorem]{Example}
\newtheorem{question}[theorem]{Question}
\newtheorem{openquestion}[theorem]{Open Question}
% \newtheorem{algorithm}[theorem]{Algorithm}
\newtheorem{problem}[theorem]{Problem}
\newtheorem{protocol}[theorem]{Protocol}
\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{exercise-easy}[theorem]{Exercise}
\newtheorem{exercise-med}[theorem]{Exercise}
\newtheorem{exercise-hard}[theorem]{Exercise$^\star$}
\newtheorem{claim}[theorem]{Claim}
\newtheorem*{claim*}{Claim}
\newtheorem{remark}[theorem]{Remark}
\newtheorem*{remark*}{Remark}
\newtheorem{observation}[theorem]{Observation}
\newtheorem*{observation*}{Observation}
% 1-inch margins, from fullpage.sty by H.Partl, Version 2, Dec. 15, 1988.
% \topmargin 0pt
% \advance \topmargin by -\headheight
% \advance \topmargin by -\headsep
% \textheight 8.9in
% \oddsidemargin 0pt
% \evensidemargin \oddsidemargin
% \marginparwidth 0.5in
% \textwidth 6.5in
% \parindent 0in
% \parskip 1.5ex
\begin{document}
\lecture{11 --- February 21, 2023}{Spring 2023}{Prof.\ Jelani Nelson}{Amar Shah}
\section{Chernoff Bound}
To prove the Chernoff inequality, we will use the Markov inequality which we state here without proof:
\begin{theorem}[Markov's inequality]
For an non-negative random variable $Z$. We have that $\forall \lambda > 0$
\[\Pr[Z > \lambda] < \frac{\E[Z]}{\lambda}\]
\end{theorem}
\begin{theorem}[Chernoff's inequality]
Take $X_1, ..., X_n \in \{0, 1\}$ independently, then $\Pr[X_i = 1] = p_i$, $X = \sum\limits_{i=1}^n X_i$, and $\mu = \E[X]$. Then, $\forall \epsilon > 0$, we have that:
\[\Pr[X > (1 + \epsilon)\mu] < [\frac{e^\epsilon}{(1 + \epsilon)^{1+ \epsilon}}]^\mu\]
\end{theorem}
\begin{proof}
We first note that $\Pr[X > (1 + \epsilon)\mu] = \Pr[e^{tX} > e^{t(1 + \epsilon) \mu}]$. This is true for any $t > 0$. Then note that $\Pr[e^{tX} > e^{t(1 + \epsilon) \mu}] < e^{-t(1 + \epsilon)\mu} \E[e^{tX}]$ using Markov's inequality. We will now try and find an upperbound on the moment generating function $\E[e^{tX}]$:
\begin{align*}
\E[e^{tX}] &= \E[e^{\sum\limits_{i=1}^n X_i}] \\
&= \E[\prod\limits_{i=1}^n e^{tX_i}] \\
&= \prod\limits_{i=1}^n \E[e^{tX_i}] \\
&= \prod\limits_{i=1}^n (1 - p_i + p_i e^t) \text{ by considering cases} \\
&= \prod\limits_{i=1}^n (1 + p_i(e^t - 1)) \\
&\leq \prod\limits_{i=1}^n e^{p_i (e^t - 1)} \\
&= e^{\sum\limits_{i=1}^n p_i (e^t - 1)} \\
&= e^{\mu(e^t - 1)}
\end{align*}
Thus, we get that:
\begin{align*}
\Pr[e^{tX} &> e^{t(1 + \epsilon) \mu}] < e^{-t(1 + \epsilon)\mu} \E[e^{tX}] \\
&\leq e^{-t(1 + \epsilon)\mu} e^{\mu(e^t - 1)} \\
&= e^{\mu(e^t - 1 - t(1 + \epsilon))}
\end{align*}
By taking the first and second derivative, we get that $e^{\mu(e^t - 1 - t(1 + \epsilon))}$ is minimized when $t = ln(1 + \epsilon)$. Plugging this in we get:
\begin{align*}
e^{\mu(e^t - 1 - t(1 + \epsilon))} &= e^{\mu(1 + \epslon - 1 - ln(1 + \epsilon) \cdot [1 + \epsilon])} \\
&= e^{\mu(\epsilon - ln(1 + \epsilon) \cdot [1 + \epsilon])} \\
&= \frac{e^{\mu \epsilon}}{(1 + \epsilon)^{(1 + \epsilon) \mu}}
\end{align*}
\end{proof}
\section{Load Balancing Review}
Suppose that we have $n = m$ servers and tasks. Recall how we upperbounded the probability that one server would have more than $\lambda$ tasks last time:
\begin{align*}
\Pr[\exists \text{ server w/ load } \geq \lambda] &= \Pr[\bigwedge\limits_{i=1}^m \text{ server i has load } \geq \lambda] \\
&\leq \sum\limits_{i=1}^n \Pr[\text{ server i has load } \geq \lambda] \text{ by Union Bound} \\
&= n \cdot \Pr[\text{ server } 1 \text{ has load } \geq \lambda] \\
&= n \cdot \Pr[\exists \text{set } T \text{ of } \lambda \text{ jobs mapping to server } 1] \\
&\leq n \cdot \sum\limits_{T \subseteq [n]; |T| = \lambda} \Pr[\text{ all jobs } \in T \text{ map to } 1] \\
&= n \cdot {n \choose \lambda} \cdot (\frac1n)^\lambda \text{ using independence}
\end{align*}
Then we can show that when $\lambda = O(\frac{log(n)}{log(log(n))})$, we can show that this quantity is much smaller than $1$ using Stirling's approximation. The important thing to note here is that we did not need to use full independence for this proof. We just needed "$\lambda$-wise indpendence" for the last step. This realization motivates the following definitions in the next section.
\section{k-wise Independence}
\subsection{k-wise Independent variables}
\begin{definition}[k-wise Independent Random Variables]
$Y_1, Y_2, ..., Y_n$ are $k-wise$ independent if for all subsets of size k $Y_{i_1}, ..., Y_{i_k}$ and for all values $y_1, ..., y_k$, we have that $\Pr[\bigvee\limits_{j=1}^k Y_{i_j} = y_j] = \prod\limits_{j=1}^k \Pr[Y_{i_j} = y_j]$, i.e. any subset of size $k$ are independent
\end{definition}
\begin{fact}
$k$-wise independence of a set of variables $Y_1, ..., Y_n$ for $k > 1$ implies $(k-1)$-wise independence. And thus it implies $l$-wise independence for all $1 \leq l < k$
\end{fact}
\begin{proof}
Say we have that $Y_1, ..., Y_n$ that is $k$-wise independent and we have some subset $Y_{i_1}, ..., Y_{i_{k-1}}$. We pick some $Y_t$ that is not in this subset(we know that this can be done since $n \geq k$, otherwise $k$-wise independence would not make any sense). Then we have that:
\begin{align*}
\Pr[\bigwedge\limits_{j=1}^{k-1} Y_{i_j} = y_j] &= \sum\limits_z \Pr[Y_t = z \land \bigwedge\limits_{j=1}^{k-1} Y_{i_j} = y_j] \\
&= \sum\limits_z [\Pr[Y_t = z] \prod\limits_{j=1}^{k-1} \Pr[Y_{i_j} = y_j]] \text{ by k-wise independence} \\
&= (\prod\limits_{j=1}^{k-1} \Pr[Y_{i_j} = y_j]) \cdot \sum\limits_z \Pr[Y_t = z] \\
&= (\sum\limits_z \Pr[Y_t = z]) \cdot 1 \\
&= \sum\limits_z \Pr[Y_t = z]
\end{align*}
\end{proof}
\subsection{k-wise Independent Hash Functions}
\begin{definition}[k-wise Independent Hash Family]
A hash family $\mathcal{H}$ is just a set of functions mapping $[U]$ into $[m]$. A family is k-wise independent if $h(0), h(1), ..., h(U-1)$ are $k$-wise independent for some $h$ drawn uniformly at random from the family
\end{definition}
The idea behind these hash functions is that we pick some $h \in \H$ u.a.r, but if we think about $h(0), ..., h(U-1)$ as random variables based distributed over the possible values they take for each function $h \in \H$, then these are $k$-wise independent.
\begin{fact}
Specifying some $h \in \H$ takes $log_2(|\H|)$ bits.
\end{fact}
Our goal will be to make $|\H|$ as small as possible.
\subsection{Some Examples}
\paragraph{Attempt 1:} Set $\H$ as the set of all functions mapping $[U]$ into $[m]$. Clearly, this is k-wise independent. To see this we take $m = 2$ for simplicity, i.e. we will match each $x$ to either $0$ or $1$. Then the probability that some $x \in [U]$ maps to $0$ is $\frac{2^{U-1}}{2^{U}} = \frac{1}{2}$ since there are $2^{U}$ total hash functions in $\H$ but if want that $x$ maps to $0$, there are $2^{U - 1}$ possible hash functions that this could be since there are $U-1$ possible inputs that can map to $0$ or $1$.
Now once we have have that $x$ maps to 1, what is then the probability that some $y \in [U]$ maps to 1. By a similar argument it must be $\frac{2^{U - 2}}{2^{U - 1}} = \frac{1}{2}$.
Thus, it is not hard to see in fact that this is in fact an independent hash family(not just k-wise), since setting any number of inputs to something, will not effect the probability of what the other inputs can map to.
However, since $|\H| = m^U$, we know that $log|\H\ = Ulog(m)$. We want to do better.
\paragraph{Attempt 2:} We start in the case where $U = m = p$ which is some prime. Set $\H_{poly(k)} = \{h(x): h(x) = (\sum\limits_{i=0}^{k-1} a_i x^i) (mod \text{ }p)\}$. Then we know that $|\H_{poly(k)}| = p^k = m^k$ and thus $log|\H_{poly(k)}| = k log(m)$ which is much better.
To show that this is k-wise independent, take $i_1, ..., i_k \in [U]$ and $y_1, ..., y_k \in [m]$. Then:
\begin{align*}
\Pr\limits_{h \in \H_{\poly(k)}} [\bigwedge\limits_{j = 0}^{k-1} h(i_j) = y_j] &= \frac{\# \text{of h's s.t. } \forall \text{ } j h(i_j) = y_j}{|\H_{poly(k)}|} \\
&= \frac{1}{p^k}
\end{align*}
Clearly the denominator is $p^k$, but to see why the number of h's s.t. $\forall j$ $h(i_j) = y_j$ is 1, we can note that this is essentially a $k$ degree polynomial in our finite field and we want it to go through $k$ points. There is only one way to do this.
Finally, we may want get around the condition that $m = U$. We still assume that $U = p$ which is some prime. Then we define $\hat{H}_{poly(k)} = \{h(x): h(x) = (\sum\limits_{i=0}^{k-1} a_i x^i) (mod \text{ } p)) \text{ (mod } m)\}$. This works almost as well since we get that $|\hat{H}_{poly(k)}| = m^k$ which gives us the same complexity as before.
\section{Linear Probing Analysis}
\subsection{Dictionary Review}
Recall the problem from last lecture, the dictionary problem on a universe of size $u$.
In hashing with chaining; we initialize $m$ ``bins" and $h(x)$ tells you which bin the item should go in.
If there is a hashing collision, where two items hash to the same thing, then we instead create a linked list with both the items.
To query, you have to walk along the linked list to find your queried item.
\begin{claim}
For all $x \in [u]$, the expected time to query $x$ is $O(1 + \frac{n}{m})$.
\end{claim}
In static dictionary, there is a known data structure to take linear space and have constant time query.
However, there is no known algorithm for this regime in the dynamic problem, nor is there a lower bound disallowing it.
\subsection{Linear Probing}
However, this approach is not great for cache reasons, so instead we use linear probing.
We still keep an array of size $m$, but when inserting $x$ and finding a collision, we start at $h(x)$
and continue along in the array until we find an empty space. We do a similar walk for a query.
\begin{definition}
An interval $I \subseteq [m]$ in our array is \emph{full} if the number of keys in the database hashing to $I$ is $\geq |I|$
\end{definition}
\begin{lemma}
Suppose $query(x)$ took $k$ steps. Then $h(x)$ is contained in $\geq k$ full intervals of all different lengths.
\end{lemma}
\begin{proof}
Since we know that $query(x)$ took $k$ steps, it must be that $x, x+1, ..., x + k - 1$ are all full. Say that $x - j$ is the first empty slot before $x$. Then we know that the interval $x-j+1, ..., x$ must be queried at least $j$ ties since $x - j$ is empty, but $x-j+1, ..., x$ is full.
Similarly for all $l$ such that $0 \leq l \leq k-1$, we have that $x-j+1, ..., x+l$ must have been queried $l+j$ times. This proves the claim.
\end{proof}
\subsection{Analysis}
Today, we will do the analysis assuing fully independent hashing. Next time we will to it for $7$-wise and $5$-wise independent hashing. Recall that last time we talked about the famous theorem by Donald Knuth:
\begin{theorem}[Knuth \cite{knuth:1963}]
In a hash table with linear probing with $m = (1 + \epsilon)n$, then
\[ \mathbb{E}(\text{query time}) = O(1 / \epsilon^2) \]
\end{theorem}
Today, we will show a slightly weaker version of it:
\begin{theorem}
In a hash table with linear probing with $m = 2n$, then
\[ \mathbb{E}(\text{query time}) = O(1)\]
\end{theorem}
\begin{proof}
Note that for some interval $I$, $\E[\text{items that hash to } I] = \frac{|I|}{2}$ since $m = 2n$. Thus, by the Chernoff bound we have that $\Pr[\text{a length k interval is full}] \leq e^{-\Omega(k)}$
The number of probes to query(x) is $\leq \sum\limits_{k=1}^\infty \mathbbm{1}_{\exists \text{ length k full interval containing } h(x)}$. Thus, we have that:
\begin{align*}
\E[\# \text{ probes to } query(x)] &\leq \sum\limits_{i=1}^\infty \Pr[\exists \text{ length k full interval containing } h(x)] \\
&\leq \sum\limits_{i=1}^\infty k \Pr[\text{a specific length k inteval containing h(x) is full}] \text{ by Union Bound} \\
&\leq \sum\limits_{i=1}^k k e^{-\Omega(k)} \text{ by the Chernoff bound} \\
&= O(1)
\end{align*}
\end{proof}
Note that the sum $\sum\limits_{i=1}^k k e^{-\Omega(k)}$ actually converges faster than in needs to in order to get the necessary bound. This gives intuition for how we are going to show this for $7$-wise and $5$-wise independent hashing next time.
\bibliographystyle{alpha}
\begin{thebibliography}{42}
\bibitem{knuth:1963}
Donald Knuth.
\newblock Notes on ``open" addressing, 1963.
\newblock URL: http://jeffe.cs.illinois.edu/teaching/ datastructures/2011/notes/knuth-OALP.pdf.
\end{thebibliography}
\end{document}\end{align*}
\end{document}