\documentclass[11pt]{article}
\usepackage{amsmath,amssymb,amsthm}
\usepackage{fullpage}
\usepackage{hyperref}
\usepackage[capitalise,nameinlink]{cleveref}
\usepackage{listings}% http://ctan.org/pkg/listings
\lstset{
basicstyle=\ttfamily,
mathescape
}
\hypersetup{hidelinks,hypertexnames=false,colorlinks=true,allcolors=blue}
\usepackage{cite}
\crefname{lemma}{Lemma}{Lemmas}
\crefname{fact}{Fact}{Facts}
\crefname{theorem}{Theorem}{Theorems}
\crefname{corollary}{Corollary}{Corollaries}
\crefname{claim}{Claim}{Claims}
\crefname{example}{Example}{Examples}
\crefname{problem}{Problem}{Problems}
\crefname{setting}{Setting}{Settings}
\crefname{definition}{Definition}{Definitions}
\crefname{assumption}{Assumption}{Assumptions}
\crefname{subsection}{Subsection}{Subsections}
\crefname{section}{Section}{Sections}
\DeclareMathOperator*{\E}{\mathbb{E}}
\let\Pr\relax
\DeclareMathOperator*{\Pr}{\mathbb{P}}
\newcommand{\eps}{\varepsilon}
\newcommand{\inprod}[1]{\left\langle #1 \right\rangle}
\newcommand{\R}{\mathbb{R}}
\newcommand{\handout}[5]{
\noindent
\begin{center}
\framebox{
\vbox{
\hbox to 5.78in { {\bf CS 270: Combinatorial Algorithms and Data Structures
} \hfill #2 }
\vspace{4mm}
\hbox to 5.78in { {\Large \hfill #5 \hfill} }
\vspace{2mm}
\hbox to 5.78in { {\em #3 \hfill #4} }
}
}
\end{center}
\vspace*{4mm}
}
\newcommand{\lecture}[4]{\handout{#1}{#2}{#3}{Scribe: #4}{Lecture #1}}
\newtheorem{theorem}{Theorem}[section]
\newtheorem*{theorem*}{Theorem}
\newtheorem{itheorem}{Theorem}
\newtheorem{subclaim}{Claim}[theorem]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem*{proposition*}{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem*{lemma*}{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem*{conjecture*}{Conjecture}
\newtheorem{fact}[theorem]{Fact}
\newtheorem*{fact*}{Fact}
\newtheorem{exercise}[theorem]{Exercise}
\newtheorem*{exercise*}{Exercise}
\newtheorem{hypothesis}[theorem]{Hypothesis}
\newtheorem*{hypothesis*}{Hypothesis}
\newtheorem{conjecture}[theorem]{Conjecture}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{setting}[theorem]{Setting}
\newtheorem{construction}[theorem]{Construction}
\newtheorem{example}[theorem]{Example}
\newtheorem{question}[theorem]{Question}
\newtheorem{openquestion}[theorem]{Open Question}
% \newtheorem{algorithm}[theorem]{Algorithm}
\newtheorem{problem}[theorem]{Problem}
\newtheorem{protocol}[theorem]{Protocol}
\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{exercise-easy}[theorem]{Exercise}
\newtheorem{exercise-med}[theorem]{Exercise}
\newtheorem{exercise-hard}[theorem]{Exercise$^\star$}
\newtheorem{claim}[theorem]{Claim}
\newtheorem*{claim*}{Claim}
\newtheorem{remark}[theorem]{Remark}
\newtheorem*{remark*}{Remark}
\newtheorem{observation}[theorem]{Observation}
\newtheorem*{observation*}{Observation}
% additional packages/commands
\usepackage{mathtools}
\usepackage{xparse}
\usepackage{suffix} % commands with suffix
\usepackage{physics}
\usepackage{parskip, float}
\usepackage{microtype} % Micro-kerning
\newcommand\definedas\coloneqq
\newcommand\from{:}
\renewcommand{\Pr}{\operatorname{\mathbb{P}}\mathopen{}\opbraces{}} % Probability
\WithSuffix\newcommand\Pr*{\operatorname{\mathbb{P}}\mathopen{}} % Probability (no qty)
\newcommand{\Exp}{\operatorname{\mathbb{E}}\mathopen{}\opbraces{}} % Expected value
\WithSuffix\newcommand\Exp*{\operatorname{\mathbb{E}}\mathopen{}} % Expected value (no qty)
% 1-inch margins, from fullpage.sty by H.Partl, Version 2, Dec. 15, 1988.
% \topmargin 0pt
% \advance \topmargin by -\headheight
% \advance \topmargin by -\headsep
% \textheight 8.9in
% \oddsidemargin 0pt
% \evensidemargin \oddsidemargin
% \marginparwidth 0.5in
% \textwidth 6.5in
% \parindent 0in
% \parskip 1.5ex
\begin{document}
\lecture{14 -- March 2nd, 2023}{Spring 2023}{Prof.\ Jelani Nelson}{Nikki Suzani, Eshaan Bhansali}
\section{Power of Two Choices}
The power of two choices \cite{ABKU99} \cite{Mitz01} takes two random hash functions $h, g$ that map $[U] \rightarrow n$, and finds that the expected max load on each slot goes from $\frac{\ln(n)}{\ln\ln(n)}$ to $\frac{\ln \ln(n)}{\ln(2)} + O(1)$. Note that with $k \geq 2$ hash functions, there is only a constant factor improvement, with the $\ln(2)$ in the denominator changing to $\ln(k)$.
To prove this, we'll do analysis based on height.
Given $x \in DB$, we define the height of some $x$ to be its height in the stack at the time it was inserted; for example, if $x$ was the $h$th item put into its bucket, then its height is $h$. Our goal is to show that when we get to some height of $\frac{\ln \ln (n)}{\ln (2)}$ the number of elements with that height is less than 1. Since this number must be an integers, this means that we expect no elements with that height, and the expected max load is bounded by $\frac{\ln \ln (n)}{\ln (2)} + O(1)$.
Define $B_i$ to be the number of slots with at least i items. Let's look at the ratio $\frac{B_i}{n}$. If we can show that this fraction is less than $\frac{1}{n}$ we have proven the statement, since it implies there are less than 1 buckets with the given load.
Note that clearly $\frac{B_1}{n} \leq 1$, as we have $n$ slots. Further, if a bucket has height $i+1$, then there is one unique person in the bucket with height $i + 1$.
\begin{align*}
B_{i+1} &\leq \sum_x \mathbf{1}\{\textrm{height($x$)} \geq i + 1\}\\
\end{align*}
We can now take the expectation.
\begin{align}
\E[B_{i+1}] &\leq \sum_x \E\{\textrm{height($x$)} \geq i + 1\}\\
\E[B_{i+1}] &\leq \sum_x \Pr\{\textrm{height($x$)} \geq i + 1\}\\
\E[B_{i + 1}] &\leq n * \Pr(\textrm{height($x$) for some $x$} \geq i + 1)\\
&= n * \Pr(\textrm{height(x)}) \geq i + 1\\
&= n\left(\frac{B_i}{n}\right)^2 \\
&= \frac{B_i^2}{n}
\end{align}
Here, (5) comes from the fact that all these $B_i$ are random variables related to each other, based on the same hash functions. Pretend the previous $B_j$ for $j \in [1, i]$ are fixed. For the height to be at least $i+1$, it means that for each of the places $x$ hashed, each of those places had to have a load of at least $i$. We know that the probability a hash function takes us to a place with load at least $i$ is $\frac{B_i}{n}$, so we can substitute that in here.
Dividing both sides by $n$, we see that
\[\E \left[\frac{B_{i+1}}{n}\right] \leq \left(\frac{B_{i}}{n}\right)^2\].
This analysis is a little hand-wavy, since we already considered the randomness. However, the key is that if things are going according to their expectation, then the fraction of buckets that have load $i$ is decreasing in this way.
We know that $\frac{B_2}{n} \leq \frac{1}{2}$ with probability 1, by counting the places where elements can hash. If things go according to their expectation, then generally
\[ \frac{B_{2+j}}{n} \leq \frac{1}{2^{2^j}}\]
To solve, we want $\frac{B_i}{n} < \frac{1}{n}$.
\[\frac{1}{2^{2^j}} < \frac{1}{n}\]
\[j \geq \log\log(n)\]
Another paper from 2003 \cite{V03}, takes $d$ hash functions and splits up a hash table of $n$ slots into $d$ buckets with $\frac{n}{d}$ slots. Here, you have a hash function for each bucket, such that when you see an item you hash it to its slot in each bucket and then put it into the least loaded bucket. (Ties are broken by putting it into the leftmost least-loaded bucket.) Through this approach, with high probability the max load is $\frac{\ln \ln(n)}{d \cdot \phi_d} + O(1)$ where $\phi_d$ is a sequence of numbers that are in the range of $[1.61, 2]$. This does better than the power of two choices, since it divides by d instead of $\ln(d)$.
\section{An Aside on Upcoming Lectures}
Next lecture we'll talk about spectral graph theory. This idea comes from adjacency theory, where you are able to study properties of the graph from spectral properties of the matrix. For example, given the eigenvalues and eigenvectors of the adjacency matrix of the graph, you can know how many connected components are in the graph.
We'll later talk about linear programming and how to solve linear programs, looking at the details of the Simplex method and Interior-Point polynomial time algorithms.
\section{Online Algorithms}
Online algorithms are about decision-making in the face of uncertainty about the future. That is, without knowing the future, the goal is to make the best decision (or close to the best decision)``on-the-fly." We then compare our results with an omniscient being who knows the future.
\section{Pot of Gold}
Let's start with the $\textbf{Pot of Gold}$ problem. Imagine a long hallway that contains equally-spaced treasure chests on both sides. The distance between all the treasure chests is one yard. We know that one of the treasure chests has gold inside, and the rest are empty. Each timestep, we walk to a new chest and open it to see if the gold is inside. The goal is to walk as few yards as possible to find the gold.
Let's say the gold is at some position t. We know that $OPT$ will pay $t$ yards, since it goes directly to where the gold is.
To compete with $OPT$, let's start by trying zig-zagging, going from $-1$ to $1$ then $-2$ to $2$. This leads to a lot of work taken to walk between the sides, so perhaps we should consider spending more time on each side and checking nearby chests.
One method to do this is going through powers of two, from $-1 \rightarrow 2 \rightarrow -4 \rightarrow 8 \rightarrow 16 \rightarrow 32$. The reason this works is because we need to walk to the origin regardless when crossing sides, so the time to go even further on the other side is amortized a similar amount.
Say $t \in [2^m, 2^{m+1}]$. We pay $2 \cdot (1 + 2 + 4 + ... 2^k) + |t|$, considering the time it takes to go back to the origin. In the worst case, $k = m+1$, since we got almost up to t on one side, but went across and took about 2t additional time to get to t, so we have
$2 \cdot \underbrace{(1 + 2 + 4 + ... 2^{m+1})}_{4t} + |t| \leq 9t$.
\begin{definition} \label{3}
For any algorithm A, it is C-competitive if for all inputs $\sigma$,$$\textrm{cost}(A(\sigma)) \leq C \cdot OPT(\sigma) + O(1)$$
\end{definition}
Thus, by \cref{3}, this algorithm is 9-competitive.
\section{Ski Rental Problem}
Imagine you and your friends are going to a ski resort, but haven't picked an end date for your vacation. Every day you decide whether to go home or continue skiing. The question is about whether you should rent skis each day, or buy the skis (which has a higher fixed cost, but will be helpful if you're staying for a long time).
In this scenario, renting skis is $\$ 1$ per day, and buying skis is a one-time cost of $\$ b$ skis.
$OPT$, knowing that we stay for $d$ days, will buy on day 1 if $d \geq b$ and rent every day otherwise. Thus, $OPT = \min\{d, b\}$.
A good strategy for us would be to rent for the first $b-1$ days, then buy on day $b$. If $d < b$, we pay the same that $OPT$ does. If $d \geq b$, then we pay $\frac{2b-1}{b}$, so we're competitive with $OPT$ here (we are approximately 2-competitive since $\frac{2b-1}{b} \approx 2$).
\section{List Update Problem}
Let's explore another online algorithm problem, which is a warm-up for understanding paging and cache update problems. Here, the goal is to maintain a linked list of items with three kinds of operations:
\begin{itemize}
\item \texttt{access(x)}: Start at the beginning of a linked list, and follow list pointers until you get to x. The cost of this is the position of $x$.
\item \texttt{insert(x)}: Append $x$ to the end of the linked list, and pay the length of the list.
\item \texttt{delete(x)}: Walk to $x$, then remove it, and pay the cost of the position of $x$.
\end{itemize}
Note that at either $\texttt{insert}$ or $\texttt{access}$, after performing the operation you can choose to bring $x$ closer to the front by any number of positions for free.
The goal is to reduce the cost by deciding at each $\texttt{access}$ or $\texttt{insert}$ whether to bring $x$ to the front or not.
There are a few heuristics here that are natural:
\begin{itemize}
\item $\textbf{MF}$ or $\textbf{Move-to-Front}$. That is, each time you $\texttt{access}$ or $\texttt{insert}$ $x$, you move it all the way to the front.
\item $\textbf{Transpose}$. Here, each time you $\texttt{access}$ or $\texttt{insert}$ $x$, you bring $x$ one closer to the front.
\item $\textbf{FC}$ or $\textbf{Frequency Count}$ which tries to keep item in decreasing access of frequency, based on past $\texttt{accesses}$. An item goes up in frequency when you $\texttt{access}$ it, so once accessed you know where to move it to maintain the sorted order.
\item $\textbf{SFC}$ which stands for $\textbf{Static Frequency Count}$. Here, you look into the future and keep items in decreasing order of the final frequency of their $\texttt{accesses}$. Note that this is the best static ordering. Since we can't look into the future, we can't run this algorithm, but we can use it to compare against.
\end{itemize}
The first paper \cite{BM85} to not make any assumptions about the frequencies of possible $x$s found that if items are initially sorted by time of first access, then for all sequences of operations $\sigma$, $\textrm{cost}(\textbf{MF}(\sigma)) \leq 2 \cdot \textrm{cost}(\textbf{SFC}(\sigma))$. This showed $\textbf{MF}$ is statically optimal.
What's more interesting is that $\textbf{MF}$ is not just statically optimal, but also dynamically optimal. Let's look at a model where transpositions cost $1$ when moving something other than $x$ toward front, and $0$ cost when moving $x$.
\begin{theorem}
$\forall A, \sigma$ where A is any algorithm making decisions (including $OPT$), and $\sigma$ is the sequence of operations,
$\textrm{cost}(\textbf{MF}(\sigma)) \leq 2 \cdot \textrm{cost}(A(\sigma)) + $ $\underbrace{P(A(\sigma))}_{\textrm{paid transpositions cost}}$ - $\underbrace{F(A(\sigma))}_\textrm{\# of free moves}$ - m. \cite{ST85}
Note that this assumes $A$ and $\textbf{MF}$ start with the same ordering (possibly the empty list).
\end{theorem}
\begin{proof}
Let's use a potential function argument where $\Phi$(State) = the number of inversions in $\textbf{MF}$'s list, according to the ordering in $A$'s list.
As a reminder, the $\Phi$-cost of an operation = Total cost + $\Delta \Phi$, meaning Total $\Phi$-cost = Total cost + $\Phi$(final) - $\Phi$(initial).
Thus, the Total cost = Total $\Phi$-cost + $\Phi$(initial) - $\Phi$(final).
Since we start with the same number of inversions, $\Phi$(initial) $ = 0$ and $\Phi$(final) $\geq 0$ and this then means that Total cost $\leq$ Total $\Phi$-cost.
Now, let's look at the orderings of both $\textbf{MF}$ and $A$. Suppose we are accessing element $x$, which is in position $k$ in $\textbf{MF}$ and in position $i$ in A. There are some $t$ items that are before $x$ in $\textbf{MF}$ and after $x$ in $A$ (the red items in Figure 1). All other non-$x$ items in $\textbf{MF}$ which are before $x$, must be also before $x$ in $A$ (the blue items in Figure 1). Note that the number of elements in this set are $k - t-1$, since there are $t$ elements that are before $x$ in $\textbf{MF}$ and $A$, x is one element, and there are k total elements up to x.
\begin{figure}[H]
\centering
\includegraphics[scale=0.25]{figure.png}
\caption{Comparison of lists using algorithm A and \textbf{MF}.}
\label{fig:my_label}
\end{figure}
Let's solve for the $\Phi$-cost here. We know that the actual cost of an operation $= k$. The $\Delta \Phi$ here considers the difference between A's movements and $\textbf{MF}$ moving $x$ all the way to the front. When $\textbf{MF}$ moves $x$ all the way to the front, the $t$ items are now no longer inversions, and the $k - t - 1$ elements before $x$ in $\textbf{MF}$ and $A$ now become inversions. Thus, $\Delta \Phi = (k-t-1) - t = -2t + k - 1$. Thus, the $\Phi$-cost is $k + -2t + k - 1 = 2(k-t) - 1$. We know that $k - t - 1 \leq i -1$, since there are only $i-1$ items before $x$ in $A$'s list. Then, the total cost $\leq 2 i - 1$. Then, summing this over all elements we get $2\cdot\textrm{cost}(A(\sigma)) - m$. And knowing that when $A$ does a free move to the front it only helps $\Phi$ by costing $1$ to $A$, we have completed the proof.
\end{proof}
\bibliographystyle{alpha}
\begin{thebibliography}{1}
\bibitem{ABKU99}
Azar, Broder, Karlin, Upfal.
\newblock Balanced Allocations (extended abstract)
\newblock {\em J. Comput. Syst. Sci.}, 58(1):137--147, 1999.
\bibitem{Mitz01}
Michael Mitzenmacher, Andrea W. Richa, Ramesh Sitaraman.
\newblock Chapter 9: The Power of Two
Random Choices: A Survey Of Techniques And Results.
\newblock {\em Handbook of Randomized Computing.}
2001. Kluwer Academic Publishers.
\bibitem{V03}
Berthold Vocking.
\newblock How asymmetry helps load balancing.
\newblock {\em J. ACM.}, 50(4):568–589, 2003
\bibitem{BM85}
Jon L. Bentley, Catherine C. McGeoch.
\newblock Amortized Analyses of Self-organizing Sequential
Search Heuristics.
\newblock {\em ACM}, 28(4):404–411, 1985.
\bibitem{ST85}
Daniel Sleator, Robert Tarjan.
\newblock Self-Adjusting Binary Search Trees.
\newblock {\em J. ACM}, 50(4):568–589, 2003.
\end{thebibliography}
\end{document}