aboutsummaryrefslogtreecommitdiff
path: root/static
diff options
context:
space:
mode:
authorAnthony Wang2024-06-13 17:57:31 -0500
committerAnthony Wang2024-06-13 17:57:31 -0500
commit66432b3158c2182762572649ba0f051bf18dcaa6 (patch)
treea9e3a59d2f47d1cb589d0323e636710d496a45a4 /static
parentdbc640213dd7912bd3698bd90b568408b6bf84aa (diff)
First draft of gzip classification presentation
Diffstat (limited to 'static')
-rw-r--r--static/src/gzip-classification.pdfbin0 -> 115204 bytes
-rw-r--r--static/src/gzip-classification.tex129
2 files changed, 129 insertions, 0 deletions
diff --git a/static/src/gzip-classification.pdf b/static/src/gzip-classification.pdf
new file mode 100644
index 0000000..8bf0d2c
--- /dev/null
+++ b/static/src/gzip-classification.pdf
Binary files differ
diff --git a/static/src/gzip-classification.tex b/static/src/gzip-classification.tex
new file mode 100644
index 0000000..ca743d9
--- /dev/null
+++ b/static/src/gzip-classification.tex
@@ -0,0 +1,129 @@
+\documentclass{beamer}
+
+\usepackage{pythonhighlight}
+
+% https://arxiv.org/abs/2212.09410
+% https://arxiv.org/abs/2309.10668
+% https://github.com/Futrell/ziplm
+% https://bellard.org/nncp/nncp_v2.1.pdf
+% https://prize.hutter1.net/ (blocked by AMD 🤷)
+% https://kenschutte.com/gzip-knn-paper/
+% https://kenschutte.com/gzip-knn-paper2/
+
+\title{Less is More: Parameter-Free Text Classification with Gzip}
+\author{Anthony Wang}
+
+\begin{document}
+
+\frame{\titlepage}
+
+\begin{frame}
+ \frametitle{Task: text compression}
+ \begin{itemize}
+ \item Given some categories, classify strings into the categories
+ \item Example from AG News dataset:
+ \begin{itemize}
+ \item Categories: World, Sports, Business, Sci/Tech
+ \item Example string: "AMD Will Have An Edge Over Intel Through 2005. Piper Jaffray said Advanced Micro Devices (nyse: AMD - news - people ) should have an edge over Intel (nasdaq: INTC - news - people ) quot;throughout 2005 quot; but resumed coverage of both companies at quot;market perform quot; and both at price targets of \$25."
+ \end{itemize}
+ \item Usually solved using neural networks, i.e. BERT
+ \end{itemize}
+\end{frame}
+
+\begin{frame}
+ \frametitle{gzip}
+ \begin{itemize}
+ \item Compression program using LZ77 and Huffman coding
+ \item Pretty fast and used everywhere
+ \item Doesn't require giant GPUs
+ \end{itemize}
+\end{frame}
+
+\begin{frame}
+ \frametitle{Intuition}
+ \begin{itemize}
+ \item Compressors are good at capturing regularity
+ \item Objects from the same category share more regularity than those that aren't
+ \item Compressor $C$, strings $x,y$ similar, different from $z$
+ \item Then $C(xy) < C(xz)$
+ \end{itemize}
+\end{frame}
+
+\begin{frame}
+ \frametitle{Kolmogorov complexity}
+ \begin{definition}
+ \textbf{Kolmogorov complexity} $K(x)$: length of shortest program that outputs $x$, i.e. an ideal compressor
+ \end{definition}
+ \begin{definition}
+ \textbf{Information distance} $E(x, y)$: length of shortest program that converts $x$ to $y$, equal to $K(xy)-K(x)$
+ \end{definition}
+ \begin{itemize}
+ \item $K(x)$ is generally \textbf{incomputable} though, similar to the halting problem
+ \end{itemize}
+\end{frame}
+
+\begin{frame}
+ \frametitle{Approximating $K$}
+ \begin{itemize}
+ \item Simply swap out $K$ for our gzip compressor $C$
+ \item Normalize by length
+ \end{itemize}
+ \begin{definition}
+ \textbf{Normalized compression distance} \[NCD(x,y) = \frac{C(x,y)-\min(C(x),C(y))}{\max(C(x),C(y))}\]
+ \end{definition}
+\end{frame}
+
+\begin{frame}
+ \frametitle{Method for classification}
+ \begin{itemize}
+ \item $k$-nearest-neighbor among all strings in the training set, using $NCD$ as the distance
+ \item That's all!
+ \end{itemize}
+\end{frame}
+
+\begin{frame}[fragile]
+ \frametitle{Code}
+ \begin{python}
+from gzip import compress as C
+import numpy as np
+
+for (x1, _) in test_set:
+ Cx1 = len(C(x1.encode()))
+ distance_from_x1 = []
+ for (x2, _) in training_set:
+ Cx2 = len(C(x2.encode()))
+ x1x2 = " ".join([x1, x2])
+ Cx1x2 = len(C(x1x2.encode()))
+ ncd = (Cx1x2 - min(Cx1,Cx2)) / max(Cx1, Cx2)
+ distance_from_x1.append(ncd)
+ sorted_idx = np.argsort(np.array(distance_from_x1))
+ top_k_class = training_set[sorted_idx[:k], 1]
+ predict_class = max(set(top_k_class), key=top_k_class.count)
+ \end{python}
+\end{frame}
+
+\begin{frame}
+ \frametitle{Performance optimizations}
+ \begin{itemize}
+ \item Can reuse $C(x)$ when computing $C(xy)$
+ \item Multithreading
+ \item Use C instead of Python
+ \end{itemize}
+\end{frame}
+
+\begin{frame}
+ \frametitle{Results}
+\end{frame}
+
+\begin{frame}
+ \frametitle{Actually\dots}
+\end{frame}
+
+\begin{frame}
+ \frametitle{Connections between compression and ML}
+ \begin{itemize}
+ \item Autoencoders
+ \end{itemize}
+\end{frame}
+
+\end{document}