diff options
author | Anthony Wang | 2024-06-13 17:57:31 -0500 |
---|---|---|
committer | Anthony Wang | 2024-06-13 17:57:31 -0500 |
commit | 66432b3158c2182762572649ba0f051bf18dcaa6 (patch) | |
tree | a9e3a59d2f47d1cb589d0323e636710d496a45a4 /static | |
parent | dbc640213dd7912bd3698bd90b568408b6bf84aa (diff) |
First draft of gzip classification presentation
Diffstat (limited to 'static')
-rw-r--r-- | static/src/gzip-classification.pdf | bin | 0 -> 115204 bytes | |||
-rw-r--r-- | static/src/gzip-classification.tex | 129 |
2 files changed, 129 insertions, 0 deletions
diff --git a/static/src/gzip-classification.pdf b/static/src/gzip-classification.pdf Binary files differnew file mode 100644 index 0000000..8bf0d2c --- /dev/null +++ b/static/src/gzip-classification.pdf diff --git a/static/src/gzip-classification.tex b/static/src/gzip-classification.tex new file mode 100644 index 0000000..ca743d9 --- /dev/null +++ b/static/src/gzip-classification.tex @@ -0,0 +1,129 @@ +\documentclass{beamer} + +\usepackage{pythonhighlight} + +% https://arxiv.org/abs/2212.09410 +% https://arxiv.org/abs/2309.10668 +% https://github.com/Futrell/ziplm +% https://bellard.org/nncp/nncp_v2.1.pdf +% https://prize.hutter1.net/ (blocked by AMD 🤷) +% https://kenschutte.com/gzip-knn-paper/ +% https://kenschutte.com/gzip-knn-paper2/ + +\title{Less is More: Parameter-Free Text Classification with Gzip} +\author{Anthony Wang} + +\begin{document} + +\frame{\titlepage} + +\begin{frame} + \frametitle{Task: text compression} + \begin{itemize} + \item Given some categories, classify strings into the categories + \item Example from AG News dataset: + \begin{itemize} + \item Categories: World, Sports, Business, Sci/Tech + \item Example string: "AMD Will Have An Edge Over Intel Through 2005. Piper Jaffray said Advanced Micro Devices (nyse: AMD - news - people ) should have an edge over Intel (nasdaq: INTC - news - people ) quot;throughout 2005 quot; but resumed coverage of both companies at quot;market perform quot; and both at price targets of \$25." + \end{itemize} + \item Usually solved using neural networks, i.e. BERT + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{gzip} + \begin{itemize} + \item Compression program using LZ77 and Huffman coding + \item Pretty fast and used everywhere + \item Doesn't require giant GPUs + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Intuition} + \begin{itemize} + \item Compressors are good at capturing regularity + \item Objects from the same category share more regularity than those that aren't + \item Compressor $C$, strings $x,y$ similar, different from $z$ + \item Then $C(xy) < C(xz)$ + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Kolmogorov complexity} + \begin{definition} + \textbf{Kolmogorov complexity} $K(x)$: length of shortest program that outputs $x$, i.e. an ideal compressor + \end{definition} + \begin{definition} + \textbf{Information distance} $E(x, y)$: length of shortest program that converts $x$ to $y$, equal to $K(xy)-K(x)$ + \end{definition} + \begin{itemize} + \item $K(x)$ is generally \textbf{incomputable} though, similar to the halting problem + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Approximating $K$} + \begin{itemize} + \item Simply swap out $K$ for our gzip compressor $C$ + \item Normalize by length + \end{itemize} + \begin{definition} + \textbf{Normalized compression distance} \[NCD(x,y) = \frac{C(x,y)-\min(C(x),C(y))}{\max(C(x),C(y))}\] + \end{definition} +\end{frame} + +\begin{frame} + \frametitle{Method for classification} + \begin{itemize} + \item $k$-nearest-neighbor among all strings in the training set, using $NCD$ as the distance + \item That's all! + \end{itemize} +\end{frame} + +\begin{frame}[fragile] + \frametitle{Code} + \begin{python} +from gzip import compress as C +import numpy as np + +for (x1, _) in test_set: + Cx1 = len(C(x1.encode())) + distance_from_x1 = [] + for (x2, _) in training_set: + Cx2 = len(C(x2.encode())) + x1x2 = " ".join([x1, x2]) + Cx1x2 = len(C(x1x2.encode())) + ncd = (Cx1x2 - min(Cx1,Cx2)) / max(Cx1, Cx2) + distance_from_x1.append(ncd) + sorted_idx = np.argsort(np.array(distance_from_x1)) + top_k_class = training_set[sorted_idx[:k], 1] + predict_class = max(set(top_k_class), key=top_k_class.count) + \end{python} +\end{frame} + +\begin{frame} + \frametitle{Performance optimizations} + \begin{itemize} + \item Can reuse $C(x)$ when computing $C(xy)$ + \item Multithreading + \item Use C instead of Python + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{Results} +\end{frame} + +\begin{frame} + \frametitle{Actually\dots} +\end{frame} + +\begin{frame} + \frametitle{Connections between compression and ML} + \begin{itemize} + \item Autoencoders + \end{itemize} +\end{frame} + +\end{document} |