From 55f2c364c6d9b9d2d9549a55deac7e1416eabc02 Mon Sep 17 00:00:00 2001 From: SIPB Date: Sun, 24 Nov 2024 19:48:51 -0500 Subject: Add first draft of blog post --- blog.bib | 51 +++++++++ blog.md | 73 ++++++++++++ index.html | 381 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 505 insertions(+) create mode 100644 blog.bib create mode 100644 blog.md create mode 100644 index.html diff --git a/blog.bib b/blog.bib new file mode 100644 index 0000000..b118a5a --- /dev/null +++ b/blog.bib @@ -0,0 +1,51 @@ +@inproceedings{10.5555/3666122.3666260, +author = {Zang, Xiao and Yin, Miao and Xiao, Jinqi and Zonouz, Saman and Yuan, Bo}, +title = {GraphMP: graph neural network-based motion planning with efficient graph search}, +year = {2024}, +publisher = {Curran Associates Inc.}, +address = {Red Hook, NY, USA}, +abstract = {Motion planning, which aims to find a high-quality collision-free path in the configuration space, is a fundamental task in robotic systems. Recently, learning-based motion planners, especially the graph neural network-powered, have shown promising planning performance. However, though the state-of-the-art GNN planner can efficiently extract and learn graph information, its inherent mechanism is not well suited for graph search process, hindering its further performance improvement. To address this challenge and fully unleash the potential of GNN in motion planning, this paper proposes GraphMP, a neural motion planner for both low and high-dimensional planning tasks. With the customized model architecture and training mechanism design, GraphMP can simultaneously perform efficient graph pattern extraction and graph search processing, leading to strong planning performance. Experiments on a variety of environments, ranging from 2D Maze to 14D dual KUKA robotic arm, show that our proposed GraphMP achieves significant improvement on path quality and planning speed over state-of-the-art learning-based and classical planners; while preserving competitive success rate.}, +booktitle = {Proceedings of the 37th International Conference on Neural Information Processing Systems}, +articleno = {138}, +numpages = {12}, +location = {New Orleans, LA, USA}, +series = {NIPS '23} +} + +@article{DBLP:journals/corr/abs-2102-09544, + author = {Quentin Cappart and + Didier Ch{\'{e}}telat and + Elias B. Khalil and + Andrea Lodi and + Christopher Morris and + Petar Velickovic}, + title = {Combinatorial optimization and reasoning with graph neural networks}, + journal = {CoRR}, + volume = {abs/2102.09544}, + year = {2021}, + url = {https://arxiv.org/abs/2102.09544}, + eprinttype = {arXiv}, + eprint = {2102.09544}, + timestamp = {Fri, 26 Feb 2021 14:31:25 +0100}, + biburl = {https://dblp.org/rec/journals/corr/abs-2102-09544.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@article{10.1109/TPAMI.2023.3256421, +author = {Tutsoy, Onder}, +title = {Graph Theory Based Large-Scale Machine Learning With Multi-Dimensional Constrained Optimization Approaches for Exact Epidemiological Modeling of Pandemic Diseases}, +year = {2023}, +issue_date = {Aug. 2023}, +publisher = {IEEE Computer Society}, +address = {USA}, +volume = {45}, +number = {8}, +issn = {0162-8828}, +url = {https://doi.org/10.1109/TPAMI.2023.3256421}, +doi = {10.1109/TPAMI.2023.3256421}, +abstract = {Multi-dimensional prediction models of the pandemic diseases should be constructed in a way to reflect their peculiar epidemiological characters. In this paper, a graph theory-based constrained multi-dimensional (CM) mathematical and meta-heuristic algorithms (MA) are formed to learn the unknown parameters of a large-scale epidemiological model. The specified parameter signs and the coupling parameters of the sub-models constitute the constraints of the optimization problem. In addition, magnitude constraints on the unknown parameters are imposed to proportionally weight the input-output data importance. To learn these parameters, a gradient-based CM recursive least square (CM-RLS) algorithm, and three search-based MAs; namely, the CM particle swarm optimization (CM-PSO), the CM success history-based adaptive differential evolution (CM-SHADE), and the CM-SHADEWO enriched with the whale optimization (WO) algorithms are constructed. The traditional SHADE algorithm was the winner of the 2018 IEEE congress on evolutionary computation (CEC) and its versions in this paper are modified to create more certain parameter search spaces. The results obtained under the equal conditions show that the mathematical optimization algorithm CM-RLS outperforms the MA algorithms, which is expected since it uses the available gradient information. However, the search-based CM-SHADEWO algorithm is able to capture the dominant character of the CM optimization solution and produce satisfactory estimates in the presence of the hard constraints, uncertainties and lack of gradient information.}, +journal = {IEEE Trans. Pattern Anal. Mach. Intell.}, +month = aug, +pages = {9836–9845}, +numpages = {10} +} \ No newline at end of file diff --git a/blog.md b/blog.md new file mode 100644 index 0000000..68de189 --- /dev/null +++ b/blog.md @@ -0,0 +1,73 @@ +--- +build: pandoc blog.md --citeproc -s -o index.html +title: "6.7960 Project: Investigating Off-Distribution Generalization of Transformers" +bibliography: blog.bib +link-citations: true +--- + + + + + +
+Anthony Wang, Alek Westover, Kevin Zhao + +{xy,alekw,kevinmz}\@mit.edu +
+ +## Abstract + +TODO: Probably should move this to the introduction instead of abstract? + +Recently, LLMs have been developing very fast, and with that comes the concern of aligning the models to output true and productive statements. One common approach for ensuring this is to have a human in the loop rewarding the model for true outputs (e.g. RLHF), but one drawback to this problem is that humans can be poor judges of truthfulness. As LLMs become more capable, there might not even exist experts that are good judges of whether the model's outputs, such as difficult mathematical proofs, are truthful. So, we'd like to propose a potential solution to this issue via **off-distribution generalization** - applying human-like intuition to solve problems not in the dataset. To do so involves Graph Neural Networks (GNNs), which are important in many artificial intelligence applications, including the design of practical features. Paul Christiano [proposed an experiment](https://www.alignmentforum.org/posts/BxersHYN2qcFoonwg/experimentally-evaluating-whether-honesty-generalizes?commentId=dsDA2BWpHPdgLvaXX) about shortest paths in a graph; our project is essentially to implement Christiano's proposed experiment. To the best of our knowledge, although there has been research in applying machine learning for different variations of graph searches [@10.5555/3666122.3666260], no one has done our exact experiment yet. + +# Introduction + +## Overview + +## Related Work + +There has been some research into the algorithmic optimization of GNNs and how they may solve real-world issues; however, none of the related work targets using generic machine learning methods to solve graph problems. + +- Cappart et al. has researched more into the Combinatorial Optimization of GNNs and developed algorithms for related tasks, thus facilitating machine learning [@DBLP:journals/corr/abs-2102-09544]. Their results are mostly algorithmic so we develop further by trading a bit of accuracy for much faster computation in such tasks. + +- Tutsoy uses a graph-theory-based approach to model the epidemiological characteristics of infectious diseases, such as COVID-19 [@10.1109/TPAMI.2023.3256421]. We understand from his paper how GNN optimization may also be useful in researching novel diseases. + +## Sectioning + +# Our Machine Learning Approach + +## Task + +We will use a synthetic task to test our hypothesis that models will generalize truthfully off-distribution. The synthetic task is computing the distance between various vertices in an input graph. Our experiment will have three parts: + +1. Pre-train a transformer to predict the distance between two fixed vertices $s,t$ on graphs with $n\in [8, 32)$ vertices. +2. Fine-tune a transformer to predict the distances between $s,t'$ for any $t'$ which is on the shortest path from $s$ to $t$, but only do fine-tuning on graphs with $n\in [8,16)$ vertices. +3. Test whether the transformer can accurately predict the distances between $s,t'$ for any $t'$ on the shortest path from $s$ to $t$ for graphs with $n\in [16,32)$ vertices. + +## Data + +We will represent an $n$ vertex, $m$ edge unweighted, undirected graph as sequence of the endpoints of the $m$ edges, so $[a_1,b_1,a_2,b_2,\ldots,a_m,b_m]$ represents a graph with the edges $\{(a_i,b_i)\}$ for $1 \leq i \leq m$. We will pad all sequences to be the same length using the padding token 0. + +The full input to our model will additionally add the target vertex after the padding tokens. The model is tasked with predicting the length of the shortest path between vertex 1 and the target vertex $t$. If no such path exists, we define the length to be $n+1$ which represents infinity. For example, an input-output pair for our model could look like $[1, 3, 3, 2, 0, 0, 0, 0, 2]$ and $2$ respectively. + +We have three separate datasets. + +- **Pre-train data**: For each $n \in [8,32)$, we will generate several graphs on $n$ vertices. We generate these graphs by inserting $2n$ random edges into the graph. We always set the target vertex to be $2$ here. +- **Fine-tune data**: For each $n \in [8,16)$, we will generate several graphs on $n$ vertices. We generate these graphs by inserting $2n$ random edges into the graph. We select the target vertex to be a random vertex on the shortest path from $1$ to $2$. +- **Generalization testing data**: The same as the fine-tune data, except we sample $n \in [16,32)$ instead. + +As a side note, we are also curious whether the transformer learns to generalize to different distributions of graphs, such as denser graphs or graphs with different properties. Time permitting, we will also investigate this. + +## Architecture + +We plan to use a standard transformer architecture. We will ensure that the number of layers in our transformer is at least the diameter of the graph. By doing this, we ensure that there is an extremely simple circuit --- namely BFS --- that the transformer could in theory learn to perform the task. Note that if the transformer actually learns a simple circuit to perform this task, then it seems more likely to generalize well. This is also our intuition for why it should be possible to fine tune on a small amount of data for finding shortest paths to other vertices besides $2$ -- it seems like the model should be computing these other distances as intermediate values in its computation to find the distance to vertex $2$. + +## Positional Encodings + +In order to facilitate performing this task with limited computational resources, we plan to use custom-made positional encodings that tell the model extra information about the +structure of the problem, rather than the traditional sine/cosine positional encodings. (TODO: THIS IS OUTDATED) Specifically, our positional encodings are $v_1,v_1,v_2,v_2,\ldots,v_m,v_m,v_{m+1}$ where each $v_i$ is a random vector so each $v_i,v_j$ pair is nearly orthogonal with high probability. We will concatenate these with the token encodings rather than adding them. This should let the model easily have large attention scores between vertices corresponding to a single edge. + +## References diff --git a/index.html b/index.html new file mode 100644 index 0000000..0126a3b --- /dev/null +++ b/index.html @@ -0,0 +1,381 @@ + + + + + + + 6.7960 Project: Investigating Off-Distribution Generalization of Transformers + + + +
+

6.7960 Project: Investigating Off-Distribution +Generalization of Transformers

+
+ + +
+

Anthony Wang, Alek Westover, Kevin Zhao

+

{xy,alekw,kevinmz}@mit.edu

+
+

Abstract

+

TODO: Probably should move this to the introduction instead of +abstract?

+

Recently, LLMs have been developing very fast, and with that comes +the concern of aligning the models to output true and productive +statements. One common approach for ensuring this is to have a human in +the loop rewarding the model for true outputs (e.g. RLHF), but one +drawback to this problem is that humans can be poor judges of +truthfulness. As LLMs become more capable, there might not even exist +experts that are good judges of whether the model’s outputs, such as +difficult mathematical proofs, are truthful. So, we’d like to propose a +potential solution to this issue via off-distribution +generalization - applying human-like intuition to solve +problems not in the dataset. To do so involves Graph Neural Networks +(GNNs), which are important in many artificial intelligence +applications, including the design of practical features. Paul +Christiano proposed +an experiment about shortest paths in a graph; our project is +essentially to implement Christiano’s proposed experiment. To the best +of our knowledge, although there has been research in applying machine +learning for different variations of graph searches (Zang et al. +2024), no one has done our exact experiment yet.

+

Introduction

+

Overview

+ +

There has been some research into the algorithmic optimization of +GNNs and how they may solve real-world issues; however, none of the +related work targets using generic machine learning methods to solve +graph problems.

+ +

Sectioning

+

Our Machine Learning +Approach

+

Task

+

We will use a synthetic task to test our hypothesis that models will +generalize truthfully off-distribution. The synthetic task is computing +the distance between various vertices in an input graph. Our experiment +will have three parts:

+
    +
  1. Pre-train a transformer to predict the distance between two fixed +vertices s, t on +graphs with n ∈ [8, 32) +vertices.
  2. +
  3. Fine-tune a transformer to predict the distances between s, t for any t which is on the shortest path +from s to t, but only do fine-tuning on graphs +with n ∈ [8, 16) +vertices.
  4. +
  5. Test whether the transformer can accurately predict the distances +between s, t for any +t on the shortest path from +s to t for graphs with n ∈ [16, 32) vertices.
  6. +
+

Data

+

We will represent an n +vertex, m edge unweighted, +undirected graph as sequence of the endpoints of the m edges, so [a1, b1, a2, b2, …, am, bm] +represents a graph with the edges {(ai, bi)} +for 1 ≤ i ≤ m. We +will pad all sequences to be the same length using the padding token +0.

+

The full input to our model will additionally add the target vertex +after the padding tokens. The model is tasked with predicting the length +of the shortest path between vertex 1 and the target vertex t. If no such path exists, we define +the length to be n + 1 which +represents infinity. For example, an input-output pair for our model +could look like [1, 3, 3, 2, 0, 0, 0, 0, 2] and 2 respectively.

+

We have three separate datasets.

+ +

As a side note, we are also curious whether the transformer learns to +generalize to different distributions of graphs, such as denser graphs +or graphs with different properties. Time permitting, we will also +investigate this.

+

Architecture

+

We plan to use a standard transformer architecture. We will ensure +that the number of layers in our transformer is at least the diameter of +the graph. By doing this, we ensure that there is an extremely simple +circuit — namely BFS — that the transformer could in theory learn to +perform the task. Note that if the transformer actually learns a simple +circuit to perform this task, then it seems more likely to generalize +well. This is also our intuition for why it should be possible to fine +tune on a small amount of data for finding shortest paths to other +vertices besides 2 – it seems like the +model should be computing these other distances as intermediate values +in its computation to find the distance to vertex 2.

+

Positional Encodings

+

In order to facilitate performing this task with limited +computational resources, we plan to use custom-made positional encodings +that tell the model extra information about the structure of the +problem, rather than the traditional sine/cosine positional encodings. +(TODO: THIS IS OUTDATED) Specifically, our positional encodings are +v1, v1, v2, v2, …, vm, vm, vm + 1 +where each vi is a random +vector so each vi, vj +pair is nearly orthogonal with high probability. We will concatenate +these with the token encodings rather than adding them. This should let +the model easily have large attention scores between vertices +corresponding to a single edge.

+

References

+
+
+Cappart, Quentin, Didier Chételat, Elias B. Khalil, Andrea Lodi, +Christopher Morris, and Petar Velickovic. 2021. “Combinatorial +Optimization and Reasoning with Graph Neural Networks.” +CoRR abs/2102.09544. https://arxiv.org/abs/2102.09544. +
+
+Tutsoy, Onder. 2023. “Graph Theory Based Large-Scale Machine +Learning with Multi-Dimensional Constrained Optimization Approaches for +Exact Epidemiological Modeling of Pandemic Diseases.” IEEE +Trans. Pattern Anal. Mach. Intell. 45 (8): 9836–45. https://doi.org/10.1109/TPAMI.2023.3256421. +
+
+Zang, Xiao, Miao Yin, Jinqi Xiao, Saman Zonouz, and Bo Yuan. 2024. +“GraphMP: Graph Neural Network-Based Motion Planning with +Efficient Graph Search.” In Proceedings of the 37th +International Conference on Neural Information Processing Systems. +NIPS ’23. Red Hook, NY, USA: Curran Associates Inc. +
+
+ + -- cgit v1.2.3-70-g09d2