Clean up data.py script for generating training data

author: Anthony Wang 2022-02-21 12:54:35 -0600
committer: Anthony Wang 2022-02-21 12:54:35 -0600
commit: 01462b7073dda66f97210c98b394c65febd15156 (patch)
tree: 90987c38fd8b6ecf842273e5dab502872ea0abb7
parent: 198b2af9c77544ef2d7b4dfe290de66b8b515758 (diff)
2 files changed, 33 insertions, 29 deletions
diff --git a/data.py b/data.py
new file mode 100644
index 0000000..4090384
--- /dev/null
+++ b/data.py
@@ -0,0 +1,33 @@
+from re import sub
+from html import unescape
+from argparse import ArgumentParser
+
+from psycopg2 import connect
+
+
+parser = ArgumentParser()
+parser.add_argument('-d', '--database', help='database connection string')
+parser.add_argument('-o', '--output', help='Output file', default='data')
+args = parser.parse_args()
+
+
+# Fetch messages from database since it's way faster than using the API
+conn = connect(args.database)
+cur = conn.cursor()
+cur.execute('SELECT * FROM statuses')
+statuses = cur.fetchall()
+
+
+# Use regex to remove HTML stuff
+text = [unescape(sub(r'<[^>]*>', ' ', status[2])) for status in statuses]
+# Extract all words from statuses
+words = [word for message in text for word in message.split()]
+# Remove URLs and special characters and convert to lowercase
+words = [sub(r'[^a-z0-9]', '', word.lower()) for word in words if word.find('://') == -1]
+# Remove empty strings
+words = [word for word in words if word != '']
+
+
+with open(args.output, 'w') as f:
+    for word in words:
+        f.write(word + '\n')
diff --git a/db.py b/db.py
deleted file mode 100644
index ee24af0..0000000
--- a/db.py
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/python3
-
-import re
-import psycopg2
-
-
-# Fetch messages from database since it's way faster than using the API
-conn = psycopg2.connect(dbname="mastodon_production")
-cur = conn.cursor()
-cur.execute('SELECT * FROM statuses')
-statuses = cur.fetchall()
-
-
-# Extract all words from statuses
-# Use regex to remove HTML stuff
-text = [re.sub(r'<[^>]*>', ' ', status[2]) for status in statuses]
-# print(text[0:100])
-
-
-words = [word for message in text for word in message.split()]
-# Remove URLs and special characters and convert to lowercase
-words = [re.sub(r'[^a-z0-9]', '', word.lower()) for word in words if word.find('://') == -1]
-# Remove empty strings
-words = [word for word in words if word != '']
-
-
-with open('/tmp/text', 'w') as f:
-    for word in words:
-        f.write(word + '\n')
author	Anthony Wang	2022-02-21 12:54:35 -0600
committer	Anthony Wang	2022-02-21 12:54:35 -0600
commit	01462b7073dda66f97210c98b394c65febd15156 (patch)
tree	90987c38fd8b6ecf842273e5dab502872ea0abb7
parent	198b2af9c77544ef2d7b4dfe290de66b8b515758 (diff)