Make each line a status instead of just a big text file of words

author: Anthony Wang 2022-02-21 19:20:38 -0600
committer: Anthony Wang 2022-02-21 19:20:38 -0600
commit: 1c1a518fc7114dc4496555b3d218099cad859707 (patch)
tree: d284f7554f99affbf079ab94f01a65714801be25 /data.py
parent: 10c21fb1cd1081c94d4c7fd024b05a7847206aa9 (diff)
1 files changed, 9 insertions, 11 deletions
diff --git a/data.py b/data.py
index 5e90857..3eeeaf1 100644
--- a/data.py
+++ b/data.py
@@ -18,18 +18,16 @@ cur.execute('SELECT text FROM statuses WHERE language=\'en\'')
 statuses = cur.fetchall()
 
 
-# Use regex to remove HTML stuff
-text = [unescape(sub(r'<[^>]*>', '', status[0])) for status in statuses]
-# Extract all words from statuses
-words = [word for message in text for word in message.split()]
-# Remove URLs and special characters and convert to lowercase
-words = [sub(r'[^a-z0-9]', '', word.lower())
-         for word in words if word.find('://') == -1]
-# Remove empty strings
-words = [word for word in words if word != '']
+for i in range(len(statuses)):
+    # Remove HTML stuff
+    statuses[i] = unescape(sub(r'<[^>]*>', '', statuses[i][0])).split()
+    # Remove URLs and special characters and convert to lowercase
+    statuses[i] = [sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i] if word.find('://') == -1]
+    # Remove empty strings
+    statuses[i] = ' '.join([word for word in statuses[i] if word != ''])
 
 
 # Save to output file
 with open(args.output, 'w') as f:
-    for word in words:
-        print(word, file=f)
+    for status in statuses:
+        print(status, file=f)
author	Anthony Wang	2022-02-21 19:20:38 -0600
committer	Anthony Wang	2022-02-21 19:20:38 -0600
commit	1c1a518fc7114dc4496555b3d218099cad859707 (patch)
tree	d284f7554f99affbf079ab94f01a65714801be25 /data.py
parent	10c21fb1cd1081c94d4c7fd024b05a7847206aa9 (diff)