From 10c21fb1cd1081c94d4c7fd024b05a7847206aa9 Mon Sep 17 00:00:00 2001
From: Anthony Wang
Date: Mon, 21 Feb 2022 17:57:32 -0600
Subject: Only use English messages and clean up URLs better in data.py

---
 data.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/data.py b/data.py
index 87dbc91..5e90857 100644
--- a/data.py
+++ b/data.py
@@ -14,17 +14,17 @@ args = parser.parse_args()
 # Fetch messages from database since it's way faster than using the API
 conn = connect(args.database)
 cur = conn.cursor()
-cur.execute('SELECT * FROM statuses')
+cur.execute('SELECT text FROM statuses WHERE language=\'en\'')
 statuses = cur.fetchall()
 
 
 # Use regex to remove HTML stuff
-text = [unescape(sub(r'<[^>]*>', ' ', status[2])) for status in statuses]
+text = [unescape(sub(r'<[^>]*>', '', status[0])) for status in statuses]
 # Extract all words from statuses
 words = [word for message in text for word in message.split()]
 # Remove URLs and special characters and convert to lowercase
 words = [sub(r'[^a-z0-9]', '', word.lower())
-         for word in words if word.find('.') == -1]
+         for word in words if word.find('://') == -1]
 # Remove empty strings
 words = [word for word in words if word != '']
 
@@ -32,4 +32,4 @@ words = [word for word in words if word != '']
 # Save to output file
 with open(args.output, 'w') as f:
     for word in words:
-        f.write(word + '\n')
+        print(word, file=f)
-- 
cgit v1.2.3-70-g09d2