Only use English messages and clean up URLs better in data.py

author: Anthony Wang 2022-02-21 17:57:32 -0600
committer: Anthony Wang 2022-02-21 17:57:32 -0600
commit: 10c21fb1cd1081c94d4c7fd024b05a7847206aa9 (patch)
tree: c5fc13933a22f116504b4003dc01462b0cf89668
parent: 5f9292e242121a0c9c18a4bd32c811a313228a8b (diff)
1 files changed, 4 insertions, 4 deletions
diff --git a/data.py b/data.py
index 87dbc91..5e90857 100644
--- a/data.py
+++ b/data.py
@@ -14,17 +14,17 @@ args = parser.parse_args()
 # Fetch messages from database since it's way faster than using the API
 conn = connect(args.database)
 cur = conn.cursor()
-cur.execute('SELECT * FROM statuses')
+cur.execute('SELECT text FROM statuses WHERE language=\'en\'')
 statuses = cur.fetchall()
 
 
 # Use regex to remove HTML stuff
-text = [unescape(sub(r'<[^>]*>', ' ', status[2])) for status in statuses]
+text = [unescape(sub(r'<[^>]*>', '', status[0])) for status in statuses]
 # Extract all words from statuses
 words = [word for message in text for word in message.split()]
 # Remove URLs and special characters and convert to lowercase
 words = [sub(r'[^a-z0-9]', '', word.lower())
-         for word in words if word.find('.') == -1]
+         for word in words if word.find('://') == -1]
 # Remove empty strings
 words = [word for word in words if word != '']
 
@@ -32,4 +32,4 @@ words = [word for word in words if word != '']
 # Save to output file
 with open(args.output, 'w') as f:
     for word in words:
-        f.write(word + '\n')
+        print(word, file=f)
author	Anthony Wang	2022-02-21 17:57:32 -0600
committer	Anthony Wang	2022-02-21 17:57:32 -0600
commit	10c21fb1cd1081c94d4c7fd024b05a7847206aa9 (patch)
tree	c5fc13933a22f116504b4003dc01462b0cf89668
parent	5f9292e242121a0c9c18a4bd32c811a313228a8b (diff)