aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnthony Wang2022-02-21 17:57:32 -0600
committerAnthony Wang2022-02-21 17:57:32 -0600
commit10c21fb1cd1081c94d4c7fd024b05a7847206aa9 (patch)
treec5fc13933a22f116504b4003dc01462b0cf89668
parent5f9292e242121a0c9c18a4bd32c811a313228a8b (diff)
Only use English messages and clean up URLs better in data.py
-rw-r--r--data.py8
1 files changed, 4 insertions, 4 deletions
diff --git a/data.py b/data.py
index 87dbc91..5e90857 100644
--- a/data.py
+++ b/data.py
@@ -14,17 +14,17 @@ args = parser.parse_args()
# Fetch messages from database since it's way faster than using the API
conn = connect(args.database)
cur = conn.cursor()
-cur.execute('SELECT * FROM statuses')
+cur.execute('SELECT text FROM statuses WHERE language=\'en\'')
statuses = cur.fetchall()
# Use regex to remove HTML stuff
-text = [unescape(sub(r'<[^>]*>', ' ', status[2])) for status in statuses]
+text = [unescape(sub(r'<[^>]*>', '', status[0])) for status in statuses]
# Extract all words from statuses
words = [word for message in text for word in message.split()]
# Remove URLs and special characters and convert to lowercase
words = [sub(r'[^a-z0-9]', '', word.lower())
- for word in words if word.find('.') == -1]
+ for word in words if word.find('://') == -1]
# Remove empty strings
words = [word for word in words if word != '']
@@ -32,4 +32,4 @@ words = [word for word in words if word != '']
# Save to output file
with open(args.output, 'w') as f:
for word in words:
- f.write(word + '\n')
+ print(word, file=f)