From 10c21fb1cd1081c94d4c7fd024b05a7847206aa9 Mon Sep 17 00:00:00 2001 From: Anthony Wang Date: Mon, 21 Feb 2022 17:57:32 -0600 Subject: Only use English messages and clean up URLs better in data.py --- data.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/data.py b/data.py index 87dbc91..5e90857 100644 --- a/data.py +++ b/data.py @@ -14,17 +14,17 @@ args = parser.parse_args() # Fetch messages from database since it's way faster than using the API conn = connect(args.database) cur = conn.cursor() -cur.execute('SELECT * FROM statuses') +cur.execute('SELECT text FROM statuses WHERE language=\'en\'') statuses = cur.fetchall() # Use regex to remove HTML stuff -text = [unescape(sub(r'<[^>]*>', ' ', status[2])) for status in statuses] +text = [unescape(sub(r'<[^>]*>', '', status[0])) for status in statuses] # Extract all words from statuses words = [word for message in text for word in message.split()] # Remove URLs and special characters and convert to lowercase words = [sub(r'[^a-z0-9]', '', word.lower()) - for word in words if word.find('.') == -1] + for word in words if word.find('://') == -1] # Remove empty strings words = [word for word in words if word != ''] @@ -32,4 +32,4 @@ words = [word for word in words if word != ''] # Save to output file with open(args.output, 'w') as f: for word in words: - f.write(word + '\n') + print(word, file=f) -- cgit v1.2.3-70-g09d2