diff options
author | Anthony Wang | 2022-02-21 19:20:38 -0600 |
---|---|---|
committer | Anthony Wang | 2022-02-21 19:20:38 -0600 |
commit | 1c1a518fc7114dc4496555b3d218099cad859707 (patch) | |
tree | d284f7554f99affbf079ab94f01a65714801be25 | |
parent | 10c21fb1cd1081c94d4c7fd024b05a7847206aa9 (diff) |
Make each line a status instead of just a big text file of words
-rw-r--r-- | data.py | 20 |
1 files changed, 9 insertions, 11 deletions
@@ -18,18 +18,16 @@ cur.execute('SELECT text FROM statuses WHERE language=\'en\'') statuses = cur.fetchall() -# Use regex to remove HTML stuff -text = [unescape(sub(r'<[^>]*>', '', status[0])) for status in statuses] -# Extract all words from statuses -words = [word for message in text for word in message.split()] -# Remove URLs and special characters and convert to lowercase -words = [sub(r'[^a-z0-9]', '', word.lower()) - for word in words if word.find('://') == -1] -# Remove empty strings -words = [word for word in words if word != ''] +for i in range(len(statuses)): + # Remove HTML stuff + statuses[i] = unescape(sub(r'<[^>]*>', '', statuses[i][0])).split() + # Remove URLs and special characters and convert to lowercase + statuses[i] = [sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i] if word.find('://') == -1] + # Remove empty strings + statuses[i] = ' '.join([word for word in statuses[i] if word != '']) # Save to output file with open(args.output, 'w') as f: - for word in words: - print(word, file=f) + for status in statuses: + print(status, file=f) |