diff options
author | Anthony Wang | 2022-02-21 17:57:32 -0600 |
---|---|---|
committer | Anthony Wang | 2022-02-21 17:57:32 -0600 |
commit | 10c21fb1cd1081c94d4c7fd024b05a7847206aa9 (patch) | |
tree | c5fc13933a22f116504b4003dc01462b0cf89668 | |
parent | 5f9292e242121a0c9c18a4bd32c811a313228a8b (diff) |
Only use English messages and clean up URLs better in data.py
-rw-r--r-- | data.py | 8 |
1 files changed, 4 insertions, 4 deletions
@@ -14,17 +14,17 @@ args = parser.parse_args() # Fetch messages from database since it's way faster than using the API conn = connect(args.database) cur = conn.cursor() -cur.execute('SELECT * FROM statuses') +cur.execute('SELECT text FROM statuses WHERE language=\'en\'') statuses = cur.fetchall() # Use regex to remove HTML stuff -text = [unescape(sub(r'<[^>]*>', ' ', status[2])) for status in statuses] +text = [unescape(sub(r'<[^>]*>', '', status[0])) for status in statuses] # Extract all words from statuses words = [word for message in text for word in message.split()] # Remove URLs and special characters and convert to lowercase words = [sub(r'[^a-z0-9]', '', word.lower()) - for word in words if word.find('.') == -1] + for word in words if word.find('://') == -1] # Remove empty strings words = [word for word in words if word != ''] @@ -32,4 +32,4 @@ words = [word for word in words if word != ''] # Save to output file with open(args.output, 'w') as f: for word in words: - f.write(word + '\n') + print(word, file=f) |