diff options
author | Anthony Wang | 2022-02-21 19:43:02 -0600 |
---|---|---|
committer | Anthony Wang | 2022-02-21 19:43:02 -0600 |
commit | 289b8b4bcb99a5b47f9ea010f420de1eb06d47c3 (patch) | |
tree | bc51dd28936ec95a1ea4620f95bf1551b8470394 /data.py | |
parent | 1c1a518fc7114dc4496555b3d218099cad859707 (diff) |
Clean up statuses better in data.py
Diffstat (limited to 'data.py')
-rw-r--r-- | data.py | 5 |
1 files changed, 2 insertions, 3 deletions
@@ -18,13 +18,12 @@ cur.execute('SELECT text FROM statuses WHERE language=\'en\'') statuses = cur.fetchall() +# Clean up statuses for i in range(len(statuses)): # Remove HTML stuff statuses[i] = unescape(sub(r'<[^>]*>', '', statuses[i][0])).split() # Remove URLs and special characters and convert to lowercase - statuses[i] = [sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i] if word.find('://') == -1] - # Remove empty strings - statuses[i] = ' '.join([word for word in statuses[i] if word != '']) + statuses[i] = ' '.join([sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i] if word.find('://') == -1]) # Save to output file |