aboutsummaryrefslogtreecommitdiff
path: root/data.py
diff options
context:
space:
mode:
Diffstat (limited to 'data.py')
-rw-r--r--data.py5
1 files changed, 2 insertions, 3 deletions
diff --git a/data.py b/data.py
index 3eeeaf1..acfc64e 100644
--- a/data.py
+++ b/data.py
@@ -18,13 +18,12 @@ cur.execute('SELECT text FROM statuses WHERE language=\'en\'')
statuses = cur.fetchall()
+# Clean up statuses
for i in range(len(statuses)):
# Remove HTML stuff
statuses[i] = unescape(sub(r'<[^>]*>', '', statuses[i][0])).split()
# Remove URLs and special characters and convert to lowercase
- statuses[i] = [sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i] if word.find('://') == -1]
- # Remove empty strings
- statuses[i] = ' '.join([word for word in statuses[i] if word != ''])
+ statuses[i] = ' '.join([sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i] if word.find('://') == -1])
# Save to output file