aboutsummaryrefslogtreecommitdiff
path: root/data.py
diff options
context:
space:
mode:
Diffstat (limited to 'data.py')
-rw-r--r--data.py5
1 files changed, 3 insertions, 2 deletions
diff --git a/data.py b/data.py
index acfc64e..2ff8a59 100644
--- a/data.py
+++ b/data.py
@@ -21,9 +21,10 @@ statuses = cur.fetchall()
# Clean up statuses
for i in range(len(statuses)):
# Remove HTML stuff
- statuses[i] = unescape(sub(r'<[^>]*>', '', statuses[i][0])).split()
+ statuses[i] = unescape(sub(r'<[^>]*>', '', statuses[i][0]))
# Remove URLs and special characters and convert to lowercase
- statuses[i] = ' '.join([sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i] if word.find('://') == -1])
+ # Uncomment for generating data for LSTMs
+ #statuses[i] = ' '.join([sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i].split() if word.find('://') == -1])
# Save to output file