aboutsummaryrefslogtreecommitdiff
path: root/data.py
diff options
context:
space:
mode:
authorAnthony Wang2022-02-22 16:57:12 -0600
committerAnthony Wang2022-02-22 16:57:12 -0600
commitedd4708123a9a87669893797a61dd42401de21dd (patch)
tree842c9cc2db9716d464bb4261ef74d7903600c916 /data.py
parenta4dc9c238bad031da5fe478267c6e87a6e3dffa4 (diff)
Modify data generation script for transformers
Diffstat (limited to 'data.py')
-rw-r--r--data.py5
1 files changed, 3 insertions, 2 deletions
diff --git a/data.py b/data.py
index acfc64e..2ff8a59 100644
--- a/data.py
+++ b/data.py
@@ -21,9 +21,10 @@ statuses = cur.fetchall()
# Clean up statuses
for i in range(len(statuses)):
# Remove HTML stuff
- statuses[i] = unescape(sub(r'<[^>]*>', '', statuses[i][0])).split()
+ statuses[i] = unescape(sub(r'<[^>]*>', '', statuses[i][0]))
# Remove URLs and special characters and convert to lowercase
- statuses[i] = ' '.join([sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i] if word.find('://') == -1])
+ # Uncomment for generating data for LSTMs
+ #statuses[i] = ' '.join([sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i].split() if word.find('://') == -1])
# Save to output file