diff options
author | Anthony Wang | 2022-02-22 16:57:12 -0600 |
---|---|---|
committer | Anthony Wang | 2022-02-22 16:57:12 -0600 |
commit | edd4708123a9a87669893797a61dd42401de21dd (patch) | |
tree | 842c9cc2db9716d464bb4261ef74d7903600c916 /data.py | |
parent | a4dc9c238bad031da5fe478267c6e87a6e3dffa4 (diff) |
Modify data generation script for transformers
Diffstat (limited to 'data.py')
-rw-r--r-- | data.py | 5 |
1 files changed, 3 insertions, 2 deletions
@@ -21,9 +21,10 @@ statuses = cur.fetchall() # Clean up statuses for i in range(len(statuses)): # Remove HTML stuff - statuses[i] = unescape(sub(r'<[^>]*>', '', statuses[i][0])).split() + statuses[i] = unescape(sub(r'<[^>]*>', '', statuses[i][0])) # Remove URLs and special characters and convert to lowercase - statuses[i] = ' '.join([sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i] if word.find('://') == -1]) + # Uncomment for generating data for LSTMs + #statuses[i] = ' '.join([sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i].split() if word.find('://') == -1]) # Save to output file |