diff options
-rw-r--r-- | data.py | 5 |
1 files changed, 3 insertions, 2 deletions
@@ -21,9 +21,10 @@ statuses = cur.fetchall() # Clean up statuses for i in range(len(statuses)): # Remove HTML stuff - statuses[i] = unescape(sub(r'<[^>]*>', '', statuses[i][0])).split() + statuses[i] = unescape(sub(r'<[^>]*>', '', statuses[i][0])) # Remove URLs and special characters and convert to lowercase - statuses[i] = ' '.join([sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i] if word.find('://') == -1]) + # Uncomment for generating data for LSTMs + #statuses[i] = ' '.join([sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i].split() if word.find('://') == -1]) # Save to output file |