From edd4708123a9a87669893797a61dd42401de21dd Mon Sep 17 00:00:00 2001 From: Anthony Wang Date: Tue, 22 Feb 2022 16:57:12 -0600 Subject: Modify data generation script for transformers --- data.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/data.py b/data.py index acfc64e..2ff8a59 100644 --- a/data.py +++ b/data.py @@ -21,9 +21,10 @@ statuses = cur.fetchall() # Clean up statuses for i in range(len(statuses)): # Remove HTML stuff - statuses[i] = unescape(sub(r'<[^>]*>', '', statuses[i][0])).split() + statuses[i] = unescape(sub(r'<[^>]*>', '', statuses[i][0])) # Remove URLs and special characters and convert to lowercase - statuses[i] = ' '.join([sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i] if word.find('://') == -1]) + # Uncomment for generating data for LSTMs + #statuses[i] = ' '.join([sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i].split() if word.find('://') == -1]) # Save to output file -- cgit v1.2.3-70-g09d2