From 289b8b4bcb99a5b47f9ea010f420de1eb06d47c3 Mon Sep 17 00:00:00 2001 From: Anthony Wang Date: Mon, 21 Feb 2022 19:43:02 -0600 Subject: Clean up statuses better in data.py --- data.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/data.py b/data.py index 3eeeaf1..acfc64e 100644 --- a/data.py +++ b/data.py @@ -18,13 +18,12 @@ cur.execute('SELECT text FROM statuses WHERE language=\'en\'') statuses = cur.fetchall() +# Clean up statuses for i in range(len(statuses)): # Remove HTML stuff statuses[i] = unescape(sub(r'<[^>]*>', '', statuses[i][0])).split() # Remove URLs and special characters and convert to lowercase - statuses[i] = [sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i] if word.find('://') == -1] - # Remove empty strings - statuses[i] = ' '.join([word for word in statuses[i] if word != '']) + statuses[i] = ' '.join([sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i] if word.find('://') == -1]) # Save to output file -- cgit v1.2.3-70-g09d2