diff options
author | Anthony Wang | 2022-02-21 15:49:29 -0600 |
---|---|---|
committer | Anthony Wang | 2022-02-21 15:49:29 -0600 |
commit | 13c502ef53e8d16689730dc3ee97a05f78b2ef1e (patch) | |
tree | 25fce22a08a702bd0860efcefbe0dca14eefe60c /data.py | |
parent | 176f6f306bc5025f9c821491f139f94fd971050b (diff) |
Ignore URLs by ignoring strings with .
Diffstat (limited to 'data.py')
-rw-r--r-- | data.py | 2 |
1 files changed, 1 insertions, 1 deletions
@@ -24,7 +24,7 @@ text = [unescape(sub(r'<[^>]*>', ' ', status[2])) for status in statuses] words = [word for message in text for word in message.split()] # Remove URLs and special characters and convert to lowercase words = [sub(r'[^a-z0-9]', '', word.lower()) - for word in words if word.find('://') == -1] + for word in words if word.find('.') == -1] # Remove empty strings words = [word for word in words if word != ''] |