diff options
author | Anthony Wang | 2022-02-21 12:54:35 -0600 |
---|---|---|
committer | Anthony Wang | 2022-02-21 12:54:35 -0600 |
commit | 01462b7073dda66f97210c98b394c65febd15156 (patch) | |
tree | 90987c38fd8b6ecf842273e5dab502872ea0abb7 | |
parent | 198b2af9c77544ef2d7b4dfe290de66b8b515758 (diff) |
Clean up data.py script for generating training data
-rw-r--r-- | data.py | 33 | ||||
-rw-r--r-- | db.py | 29 |
2 files changed, 33 insertions, 29 deletions
@@ -0,0 +1,33 @@ +from re import sub +from html import unescape +from argparse import ArgumentParser + +from psycopg2 import connect + + +parser = ArgumentParser() +parser.add_argument('-d', '--database', help='database connection string') +parser.add_argument('-o', '--output', help='Output file', default='data') +args = parser.parse_args() + + +# Fetch messages from database since it's way faster than using the API +conn = connect(args.database) +cur = conn.cursor() +cur.execute('SELECT * FROM statuses') +statuses = cur.fetchall() + + +# Use regex to remove HTML stuff +text = [unescape(sub(r'<[^>]*>', ' ', status[2])) for status in statuses] +# Extract all words from statuses +words = [word for message in text for word in message.split()] +# Remove URLs and special characters and convert to lowercase +words = [sub(r'[^a-z0-9]', '', word.lower()) for word in words if word.find('://') == -1] +# Remove empty strings +words = [word for word in words if word != ''] + + +with open(args.output, 'w') as f: + for word in words: + f.write(word + '\n') @@ -1,29 +0,0 @@ -#!/usr/bin/python3 - -import re -import psycopg2 - - -# Fetch messages from database since it's way faster than using the API -conn = psycopg2.connect(dbname="mastodon_production") -cur = conn.cursor() -cur.execute('SELECT * FROM statuses') -statuses = cur.fetchall() - - -# Extract all words from statuses -# Use regex to remove HTML stuff -text = [re.sub(r'<[^>]*>', ' ', status[2]) for status in statuses] -# print(text[0:100]) - - -words = [word for message in text for word in message.split()] -# Remove URLs and special characters and convert to lowercase -words = [re.sub(r'[^a-z0-9]', '', word.lower()) for word in words if word.find('://') == -1] -# Remove empty strings -words = [word for word in words if word != ''] - - -with open('/tmp/text', 'w') as f: - for word in words: - f.write(word + '\n') |