mkbinder.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78

#!/usr/bin/env python3
import pdfkit
import weasyprint
import re
import os
import argparse
import pickle
from htmldate import find_date
from datetime import date


# CLI arguments
parser = argparse.ArgumentParser()
parser.add_argument('--backend', '-b', dest = 'backend', help = 'change the download backend; default: pdfkit', default = 'pdfkit', choices = ['pdfkit', 'weasyprint'])
parser.add_argument('--force', '-f', dest = 'force', help = 'force download all links instead of only the ones that need to be updated; default: False', default = False, choices = [False, True])
args = parser.parse_args()


for filename in os.listdir("Links"):
    if not filename.endswith(".txt"): continue
    if filename == "requirements.txt": continue
    
    print("Examining: " + filename)
    
    try:
        os.mkdir(filename[:-4])
    except: # I love bad error handling
        pass
    
    try:
        dates = pickle.load(open(os.path.join("Links", filename[:-4] + ".pickle"), 'rb'))
    except:
        dates = {}

    file = open(os.path.join("Links", filename), "r")
    links = file.readlines()
    for link in links:
        if link[0] == "#" or link[0] == "\n": continue
        
        new_date_str = find_date(link[:-1])
        
        if new_date_str == None:
            new_date = date.fromisoformat("9999-01-01")
        else:
            new_date = date.fromisoformat(new_date_str)
        
        try:
            old_date = dates[link[:-1]]
        except:
            old_date = date.fromisoformat("1970-01-01")
        
        
        if new_date > old_date or args.force:
            print("Downloading: " + link[:-1])
            print("Edit date: " + str(new_date))
            
            name = os.path.join(filename[:-4], re.sub(r'(?u)[^-\w.]', '', link[5:]) + ".pdf")
            # name = re.sub(r'(?u)[^-\w.]', '', link[5:]) + ".pdf"
            # print(name)
            try:
                # weasyprint seems faster?
                # but seems to be broken sometimes???
                if args.backend == 'pdfkit':
                    pdfkit.from_url(link, name)
                else:
                    pdf = weasyprint.HTML(link).write_pdf()
                    open(name, 'wb').write(pdf)
            except: # Maybe should handle errors a little bit better?
                print("Error when printing")
                pass
            
            
            if new_date != date.fromisoformat("9999-01-01"):
                dates[link[:-1]] = new_date
    
    
    pickle.dump(dates, open(os.path.join("Links", filename[:-4] + ".pickle"), 'wb'))