gpt.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54

from http.server import HTTPServer, BaseHTTPRequestHandler
from os import chmod
from pathlib import Path
from socket import AF_UNIX
from socketserver import UnixStreamServer
from urllib.parse import unquote

from torch import float16
from transformers import AutoModelForCausalLM, AutoTokenizer


# https://stackoverflow.com/questions/21650370/setting-up-an-http-server-that-listens-over-a-file-socket
class UnixHTTPServer(UnixStreamServer):
    def get_request(self):
        request, client_address = super(UnixHTTPServer, self).get_request()
        return (request, ["local", 0])


class textgenHandler(BaseHTTPRequestHandler):
    def do_GET(self):    
        prompt = unquote(self.path[1:])        
        print('Prompt')
        print(prompt)

        if prompt == 'favicon.ico':
            return
        
        input = tokenizer.encode(prompt, return_tensors='pt').to('cuda')
        output = tokenizer.decode(model.generate(
            input, do_sample=True, max_length=500, top_p=0.9)[0])
        print(output)
        
        self.send_response(200)
        self.send_header('Content-Type', 'text/plain')
        self.send_header('Content-Length', str(len(output)))
        self.end_headers()
        self.wfile.write(output.encode('utf-8'))


# Load model
print('Loading model')
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-2.7B')
model = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-2.7B',
    torch_dtype=float16, low_cpu_mem_usage=True).to('cuda')


# Create and start server
print('Starting server')
path = '/srv/http/pages/textgen'
Path(path).unlink(missing_ok=True)
server = UnixHTTPServer(path, textgenHandler)
chmod(path, 666)
print('Server ready')
server.serve_forever()