import os import subprocess import time import threading from flask import Flask, Response app = Flask(__name__) @app.route("/<prompt>") def llama(prompt): if prompt == "favicon.ico": return Response(status=204) def generate(): try: process = subprocess.Popen( [ "/opt/llama.cpp/main", "-ngl", "32", "-m", "/opt/llama.cpp/models/wizardLM-7B.ggmlv3.q4_0.bin", "-n", "1024", "-p", f"{prompt}\n\n### Response:", ], stdout=subprocess.PIPE, ) for c in iter(lambda: process.stdout.read(1), b""): yield c finally: process.kill() return Response(generate(), mimetype="text/plain") path = "/srv/http/pages/textgen" def fixperms(): time.sleep(0.1) os.chmod(path, 660) threading.Thread(target=fixperms).start() app.run(host="unix://" + path)