textgen - A simple llama.cpp web API

import os import subprocess import time import threading from flask import Flask, Response

app = Flask(name)

@app.route("/") def llama(prompt): if prompt == "favicon.ico": return Response(status=204)

def generate():
    try:
        process = subprocess.Popen(
            [
                "/opt/llama.cpp/main",
                "-ngl",
                "32",
                "-m",
                "/opt/llama.cpp/models/wizardLM-7B.ggmlv3.q4_0.bin",
                "-n",
                "1024",
                "-p",
                f"{prompt}\n\n### Response:",
            ],
            stdout=subprocess.PIPE,
        )
        for c in iter(lambda: process.stdout.read(1), b""):
            yield c
    finally:
        process.kill()

return Response(generate(), mimetype="text/plain")

path = "/srv/http/pages/textgen"

def fixperms(): time.sleep(0.1) os.chmod(path, 660)

threading.Thread(target=fixperms).start()

app.run(host="unix://" + path)