1
0
mirror of https://github.com/osmarks/random-stuff synced 2025-06-09 05:24:06 +00:00
random-stuff/extract_nouns.py
2025-05-18 14:09:06 +01:00

54 lines
1.6 KiB
Python

import openai
import json
import os
import shelve
import re
import random
openai.api_key = os.environ["OPENAI_API_KEY"]
client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])
def chunks(text, size):
out = [""]
for line in text.split("\n"):
out[-1] += line + "\n"
if len(out[-1]) > size:
out.append("")
return [ x.removesuffix("\n") for x in out if x ]
def extract_nouns(text):
completion = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "user", "content": f"""Extract all unique simple noun phrases from this document and put them in a JSON array in the singular:
```
{text}
```"""}],
response_format={"type": "json_object"},
max_tokens=16384,
temperature=0.2 # should be 0 but repetition issues at 0
)
result = json.loads(completion.choices[0].message.content)
return result[next(iter(result.keys()))]
with open("../website/strings.json", "r") as f:
strings = json.load(f)
nouns = set()
with shelve.open("nouns_cache.db") as db:
for bigstring in strings:
for string in chunks(bigstring, 8192):
if nouns: print(random.choices(list(nouns), k=10))
if string in db:
nouns.update(db[string])
else:
print("reading:", string[:100])
s_nouns = extract_nouns(string)
nouns.update(s_nouns)
print(len(s_nouns), "/", len(nouns))
db[string] = s_nouns
with open("nouns.json", "w") as f:
json.dump(list(nouns), f)