markey/import-misskey.py

55 lines
1.3 KiB
Python
Raw Normal View History

2024-02-23 12:05:37 +00:00
import re
import json
import sys
print('[+] loading nlp enhanced markov chain')
import chain
MENTION = re.compile(r'@[a-zA-Z0-9.-_]+(@[a-zA-Z0-9.]+-_)?')
MFM_BEGIN = re.compile(r'\$\[[a-z0-9.,=]+')
MFM_END = re.compile(r'\]+')
HTML = re.compile(r'</?[a-z]+>')
SPACE = re.compile(r'[ \n]+')
CONTRACTION = re.compile(r"(\w+)'(\w+)")
print('[+] loading note json')
export_f = sys.argv[1]
export = open(export_f)
export_json = json.load(export)
corpus = []
for note in export_json:
if note.get('visibility') not in ['public', 'unlisted']:
continue
if note.get('localOnly'):
continue
if note.get('cw'):
continue
text = note.get('text')
if not text:
continue
text = text.lower()
text = re.sub(MENTION, '', text)
text = re.sub(MFM_BEGIN, '', text)
text = re.sub(MFM_END, '', text)
text = re.sub(HTML, '', text)
text = re.sub(SPACE, ' ', text)
text = re.sub(CONTRACTION, r'\1\2', text)
text = text.strip()
print(f" - {text}")
corpus.append(text)
print('[+] building markov chain')
model = chain.Text("@@note@@".join(corpus), well_formed=False)
model_json = model.compile().to_json()
print('[+] exporting')
export = open(export_f.replace('.json', '.model.json'), 'w')
export.write(model_json)