tlfi/tlfi.py

77 lines
2.5 KiB
Python

import sqlite3
import json
import sys
from tqdm import tqdm
from requests_html import HTMLSession
session = HTMLSession()
con = sqlite3.connect('tlfi.db')
cur = con.cursor()
if len(sys.argv) > 1:
limit = int(sys.argv[1])
else:
limit = 1
print("Limit:", limit)
def get_pages(word, link):
r = session.get(link)
nb_entries = len(r.html.find('#vtoolbar', first=True).find('a'))
tqdm.write(f"Word '{word}' has {nb_entries} entries.")
page = r.html.find('#lexicontent', first=True)
yield page.html
for entry in range(1, nb_entries):
r = session.get(f"{link}/{entry}")
page = r.html.find('#lexicontent', first=True)
yield page.html
def get_words(link):
i = 0
while True:
r = session.get(f"{link}/{i}")
words = r.html.find('.hometab', first=True)
words_dic = { link.rsplit('/')[-1]: link for link in words.absolute_links }
yield words_dic
if "Page suivante" in r.html.html:
i += 80
else:
break
def get_letters_and_words():
r = session.get('https://www.cnrtl.fr/definition/')
letters = r.html.find('.letterHeader', first=True)
letters_dic = { link[-1]: link for link in letters.absolute_links }
for letter, letter_link in sorted(letters_dic.items()):
print(f"Processing letter {letter}")
words = {}
for batch in get_words(letter_link):
words.update(batch)
print('.', end='', flush=True)
print('')
nb = len(words)
print(f"Found {nb} words for letter {letter}.")
cur.executemany("INSERT INTO word_links VALUES(?, ?)", words.items())
con.commit()
res = cur.execute("SELECT name FROM sqlite_master WHERE name='word_links'")
if res.fetchone() is None:
cur.execute("CREATE TABLE word_links(word, link)")
get_letters_and_words()
else:
res = cur.execute("SELECT name FROM sqlite_master WHERE name='word_pages'")
if res.fetchone() is None:
cur.execute("CREATE TABLE word_pages(word, page)")
for word, link in tqdm(cur.execute(f"SELECT word, link FROM word_links LEFT JOIN word_pages USING (word) WHERE page IS NULL ORDER BY word LIMIT ?", (limit,)).fetchall()):
tqdm.write(f"Looking up word '{word}'")
try:
pages = list(get_pages(word, link))
j = json.dumps(pages, indent=4)
except:
j = "ERROR"
tqdm.write(f"ERROR on word '{word}'")
cur.execute("INSERT INTO word_pages VALUES(?, ?)", (word, j))
con.commit()
con.close()