77 lines
2.5 KiB
Python
77 lines
2.5 KiB
Python
import sqlite3
|
|
import json
|
|
import sys
|
|
from tqdm import tqdm
|
|
from requests_html import HTMLSession
|
|
session = HTMLSession()
|
|
|
|
con = sqlite3.connect('tlfi.db')
|
|
cur = con.cursor()
|
|
|
|
if len(sys.argv) > 1:
|
|
limit = int(sys.argv[1])
|
|
else:
|
|
limit = 1
|
|
print("Limit:", limit)
|
|
|
|
def get_pages(word, link):
|
|
r = session.get(link)
|
|
nb_entries = len(r.html.find('#vtoolbar', first=True).find('a'))
|
|
tqdm.write(f"Word '{word}' has {nb_entries} entries.")
|
|
page = r.html.find('#lexicontent', first=True)
|
|
yield page.html
|
|
for entry in range(1, nb_entries):
|
|
r = session.get(f"{link}/{entry}")
|
|
page = r.html.find('#lexicontent', first=True)
|
|
yield page.html
|
|
|
|
def get_words(link):
|
|
i = 0
|
|
while True:
|
|
r = session.get(f"{link}/{i}")
|
|
words = r.html.find('.hometab', first=True)
|
|
words_dic = { link.rsplit('/')[-1]: link for link in words.absolute_links }
|
|
yield words_dic
|
|
if "Page suivante" in r.html.html:
|
|
i += 80
|
|
else:
|
|
break
|
|
|
|
def get_letters_and_words():
|
|
r = session.get('https://www.cnrtl.fr/definition/')
|
|
letters = r.html.find('.letterHeader', first=True)
|
|
letters_dic = { link[-1]: link for link in letters.absolute_links }
|
|
|
|
for letter, letter_link in sorted(letters_dic.items()):
|
|
print(f"Processing letter {letter}")
|
|
words = {}
|
|
for batch in get_words(letter_link):
|
|
words.update(batch)
|
|
print('.', end='', flush=True)
|
|
print('')
|
|
nb = len(words)
|
|
print(f"Found {nb} words for letter {letter}.")
|
|
cur.executemany("INSERT INTO word_links VALUES(?, ?)", words.items())
|
|
con.commit()
|
|
|
|
res = cur.execute("SELECT name FROM sqlite_master WHERE name='word_links'")
|
|
if res.fetchone() is None:
|
|
cur.execute("CREATE TABLE word_links(word, link)")
|
|
get_letters_and_words()
|
|
else:
|
|
res = cur.execute("SELECT name FROM sqlite_master WHERE name='word_pages'")
|
|
if res.fetchone() is None:
|
|
cur.execute("CREATE TABLE word_pages(word, page)")
|
|
for word, link in tqdm(cur.execute(f"SELECT word, link FROM word_links LEFT JOIN word_pages USING (word) WHERE page IS NULL ORDER BY word LIMIT ?", (limit,)).fetchall()):
|
|
tqdm.write(f"Looking up word '{word}'")
|
|
try:
|
|
pages = list(get_pages(word, link))
|
|
j = json.dumps(pages, indent=4)
|
|
except:
|
|
j = "ERROR"
|
|
tqdm.write(f"ERROR on word '{word}'")
|
|
cur.execute("INSERT INTO word_pages VALUES(?, ?)", (word, j))
|
|
con.commit()
|
|
|
|
con.close()
|