import sqlite3 import json import sys from tqdm import tqdm from requests_html import HTMLSession session = HTMLSession() con = sqlite3.connect('tlfi.db') cur = con.cursor() if len(sys.argv) > 1: limit = int(sys.argv[1]) else: limit = 1 print("Limit:", limit) def get_pages(word, link): r = session.get(link) nb_entries = len(r.html.find('#vtoolbar', first=True).find('a')) tqdm.write(f"Word '{word}' has {nb_entries} entries.") page = r.html.find('#lexicontent', first=True) yield page.html for entry in range(1, nb_entries): r = session.get(f"{link}/{entry}") page = r.html.find('#lexicontent', first=True) yield page.html def get_words(link): i = 0 while True: r = session.get(f"{link}/{i}") words = r.html.find('.hometab', first=True) words_dic = { link.rsplit('/')[-1]: link for link in words.absolute_links } yield words_dic if "Page suivante" in r.html.html: i += 80 else: break def get_letters_and_words(): r = session.get('https://www.cnrtl.fr/definition/') letters = r.html.find('.letterHeader', first=True) letters_dic = { link[-1]: link for link in letters.absolute_links } for letter, letter_link in sorted(letters_dic.items()): print(f"Processing letter {letter}") words = {} for batch in get_words(letter_link): words.update(batch) print('.', end='', flush=True) print('') nb = len(words) print(f"Found {nb} words for letter {letter}.") cur.executemany("INSERT INTO word_links VALUES(?, ?)", words.items()) con.commit() res = cur.execute("SELECT name FROM sqlite_master WHERE name='word_links'") if res.fetchone() is None: cur.execute("CREATE TABLE word_links(word, link)") get_letters_and_words() else: res = cur.execute("SELECT name FROM sqlite_master WHERE name='word_pages'") if res.fetchone() is None: cur.execute("CREATE TABLE word_pages(word, page)") for word, link in tqdm(cur.execute(f"SELECT word, link FROM word_links LEFT JOIN word_pages USING (word) WHERE page IS NULL ORDER BY word LIMIT ?", (limit,)).fetchall()): tqdm.write(f"Looking up word '{word}'") try: pages = list(get_pages(word, link)) j = json.dumps(pages, indent=4) except: j = "ERROR" tqdm.write(f"ERROR on word '{word}'") cur.execute("INSERT INTO word_pages VALUES(?, ?)", (word, j)) con.commit() con.close()