Initial commit: scrape to sqlite
This commit is contained in:
commit
33bdcb16bf
76
tlfi.py
Normal file
76
tlfi.py
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
import sqlite3
|
||||
import json
|
||||
import sys
|
||||
from tqdm import tqdm
|
||||
from requests_html import HTMLSession
|
||||
session = HTMLSession()
|
||||
|
||||
con = sqlite3.connect('tlfi.db')
|
||||
cur = con.cursor()
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
limit = int(sys.argv[1])
|
||||
else:
|
||||
limit = 1
|
||||
print("Limit:", limit)
|
||||
|
||||
def get_pages(word, link):
|
||||
r = session.get(link)
|
||||
nb_entries = len(r.html.find('#vtoolbar', first=True).find('a'))
|
||||
tqdm.write(f"Word '{word}' has {nb_entries} entries.")
|
||||
page = r.html.find('#lexicontent', first=True)
|
||||
yield page.html
|
||||
for entry in range(1, nb_entries):
|
||||
r = session.get(f"{link}/{entry}")
|
||||
page = r.html.find('#lexicontent', first=True)
|
||||
yield page.html
|
||||
|
||||
def get_words(link):
|
||||
i = 0
|
||||
while True:
|
||||
r = session.get(f"{link}/{i}")
|
||||
words = r.html.find('.hometab', first=True)
|
||||
words_dic = { link.rsplit('/')[-1]: link for link in words.absolute_links }
|
||||
yield words_dic
|
||||
if "Page suivante" in r.html.html:
|
||||
i += 80
|
||||
else:
|
||||
break
|
||||
|
||||
def get_letters_and_words():
|
||||
r = session.get('https://www.cnrtl.fr/definition/')
|
||||
letters = r.html.find('.letterHeader', first=True)
|
||||
letters_dic = { link[-1]: link for link in letters.absolute_links }
|
||||
|
||||
for letter, letter_link in sorted(letters_dic.items()):
|
||||
print(f"Processing letter {letter}")
|
||||
words = {}
|
||||
for batch in get_words(letter_link):
|
||||
words.update(batch)
|
||||
print('.', end='', flush=True)
|
||||
print('')
|
||||
nb = len(words)
|
||||
print(f"Found {nb} words for letter {letter}.")
|
||||
cur.executemany("INSERT INTO word_links VALUES(?, ?)", words.items())
|
||||
con.commit()
|
||||
|
||||
res = cur.execute("SELECT name FROM sqlite_master WHERE name='word_links'")
|
||||
if res.fetchone() is None:
|
||||
cur.execute("CREATE TABLE word_links(word, link)")
|
||||
get_letters_and_words()
|
||||
else:
|
||||
res = cur.execute("SELECT name FROM sqlite_master WHERE name='word_pages'")
|
||||
if res.fetchone() is None:
|
||||
cur.execute("CREATE TABLE word_pages(word, page)")
|
||||
for word, link in tqdm(cur.execute(f"SELECT word, link FROM word_links LEFT JOIN word_pages USING (word) WHERE page IS NULL ORDER BY word LIMIT ?", (limit,)).fetchall()):
|
||||
tqdm.write(f"Looking up word '{word}'")
|
||||
try:
|
||||
pages = list(get_pages(word, link))
|
||||
j = json.dumps(pages, indent=4)
|
||||
except:
|
||||
j = "ERROR"
|
||||
tqdm.write(f"ERROR on word '{word}'")
|
||||
cur.execute("INSERT INTO word_pages VALUES(?, ?)", (word, j))
|
||||
con.commit()
|
||||
|
||||
con.close()
|
||||
Loading…
Reference in a new issue