recipe-graph/src/scrape.py

84 lines
3.2 KiB
Python
Raw Normal View History

2022-07-18 11:13:53 -04:00
import db
2022-07-20 18:00:49 -04:00
from sqlalchemy import select, desc
2022-07-18 11:13:53 -04:00
from sqlalchemy.orm import sessionmaker
import bs4
from urllib.request import urlopen
import logging
from argparse import ArgumentParser
parser = ArgumentParser(description="Scrape a recipe site for recipies")
parser.add_argument('site',
help='Name of site')
2022-07-20 18:00:49 -04:00
parser.add_argument('-id', '--identifier', dest='id',
help='url of recipe(reletive to base url of site) or commma seperated list')
parser.add_argument('-a', '--auto', action='store', dest='n',
help='automaticaly generate identifier(must supply number of recipies to scrape)')
2022-07-18 11:13:53 -04:00
parser.add_argument('-v', '--verbose', action='store_true')
args = parser.parse_args()
if args.verbose:
logging.basicConfig(level=logging.INFO)
logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
eng = db.get_engine()
S = sessionmaker(eng)
with S.begin() as sess:
2022-07-20 18:00:49 -04:00
site = sess.query(db.RecipeSite).where(db.RecipeSite.name == args.site).one()
site_id = site.id
recipe_ids = []
starting_id = 0
if args.id and not args.n:
recipe_ids.append(args.id)
logging.info(f'Retreiving single recipe: {args.id}')
elif args.n:
if not args.id:
last_recipe = sess.query(db.Recipe).\
where(db.Recipe.recipe_site_id == site.id).\
order_by(desc(db.Recipe.identifier)).\
limit(1).\
scalar()
starting_id = int(last_recipe.identifier) + 1
else:
starting_id = args.id
recipe_ids = range(starting_id, starting_id+int(args.n))
logging.info(f'Retreving {args.n} recipes from {site.base_url} starting at {starting_id}')
2022-07-18 11:13:53 -04:00
2022-07-20 18:00:49 -04:00
for recipe_id in recipe_ids:
recipe = db.Recipe(identifier = recipe_id, recipe_site_id = site.id)
recipe_url = f'{site.base_url}/{recipe.identifier}'
logging.info(f'Loading Recipe: {recipe_url}')
try:
with urlopen(recipe_url) as f:
if f.getcode() == 404:
raise Exception(f"Recipe Does not exist: {recipe_url}")
recipe_page = bs4.BeautifulSoup(f.read().decode())
name_candidates = recipe_page.find_all(class_=site.name_class)
if len(name_candidates) == 0:
raise Exception(f"Could not extract recipe name: {recipe_url}")
name_div = name_candidates[0]
except Exception as e:
logging.warning(f"Could not download or parse recipe: {recipe_url}")
logging.warning(e)
continue
recipe.name = name_div.text
logging.info(f"Adding Recipe {recipe}")
2022-07-18 11:13:53 -04:00
2022-07-20 18:00:49 -04:00
sess.add(recipe)
sess.flush()
2022-07-18 11:13:53 -04:00
2022-07-20 18:00:49 -04:00
ingredients = []
ingred_candidates = recipe_page.find_all(class_=site.ingredient_class)
for ingredient in ingred_candidates:
ingredients.append(db.RecipeIngredient(text=ingredient.text,
recipe_id=recipe.id))
logging.info(f"{len(ingredients)} ingredients found. Inserting into DB")
sess.add_all(ingredients)