import db from sqlalchemy import select, desc from sqlalchemy.orm import sessionmaker import bs4 from urllib.request import urlopen import logging from argparse import ArgumentParser parser = ArgumentParser(description="Scrape a recipe site for recipies") parser.add_argument('site', help='Name of site') parser.add_argument('-id', '--identifier', dest='id', help='url of recipe(reletive to base url of site) or commma seperated list') parser.add_argument('-a', '--auto', action='store', dest='n', help='automaticaly generate identifier(must supply number of recipies to scrape)') parser.add_argument('-v', '--verbose', action='store_true') args = parser.parse_args() if args.verbose: logging.basicConfig(level=logging.INFO) logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO) eng = db.get_engine() S = sessionmaker(eng) with S.begin() as sess: site = sess.query(db.RecipeSite).where(db.RecipeSite.name == args.site).one() site_id = site.id recipe_ids = [] starting_id = 0 if args.id and not args.n: recipe_ids.append(args.id) logging.info(f'Retreiving single recipe: {args.id}') elif args.n: if not args.id: last_recipe = sess.query(db.Recipe).\ where(db.Recipe.recipe_site_id == site.id).\ order_by(desc(db.Recipe.identifier)).\ limit(1).\ scalar() starting_id = int(last_recipe.identifier) + 1 else: starting_id = args.id recipe_ids = range(starting_id, starting_id+int(args.n)) logging.info(f'Retreving {args.n} recipes from {site.base_url} starting at {starting_id}') for recipe_id in recipe_ids: recipe = db.Recipe(identifier = recipe_id, recipe_site_id = site.id) recipe_url = f'{site.base_url}/{recipe.identifier}' logging.info(f'Loading Recipe: {recipe_url}') try: with urlopen(recipe_url) as f: if f.getcode() == 404: raise Exception(f"Recipe Does not exist: {recipe_url}") recipe_page = bs4.BeautifulSoup(f.read().decode()) name_candidates = recipe_page.find_all(class_=site.name_class) if len(name_candidates) == 0: raise Exception(f"Could not extract recipe name: {recipe_url}") name_div = name_candidates[0] except Exception as e: logging.warning(f"Could not download or parse recipe: {recipe_url}") logging.warning(e) continue recipe.name = name_div.text logging.info(f"Adding Recipe {recipe}") sess.add(recipe) sess.flush() ingredients = [] ingred_candidates = recipe_page.find_all(class_=site.ingredient_class) for ingredient in ingred_candidates: ingredients.append(db.RecipeIngredient(text=ingredient.text, recipe_id=recipe.id)) logging.info(f"{len(ingredients)} ingredients found. Inserting into DB") sess.add_all(ingredients)