Compare commits
7 Commits
259c08fd4e
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
| c4d5b3a7bf | |||
| 9d0413ada5 | |||
| a04bb06ed8 | |||
| cf05777f2c | |||
| 209597432d | |||
| e207c359ed | |||
| 35fadd6638 |
@@ -9,6 +9,7 @@ from urllib.parse import urljoin
|
|||||||
import logging
|
import logging
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
|
|
||||||
|
|
||||||
def ingredient_regex(units: list[str], instructions: list[str]) -> re.Pattern:
|
def ingredient_regex(units: list[str], instructions: list[str]) -> re.Pattern:
|
||||||
number_regex = "((?:[\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)"
|
number_regex = "((?:[\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)"
|
||||||
ingredient_regex = "([a-zA-Z '\-]+)"
|
ingredient_regex = "([a-zA-Z '\-]+)"
|
||||||
@@ -63,33 +64,27 @@ def parse_ingredient(
|
|||||||
|
|
||||||
return [text.strip() if text else None for text in m.groups()]
|
return [text.strip() if text else None for text in m.groups()]
|
||||||
|
|
||||||
|
# this code is unused
|
||||||
def reparse_ingredients(session):
|
# TODO: add tests when this is used
|
||||||
|
def missing_ingredients_query(session):
|
||||||
cte = (
|
cte = (
|
||||||
except_(select(db.RecipeIngredient.id), select(db.RecipeIngredientParts.id))
|
except_(select(db.RecipeIngredient.id), select(db.RecipeIngredientParts.id))
|
||||||
).alias("missing")
|
).alias("missing")
|
||||||
missing = (
|
missing = (
|
||||||
session.query(db.RecipeIngredient).where(db.RecipeIngredient.id.in_(cte)).all()
|
session.query(db.RecipeIngredient).where(db.RecipeIngredient.id.in_(cte)).all()
|
||||||
)
|
)
|
||||||
|
return missing
|
||||||
|
|
||||||
|
# this code is unused
|
||||||
|
# TODO: add tests when this is used
|
||||||
|
def parse_missing_ingredients(session):
|
||||||
|
missing = missing_ingredients_query(session)
|
||||||
for ingredient in missing:
|
for ingredient in missing:
|
||||||
parts = parse_ingredient(ingredient.text)
|
parts = ingredient_to_parts(ingredient)
|
||||||
if not parts:
|
session.add(parts)
|
||||||
continue
|
|
||||||
quantity, unit, instruction, name, supplement = parts
|
|
||||||
session.add(
|
|
||||||
db.RecipeIngredientParts(
|
|
||||||
id=ingredient.id,
|
|
||||||
quantity=quantity,
|
|
||||||
unit=unit,
|
|
||||||
instruction=instruction,
|
|
||||||
ingredient=name,
|
|
||||||
supplement=supplement,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def load_page(recipe_url):
|
def load_page(recipe_url: str) -> bs4.BeautifulSoup:
|
||||||
try:
|
try:
|
||||||
logging.info(f"Loading Page: {recipe_url}")
|
logging.info(f"Loading Page: {recipe_url}")
|
||||||
with req.get(recipe_url) as resp:
|
with req.get(recipe_url) as resp:
|
||||||
@@ -102,43 +97,61 @@ def load_page(recipe_url):
|
|||||||
logging.warning(e)
|
logging.warning(e)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_recipe_name(
|
||||||
|
site: db.RecipeSite,
|
||||||
|
page: bs4.BeautifulSoup,
|
||||||
|
recipe: db.Recipe,
|
||||||
|
url: str = None,
|
||||||
|
) -> db.Recipe:
|
||||||
|
if not url:
|
||||||
|
url = {"site": site.base_url, "recipe": recipe.identifier}
|
||||||
|
name_candidates = page.find_all(class_=site.name_class)
|
||||||
|
if len(name_candidates) == 0:
|
||||||
|
raise Exception(f"Could not extract recipe name: {url}")
|
||||||
|
name_div = name_candidates[0]
|
||||||
|
recipe.name = name_div.text
|
||||||
|
|
||||||
|
logging.info(f"Adding Recipe {recipe.name} from {url}")
|
||||||
|
|
||||||
|
return recipe
|
||||||
|
|
||||||
|
def ingredient_to_parts(
|
||||||
|
ingredient: db.Ingredient
|
||||||
|
) -> db.RecipeIngredientParts:
|
||||||
|
parts = parse_ingredient(ingredient.text)
|
||||||
|
if parts:
|
||||||
|
quantity, unit, instruction, ingredient_name, supplement = parts
|
||||||
|
return db.RecipeIngredientParts(
|
||||||
|
id=ingredient.id,
|
||||||
|
quantity=quantity,
|
||||||
|
unit=unit,
|
||||||
|
instruction=instruction,
|
||||||
|
ingredient=ingredient_name,
|
||||||
|
supplement=supplement,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def parse_recipe(session, recipe, site):
|
def parse_recipe(session, recipe, site):
|
||||||
recipe_url = urljoin(site.base_url, str(recipe.identifier))
|
recipe_url = urljoin(site.base_url, str(recipe.identifier))
|
||||||
recipe_page = load_page(recipe_url)
|
recipe_page = load_page(recipe_url)
|
||||||
if not recipe_page:
|
if not recipe_page:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
name_candidates = recipe_page.find_all(class_=site.name_class)
|
recipe = parse_recipe_name(site, recipe_page, recipe, recipe_url)
|
||||||
if len(name_candidates) == 0:
|
|
||||||
raise Exception(f"Could not extract recipe name: {recipe_url}")
|
|
||||||
name_div = name_candidates[0]
|
|
||||||
recipe.name = name_div.text
|
|
||||||
|
|
||||||
logging.info(f"Adding Recipe {recipe.name} from {recipe_url}")
|
|
||||||
|
|
||||||
session.add(recipe)
|
session.add(recipe)
|
||||||
session.flush()
|
session.flush()
|
||||||
|
|
||||||
ingred_candidates = recipe_page.find_all(class_=site.ingredient_class)
|
candidates = recipe_page.find_all(class_=site.ingredient_class)
|
||||||
for candidate in ingred_candidates:
|
for candidate in candidates:
|
||||||
ingred = db.RecipeIngredient(text=candidate.text, recipe_id=recipe.id)
|
ingredient = db.RecipeIngredient(text=candidate.text, recipe_id=recipe.id)
|
||||||
session.add(ingred)
|
session.add(ingredient)
|
||||||
session.flush()
|
session.flush()
|
||||||
|
|
||||||
parts = parse_ingredient(ingred.text)
|
parts = ingredient_to_parts(ingredient)
|
||||||
if parts:
|
if parts:
|
||||||
quantity, unit, instruction, ingredient, supplement = parts
|
session.add(parts)
|
||||||
ingred_parts = db.RecipeIngredientParts(
|
|
||||||
id=ingred.id,
|
|
||||||
quantity=quantity,
|
|
||||||
unit=unit,
|
|
||||||
instruction=instruction,
|
|
||||||
ingredient=ingredient,
|
|
||||||
supplement=supplement,
|
|
||||||
)
|
|
||||||
session.add(ingred_parts)
|
|
||||||
|
|
||||||
logging.info(f"{len(ingred_candidates)} ingredients found. Inserting into DB")
|
logging.info(f"{len(candidates)} ingredients found. Inserting into DB")
|
||||||
|
|
||||||
return recipe
|
return recipe
|
||||||
|
|
||||||
|
|||||||
@@ -1,14 +1,57 @@
|
|||||||
from recipe_graph import scrape
|
from recipe_graph import scrape
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from recipe_graph.db import RecipeSite, Recipe, RecipeIngredient, RecipeIngredientParts
|
||||||
|
|
||||||
import pytest
|
from pytest import fixture
|
||||||
|
|
||||||
|
|
||||||
|
@fixture
|
||||||
|
def mock_site():
|
||||||
|
return RecipeSite(
|
||||||
|
name="mock-site",
|
||||||
|
ingredient_class="mock-ing",
|
||||||
|
name_class="mock-name",
|
||||||
|
base_url="example-site/mock-site",
|
||||||
|
)
|
||||||
|
|
||||||
|
# TODO: should probably load HTML from file
|
||||||
|
@fixture
|
||||||
|
def mock_page():
|
||||||
|
return BeautifulSoup(
|
||||||
|
"""
|
||||||
|
<header></header><body>
|
||||||
|
<div class="mock-name">test_recipe</div>
|
||||||
|
<div class="mock-ing">test_ingredient</div>
|
||||||
|
</body>
|
||||||
|
""",
|
||||||
|
"html.parser",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@fixture
|
||||||
|
def mock_blank_page():
|
||||||
|
return BeautifulSoup(""" <header></header><body> </body> """, "html.parser")
|
||||||
|
|
||||||
|
|
||||||
|
@fixture
|
||||||
|
def mock_recipe():
|
||||||
|
return Recipe(name="test_recipe", identifier="mock_1")
|
||||||
|
|
||||||
|
|
||||||
|
@fixture
|
||||||
|
def mock_ingredient():
|
||||||
|
return RecipeIngredient(text="1 ounce water")
|
||||||
|
|
||||||
|
@fixture
|
||||||
|
def mock_url():
|
||||||
|
return "example-site/mock-site"
|
||||||
|
|
||||||
|
|
||||||
def test_load_page():
|
def test_load_page():
|
||||||
page = scrape.load_page("https://hs.andreistoica.ca:4943")
|
page = scrape.load_page("https://www.google.com")
|
||||||
assert type(page) == BeautifulSoup
|
assert type(page) == BeautifulSoup
|
||||||
|
|
||||||
page = scrape.load_page("https://hs.andreistoica.ca:4943/some-nonesense")
|
page = scrape.load_page("https://www.google.com/some-nonsense")
|
||||||
assert page == None
|
assert page == None
|
||||||
|
|
||||||
|
|
||||||
@@ -23,3 +66,50 @@ def test_ingredient_regex():
|
|||||||
regex.pattern
|
regex.pattern
|
||||||
== "((?:[\\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\\s?(?:\\(.+\\))?)*)((?:(?:[cC]up|[oO]unce)e?s?)?)((?:(?:(?:[cC]rushed|[gG]round)(?:ly)?)| )*)([a-zA-Z '\\-]+),?(.*)"
|
== "((?:[\\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\\s?(?:\\(.+\\))?)*)((?:(?:[cC]up|[oO]unce)e?s?)?)((?:(?:(?:[cC]rushed|[gG]round)(?:ly)?)| )*)([a-zA-Z '\\-]+),?(.*)"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_parse_ingredient(mock_ingredient):
|
||||||
|
parts = scrape.parse_ingredient(mock_ingredient.text)
|
||||||
|
assert len(parts) > 0
|
||||||
|
assert parts == ['1', 'ounce', '', 'water', None]
|
||||||
|
|
||||||
|
parts = scrape.parse_ingredient("Water")
|
||||||
|
assert len(parts) > 0
|
||||||
|
assert parts == [None, None, None, 'Water', None]
|
||||||
|
|
||||||
|
parts = scrape.parse_ingredient("")
|
||||||
|
assert parts == None
|
||||||
|
|
||||||
|
def test_parse_recipe_name(mock_site, mock_page, mock_recipe, mock_url, mock_blank_page,):
|
||||||
|
expected_name = mock_recipe.name
|
||||||
|
mock_recipe.name = None
|
||||||
|
|
||||||
|
mock_recipe = scrape.parse_recipe_name(
|
||||||
|
mock_site,
|
||||||
|
mock_page,
|
||||||
|
mock_recipe,
|
||||||
|
)
|
||||||
|
assert mock_recipe.name == expected_name
|
||||||
|
|
||||||
|
ex = None
|
||||||
|
try:
|
||||||
|
mock_recipe = scrape.parse_recipe_name(
|
||||||
|
mock_site,
|
||||||
|
mock_blank_page,
|
||||||
|
mock_recipe,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
ex = e
|
||||||
|
url = {"site": mock_site.base_url, "recipe": mock_recipe.identifier}
|
||||||
|
assert str(e) == f"Could not extract recipe name: {url}"
|
||||||
|
assert ex
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def test_ingredient_to_parts(mock_ingredient):
|
||||||
|
parts = scrape.ingredient_to_parts(mock_ingredient)
|
||||||
|
assert parts.quantity == "1"
|
||||||
|
assert parts.unit == "ounce"
|
||||||
|
assert parts.instruction == ""
|
||||||
|
assert parts.ingredient == "water"
|
||||||
|
assert parts.supplement == None
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user