Compare commits

...

7 Commits

Author SHA1 Message Date
c4d5b3a7bf commented unused code
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-19 18:08:47 -04:00
9d0413ada5 refactor
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-19 17:09:57 -04:00
a04bb06ed8 added parse_ingredient test
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-19 01:04:52 -04:00
cf05777f2c increassed code coverage
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-18 23:52:20 -04:00
209597432d added ingrediet_to_parts tests
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-18 23:31:42 -04:00
e207c359ed test parse_recipe_name
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-18 19:05:34 -04:00
35fadd6638 changed test addresses to google
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-18 18:36:11 -04:00
2 changed files with 148 additions and 45 deletions

View File

@@ -9,6 +9,7 @@ from urllib.parse import urljoin
import logging
from argparse import ArgumentParser
def ingredient_regex(units: list[str], instructions: list[str]) -> re.Pattern:
number_regex = "((?:[\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)"
ingredient_regex = "([a-zA-Z '\-]+)"
@@ -63,33 +64,27 @@ def parse_ingredient(
return [text.strip() if text else None for text in m.groups()]
def reparse_ingredients(session):
# this code is unused
# TODO: add tests when this is used
def missing_ingredients_query(session):
cte = (
except_(select(db.RecipeIngredient.id), select(db.RecipeIngredientParts.id))
).alias("missing")
missing = (
session.query(db.RecipeIngredient).where(db.RecipeIngredient.id.in_(cte)).all()
)
return missing
# this code is unused
# TODO: add tests when this is used
def parse_missing_ingredients(session):
missing = missing_ingredients_query(session)
for ingredient in missing:
parts = parse_ingredient(ingredient.text)
if not parts:
continue
quantity, unit, instruction, name, supplement = parts
session.add(
db.RecipeIngredientParts(
id=ingredient.id,
quantity=quantity,
unit=unit,
instruction=instruction,
ingredient=name,
supplement=supplement,
)
)
parts = ingredient_to_parts(ingredient)
session.add(parts)
def load_page(recipe_url):
def load_page(recipe_url: str) -> bs4.BeautifulSoup:
try:
logging.info(f"Loading Page: {recipe_url}")
with req.get(recipe_url) as resp:
@@ -102,43 +97,61 @@ def load_page(recipe_url):
logging.warning(e)
def parse_recipe_name(
site: db.RecipeSite,
page: bs4.BeautifulSoup,
recipe: db.Recipe,
url: str = None,
) -> db.Recipe:
if not url:
url = {"site": site.base_url, "recipe": recipe.identifier}
name_candidates = page.find_all(class_=site.name_class)
if len(name_candidates) == 0:
raise Exception(f"Could not extract recipe name: {url}")
name_div = name_candidates[0]
recipe.name = name_div.text
logging.info(f"Adding Recipe {recipe.name} from {url}")
return recipe
def ingredient_to_parts(
ingredient: db.Ingredient
) -> db.RecipeIngredientParts:
parts = parse_ingredient(ingredient.text)
if parts:
quantity, unit, instruction, ingredient_name, supplement = parts
return db.RecipeIngredientParts(
id=ingredient.id,
quantity=quantity,
unit=unit,
instruction=instruction,
ingredient=ingredient_name,
supplement=supplement,
)
def parse_recipe(session, recipe, site):
recipe_url = urljoin(site.base_url, str(recipe.identifier))
recipe_page = load_page(recipe_url)
if not recipe_page:
return None
name_candidates = recipe_page.find_all(class_=site.name_class)
if len(name_candidates) == 0:
raise Exception(f"Could not extract recipe name: {recipe_url}")
name_div = name_candidates[0]
recipe.name = name_div.text
logging.info(f"Adding Recipe {recipe.name} from {recipe_url}")
recipe = parse_recipe_name(site, recipe_page, recipe, recipe_url)
session.add(recipe)
session.flush()
ingred_candidates = recipe_page.find_all(class_=site.ingredient_class)
for candidate in ingred_candidates:
ingred = db.RecipeIngredient(text=candidate.text, recipe_id=recipe.id)
session.add(ingred)
candidates = recipe_page.find_all(class_=site.ingredient_class)
for candidate in candidates:
ingredient = db.RecipeIngredient(text=candidate.text, recipe_id=recipe.id)
session.add(ingredient)
session.flush()
parts = parse_ingredient(ingred.text)
parts = ingredient_to_parts(ingredient)
if parts:
quantity, unit, instruction, ingredient, supplement = parts
ingred_parts = db.RecipeIngredientParts(
id=ingred.id,
quantity=quantity,
unit=unit,
instruction=instruction,
ingredient=ingredient,
supplement=supplement,
)
session.add(ingred_parts)
session.add(parts)
logging.info(f"{len(ingred_candidates)} ingredients found. Inserting into DB")
logging.info(f"{len(candidates)} ingredients found. Inserting into DB")
return recipe

View File

@@ -1,14 +1,57 @@
from recipe_graph import scrape
from bs4 import BeautifulSoup
from recipe_graph.db import RecipeSite, Recipe, RecipeIngredient, RecipeIngredientParts
import pytest
from pytest import fixture
@fixture
def mock_site():
return RecipeSite(
name="mock-site",
ingredient_class="mock-ing",
name_class="mock-name",
base_url="example-site/mock-site",
)
# TODO: should probably load HTML from file
@fixture
def mock_page():
return BeautifulSoup(
"""
<header></header><body>
<div class="mock-name">test_recipe</div>
<div class="mock-ing">test_ingredient</div>
</body>
""",
"html.parser",
)
@fixture
def mock_blank_page():
return BeautifulSoup(""" <header></header><body> </body> """, "html.parser")
@fixture
def mock_recipe():
return Recipe(name="test_recipe", identifier="mock_1")
@fixture
def mock_ingredient():
return RecipeIngredient(text="1 ounce water")
@fixture
def mock_url():
return "example-site/mock-site"
def test_load_page():
page = scrape.load_page("https://hs.andreistoica.ca:4943")
page = scrape.load_page("https://www.google.com")
assert type(page) == BeautifulSoup
page = scrape.load_page("https://hs.andreistoica.ca:4943/some-nonesense")
page = scrape.load_page("https://www.google.com/some-nonsense")
assert page == None
@@ -23,3 +66,50 @@ def test_ingredient_regex():
regex.pattern
== "((?:[\\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\\s?(?:\\(.+\\))?)*)((?:(?:[cC]up|[oO]unce)e?s?)?)((?:(?:(?:[cC]rushed|[gG]round)(?:ly)?)| )*)([a-zA-Z '\\-]+),?(.*)"
)
def test_parse_ingredient(mock_ingredient):
parts = scrape.parse_ingredient(mock_ingredient.text)
assert len(parts) > 0
assert parts == ['1', 'ounce', '', 'water', None]
parts = scrape.parse_ingredient("Water")
assert len(parts) > 0
assert parts == [None, None, None, 'Water', None]
parts = scrape.parse_ingredient("")
assert parts == None
def test_parse_recipe_name(mock_site, mock_page, mock_recipe, mock_url, mock_blank_page,):
expected_name = mock_recipe.name
mock_recipe.name = None
mock_recipe = scrape.parse_recipe_name(
mock_site,
mock_page,
mock_recipe,
)
assert mock_recipe.name == expected_name
ex = None
try:
mock_recipe = scrape.parse_recipe_name(
mock_site,
mock_blank_page,
mock_recipe,
)
except Exception as e:
ex = e
url = {"site": mock_site.base_url, "recipe": mock_recipe.identifier}
assert str(e) == f"Could not extract recipe name: {url}"
assert ex
def test_ingredient_to_parts(mock_ingredient):
parts = scrape.ingredient_to_parts(mock_ingredient)
assert parts.quantity == "1"
assert parts.unit == "ounce"
assert parts.instruction == ""
assert parts.ingredient == "water"
assert parts.supplement == None