commented unused code

refactor
added parse_ingredient test
2023-05-19 18:08:47 -04:00 · 2023-05-19 17:09:57 -04:00 · 2023-05-19 01:04:52 -04:00 · 2023-05-18 23:52:20 -04:00 · 2023-05-18 23:31:42 -04:00 · 2023-05-18 19:05:34 -04:00
2 changed files with 148 additions and 45 deletions
--- a/src/recipe_graph/scrape.py
+++ b/src/recipe_graph/scrape.py
@@ -9,6 +9,7 @@ from urllib.parse import urljoin
 import logging
 from argparse import ArgumentParser

+
 def ingredient_regex(units: list[str], instructions: list[str]) -> re.Pattern:
    number_regex = "((?:[\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)"
    ingredient_regex = "([a-zA-Z '\-]+)"
@@ -63,33 +64,27 @@ def parse_ingredient(

    return [text.strip() if text else None for text in m.groups()]

-
-def reparse_ingredients(session):
+# this code is unused
+# TODO: add tests when this is used
+def missing_ingredients_query(session):
    cte = (
        except_(select(db.RecipeIngredient.id), select(db.RecipeIngredientParts.id))
    ).alias("missing")
    missing = (
        session.query(db.RecipeIngredient).where(db.RecipeIngredient.id.in_(cte)).all()
    )
+    return missing

+# this code is unused
+# TODO: add tests when this is used
+def parse_missing_ingredients(session):
+    missing = missing_ingredients_query(session)
    for ingredient in missing:
-        parts = parse_ingredient(ingredient.text)
-        if not parts:
-            continue
-        quantity, unit, instruction, name, supplement = parts
-        session.add(
-            db.RecipeIngredientParts(
-                id=ingredient.id,
-                quantity=quantity,
-                unit=unit,
-                instruction=instruction,
-                ingredient=name,
-                supplement=supplement,
-            )
-        )
+        parts = ingredient_to_parts(ingredient)
+        session.add(parts)


-def load_page(recipe_url):
+def load_page(recipe_url: str) -> bs4.BeautifulSoup:
    try:
        logging.info(f"Loading Page: {recipe_url}")
        with req.get(recipe_url) as resp:
@@ -102,43 +97,61 @@ def load_page(recipe_url):
        logging.warning(e)


+def parse_recipe_name(
+    site: db.RecipeSite,
+    page: bs4.BeautifulSoup,
+    recipe: db.Recipe,
+    url: str = None,
+) -> db.Recipe:
+    if not url:
+        url = {"site": site.base_url, "recipe": recipe.identifier}
+    name_candidates = page.find_all(class_=site.name_class)
+    if len(name_candidates) == 0:
+        raise Exception(f"Could not extract recipe name: {url}")
+    name_div = name_candidates[0]
+    recipe.name = name_div.text
+
+    logging.info(f"Adding Recipe {recipe.name} from {url}")
+
+    return recipe
+
+def ingredient_to_parts(
+    ingredient: db.Ingredient
+) -> db.RecipeIngredientParts:
+    parts = parse_ingredient(ingredient.text)
+    if parts:
+        quantity, unit, instruction, ingredient_name, supplement = parts
+        return db.RecipeIngredientParts(
+            id=ingredient.id,
+            quantity=quantity,
+            unit=unit,
+            instruction=instruction,
+            ingredient=ingredient_name,
+            supplement=supplement,
+        )
+
+
 def parse_recipe(session, recipe, site):
    recipe_url = urljoin(site.base_url, str(recipe.identifier))
    recipe_page = load_page(recipe_url)
    if not recipe_page:
        return None

-    name_candidates = recipe_page.find_all(class_=site.name_class)
-    if len(name_candidates) == 0:
-        raise Exception(f"Could not extract recipe name: {recipe_url}")
-    name_div = name_candidates[0]
-    recipe.name = name_div.text
-
-    logging.info(f"Adding Recipe {recipe.name} from {recipe_url}")
-
+    recipe = parse_recipe_name(site, recipe_page, recipe, recipe_url)
    session.add(recipe)
    session.flush()

-    ingred_candidates = recipe_page.find_all(class_=site.ingredient_class)
-    for candidate in ingred_candidates:
-        ingred = db.RecipeIngredient(text=candidate.text, recipe_id=recipe.id)
-        session.add(ingred)
+    candidates = recipe_page.find_all(class_=site.ingredient_class)
+    for candidate in candidates:
+        ingredient = db.RecipeIngredient(text=candidate.text, recipe_id=recipe.id)
+        session.add(ingredient)
        session.flush()
        
-        parts = parse_ingredient(ingred.text)
+        parts = ingredient_to_parts(ingredient)
        if parts:
-            quantity, unit, instruction, ingredient, supplement = parts
-            ingred_parts = db.RecipeIngredientParts(
-                id=ingred.id,
-                quantity=quantity,
-                unit=unit,
-                instruction=instruction,
-                ingredient=ingredient,
-                supplement=supplement,
-            )
-            session.add(ingred_parts)
+            session.add(parts)

-    logging.info(f"{len(ingred_candidates)} ingredients found. Inserting into DB")
+    logging.info(f"{len(candidates)} ingredients found. Inserting into DB")

    return recipe

--- a/test/test_scrape.py
+++ b/test/test_scrape.py
@@ -1,14 +1,57 @@
 from recipe_graph import scrape
 from bs4 import BeautifulSoup
+from recipe_graph.db import RecipeSite, Recipe, RecipeIngredient, RecipeIngredientParts

-import pytest
+from pytest import fixture
+
+
+@fixture
+def mock_site():
+    return RecipeSite(
+        name="mock-site",
+        ingredient_class="mock-ing",
+        name_class="mock-name",
+        base_url="example-site/mock-site",
+    )
+
+# TODO: should probably load HTML from file
+@fixture
+def mock_page():
+    return BeautifulSoup(
+        """
+                    <header></header><body>
+                        <div class="mock-name">test_recipe</div>
+                        <div class="mock-ing">test_ingredient</div>
+                    </body>
+                  """,
+        "html.parser",
+    )
+
+
+@fixture
+def mock_blank_page():
+    return BeautifulSoup(""" <header></header><body> </body> """, "html.parser")
+
+
+@fixture
+def mock_recipe():
+    return Recipe(name="test_recipe", identifier="mock_1")
+
+
+@fixture
+def mock_ingredient():
+    return RecipeIngredient(text="1 ounce water")
+
+@fixture
+def mock_url():
+    return "example-site/mock-site"


 def test_load_page():
-    page = scrape.load_page("https://hs.andreistoica.ca:4943")
+    page = scrape.load_page("https://www.google.com")
    assert type(page) == BeautifulSoup

-    page = scrape.load_page("https://hs.andreistoica.ca:4943/some-nonesense")
+    page = scrape.load_page("https://www.google.com/some-nonsense")
    assert page == None


@@ -23,3 +66,50 @@ def test_ingredient_regex():
        regex.pattern
        == "((?:[\\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\\s?(?:\\(.+\\))?)*)((?:(?:[cC]up|[oO]unce)e?s?)?)((?:(?:(?:[cC]rushed|[gG]round)(?:ly)?)| )*)([a-zA-Z '\\-]+),?(.*)"
    )
+
+def test_parse_ingredient(mock_ingredient):
+    parts = scrape.parse_ingredient(mock_ingredient.text)
+    assert len(parts) > 0
+    assert parts == ['1', 'ounce', '', 'water', None]
+
+    parts = scrape.parse_ingredient("Water")
+    assert len(parts) > 0
+    assert parts == [None, None, None, 'Water', None]
+
+    parts = scrape.parse_ingredient("")
+    assert parts == None
+
+def test_parse_recipe_name(mock_site, mock_page, mock_recipe, mock_url, mock_blank_page,):
+    expected_name = mock_recipe.name
+    mock_recipe.name = None
+
+    mock_recipe = scrape.parse_recipe_name(
+        mock_site,
+        mock_page,
+        mock_recipe,
+    )
+    assert mock_recipe.name == expected_name
+
+    ex = None
+    try:
+        mock_recipe = scrape.parse_recipe_name(
+            mock_site,
+            mock_blank_page,
+            mock_recipe,
+        )
+    except Exception as e:
+        ex = e
+        url = {"site": mock_site.base_url, "recipe": mock_recipe.identifier}
+        assert str(e) == f"Could not extract recipe name: {url}"
+    assert ex
+
+
+
+def test_ingredient_to_parts(mock_ingredient):
+    parts = scrape.ingredient_to_parts(mock_ingredient)
+    assert parts.quantity == "1"
+    assert parts.unit == "ounce"
+    assert parts.instruction == ""
+    assert parts.ingredient == "water"
+    assert parts.supplement == None
+
Author	SHA1	Message	Date
Andrei Stoica	c4d5b3a7bf	commented unused code All checks were successful continuous-integration/drone/push Build is passing Details	2023-05-19 18:08:47 -04:00
Andrei Stoica	9d0413ada5	refactor All checks were successful continuous-integration/drone/push Build is passing Details	2023-05-19 17:09:57 -04:00
Andrei Stoica	a04bb06ed8	added parse_ingredient test All checks were successful continuous-integration/drone/push Build is passing Details	2023-05-19 01:04:52 -04:00
Andrei Stoica	cf05777f2c	increassed code coverage All checks were successful continuous-integration/drone/push Build is passing Details	2023-05-18 23:52:20 -04:00
Andrei Stoica	209597432d	added ingrediet_to_parts tests All checks were successful continuous-integration/drone/push Build is passing Details	2023-05-18 23:31:42 -04:00
Andrei Stoica	e207c359ed	test parse_recipe_name All checks were successful continuous-integration/drone/push Build is passing Details	2023-05-18 19:05:34 -04:00
Andrei Stoica	35fadd6638	changed test addresses to google All checks were successful continuous-integration/drone/push Build is passing Details	2023-05-18 18:36:11 -04:00