Skip to content

pages

onion_peeler.pages

GenericPage

Bases: WebPage

Site-agnostic page object powered by selectors from SiteConfig.

all_items()

Extract all configured item types from the current response.

Source code in src/onion_peeler/pages/base.py
@field
def all_items(self) -> Dict[str, List[Dict[str, Any]]]:
    """Extract all configured item types from the current response."""
    result: Dict[str, List[Dict[str, Any]]] = {}

    for item_type in self.site_config.selectors.keys():
        extracted = self.extract_items(item_type)
        if extracted:
            result[item_type] = extracted

    return result

extract_items(item_type)

Extract items for a specific configured item type.

Source code in src/onion_peeler/pages/base.py
def extract_items(self, item_type: str) -> List[Dict[str, Any]]:
    """Extract items for a specific configured item type."""
    selectors = self.site_config.get_item_selectors(item_type=item_type)
    if not selectors:
        return []

    container_selector = selectors.pop("container", None)
    containers = self._select(container_selector) if container_selector else [self]

    results: List[Dict[str, Any]] = []
    for container in containers:
        item: Dict[str, Any] = {"source": self.url, "item_type": item_type}
        for field_name, selector in selectors.items():
            value = self._extract_field_from_container(container, selector)
            if isinstance(value, str):
                value = value.strip()
            if value is not None:
                item[field_name] = value

        if len(item) > 2:
            results.append(item)

    return results

items()

Extract items for the default item type from request metadata or config.

Source code in src/onion_peeler/pages/base.py
@field
def items(self) -> List[Dict[str, Any]]:
    """Extract items for the default item type from request metadata or config."""
    item_type = self._default_item_type()
    if not item_type:
        return []
    return self.extract_items(item_type)

next_page_url()

Extract next page URL from site pagination selector.

Source code in src/onion_peeler/pages/base.py
@field
def next_page_url(self) -> Optional[str]:
    """Extract next page URL from site pagination selector."""
    pagination = getattr(self.site_config, "pagination", None)
    if pagination and pagination.selector:
        return self._extract_link(pagination.selector)

    return None

PageObjectProvider(injector)

Bases: PageObjectInputProvider

Inject page objects from config, independent of per-site Python classes.

Source code in src/onion_peeler/pages/factory.py
def __init__(self, injector):
    super().__init__(injector)
    self._loader = get_loader()

resolve_page_object_class(import_path)

Resolve a configured page object class with a safe GenericPage fallback.

Source code in src/onion_peeler/pages/registry.py
def resolve_page_object_class(import_path: str | None) -> Type[GenericPage]:
    """Resolve a configured page object class with a safe GenericPage fallback."""
    if not import_path:
        return GenericPage

    module_name, _, class_name = import_path.rpartition(".")
    if not module_name or not class_name:
        logger.warning("Invalid page_object path '%s'; using GenericPage", import_path)
        return GenericPage

    try:
        module = importlib.import_module(module_name)
        page_cls = getattr(module, class_name)
        if not isinstance(page_cls, type) or not issubclass(page_cls, GenericPage):
            logger.warning(
                "Configured page_object '%s' is not a GenericPage subclass; using GenericPage",
                import_path,
            )
            return GenericPage
        return page_cls
    except Exception as exc:
        logger.warning("Failed to import page_object '%s': %s; using GenericPage", import_path, exc)
        return GenericPage