Репозитории (Repositories)

Слой доступа к данным.

`app.repositories.parsing.ParsingRepository`

Source code in app/repositories/parsing.py

class ParsingRepository:
    def __init__(self, session: AsyncSession, redis: Optional[Any] = None):
        self.session = session
        self.redis = redis

    async def get_due_sources(self, limit: int = 10) -> Sequence[ParsingSource]:
        stmt = (
            select(ParsingSource)
            .where(
                and_(
                    ParsingSource.is_active.is_(True),
                    ParsingSource.next_sync_at <= func.now()
                )
            )
            .order_by(ParsingSource.priority.desc(), ParsingSource.next_sync_at.asc())
            .limit(limit)
            .with_for_update(skip_locked=True)
        )
        result = await self.session.execute(stmt)
        return result.scalars().all()

    async def update_source_stats(self, source_id: int, stats: dict):
        stmt = select(ParsingSource).where(ParsingSource.id == source_id)
        result = await self.session.execute(stmt)
        source = result.scalar_one_or_none()

        if source:
            source.last_synced_at = func.now()
            source.next_sync_at = datetime.now() + timedelta(hours=source.refresh_interval_hours)
            source.status = "waiting"
            if source.config is None:
                source.config = {}
            cfg = dict(source.config)
            cfg["last_stats"] = stats
            source.config = cfg

        await self.session.commit()

    async def set_queued(self, source_id: int):
        """Marks source as queued in RabbitMQ. Status 'running' will be set by worker."""
        stmt = update(ParsingSource).where(ParsingSource.id == source_id).values(
            status="queued", 
            next_sync_at=datetime.now() + timedelta(minutes=15)
        )
        await self.session.execute(stmt)
        await self.session.commit()

    async def get_or_create_category_maps(self, names: List[str]) -> List[CategoryMap]:
        if not names:
            return []

        # Bulk get existing
        stmt = select(CategoryMap).where(CategoryMap.external_name.in_(names))
        result = await self.session.execute(stmt)
        existing = {m.external_name: m for m in result.scalars().all()}

        new_names = [n for n in names if n not in existing]
        if new_names:
            # Bulk insert new ones
            if self.session.bind.dialect.name == "postgresql":
                from sqlalchemy.dialects.postgresql import insert as pg_insert
                insert_stmt = pg_insert(CategoryMap).values([
                    {"external_name": name, "internal_category_id": None}
                    for name in new_names
                ]).on_conflict_do_nothing()
                await self.session.execute(insert_stmt)
            else:
                # SQLite fallback: manual loop or simple insert (ignoring conflicts if we already filtered)
                for name in new_names:
                    self.session.add(CategoryMap(external_name=name, internal_category_id=None))
                await self.session.flush()

            # Fetch again to get all (including newly created)
            stmt = select(CategoryMap).where(CategoryMap.external_name.in_(names))
            result = await self.session.execute(stmt)
            return list(result.scalars().all())

        return list(existing.values())

    async def get_unmapped_categories(self, limit: int = 100) -> Sequence[CategoryMap]:
        """Возвращает категории, у которых еще нет привязки к внутренней категории Gifty."""
        stmt = (
            select(CategoryMap)
            .where(CategoryMap.internal_category_id.is_(None))
            .limit(limit)
        )
        result = await self.session.execute(stmt)
        return result.scalars().all()

    async def update_category_mappings(self, mappings: List[dict]) -> int:
        """
        Массово обновляет привязки внешних категорий к внутренним.
        mappings: [{"external_name": "...", "internal_category_id": 123}, ...]
        """
        if not mappings:
            return 0

        count = 0
        for m in mappings:
            stmt = (
                update(CategoryMap)
                .where(CategoryMap.external_name == m["external_name"])
                .values(internal_category_id=m["internal_category_id"])
            )
            await self.session.execute(stmt)
            count += 1

        await self.session.commit()
        return count

    async def get_all_sources(self) -> Sequence[ParsingSource]:
        stmt = select(ParsingSource).order_by(ParsingSource.id.asc())
        result = await self.session.execute(stmt)
        return result.scalars().all()

    async def upsert_source(self, data: dict) -> ParsingSource:
        url = data.get("url")
        if not url:
            raise ValueError("URL is required")

        stmt = select(ParsingSource).where(ParsingSource.url == url)
        result = await self.session.execute(stmt)
        source = result.scalar_one_or_none()

        if source:
            for key, value in data.items():
                if hasattr(source, key):
                    setattr(source, key, value)
        else:
            source = ParsingSource(**data)
            self.session.add(source)

        await self.session.commit()
        await self.session.refresh(source)
        return source

    async def get_source_by_id(self, source_id: int) -> Optional[ParsingSource]:
        stmt = select(ParsingSource).where(ParsingSource.id == source_id)
        result = await self.session.execute(stmt)
        return result.scalar_one_or_none()

    async def get_source(self, source_id: int) -> Optional[ParsingSource]:
        """Alias for get_source_by_id used by IngestionService."""
        return await self.get_source_by_id(source_id)

    async def report_source_error(self, source_id: int, error_msg: str, is_broken: bool = True) -> Optional[ParsingSource]:
        stmt = select(ParsingSource).where(ParsingSource.id == source_id)
        result = await self.session.execute(stmt)
        source = result.scalar_one_or_none()

        if source:
            if source.config is None:
                source.config = {}

            cfg = dict(source.config)
            cfg["last_error"] = error_msg

            # Simple retry logic: if it's not a discovery (hub) which we want to be more careful with,
            # or if it's discovery-deep, we can allow a few retries before marking as broken.
            retries = cfg.get("retry_count", 0)

            if is_broken or retries >= 3:
                cfg["fix_required"] = True
                source.is_active = False
                source.status = "broken"
                cfg["retry_count"] = 0 # reset for next manual fix
            else:
                cfg["retry_count"] = retries + 1
                source.status = "error"
                # Back-off: wait 10 mins * retries
                source.next_sync_at = datetime.now() + timedelta(minutes=10 * (retries + 1))

            source.config = cfg
            await self.session.commit()
            return source
        return None

    async def sync_spiders(self, available_spiders: List[str]) -> List[str]:
        """
        Synchronizes the list of spiders from the scraper with the database.
        Returns a list of NEW spider keys that were not in the database.
        """
        stmt = select(ParsingSource.site_key)
        result = await self.session.execute(stmt)
        existing_keys = set(result.scalars().all())

        new_spiders = []
        for spider_key in available_spiders:
            if spider_key not in existing_keys:
                # Add new inactive source
                new_source = ParsingSource(
                    site_key=spider_key,
                    url=f"https://{spider_key}.placeholder", # Needs to be filled
                    type="hub",
                    strategy="discovery",
                    is_active=False,
                    config={"is_new": True, "note": "Automatically detected, please configure"}
                )
                self.session.add(new_source)
                new_spiders.append(spider_key)

        if new_spiders:
            await self.session.commit()

        return new_spiders

    async def set_source_active_status(self, source_id: int, is_active: bool) -> bool:
        stmt = update(ParsingSource).where(ParsingSource.id == source_id).values(
            is_active=is_active,
            status="waiting" if is_active else "disabled"
        )
        result = await self.session.execute(stmt)
        await self.session.commit()
        return result.rowcount > 0

    async def set_source_status(self, source_id: int, status: str):
        stmt = update(ParsingSource).where(ParsingSource.id == source_id).values(status=status)
        await self.session.execute(stmt)
        await self.session.commit()

    async def update_source_logs(self, source_id: int, logs: str):
        stmt = select(ParsingSource).where(ParsingSource.id == source_id)
        result = await self.session.execute(stmt)
        source = result.scalar_one_or_none()
        if source:
            if source.config is None:
                source.config = {}
            cfg = dict(source.config)
            cfg["last_logs"] = logs
            source.config = cfg
            await self.session.commit()

    async def reset_source_error(self, source_id: int):
        stmt = select(ParsingSource).where(ParsingSource.id == source_id)
        result = await self.session.execute(stmt)
        source = result.scalar_one_or_none()
        if source:
            if source.config is None:
                source.config = {}
            # Update dictionary directly as SQLAlchemy handles change tracking for JSONB
            cfg = dict(source.config)
            cfg.pop("last_error", None)
            cfg.pop("fix_required", None)
            source.config = cfg
            await self.session.commit()

    async def update_source(self, source_id: int, data: dict) -> Optional[ParsingSource]:
        stmt = select(ParsingSource).where(ParsingSource.id == source_id)
        result = await self.session.execute(stmt)
        source = result.scalar_one_or_none()

        if source:
            for key, value in data.items():
                if hasattr(source, key):
                    setattr(source, key, value)
            await self.session.commit()
            await self.session.refresh(source)
            return source
        return None

    async def log_parsing_run(self, source_id: int, status: str, items_scraped: int, items_new: int, error_message: str = None):
        from app.models import ParsingRun
        run = ParsingRun(
            source_id=source_id,
            status=status,
            items_scraped=items_scraped,
            items_new=items_new,
            error_message=error_message
        )
        self.session.add(run)
        await self.session.commit()

    async def get_source_history(self, source_id: int, limit: int = 15):
        from app.models import ParsingRun
        stmt = (
            select(ParsingRun)
            .where(ParsingRun.source_id == source_id)
            .order_by(ParsingRun.created_at.desc())
            .limit(limit)
        )
        result = await self.session.execute(stmt)
        return result.scalars().all()

    async def get_total_products_count(self, site_key: str) -> int:
        from app.models import Product, ParsingSource, ParsingRun
        # Approximate count based on gift_id prefix
        stmt = select(func.count()).select_from(Product).where(Product.gift_id.like(f"{site_key}:%"))
        result = await self.session.execute(stmt)
        return result.scalar() or 0

    async def get_aggregate_status(self, site_key: str) -> str:
        """Returns 'running' if any source for this site is running, else 'waiting' or 'error'."""
        stmt = select(ParsingSource.status).where(ParsingSource.site_key == site_key)
        result = await self.session.execute(stmt)
        statuses = result.scalars().all()
        if "running" in statuses:
            return "running"
        if "queued" in statuses:
            return "queued"
        if "error" in statuses:
            return "error"
        if "broken" in statuses:
            return "broken"
        return "waiting"

    async def get_sites_monitoring(self) -> List[dict]:
        """Returns aggregate health and stats for all sites efficiently via SQL."""
        # This performs GROUP BY on database side
        # statuses per site
        stmt_stats = (
            select(
                ParsingSource.site_key,
                func.count(ParsingSource.id).label("total_sources"),
                func.sum(case((ParsingSource.status == 'running', 1), else_=0)).label("running_count"),
                func.sum(case((ParsingSource.status == 'queued', 1), else_=0)).label("queued_count"),
                func.sum(case((ParsingSource.status == 'error', 1), else_=0)).label("error_count"),
                func.sum(case((ParsingSource.status == 'broken', 1), else_=0)).label("broken_count"),
                func.max(ParsingSource.last_synced_at).label("last_synced_at")
            )
            .group_by(ParsingSource.site_key)
        )

        result = await self.session.execute(stmt_stats)
        rows = result.all()

        monitoring = []
        for row in rows:
            status = "waiting"
            if row.running_count > 0: status = "running"
            elif row.queued_count > 0: status = "queued"
            elif row.error_count > 0: status = "error"
            elif row.broken_count > 0: status = "broken"

            monitoring.append({
                "site_key": row.site_key,
                "total_sources": row.total_sources,
                "status": status,
                "last_synced_at": row.last_synced_at.isoformat() if row.last_synced_at else None,
                "is_active": True # Simplified for summary
            })
        return monitoring

    async def get_24h_stats(self) -> dict:
        """Returns scraped items count for the last 24h from DB ParsingRuns."""
        since = datetime.now() - timedelta(hours=24)
        stmt = select(
            func.sum(ParsingRun.items_scraped).label("scraped"),
            func.sum(ParsingRun.items_new).label("new")
        ).where(ParsingRun.created_at >= since)

        result = await self.session.execute(stmt)
        row = result.one_or_none()
        return {
            "scraped_24h": int(row.scraped or 0) if row else 0,
            "new_24h": int(row.new or 0) if row else 0
        }

    async def get_aggregate_history(self, site_key: str, limit_days: int = 15):
        """Returns runs aggregated by day for the entire site."""
        from app.models import ParsingRun, ParsingSource
        # Aggregate by date (truncated created_at)
        date_trunc = func.date_trunc('day', ParsingRun.created_at)
        stmt = (
            select(
                date_trunc.label("day"),
                func.sum(ParsingRun.items_new).label("items_new"),
                func.sum(ParsingRun.items_scraped).label("items_scraped")
            )
            .join(ParsingSource, ParsingRun.source_id == ParsingSource.id)
            .where(ParsingSource.site_key == site_key)
            .group_by(date_trunc)
            .order_by(date_trunc.desc())
            .limit(limit_days)
        )
        result = await self.session.execute(stmt)
        return result.all()

    async def get_last_full_cycle_stats(self, site_key: str) -> int:
        """
        Calculates how many new items were added since the last 'discovery' run started.
        """
        from app.models import ParsingRun, ParsingSource
        # Find the last successful discovery run for this site
        last_hub_run_stmt = (
            select(ParsingRun.created_at)
            .join(ParsingSource, ParsingRun.source_id == ParsingSource.id)
            .where(ParsingSource.site_key == site_key)
            .where(ParsingSource.type == "hub")
            .order_by(ParsingRun.created_at.desc())
            .limit(1)
        )
        last_hub_run_res = await self.session.execute(last_hub_run_stmt)
        last_hub_time = last_hub_run_res.scalar()

        if not last_hub_time:
            return 0

        # Sum all new items since that time for this site_key
        stmt = (
            select(func.sum(ParsingRun.items_new))
            .join(ParsingSource, ParsingRun.source_id == ParsingSource.id)
            .where(ParsingSource.site_key == site_key)
            .where(ParsingRun.created_at >= last_hub_time)
        )
        result = await self.session.execute(stmt)
        return result.scalar() or 0

    async def get_total_category_products_count(self, site_key: str, category_name: str) -> int:
        from app.models import Product
        # Try to match by category name stored in Product.category
        stmt = select(func.count()).select_from(Product).where(
            and_(
                Product.gift_id.like(f"{site_key}:%"),
                Product.category == category_name
            )
        )
        result = await self.session.execute(stmt)
        return result.scalar() or 0

    async def get_source_daily_history(self, source_id: int, limit_days: int = 15):
        from app.models import ParsingRun
        date_trunc = func.date_trunc('day', ParsingRun.created_at)
        stmt = (
            select(
                date_trunc.label("day"),
                func.sum(ParsingRun.items_new).label("items_new"),
                func.sum(ParsingRun.items_scraped).label("items_scraped")
            )
            .where(ParsingRun.source_id == source_id)
            .group_by(date_trunc)
            .order_by(date_trunc.desc())
            .limit(limit_days)
        )
        result = await self.session.execute(stmt)
        return result.all()

    async def get_active_workers(self) -> List[dict]:
        """Fetches active workers from Redis heartbeats."""
        if not self.redis:
            return []

        workers = []
        try:
            keys = await self.redis.keys("worker_heartbeat:*")
            for key in keys:
                data = await self.redis.get(key)
                if data:
                    workers.append(json.loads(data))
        except Exception as e:
            logger.error(f"Error fetching workers from Redis: {e}")
        return workers

    async def get_source_by_url(self, url: str) -> Optional[ParsingSource]:
        stmt = select(ParsingSource).where(ParsingSource.url == url)
        result = await self.session.execute(stmt)
        return result.scalar_one_or_none()

    async def count_discovered_today(self) -> int:
        """Counts how many sources were discovered/activated today."""
        stmt = select(func.count(ParsingSource.id)).where(
            and_(
                ParsingSource.status != "discovered", # already activated
                ParsingSource.created_at >= func.now() - timedelta(days=1)
            )
        )
        result = await self.session.execute(stmt)
        return result.scalar() or 0

    async def get_discovered_sources(self, limit: int = 50) -> List[ParsingSource]:
        """Fetches inactive 'discovered' sources from the backlog."""
        stmt = (
            select(ParsingSource)
            .where(ParsingSource.status == "discovered")
            .order_by(ParsingSource.priority.desc(), ParsingSource.created_at.asc())
            .limit(limit)
        )
        result = await self.session.execute(stmt)
        return result.scalars().all()

    async def activate_sources(self, source_ids: List[int]):
        """Activates sources from the backlog."""
        if not source_ids:
            return
        stmt = (
            update(ParsingSource)
            .where(ParsingSource.id.in_(source_ids))
            .values(
                is_active=True,
                status="waiting",
                next_sync_at=func.now()
            )
        )
        await self.session.execute(stmt)
        await self.session.commit()

Functions

`activate_sources(source_ids)` `async`

Activates sources from the backlog.

Source code in app/repositories/parsing.py

async def activate_sources(self, source_ids: List[int]):
    """Activates sources from the backlog."""
    if not source_ids:
        return
    stmt = (
        update(ParsingSource)
        .where(ParsingSource.id.in_(source_ids))
        .values(
            is_active=True,
            status="waiting",
            next_sync_at=func.now()
        )
    )
    await self.session.execute(stmt)
    await self.session.commit()

`count_discovered_today()` `async`

Counts how many sources were discovered/activated today.

Source code in app/repositories/parsing.py

async def count_discovered_today(self) -> int:
    """Counts how many sources were discovered/activated today."""
    stmt = select(func.count(ParsingSource.id)).where(
        and_(
            ParsingSource.status != "discovered", # already activated
            ParsingSource.created_at >= func.now() - timedelta(days=1)
        )
    )
    result = await self.session.execute(stmt)
    return result.scalar() or 0

`get_24h_stats()` `async`

Returns scraped items count for the last 24h from DB ParsingRuns.

Source code in app/repositories/parsing.py

async def get_24h_stats(self) -> dict:
    """Returns scraped items count for the last 24h from DB ParsingRuns."""
    since = datetime.now() - timedelta(hours=24)
    stmt = select(
        func.sum(ParsingRun.items_scraped).label("scraped"),
        func.sum(ParsingRun.items_new).label("new")
    ).where(ParsingRun.created_at >= since)

    result = await self.session.execute(stmt)
    row = result.one_or_none()
    return {
        "scraped_24h": int(row.scraped or 0) if row else 0,
        "new_24h": int(row.new or 0) if row else 0
    }

`get_active_workers()` `async`

Fetches active workers from Redis heartbeats.

Source code in app/repositories/parsing.py

async def get_active_workers(self) -> List[dict]:
    """Fetches active workers from Redis heartbeats."""
    if not self.redis:
        return []

    workers = []
    try:
        keys = await self.redis.keys("worker_heartbeat:*")
        for key in keys:
            data = await self.redis.get(key)
            if data:
                workers.append(json.loads(data))
    except Exception as e:
        logger.error(f"Error fetching workers from Redis: {e}")
    return workers

`get_aggregate_history(site_key, limit_days=15)` `async`

Returns runs aggregated by day for the entire site.

Source code in app/repositories/parsing.py

async def get_aggregate_history(self, site_key: str, limit_days: int = 15):
    """Returns runs aggregated by day for the entire site."""
    from app.models import ParsingRun, ParsingSource
    # Aggregate by date (truncated created_at)
    date_trunc = func.date_trunc('day', ParsingRun.created_at)
    stmt = (
        select(
            date_trunc.label("day"),
            func.sum(ParsingRun.items_new).label("items_new"),
            func.sum(ParsingRun.items_scraped).label("items_scraped")
        )
        .join(ParsingSource, ParsingRun.source_id == ParsingSource.id)
        .where(ParsingSource.site_key == site_key)
        .group_by(date_trunc)
        .order_by(date_trunc.desc())
        .limit(limit_days)
    )
    result = await self.session.execute(stmt)
    return result.all()

`get_aggregate_status(site_key)` `async`

Returns 'running' if any source for this site is running, else 'waiting' or 'error'.

Source code in app/repositories/parsing.py

async def get_aggregate_status(self, site_key: str) -> str:
    """Returns 'running' if any source for this site is running, else 'waiting' or 'error'."""
    stmt = select(ParsingSource.status).where(ParsingSource.site_key == site_key)
    result = await self.session.execute(stmt)
    statuses = result.scalars().all()
    if "running" in statuses:
        return "running"
    if "queued" in statuses:
        return "queued"
    if "error" in statuses:
        return "error"
    if "broken" in statuses:
        return "broken"
    return "waiting"

`get_discovered_sources(limit=50)` `async`

Fetches inactive 'discovered' sources from the backlog.

Source code in app/repositories/parsing.py

async def get_discovered_sources(self, limit: int = 50) -> List[ParsingSource]:
    """Fetches inactive 'discovered' sources from the backlog."""
    stmt = (
        select(ParsingSource)
        .where(ParsingSource.status == "discovered")
        .order_by(ParsingSource.priority.desc(), ParsingSource.created_at.asc())
        .limit(limit)
    )
    result = await self.session.execute(stmt)
    return result.scalars().all()

`get_last_full_cycle_stats(site_key)` `async`

Calculates how many new items were added since the last 'discovery' run started.

Source code in app/repositories/parsing.py

async def get_last_full_cycle_stats(self, site_key: str) -> int:
    """
    Calculates how many new items were added since the last 'discovery' run started.
    """
    from app.models import ParsingRun, ParsingSource
    # Find the last successful discovery run for this site
    last_hub_run_stmt = (
        select(ParsingRun.created_at)
        .join(ParsingSource, ParsingRun.source_id == ParsingSource.id)
        .where(ParsingSource.site_key == site_key)
        .where(ParsingSource.type == "hub")
        .order_by(ParsingRun.created_at.desc())
        .limit(1)
    )
    last_hub_run_res = await self.session.execute(last_hub_run_stmt)
    last_hub_time = last_hub_run_res.scalar()

    if not last_hub_time:
        return 0

    # Sum all new items since that time for this site_key
    stmt = (
        select(func.sum(ParsingRun.items_new))
        .join(ParsingSource, ParsingRun.source_id == ParsingSource.id)
        .where(ParsingSource.site_key == site_key)
        .where(ParsingRun.created_at >= last_hub_time)
    )
    result = await self.session.execute(stmt)
    return result.scalar() or 0

`get_sites_monitoring()` `async`

Returns aggregate health and stats for all sites efficiently via SQL.

Source code in app/repositories/parsing.py

async def get_sites_monitoring(self) -> List[dict]:
    """Returns aggregate health and stats for all sites efficiently via SQL."""
    # This performs GROUP BY on database side
    # statuses per site
    stmt_stats = (
        select(
            ParsingSource.site_key,
            func.count(ParsingSource.id).label("total_sources"),
            func.sum(case((ParsingSource.status == 'running', 1), else_=0)).label("running_count"),
            func.sum(case((ParsingSource.status == 'queued', 1), else_=0)).label("queued_count"),
            func.sum(case((ParsingSource.status == 'error', 1), else_=0)).label("error_count"),
            func.sum(case((ParsingSource.status == 'broken', 1), else_=0)).label("broken_count"),
            func.max(ParsingSource.last_synced_at).label("last_synced_at")
        )
        .group_by(ParsingSource.site_key)
    )

    result = await self.session.execute(stmt_stats)
    rows = result.all()

    monitoring = []
    for row in rows:
        status = "waiting"
        if row.running_count > 0: status = "running"
        elif row.queued_count > 0: status = "queued"
        elif row.error_count > 0: status = "error"
        elif row.broken_count > 0: status = "broken"

        monitoring.append({
            "site_key": row.site_key,
            "total_sources": row.total_sources,
            "status": status,
            "last_synced_at": row.last_synced_at.isoformat() if row.last_synced_at else None,
            "is_active": True # Simplified for summary
        })
    return monitoring

`get_source(source_id)` `async`

Alias for get_source_by_id used by IngestionService.

Source code in app/repositories/parsing.py

async def get_source(self, source_id: int) -> Optional[ParsingSource]:
    """Alias for get_source_by_id used by IngestionService."""
    return await self.get_source_by_id(source_id)

`get_unmapped_categories(limit=100)` `async`

Возвращает категории, у которых еще нет привязки к внутренней категории Gifty.

Source code in app/repositories/parsing.py

async def get_unmapped_categories(self, limit: int = 100) -> Sequence[CategoryMap]:
    """Возвращает категории, у которых еще нет привязки к внутренней категории Gifty."""
    stmt = (
        select(CategoryMap)
        .where(CategoryMap.internal_category_id.is_(None))
        .limit(limit)
    )
    result = await self.session.execute(stmt)
    return result.scalars().all()

`set_queued(source_id)` `async`

Marks source as queued in RabbitMQ. Status 'running' will be set by worker.

Source code in app/repositories/parsing.py

async def set_queued(self, source_id: int):
    """Marks source as queued in RabbitMQ. Status 'running' will be set by worker."""
    stmt = update(ParsingSource).where(ParsingSource.id == source_id).values(
        status="queued", 
        next_sync_at=datetime.now() + timedelta(minutes=15)
    )
    await self.session.execute(stmt)
    await self.session.commit()

`sync_spiders(available_spiders)` `async`

Synchronizes the list of spiders from the scraper with the database. Returns a list of NEW spider keys that were not in the database.

Source code in app/repositories/parsing.py

async def sync_spiders(self, available_spiders: List[str]) -> List[str]:
    """
    Synchronizes the list of spiders from the scraper with the database.
    Returns a list of NEW spider keys that were not in the database.
    """
    stmt = select(ParsingSource.site_key)
    result = await self.session.execute(stmt)
    existing_keys = set(result.scalars().all())

    new_spiders = []
    for spider_key in available_spiders:
        if spider_key not in existing_keys:
            # Add new inactive source
            new_source = ParsingSource(
                site_key=spider_key,
                url=f"https://{spider_key}.placeholder", # Needs to be filled
                type="hub",
                strategy="discovery",
                is_active=False,
                config={"is_new": True, "note": "Automatically detected, please configure"}
            )
            self.session.add(new_source)
            new_spiders.append(spider_key)

    if new_spiders:
        await self.session.commit()

    return new_spiders

`update_category_mappings(mappings)` `async`

Массово обновляет привязки внешних категорий к внутренним. mappings: [{"external_name": "...", "internal_category_id": 123}, ...]

Source code in app/repositories/parsing.py

async def update_category_mappings(self, mappings: List[dict]) -> int:
    """
    Массово обновляет привязки внешних категорий к внутренним.
    mappings: [{"external_name": "...", "internal_category_id": 123}, ...]
    """
    if not mappings:
        return 0

    count = 0
    for m in mappings:
        stmt = (
            update(CategoryMap)
            .where(CategoryMap.external_name == m["external_name"])
            .values(internal_category_id=m["internal_category_id"])
        )
        await self.session.execute(stmt)
        count += 1

    await self.session.commit()
    return count

`app.repositories.catalog.PostgresCatalogRepository`

Bases: CatalogRepository

Source code in app/repositories/catalog.py

class PostgresCatalogRepository(CatalogRepository):
    def __init__(self, session: AsyncSession):
        self.session = session

    async def upsert_products(self, products: list[dict]) -> int:
        if not products:
            return 0

        # Construct values for upsert.
        # We assume products list contains dicts matching Product model fields.
        stmt = insert(Product).values(products)

        # On conflict do update
        # We update everything except created_at (and gift_id obviously)
        update_dict = {
            col.name: col
            for col in stmt.excluded
            if col.name not in ("created_at", "gift_id")
        }

        stmt = stmt.on_conflict_do_update(
            index_elements=[Product.gift_id],
            set_=update_dict
        )

        # RETURNING xmax is specific to PostgreSQL for counting inserts vs updates
        if self.session.bind.dialect.name == "postgresql":
            stmt = stmt.returning(sa.literal_column("xmax"))
            result = await self.session.execute(stmt)
            rows = result.scalars().all()
            inserted_count = sum(1 for xmax in rows if xmax == 0)
            return inserted_count
        else:
            # Fallback for SQLite/others: just return rowcount
            result = await self.session.execute(stmt)
            return result.rowcount

    async def mark_inactive_except(self, seen_ids: set[str]) -> int:
        """
        Mark all products NOT in the provided set of gift_ids as inactive.
        Used for soft-delete during full sync.
        """
        if not seen_ids:
            # If seen_ids is empty, we don't deactivate everything 
            # as it might be a failed sync. We require at least some IDs.
            return 0

        stmt = (
            update(Product)
            .where(Product.gift_id.notin_(seen_ids))
            .where(Product.is_active.is_(True))
            .values(is_active=False, updated_at=func.now())
        )

        result = await self.session.execute(stmt)
        return result.rowcount

    async def get_active_products_count(self) -> int:
        query = select(func.count(Product.gift_id)).where(Product.is_active.is_(True))
        result = await self.session.execute(query)
        return result.scalar() or 0

    async def get_products_without_embeddings(self, model_version: str, limit: int = 100) -> list[Product]:
        """
        Fetch products that do not have an embedding for the specified model_version.
        We check if `product_embeddings` entry exists OR if content_hash doesn't match.
        """
        from sqlalchemy.orm import aliased
        from sqlalchemy import and_, or_
        from app.models import ProductEmbedding

        p = aliased(Product)
        pe = aliased(ProductEmbedding)

        stmt = (
            select(p)
            .outerjoin(
                pe,
                and_(
                    p.gift_id == pe.gift_id,
                    pe.model_version == model_version
                )
            )
            .where(
                and_(
                    p.is_active.is_(True),
                    or_(
                        pe.gift_id.is_(None),
                        pe.content_hash != p.content_hash
                    )
                )
            )
            .limit(limit)
        )

        result = await self.session.execute(stmt)
        return list(result.scalars().all())

    async def save_embeddings(self, embeddings: list[dict]) -> int:
        """
        Upsert product embeddings.
        embeddings list should contain dicts matching ProductEmbedding model.
        """
        if not embeddings:
            return 0

        stmt = insert(ProductEmbedding).values(embeddings)

        update_dict = {
            "embedding": stmt.excluded.embedding, # This assumes we pass vector/list
            "content_hash": stmt.excluded.content_hash,
            "embedded_at": func.now(),
            "updated_at": datetime.now(),
        }

        stmt = stmt.on_conflict_do_update(
            index_elements=[
                ProductEmbedding.gift_id,
                ProductEmbedding.model_name,
                ProductEmbedding.model_version
            ],
            set_=update_dict
        )

        try:
            result = await self.session.execute(stmt)
            return result.rowcount
        except Exception as e:
            logger.error(f"Failed to upsert embeddings. Batch size: {len(embeddings)}. Error: {type(e).__name__}: {e}")
            raise e

    async def search_similar_products(
        self, 
        embedding: list[float], 
        limit: int = 10, 
        min_similarity: float = 0.0, 
        is_active_only: bool = True,
        max_price: Optional[int] = None,
        max_delivery_days: Optional[int] = None,
        model_name: Optional[str] = None
    ) -> list[Product]:
        # Perform vector search using cosine distance (operator <=>)
        from app.core.logic_config import logic_config
        target_model = model_name or logic_config.llm.model_embedding

        stmt = (
            select(Product)
            .join(ProductEmbedding, and_(
                Product.gift_id == ProductEmbedding.gift_id,
                ProductEmbedding.model_name == target_model
            ))
        )

        if is_active_only:
            stmt = stmt.where(Product.is_active.is_(True))

        if max_price:
            stmt = stmt.where(Product.price <= max_price)

        if max_delivery_days:
            stmt = stmt.where(Product.delivery_days <= max_delivery_days)

        distance_col = ProductEmbedding.embedding.cosine_distance(embedding)

        if min_similarity > 0:
            # cosine_similarity = 1 - cosine_distance
            stmt = stmt.where(1 - distance_col >= min_similarity)

        stmt = stmt.order_by(distance_col).limit(limit)
        try:
            result = await self.session.execute(stmt)
            return list(result.scalars().all())
        except Exception as e:
            logger.error(f"CatalogRepository.search_similar_products failed: {e}")
            from app.services.notifications import get_notification_service
            notifier = get_notification_service()
            # Send alert
            if notifier:
                await notifier.notify(
                    topic="db_error",
                    message="Vector search failed in CatalogRepository",
                    data={"error": str(e)}
                )

            from app.config import get_settings
            if get_settings().env == "dev":
                logger.warning("DB search failed, returning empty list (dev mode)")
                return []
            raise e

    async def get_products_without_llm_score(self, limit: int = 100) -> list[Product]:
        """
        Fetch products that don't have an LLM gift score yet.
        """
        stmt = (
            select(Product)
            .where(
                and_(
                    Product.is_active.is_(True),
                    Product.llm_gift_score.is_(None)
                )
            )
            .order_by(Product.updated_at.desc()) # Or some other priority
            .limit(limit)
        )

        result = await self.session.execute(stmt)
        return list(result.scalars().all())

    async def save_llm_scores(self, scores: list[dict]) -> int:
        """
        Update product rows with LLM scores and reasoning.
        scores list should contain dicts: {'gift_id': str, 'llm_gift_score': float, 'llm_gift_reasoning': str, ...}
        Uses batch update (UPSERT) for efficiency.
        """
        if not scores:
            return 0

        # Add timestamp to all items
        now = datetime.now()
        for s in scores:
            s["llm_scored_at"] = now
            s["b_gift_id"] = s["gift_id"]

        # Use session.execute with bindparam for batch update
        # This is more robust as it doesn't require all columns for INSERT
        from sqlalchemy import bindparam

        stmt = (
            sa.update(Product)
            .where(Product.gift_id == bindparam("b_gift_id"))
            .values(
                llm_gift_score=bindparam("llm_gift_score"),
                llm_gift_reasoning=bindparam("llm_gift_reasoning"),
                llm_scored_at=bindparam("llm_scored_at"),
                updated_at=func.now()
            )
            .execution_options(synchronize_session=None)
        )

        result = await self.session.execute(stmt, scores)
        # For bulk updates, rowcount might not be directly available on IteratorResult
        try:
            return result.rowcount
        except AttributeError:
            return len(scores)

Functions

`get_products_without_embeddings(model_version, limit=100)` `async`

Fetch products that do not have an embedding for the specified model_version. We check if product_embeddings entry exists OR if content_hash doesn't match.

Source code in app/repositories/catalog.py

async def get_products_without_embeddings(self, model_version: str, limit: int = 100) -> list[Product]:
    """
    Fetch products that do not have an embedding for the specified model_version.
    We check if `product_embeddings` entry exists OR if content_hash doesn't match.
    """
    from sqlalchemy.orm import aliased
    from sqlalchemy import and_, or_
    from app.models import ProductEmbedding

    p = aliased(Product)
    pe = aliased(ProductEmbedding)

    stmt = (
        select(p)
        .outerjoin(
            pe,
            and_(
                p.gift_id == pe.gift_id,
                pe.model_version == model_version
            )
        )
        .where(
            and_(
                p.is_active.is_(True),
                or_(
                    pe.gift_id.is_(None),
                    pe.content_hash != p.content_hash
                )
            )
        )
        .limit(limit)
    )

    result = await self.session.execute(stmt)
    return list(result.scalars().all())

`get_products_without_llm_score(limit=100)` `async`

Fetch products that don't have an LLM gift score yet.

Source code in app/repositories/catalog.py

async def get_products_without_llm_score(self, limit: int = 100) -> list[Product]:
    """
    Fetch products that don't have an LLM gift score yet.
    """
    stmt = (
        select(Product)
        .where(
            and_(
                Product.is_active.is_(True),
                Product.llm_gift_score.is_(None)
            )
        )
        .order_by(Product.updated_at.desc()) # Or some other priority
        .limit(limit)
    )

    result = await self.session.execute(stmt)
    return list(result.scalars().all())

`mark_inactive_except(seen_ids)` `async`

Mark all products NOT in the provided set of gift_ids as inactive. Used for soft-delete during full sync.

Source code in app/repositories/catalog.py

async def mark_inactive_except(self, seen_ids: set[str]) -> int:
    """
    Mark all products NOT in the provided set of gift_ids as inactive.
    Used for soft-delete during full sync.
    """
    if not seen_ids:
        # If seen_ids is empty, we don't deactivate everything 
        # as it might be a failed sync. We require at least some IDs.
        return 0

    stmt = (
        update(Product)
        .where(Product.gift_id.notin_(seen_ids))
        .where(Product.is_active.is_(True))
        .values(is_active=False, updated_at=func.now())
    )

    result = await self.session.execute(stmt)
    return result.rowcount

`save_embeddings(embeddings)` `async`

Upsert product embeddings. embeddings list should contain dicts matching ProductEmbedding model.

Source code in app/repositories/catalog.py

async def save_embeddings(self, embeddings: list[dict]) -> int:
    """
    Upsert product embeddings.
    embeddings list should contain dicts matching ProductEmbedding model.
    """
    if not embeddings:
        return 0

    stmt = insert(ProductEmbedding).values(embeddings)

    update_dict = {
        "embedding": stmt.excluded.embedding, # This assumes we pass vector/list
        "content_hash": stmt.excluded.content_hash,
        "embedded_at": func.now(),
        "updated_at": datetime.now(),
    }

    stmt = stmt.on_conflict_do_update(
        index_elements=[
            ProductEmbedding.gift_id,
            ProductEmbedding.model_name,
            ProductEmbedding.model_version
        ],
        set_=update_dict
    )

    try:
        result = await self.session.execute(stmt)
        return result.rowcount
    except Exception as e:
        logger.error(f"Failed to upsert embeddings. Batch size: {len(embeddings)}. Error: {type(e).__name__}: {e}")
        raise e

`save_llm_scores(scores)` `async`

Update product rows with LLM scores and reasoning. scores list should contain dicts: {'gift_id': str, 'llm_gift_score': float, 'llm_gift_reasoning': str, ...} Uses batch update (UPSERT) for efficiency.

Source code in app/repositories/catalog.py

async def save_llm_scores(self, scores: list[dict]) -> int:
    """
    Update product rows with LLM scores and reasoning.
    scores list should contain dicts: {'gift_id': str, 'llm_gift_score': float, 'llm_gift_reasoning': str, ...}
    Uses batch update (UPSERT) for efficiency.
    """
    if not scores:
        return 0

    # Add timestamp to all items
    now = datetime.now()
    for s in scores:
        s["llm_scored_at"] = now
        s["b_gift_id"] = s["gift_id"]

    # Use session.execute with bindparam for batch update
    # This is more robust as it doesn't require all columns for INSERT
    from sqlalchemy import bindparam

    stmt = (
        sa.update(Product)
        .where(Product.gift_id == bindparam("b_gift_id"))
        .values(
            llm_gift_score=bindparam("llm_gift_score"),
            llm_gift_reasoning=bindparam("llm_gift_reasoning"),
            llm_scored_at=bindparam("llm_scored_at"),
            updated_at=func.now()
        )
        .execution_options(synchronize_session=None)
    )

    result = await self.session.execute(stmt, scores)
    # For bulk updates, rowcount might not be directly available on IteratorResult
    try:
        return result.rowcount
    except AttributeError:
        return len(scores)

`repositories.recommendations.create_quiz_run(db, *, user_id, anon_id, answers_json)` `async`

Source code in repositories/recommendations.py

async def create_quiz_run(
    db: AsyncSession,
    *,
    user_id: Optional[UUID],
    anon_id: Optional[str],
    answers_json: dict[str, Any],
) -> QuizRun:
    quiz_run = QuizRun(user_id=user_id, anon_id=anon_id, answers_json=answers_json)
    db.add(quiz_run)
    await db.commit()
    await db.refresh(quiz_run)
    return quiz_run

`repositories.recommendations.log_event(db, event_name, *, user_id=None, anon_id=None, quiz_run_id=None, recommendation_run_id=None, gift_id=None, payload=None)` `async`

Source code in repositories/recommendations.py

async def log_event(
    db: AsyncSession,
    event_name: str,
    *,
    user_id: Optional[UUID] = None,
    anon_id: Optional[str] = None,
    quiz_run_id: Optional[UUID] = None,
    recommendation_run_id: Optional[UUID] = None,
    gift_id: Optional[str] = None,
    payload: Optional[dict[str, Any]] = None,
) -> None:
    event = Event(
        event_name=event_name,
        user_id=user_id,
        anon_id=anon_id,
        quiz_run_id=quiz_run_id,
        recommendation_run_id=recommendation_run_id,
        gift_id=gift_id,
        payload=payload,
    )
    db.add(event)
    await db.commit()

Репозитории (Repositories)

app.repositories.parsing.ParsingRepository

Functions

activate_sources(source_ids) async

count_discovered_today() async

get_24h_stats() async

get_active_workers() async

get_aggregate_history(site_key, limit_days=15) async

get_aggregate_status(site_key) async

get_discovered_sources(limit=50) async

get_last_full_cycle_stats(site_key) async

get_sites_monitoring() async

get_source(source_id) async

get_unmapped_categories(limit=100) async

set_queued(source_id) async

sync_spiders(available_spiders) async

update_category_mappings(mappings) async

app.repositories.catalog.PostgresCatalogRepository

Functions

get_products_without_embeddings(model_version, limit=100) async

get_products_without_llm_score(limit=100) async

mark_inactive_except(seen_ids) async

save_embeddings(embeddings) async

save_llm_scores(scores) async

repositories.recommendations.create_quiz_run(db, *, user_id, anon_id, answers_json) async

repositories.recommendations.log_event(db, event_name, *, user_id=None, anon_id=None, quiz_run_id=None, recommendation_run_id=None, gift_id=None, payload=None) async

`app.repositories.parsing.ParsingRepository`

`activate_sources(source_ids)` `async`

`count_discovered_today()` `async`

`get_24h_stats()` `async`

`get_active_workers()` `async`

`get_aggregate_history(site_key, limit_days=15)` `async`

`get_aggregate_status(site_key)` `async`

`get_discovered_sources(limit=50)` `async`

`get_last_full_cycle_stats(site_key)` `async`

`get_sites_monitoring()` `async`

`get_source(source_id)` `async`

`get_unmapped_categories(limit=100)` `async`

`set_queued(source_id)` `async`

`sync_spiders(available_spiders)` `async`

`update_category_mappings(mappings)` `async`

`app.repositories.catalog.PostgresCatalogRepository`

`get_products_without_embeddings(model_version, limit=100)` `async`

`get_products_without_llm_score(limit=100)` `async`

`mark_inactive_except(seen_ids)` `async`

`save_embeddings(embeddings)` `async`

`save_llm_scores(scores)` `async`

`repositories.recommendations.create_quiz_run(db, *, user_id, anon_id, answers_json)` `async`

`repositories.recommendations.log_event(db, event_name, *, user_id=None, anon_id=None, quiz_run_id=None, recommendation_run_id=None, gift_id=None, payload=None)` `async`