|
3 | 3 | import asyncio |
4 | 4 | import gzip |
5 | 5 | import sqlite3 |
| 6 | +from asyncio import Queue |
6 | 7 | from pathlib import Path |
7 | | -from typing import Callable, Sequence |
| 8 | +from typing import Callable |
8 | 9 |
|
9 | | -from playwright.async_api import async_playwright |
| 10 | +from playwright.async_api import Page, async_playwright |
10 | 11 | from playwright.sync_api import sync_playwright |
11 | 12 |
|
12 | 13 | from cppref.typing_ import Record, Source |
@@ -37,27 +38,59 @@ def fetch(record: Record, timeout: float) -> str: |
37 | 38 | return content |
38 | 39 |
|
39 | 40 | @staticmethod |
40 | | - async def afetch(*records: Record, timeout: float, limit: int): |
41 | | - def batch_iter[T](data: Sequence[T]): |
42 | | - length = len(data) |
43 | | - for i in range(0, length, limit): |
44 | | - yield data[i : i + limit] |
| 41 | + async def afetch( |
| 42 | + *records: Record, |
| 43 | + timeout: float, |
| 44 | + limit: int, |
| 45 | + on_success: Callable[[Record, str], None], |
| 46 | + on_failed: Callable[[Record, Exception], None], |
| 47 | + ): |
| 48 | + _records = Queue[Record]() |
| 49 | + for recrod in records: |
| 50 | + _records.put_nowait(recrod) |
| 51 | + |
| 52 | + _results = Queue[tuple[Record, Exception | str]]() |
| 53 | + |
| 54 | + async def producer(page: Page): |
| 55 | + while not _records.empty(): |
| 56 | + record = _records.get_nowait() |
| 57 | + try: |
| 58 | + resp = await page.goto(record.url, timeout=timeout, wait_until="networkidle") # fmt: off |
| 59 | + assert resp is not None, f"Timeout: {record}" |
| 60 | + assert resp.ok, f"Request failed: {record}, status={resp.status_text}" # fmt: off |
| 61 | + except Exception as e: |
| 62 | + _results.put_nowait((record, e)) |
| 63 | + else: |
| 64 | + _results.put_nowait((record, await page.content())) |
| 65 | + finally: |
| 66 | + _records.task_done() |
| 67 | + |
| 68 | + async def customer(): |
| 69 | + while True: |
| 70 | + record, resp = await _results.get() |
| 71 | + if isinstance(resp, str): |
| 72 | + try: |
| 73 | + on_success(record, resp) |
| 74 | + except Exception as e: |
| 75 | + on_failed(record, e) |
| 76 | + else: |
| 77 | + on_failed(record, resp) |
| 78 | + _results.task_done() |
45 | 79 |
|
46 | 80 | async with async_playwright() as p: |
47 | 81 | browser = await p.chromium.launch(headless=True) |
48 | 82 | pages = [await browser.new_page() for _ in range(limit)] |
49 | | - |
50 | | - async def _fetch(index: int, record: Record) -> str: |
51 | | - resp = await pages[index].goto(record.url, timeout=timeout, wait_until="networkidle") # fmt: off |
52 | | - assert resp is not None, f"Timeout: {record}" |
53 | | - assert resp.ok, f"Request failed: status={resp.status_text}, {record}" |
54 | | - return await pages[index].content() |
55 | | - |
56 | | - for batch in batch_iter(records): |
57 | | - tasks = map(lambda t: _fetch(t[0], t[1]), enumerate(batch)) |
58 | | - htmls = await asyncio.gather(*tasks, return_exceptions=True) |
59 | | - for html in htmls: |
60 | | - yield html |
| 83 | + producers = [asyncio.create_task(producer(pages[i])) for i in range(limit)] |
| 84 | + customers = [asyncio.create_task(customer()) for _ in range(limit)] |
| 85 | + await _records.join() |
| 86 | + for p in producers: |
| 87 | + p.cancel() |
| 88 | + await _results.join() |
| 89 | + for c in customers: |
| 90 | + c.cancel() |
| 91 | + |
| 92 | + await asyncio.gather(*producers, return_exceptions=True) |
| 93 | + await asyncio.gather(*customers, return_exceptions=True) |
61 | 94 |
|
62 | 95 | for page in pages: |
63 | 96 | await page.close() |
|
0 commit comments