Source code for scrachy.http_

#  Copyright 2020 Reid Swanson.
#
#  This file is part of scrachy.
#
#  scrachy is free software: you can redistribute it and/or modify
#  it under the terms of the GNU Lesser General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  scrachy is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU Lesser General Public License for more details.
#
#   You should have received a copy of the GNU Lesser General Public License
#   along with scrachy.  If not, see <https://www.gnu.org/licenses/>.
"""
Additional ``Request`` and ``Response`` classes for working with Selenium
and the ``AlchemyCacheStorage`` backend.

Note: Naming this module ``http`` causes a circular import error, so I've appended
an underscore to avoid conflicts.
"""
from __future__ import annotations

# Python Modules
import datetime
import logging

from typing import Any, Optional, Protocol

# 3rd Party Modules
from scrapy.http import HtmlResponse, TextResponse, Request, Response, XmlResponse
from selenium.webdriver.remote.webdriver import WebDriver

# Project Modules
from scrachy.db.models import ScrapeHistory

log = logging.getLogger(__name__)


[docs] class WaitCondition(Protocol): def __call__(self, driver: WebDriver) -> Any: pass
[docs] class ScriptExecutor(Protocol): def __call__(self, driver: WebDriver, request: Request) -> Optional[Response | list[Response] | dict[str, Response]]: pass
[docs] class SeleniumRequest(Request): """ A subclas of :class:`scrapy.http.Request` that provides extra information for downloading pages using Selenium. Based off the code from `Scrapy-Selenium <https://github.com/clemfromspace/scrapy-selenium>`_ """
[docs] def __init__( self, wait_timeout: Optional[float] = None, wait_until: Optional[WaitCondition] = None, screenshot: bool = False, script_executor: Optional[ScriptExecutor] = None, *args, **kwargs ): """ A new ``SeleniumRequest``. :param wait_timeout: The number of seconds to wait before accessing the data. :param wait_until: One of the "selenium.webdriver.support.expected_conditions". The response will be returned until the given condition is fulfilled. :param screenshot: If ``True``, a screenshot of the page will be taken and the data of the screenshot will be returned in the response "meta" attribute. :param script_executor: A function that takes a webdriver and a response as its parameters and optionally returns a list of new response objects as a side effect of its actions (e.g., executing arbitrary javascript code on the page). Any returned responses will be returned in the ``request.meta`` attribute with the key ``script_result``. Note that the returned responses will not be further processed by any other middleware. """ super().__init__(*args, **kwargs) self.wait_timeout = wait_timeout self.wait_until = wait_until self.screenshot = screenshot self.script_executor = script_executor
[docs] class CachedResponseMixin:
[docs] def __init__( self, scrape_timestamp: Optional[datetime.datetime] = None, extracted_text: Optional[str] = None, body_length: Optional[int] = None, extracted_text_length: Optional[int] = None, scrape_history: Optional[list[ScrapeHistory]] = None, *args, **kwargs ): """ A subclass of :class:`scrapy.http.HttpResponse` that contains a subset of the extra information stored in the cache. :param scrape_timestamp: The most recent date the request was scraped. :param body_number_of_bytes: The total number of bytes of the downloaded html. :param text_number_of_bytes: The number of bytes in the extracted plain text. :param body_text: The text extracted from the HTML. """ super().__init__(*args, **kwargs) self.scrape_timestamp = scrape_timestamp self.extracted_text = extracted_text self.body_length = body_length self.extracted_text_length = extracted_text_length self.scrape_history = scrape_history
[docs] class CachedHtmlResponse(CachedResponseMixin, HtmlResponse):
[docs] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs)
[docs] class CachedTextResponse(CachedResponseMixin, TextResponse):
[docs] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs)
[docs] class CachedXmlResponse(CachedResponseMixin, XmlResponse):
[docs] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs)