# Copyright 2020 Reid Swanson.
#
# This file is part of scrachy.
#
# scrachy is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# scrachy is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with scrachy. If not, see <https://www.gnu.org/licenses/>.
"""
Additional ``Request`` and ``Response`` classes for working with Selenium
and the ``AlchemyCacheStorage`` backend.
Note: Naming this module ``http`` causes a circular import error, so I've appended
an underscore to avoid conflicts.
"""
from __future__ import annotations
# Python Modules
import datetime
import logging
from typing import Any, Optional, Protocol
# 3rd Party Modules
from scrapy.http import HtmlResponse, TextResponse, Request, Response, XmlResponse
from selenium.webdriver.remote.webdriver import WebDriver
# Project Modules
from scrachy.db.models import ScrapeHistory
log = logging.getLogger(__name__)
[docs]
class WaitCondition(Protocol):
def __call__(self, driver: WebDriver) -> Any:
pass
[docs]
class ScriptExecutor(Protocol):
def __call__(self, driver: WebDriver, request: Request) -> Optional[Response | list[Response] | dict[str, Response]]:
pass
[docs]
class SeleniumRequest(Request):
"""
A subclas of :class:`scrapy.http.Request` that provides extra information for downloading pages using
Selenium.
Based off the code from `Scrapy-Selenium <https://github.com/clemfromspace/scrapy-selenium>`_
"""
[docs]
def __init__(
self,
wait_timeout: Optional[float] = None,
wait_until: Optional[WaitCondition] = None,
screenshot: bool = False,
script_executor: Optional[ScriptExecutor] = None,
*args,
**kwargs
):
"""
A new ``SeleniumRequest``.
:param wait_timeout: The number of seconds to wait before accessing the data.
:param wait_until: One of the "selenium.webdriver.support.expected_conditions". The response
will be returned until the given condition is fulfilled.
:param screenshot: If ``True``, a screenshot of the page will be taken and the data of the screenshot
will be returned in the response "meta" attribute.
:param script_executor: A function that takes a webdriver and a response as its parameters and optionally
returns a list of new response objects as a side effect of its actions (e.g.,
executing arbitrary javascript code on the page). Any returned responses will
be returned in the ``request.meta`` attribute with the key ``script_result``.
Note that the returned responses will not be further processed by any other
middleware.
"""
super().__init__(*args, **kwargs)
self.wait_timeout = wait_timeout
self.wait_until = wait_until
self.screenshot = screenshot
self.script_executor = script_executor
[docs]
class CachedResponseMixin:
[docs]
def __init__(
self,
scrape_timestamp: Optional[datetime.datetime] = None,
extracted_text: Optional[str] = None,
body_length: Optional[int] = None,
extracted_text_length: Optional[int] = None,
scrape_history: Optional[list[ScrapeHistory]] = None,
*args,
**kwargs
):
"""
A subclass of :class:`scrapy.http.HttpResponse` that contains a
subset of the extra information stored in the cache.
:param scrape_timestamp: The most recent date the request was scraped.
:param body_number_of_bytes: The total number of bytes of the downloaded
html.
:param text_number_of_bytes: The number of bytes in the extracted
plain text.
:param body_text: The text extracted from the HTML.
"""
super().__init__(*args, **kwargs)
self.scrape_timestamp = scrape_timestamp
self.extracted_text = extracted_text
self.body_length = body_length
self.extracted_text_length = extracted_text_length
self.scrape_history = scrape_history
[docs]
class CachedHtmlResponse(CachedResponseMixin, HtmlResponse):
[docs]
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
[docs]
class CachedTextResponse(CachedResponseMixin, TextResponse):
[docs]
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
[docs]
class CachedXmlResponse(CachedResponseMixin, XmlResponse):
[docs]
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)