Source code for scrachy.db.models

#  Copyright 2020 Reid Swanson.
#
#  This file is part of scrachy.
#
#  scrachy is free software: you can redistribute it and/or modify
#  it under the terms of the GNU Lesser General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  scrachy is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU Lesser General Public License for more details.
#
#   You should have received a copy of the GNU Lesser General Public License
#   along with scrachy.  If not, see <https://www.gnu.org/licenses/>.

"""
The SqlAlchemy models.
"""

from __future__ import annotations

# Python Modules
from typing import Annotated, Optional

# 3rd Party Modules
import sqlalchemy
import sqlalchemy.orm

from sqlalchemy import ColumnElement, String, ForeignKey, type_coerce
from sqlalchemy.ext.hybrid import hybrid_property
from sqlalchemy.orm import Mapped, mapped_column, relationship

# Project Modules
from scrachy.db.base import Base, binary, smallint, timestamp
from scrachy.settings import PROJECT_SETTINGS

# TODO I'm not sure the schem and qualified table name stuff is required,
#      because of the 'schema_translate_map` added to the engine
#      execution_options.
schema = PROJECT_SETTINGS.get('SCRACHY_DB_SCHEMA')
schema_prefix = f"{schema}." if schema else ""

str_pk = Annotated[str, mapped_column(primary_key=True)]
int_pk = Annotated[int, mapped_column(primary_key=True)]


[docs] def qtn(name: str) -> str: """ Return the qualified table name (i.e., with schema if necessary). :param name: :return: """ return f"{schema_prefix}{name}"
# Primary tables
[docs] class Response(Base): """ Stores the minimum amount of information to recreate the html of a page given its request fingerprint. The cache only stores the most recent version of the page, but the :class:`ScrapyHistory` table can be used to view Additional data is stored in other tables, but can be accessed via relationships. """ # A hash of the page's uniquely identifying request information. fingerprint: Mapped[binary] = mapped_column(primary_key=True) # The most recent timestamp when the page was scraped. scrape_timestamp: Mapped[timestamp] # This is redundant with the request, but can prevent a join when # we might want to search or filter responses by it. url: Mapped[str] = mapped_column(index=True) # The request method (True for GET. False for POST) method: Mapped[bool] # The body of the request (primarily used for POST requests) request_body: Mapped[Optional[str]] # The response body body: Mapped[str] # The response status code for the page status: Mapped[Optional[smallint]] # The response headers headers: Mapped[Optional[str]] # The text of the response stripped of HTML and possibly stripped of # boilerplate content. extracted_text: Mapped[Optional[str]] # The number of bytes of the full downloaded response (including the HTML) body_length: Mapped[int] # The number of bytes of the response excluding the HTML. extracted_text_length: Mapped[Optional[int]] # The scrape history scrape_history: Mapped[list[ScrapeHistory]] = relationship(cascade='all, delete-orphan') # I believe hybrid properties allow changing how the property behaves # depending on whether you are using it as an instance or as a class # template. For example, when operating on an instance referencing the # `body` property will use the regular python code in the method body. # But if you using it as a template for SQL operations you can define # a @xxx.inplace.expression method that will use the same property name # but emit SQL code instead. # So here, for example, it uses less space to store the request method # as a boolean (because it can only ever have two values), but it is # more convenient to work with these values as their original string # representations. @hybrid_property def request_method(self) -> Optional[str]: if self.method is True: return 'GET' elif self.method is False: return 'POST' return None @request_method.inplace.setter def _request__method(self, method: str) -> None: self.method = method.upper() == 'GET' # noqa # noinspection PyMethodParameters,PyNestedDecorators @request_method.inplace.expression @classmethod def _request_method(cls) -> ColumnElement[Optional[str]]: return type_coerce( sqlalchemy.case( (cls.method == True, 'GET'), (cls.method == False, 'POST'), else_=None ), String )
[docs] class ScrapeHistory(Base): """ Store the history of when a page was scraped. """ fingerprint: Mapped[binary] = mapped_column(ForeignKey(qtn('response.fingerprint')), primary_key=True) # When the page was scraped scrape_timestamp: Mapped[timestamp] = mapped_column(primary_key=True) # The response body body: Mapped[str]