Source code for scrachy.middleware.filter

#  Copyright 2020 Reid Swanson.
#
#  This file is part of scrachy.
#
#  scrachy is free software: you can redistribute it and/or modify
#  it under the terms of the GNU Lesser General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  scrachy is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU Lesser General Public License for more details.
#
#   You should have received a copy of the GNU Lesser General Public License
#   along with scrachy.  If not, see <https://www.gnu.org/licenses/>.

"""
Middleware for filtering (or ignoring) responses if they are fresh in the
cache.
"""

# Python Modules
import logging

# 3rd Party Modules
from scrapy.crawler import Crawler
from scrapy.exceptions import IgnoreRequest
import scrapy.settings

# Project Modules
from scrachy.db.engine import session_scope
from scrachy.db.repositories import ResponseRepository
from scrachy.utils.datetime import now_tzaware
from scrachy.utils.request import ExpirationManager
from scrachy.utils.settings import compile_patterns

log = logging.getLogger(__name__)



[docs]
class CachedResponseFilter:
    @classmethod
    def from_crawler(cls, crawler: Crawler):
        return cls(crawler)


[docs]
    def __init__(self, crawler: Crawler):
        """
        Sometimes you scrape the same domains multiple times looking for
        new content. However, when crawling them you might encounter pages that
        you have already scraped. If your extraction rules have not changed
        since the last crawl it may not be worth reprocessing those pages.

        This middleware will look to see if a response corresponding to this
        request is already in the cache and is not stale. If the response
        is not in the cache ``process_request`` will return immediately.
        Otherwise, it will use the following rules to determine whether
        the request should be filtered.

        * You can specify a set of patterns to match against the
          request url. Any pattern that matches part of the url will not be
          filtered regardless of whether it is in the cache or
          not. This might be useful after changing parsing rules
          for a set of pages. These are specified using the
          ``SCRACHY_CACHED_RESPONSE_FILTER_EXCLUSIONS`` setting, which takes a
          list of ``re.Patterns`` or strings which can be compiled to regular
          expressions.
        * Setting the request meta key, ``dont_filter`` to ``True``, will
          not be processed by this middleware.
        * Any page that is already excluded from caching via the ``dont_cache``
          request meta key will also never be filtered.

        Any other request that has a fresh response in the cache will be
        filtered.

        :param crawler: The current crawler.
        """
        settings = crawler.spider.settings if crawler.spider else crawler.settings

        self._fingerprinter = crawler.request_fingerprinter
        self._exclude_patterns = compile_patterns(settings.get('SCRACHY_CACHED_RESPONSE_FILTER_EXCLUSIONS'))
        self._expiration_manager = ExpirationManager(settings)


    # noinspection PyUnusedLocal

[docs]
    def process_request(self, request: scrapy.http.Request, spider: scrapy.spiders.Spider):
        """

        :param request: The Scrapy request.
        :param spider: The Scrapy Spider issuing the request.

        :raises IgnoreRequest: If the item is already cached, and it does
                not meet the requirement to be excluded.
        """
        # If dont_cache or dont_skip is set then don't skip the item.
        if request.meta.get('dont_cache') or request.meta.get('dont_filter'):
            return

        url = request.url
        fingerprint = self._fingerprinter.fingerprint(request)

        # Otherwise check to see if the request is already in the cache and
        # skip further processing if it is.
        with session_scope() as session:
            scrape_timestamp = ResponseRepository(session).find_timestamp_by_fingerprint(fingerprint)

        # If the item is not in the cache, then don't filter it
        if scrape_timestamp is None:
            return

        current_timestamp = now_tzaware()

        # If the item in the cache is stale, don't filter it.
        if self._expiration_manager.is_stale(url, scrape_timestamp, current_timestamp):
            return

        # If the url is in the exclusion list then don't filter the request
        # even if it is already in the cache.
        for pattern in self._exclude_patterns:
            if pattern.search(url):
                return

        # Otherwise ignore the cached request
        raise IgnoreRequest()