Source code for scrapy.spiders

"""
Base class for Scrapy spiders

See documentation in docs/topics/spiders.rst
"""

from __future__ import annotations

import logging
import warnings
from typing import TYPE_CHECKING, Any, cast

from scrapy import signals
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.http import Request, Response
from scrapy.utils.trackref import object_ref
from scrapy.utils.url import url_is_from_spider

if TYPE_CHECKING:
    from collections.abc import AsyncIterator, Iterable

    from twisted.internet.defer import Deferred

    # typing.Self requires Python 3.11
    from typing_extensions import Self

    from scrapy.crawler import Crawler
    from scrapy.http.request import CallbackT
    from scrapy.settings import BaseSettings, _SettingsKeyT
    from scrapy.utils.log import SpiderLoggerAdapter


[docs] class Spider(object_ref): """Base class that any spider must subclass. It provides a default :meth:`start` implementation that sends requests based on the :attr:`start_urls` class attribute and calls the :meth:`parse` method for each response. """ name: str custom_settings: dict[_SettingsKeyT, Any] | None = None #: Start URLs. See :meth:`start`. start_urls: list[str] def __init__(self, name: str | None = None, **kwargs: Any): if name is not None: self.name: str = name elif not getattr(self, "name", None): raise ValueError(f"{type(self).__name__} must have a name") self.__dict__.update(kwargs) if not hasattr(self, "start_urls"): self.start_urls: list[str] = [] @property def logger(self) -> SpiderLoggerAdapter: from scrapy.utils.log import SpiderLoggerAdapter logger = logging.getLogger(self.name) return SpiderLoggerAdapter(logger, {"spider": self})
[docs] def log(self, message: Any, level: int = logging.DEBUG, **kw: Any) -> None: """Log the given message at the given log level This helper wraps a log call to the logger within the spider, but you can use it directly (e.g. Spider.logger.info('msg')) or use any other Python logger too. """ self.logger.log(level, message, **kw)
[docs] @classmethod def from_crawler(cls, crawler: Crawler, *args: Any, **kwargs: Any) -> Self: spider = cls(*args, **kwargs) spider._set_crawler(crawler) return spider
def _set_crawler(self, crawler: Crawler) -> None: self.crawler: Crawler = crawler self.settings: BaseSettings = crawler.settings crawler.signals.connect(self.close, signals.spider_closed)
[docs] async def start(self) -> AsyncIterator[Any]: """Yield the initial :class:`~scrapy.Request` objects to send. .. versionadded:: 2.13 For example: .. code-block:: python from scrapy import Request, Spider class MySpider(Spider): name = "myspider" async def start(self): yield Request("https://toscrape.com/") The default implementation reads URLs from :attr:`start_urls` and yields a request for each with :attr:`~scrapy.Request.dont_filter` enabled. It is functionally equivalent to: .. code-block:: python async def start(self): for url in self.start_urls: yield Request(url, dont_filter=True) You can also yield :ref:`items <topics-items>`. For example: .. code-block:: python async def start(self): yield {"foo": "bar"} To write spiders that work on Scrapy versions lower than 2.13, define also a synchronous ``start_requests()`` method that returns an iterable. For example: .. code-block:: python def start_requests(self): yield Request("https://toscrape.com/") .. seealso:: :ref:`start-requests` """ with warnings.catch_warnings(): warnings.filterwarnings( "ignore", category=ScrapyDeprecationWarning, module=r"^scrapy\.spiders$" ) for item_or_request in self.start_requests(): yield item_or_request
def start_requests(self) -> Iterable[Any]: warnings.warn( ( "The Spider.start_requests() method is deprecated, use " "Spider.start() instead. If you are calling " "super().start_requests() from a Spider.start() override, " "iterate super().start() instead." ), ScrapyDeprecationWarning, stacklevel=2, ) if not self.start_urls and hasattr(self, "start_url"): raise AttributeError( "Crawling could not start: 'start_urls' not found " "or empty (but found 'start_url' attribute instead, " "did you miss an 's'?)" ) for url in self.start_urls: yield Request(url, dont_filter=True) def _parse(self, response: Response, **kwargs: Any) -> Any: return self.parse(response, **kwargs) if TYPE_CHECKING: parse: CallbackT else:
[docs] def parse(self, response: Response, **kwargs: Any) -> Any: raise NotImplementedError( f"{self.__class__.__name__}.parse callback is not defined" )
[docs] @classmethod def update_settings(cls, settings: BaseSettings) -> None: settings.setdict(cls.custom_settings or {}, priority="spider")
@classmethod def handles_request(cls, request: Request) -> bool: return url_is_from_spider(request.url, cls) @staticmethod def close(spider: Spider, reason: str) -> Deferred[None] | None: closed = getattr(spider, "closed", None) if callable(closed): return cast("Deferred[None] | None", closed(reason)) return None def __repr__(self) -> str: return f"<{type(self).__name__} {self.name!r} at 0x{id(self):0x}>"
# Top-level imports from scrapy.spiders.crawl import CrawlSpider, Rule from scrapy.spiders.feed import CSVFeedSpider, XMLFeedSpider from scrapy.spiders.sitemap import SitemapSpider __all__ = [ "CSVFeedSpider", "CrawlSpider", "Rule", "SitemapSpider", "Spider", "XMLFeedSpider", ]