tools/scraping/get_youtube_channel_id.py

#!/usr/bin/env python

"""Get the Youtube channel ID from its name using webscraping."""

from argparse import ArgumentParser
from logging import WARNING
from typing import Iterator

from scrapy import Spider, Request
from scrapy.crawler import CrawlerProcess
from scrapy.http import Response


class YoutubeChannelIdSpider(Spider):
    """Spider in charge of Youtube channel ID scraping."""

    name = "youtube_channel"
    allowed_domains = ['youtube.com']

    def __init__(self, channel, *args, **kwargs):
        """Init YoutubeChannelIdSpider.

        Args:
            channel: The name of the channel for which the ID will be retrieved.
            *args: Non-keyword arguments transmitted to the Spider init method.
            **kwargs: Keyword arguments given to the Spider init method.
        """
        super().__init__(*args, **kwargs)
        self.channel = channel
        self.url = f'https://www.youtube.com/c/{channel}'

    def start_requests(self) -> Iterator[Request]:
        """Lauch the HTTP request which allows to get Youtube channel main page."""
        yield Request(url=self.url,
                      callback=self.parse,
                      cookies={'CONSENT': 'YES+1'})

    def parse(self, response: Response, **kwargs) -> None:
        """Parse the Youtube channel main page and extract its ID from it.

        Args:
            response: The scapy.Response to parse.
            **kwargs: Not used argument.
        """
        if channel_id := response.xpath(
                '//meta[@itemprop="channelId"]/@content').get():
            print(channel_id)
        else:
            print(f'Unable to find ID for channel {self.channel}')


if __name__ == '__main__':

    parser = ArgumentParser()
    parser.add_argument(
        'channel',
        help='The Youtube channel for which the ID shall be retrieved')
    parser_args = parser.parse_args()

    process = CrawlerProcess({'LOG_LEVEL': WARNING})
    process.crawl(YoutubeChannelIdSpider, channel=parser_args.channel)
    # The script will block here until the crawling is finished
    process.start()