#!/usr/bin/env python """Get the Youtube channel ID from its name using webscraping.""" from argparse import ArgumentParser from logging import WARNING from typing import Iterator from scrapy import Spider, Request from scrapy.crawler import CrawlerProcess from scrapy.http import Response class YoutubeChannelIdSpider(Spider): """Spider in charge of Youtube channel ID scraping.""" name = "youtube_channel" allowed_domains = ['youtube.com'] def __init__(self, channel, *args, **kwargs): """Init YoutubeChannelIdSpider. Args: channel: The name of the channel for which the ID will be retrieved. *args: Non-keyword arguments transmitted to the Spider init method. **kwargs: Keyword arguments given to the Spider init method. """ super().__init__(*args, **kwargs) self.channel = channel self.url = f'https://www.youtube.com/c/{channel}' def start_requests(self) -> Iterator[Request]: """Lauch the HTTP request which allows to get Youtube channel main page.""" yield Request(url=self.url, callback=self.parse, cookies={'CONSENT': 'YES+1'}) def parse(self, response: Response, **kwargs) -> None: """Parse the Youtube channel main page and extract its ID from it. Args: response: The scapy.Response to parse. **kwargs: Not used argument. """ if channel_id := response.xpath( '//meta[@itemprop="channelId"]/@content').get(): print(channel_id) else: print(f'Unable to find ID for channel {self.channel}') if __name__ == '__main__': parser = ArgumentParser() parser.add_argument( 'channel', help='The Youtube channel for which the ID shall be retrieved') parser_args = parser.parse_args() process = CrawlerProcess({'LOG_LEVEL': WARNING}) process.crawl(YoutubeChannelIdSpider, channel=parser_args.channel) # The script will block here until the crawling is finished process.start()