Add a script retrieving the Youtube channel ID from its name.

2022-05-31 08:19:44 +02:00
commit f847654602
4 changed files with 88 additions and 0 deletions
--- a/scraping/get_youtube_channel_id.py
+++ b/scraping/get_youtube_channel_id.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+
+"""Get the Youtube channel ID from its name using webscraping."""
+
+from argparse import ArgumentParser
+from logging import WARNING
+from typing import Iterator
+
+from scrapy import Spider, Request
+from scrapy.crawler import CrawlerProcess
+from scrapy.http import Response
+
+
+class YoutubeChannelIdSpider(Spider):
+    """Spider in charge of Youtube channel ID scraping."""
+
+    name = "youtube_channel"
+    allowed_domains = ['youtube.com']
+
+    def __init__(self, channel, *args, **kwargs):
+        """Init YoutubeChannelIdSpider.
+
+        Args:
+            channel: The name of the channel for which the ID will be retrieved.
+            *args: Non-keyword arguments transmitted to the Spider init method.
+            **kwargs: Keyword arguments given to the Spider init method.
+        """
+        super().__init__(*args, **kwargs)
+        self.channel = channel
+        self.url = f'https://www.youtube.com/c/{channel}'
+
+    def start_requests(self) -> Iterator[Request]:
+        """Lauch the HTTP request which allows to get Youtube channel main page."""
+        yield Request(url=self.url,
+                      callback=self.parse,
+                      cookies={'CONSENT': 'YES+1'})
+
+    def parse(self, response: Response, **kwargs) -> None:
+        """Parse the Youtube channel main page and extract its ID from it.
+
+        Args:
+            response: The scapy.Response to parse.
+            **kwargs: Not used argument.
+        """
+        if channel_id := response.xpath(
+                '//meta[@itemprop="channelId"]/@content').get():
+            print(channel_id)
+        else:
+            print(f'Unable to find ID for channel {self.channel}')
+
+
+if __name__ == '__main__':
+
+    parser = ArgumentParser()
+    parser.add_argument(
+        'channel',
+        help='The Youtube channel for which the ID shall be retrieved')
+    parser_args = parser.parse_args()
+
+    process = CrawlerProcess({'LOG_LEVEL': WARNING})
+    process.crawl(YoutubeChannelIdSpider, channel=parser_args.channel)
+    # The script will block here until the crawling is finished
+    process.start()