Add a script retrieving the Youtube channel ID from its name.

2022-05-31 08:19:44 +02:00
commit f847654602
4 changed files with 88 additions and 0 deletions
--- a/.dir-locals.el
+++ b/.dir-locals.el
@@ -0,0 +1,8 @@
 ((nil . ((eval . (set (make-local-variable 'project-root-dir) (replace-regexp-in-string "~" (getenv "HOME") (locate-dominating-file default-directory ".dir-locals.el"))))))
 ;; TODO: Disable hicpp-no-array-decay/cppcoreguidelines-pro-bounds-array-to-pointer-decay once c++20.
 (c++-mode . ((c-basic-offset . 4)
              (eval . (set 'lsp-clients-clangd-args (list "-j=4" "-background-index" "--clang-tidy"
                                                          (format "--compile-commands-dir=%s" project-root-dir)
                                                          "--clang-tidy-checks=*,-llvmlibc-callee-namespace,-fuchsia-default-arguments-calls,-fuchsia-default-arguments-declarations,-llvmlibc-restrict-system-libc-headers,-cppcoreguidelines-pro-type-vararg,-hicpp-vararg,-hicpp-no-array-decay,-cppcoreguidelines-pro-bounds-array-to-pointer-decay,-cppcoreguidelines-pro-bounds-pointer-arithmetic,-google-runtime-int,-google-readability-todo"
                                                          "--limit-results=0")))))
 )
--- a/mypy.ini
+++ b/mypy.ini
@@ -0,0 +1,2 @@
 [mypy-scrapy.*]
 ignore_missing_imports = True
--- a/scraping/get_youtube_channel_id.py
+++ b/scraping/get_youtube_channel_id.py
@@ -0,0 +1,63 @@
 #!/usr/bin/env python
 """Get the Youtube channel ID from its name using webscraping."""
 from argparse import ArgumentParser
 from logging import WARNING
 from typing import Iterator
 from scrapy import Spider, Request
 from scrapy.crawler import CrawlerProcess
 from scrapy.http import Response
 class YoutubeChannelIdSpider(Spider):
    """Spider in charge of Youtube channel ID scraping."""
    name = "youtube_channel"
    allowed_domains = ['youtube.com']
    def __init__(self, channel, *args, **kwargs):
        """Init YoutubeChannelIdSpider.
        Args:
            channel: The name of the channel for which the ID will be retrieved.
            *args: Non-keyword arguments transmitted to the Spider init method.
            **kwargs: Keyword arguments given to the Spider init method.
        """
        super().__init__(*args, **kwargs)
        self.channel = channel
        self.url = f'https://www.youtube.com/c/{channel}'
    def start_requests(self) -> Iterator[Request]:
        """Lauch the HTTP request which allows to get Youtube channel main page."""
        yield Request(url=self.url,
                      callback=self.parse,
                      cookies={'CONSENT': 'YES+1'})
    def parse(self, response: Response, **kwargs) -> None:
        """Parse the Youtube channel main page and extract its ID from it.
        Args:
            response: The scapy.Response to parse.
            **kwargs: Not used argument.
        """
        if channel_id := response.xpath(
                '//meta[@itemprop="channelId"]/@content').get():
            print(channel_id)
        else:
            print(f'Unable to find ID for channel {self.channel}')
 if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument(
        'channel',
        help='The Youtube channel for which the ID shall be retrieved')
    parser_args = parser.parse_args()
    process = CrawlerProcess({'LOG_LEVEL': WARNING})
    process.crawl(YoutubeChannelIdSpider, channel=parser_args.channel)
    # The script will block here until the crawling is finished
    process.start()
--- a/setup.cfg
+++ b/setup.cfg
@@ -0,0 +1,15 @@
 [pycodestyle]
 max_line_length = 100 
 [flake8]
 max_line_length = 100 
 [yapf]
 based_on_style = "pep8"
 column_limit = 100
 [pylint]
 max-line-length = 100
 [tool.mypy-scrapy.*]
 ignore_missing_imports = True
		`@@ -0,0 +1,2 @@`
							`[mypy-scrapy.*]`
							`ignore_missing_imports = True`