From f84765460252e809dd8044a93281671f218abec7 Mon Sep 17 00:00:00 2001 From: ASR Date: Tue, 31 May 2022 08:19:44 +0200 Subject: [PATCH] Add a script retrieving the Youtube channel ID from its name. --- .dir-locals.el | 8 ++++ mypy.ini | 2 + scraping/get_youtube_channel_id.py | 63 ++++++++++++++++++++++++++++++ setup.cfg | 15 +++++++ 4 files changed, 88 insertions(+) create mode 100644 .dir-locals.el create mode 100644 mypy.ini create mode 100755 scraping/get_youtube_channel_id.py create mode 100644 setup.cfg diff --git a/.dir-locals.el b/.dir-locals.el new file mode 100644 index 0000000..375ec20 --- /dev/null +++ b/.dir-locals.el @@ -0,0 +1,8 @@ +((nil . ((eval . (set (make-local-variable 'project-root-dir) (replace-regexp-in-string "~" (getenv "HOME") (locate-dominating-file default-directory ".dir-locals.el")))))) + ;; TODO: Disable hicpp-no-array-decay/cppcoreguidelines-pro-bounds-array-to-pointer-decay once c++20. + (c++-mode . ((c-basic-offset . 4) + (eval . (set 'lsp-clients-clangd-args (list "-j=4" "-background-index" "--clang-tidy" + (format "--compile-commands-dir=%s" project-root-dir) + "--clang-tidy-checks=*,-llvmlibc-callee-namespace,-fuchsia-default-arguments-calls,-fuchsia-default-arguments-declarations,-llvmlibc-restrict-system-libc-headers,-cppcoreguidelines-pro-type-vararg,-hicpp-vararg,-hicpp-no-array-decay,-cppcoreguidelines-pro-bounds-array-to-pointer-decay,-cppcoreguidelines-pro-bounds-pointer-arithmetic,-google-runtime-int,-google-readability-todo" + "--limit-results=0"))))) + ) diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 0000000..93e104b --- /dev/null +++ b/mypy.ini @@ -0,0 +1,2 @@ +[mypy-scrapy.*] +ignore_missing_imports = True diff --git a/scraping/get_youtube_channel_id.py b/scraping/get_youtube_channel_id.py new file mode 100755 index 0000000..14d745a --- /dev/null +++ b/scraping/get_youtube_channel_id.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python + +"""Get the Youtube channel ID from its name using webscraping.""" + +from argparse import ArgumentParser +from logging import WARNING +from typing import Iterator + +from scrapy import Spider, Request +from scrapy.crawler import CrawlerProcess +from scrapy.http import Response + + +class YoutubeChannelIdSpider(Spider): + """Spider in charge of Youtube channel ID scraping.""" + + name = "youtube_channel" + allowed_domains = ['youtube.com'] + + def __init__(self, channel, *args, **kwargs): + """Init YoutubeChannelIdSpider. + + Args: + channel: The name of the channel for which the ID will be retrieved. + *args: Non-keyword arguments transmitted to the Spider init method. + **kwargs: Keyword arguments given to the Spider init method. + """ + super().__init__(*args, **kwargs) + self.channel = channel + self.url = f'https://www.youtube.com/c/{channel}' + + def start_requests(self) -> Iterator[Request]: + """Lauch the HTTP request which allows to get Youtube channel main page.""" + yield Request(url=self.url, + callback=self.parse, + cookies={'CONSENT': 'YES+1'}) + + def parse(self, response: Response, **kwargs) -> None: + """Parse the Youtube channel main page and extract its ID from it. + + Args: + response: The scapy.Response to parse. + **kwargs: Not used argument. + """ + if channel_id := response.xpath( + '//meta[@itemprop="channelId"]/@content').get(): + print(channel_id) + else: + print(f'Unable to find ID for channel {self.channel}') + + +if __name__ == '__main__': + + parser = ArgumentParser() + parser.add_argument( + 'channel', + help='The Youtube channel for which the ID shall be retrieved') + parser_args = parser.parse_args() + + process = CrawlerProcess({'LOG_LEVEL': WARNING}) + process.crawl(YoutubeChannelIdSpider, channel=parser_args.channel) + # The script will block here until the crawling is finished + process.start() diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..0bb886c --- /dev/null +++ b/setup.cfg @@ -0,0 +1,15 @@ +[pycodestyle] +max_line_length = 100 + +[flake8] +max_line_length = 100 + +[yapf] +based_on_style = "pep8" +column_limit = 100 + +[pylint] +max-line-length = 100 + +[tool.mypy-scrapy.*] +ignore_missing_imports = True