From f84765460252e809dd8044a93281671f218abec7 Mon Sep 17 00:00:00 2001
From: ASR <adrien.sueur@thalesgroup.com>
Date: Tue, 31 May 2022 08:19:44 +0200
Subject: [PATCH] Add a script retrieving the Youtube channel ID from its name.

---
 .dir-locals.el                     |  8 ++++
 mypy.ini                           |  2 +
 scraping/get_youtube_channel_id.py | 63 ++++++++++++++++++++++++++++++
 setup.cfg                          | 15 +++++++
 4 files changed, 88 insertions(+)
 create mode 100644 .dir-locals.el
 create mode 100644 mypy.ini
 create mode 100755 scraping/get_youtube_channel_id.py
 create mode 100644 setup.cfg

diff --git a/.dir-locals.el b/.dir-locals.el
new file mode 100644
index 0000000..375ec20
--- /dev/null
+++ b/.dir-locals.el
@@ -0,0 +1,8 @@
+((nil . ((eval . (set (make-local-variable 'project-root-dir) (replace-regexp-in-string "~" (getenv "HOME") (locate-dominating-file default-directory ".dir-locals.el"))))))
+ ;; TODO: Disable hicpp-no-array-decay/cppcoreguidelines-pro-bounds-array-to-pointer-decay once c++20.
+ (c++-mode . ((c-basic-offset . 4)
+              (eval . (set 'lsp-clients-clangd-args (list "-j=4" "-background-index" "--clang-tidy"
+                                                          (format "--compile-commands-dir=%s" project-root-dir)
+                                                          "--clang-tidy-checks=*,-llvmlibc-callee-namespace,-fuchsia-default-arguments-calls,-fuchsia-default-arguments-declarations,-llvmlibc-restrict-system-libc-headers,-cppcoreguidelines-pro-type-vararg,-hicpp-vararg,-hicpp-no-array-decay,-cppcoreguidelines-pro-bounds-array-to-pointer-decay,-cppcoreguidelines-pro-bounds-pointer-arithmetic,-google-runtime-int,-google-readability-todo"
+                                                          "--limit-results=0")))))
+ )
diff --git a/mypy.ini b/mypy.ini
new file mode 100644
index 0000000..93e104b
--- /dev/null
+++ b/mypy.ini
@@ -0,0 +1,2 @@
+[mypy-scrapy.*]
+ignore_missing_imports = True
diff --git a/scraping/get_youtube_channel_id.py b/scraping/get_youtube_channel_id.py
new file mode 100755
index 0000000..14d745a
--- /dev/null
+++ b/scraping/get_youtube_channel_id.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+
+"""Get the Youtube channel ID from its name using webscraping."""
+
+from argparse import ArgumentParser
+from logging import WARNING
+from typing import Iterator
+
+from scrapy import Spider, Request
+from scrapy.crawler import CrawlerProcess
+from scrapy.http import Response
+
+
+class YoutubeChannelIdSpider(Spider):
+    """Spider in charge of Youtube channel ID scraping."""
+
+    name = "youtube_channel"
+    allowed_domains = ['youtube.com']
+
+    def __init__(self, channel, *args, **kwargs):
+        """Init YoutubeChannelIdSpider.
+
+        Args:
+            channel: The name of the channel for which the ID will be retrieved.
+            *args: Non-keyword arguments transmitted to the Spider init method.
+            **kwargs: Keyword arguments given to the Spider init method.
+        """
+        super().__init__(*args, **kwargs)
+        self.channel = channel
+        self.url = f'https://www.youtube.com/c/{channel}'
+
+    def start_requests(self) -> Iterator[Request]:
+        """Lauch the HTTP request which allows to get Youtube channel main page."""
+        yield Request(url=self.url,
+                      callback=self.parse,
+                      cookies={'CONSENT': 'YES+1'})
+
+    def parse(self, response: Response, **kwargs) -> None:
+        """Parse the Youtube channel main page and extract its ID from it.
+
+        Args:
+            response: The scapy.Response to parse.
+            **kwargs: Not used argument.
+        """
+        if channel_id := response.xpath(
+                '//meta[@itemprop="channelId"]/@content').get():
+            print(channel_id)
+        else:
+            print(f'Unable to find ID for channel {self.channel}')
+
+
+if __name__ == '__main__':
+
+    parser = ArgumentParser()
+    parser.add_argument(
+        'channel',
+        help='The Youtube channel for which the ID shall be retrieved')
+    parser_args = parser.parse_args()
+
+    process = CrawlerProcess({'LOG_LEVEL': WARNING})
+    process.crawl(YoutubeChannelIdSpider, channel=parser_args.channel)
+    # The script will block here until the crawling is finished
+    process.start()
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..0bb886c
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,15 @@
+[pycodestyle]
+max_line_length = 100 
+
+[flake8]
+max_line_length = 100 
+
+[yapf]
+based_on_style = "pep8"
+column_limit = 100
+
+[pylint]
+max-line-length = 100
+
+[tool.mypy-scrapy.*]
+ignore_missing_imports = True