Add a script retrieving the Youtube channel ID from its name.
This commit is contained in:
8
.dir-locals.el
Normal file
8
.dir-locals.el
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
((nil . ((eval . (set (make-local-variable 'project-root-dir) (replace-regexp-in-string "~" (getenv "HOME") (locate-dominating-file default-directory ".dir-locals.el"))))))
|
||||||
|
;; TODO: Disable hicpp-no-array-decay/cppcoreguidelines-pro-bounds-array-to-pointer-decay once c++20.
|
||||||
|
(c++-mode . ((c-basic-offset . 4)
|
||||||
|
(eval . (set 'lsp-clients-clangd-args (list "-j=4" "-background-index" "--clang-tidy"
|
||||||
|
(format "--compile-commands-dir=%s" project-root-dir)
|
||||||
|
"--clang-tidy-checks=*,-llvmlibc-callee-namespace,-fuchsia-default-arguments-calls,-fuchsia-default-arguments-declarations,-llvmlibc-restrict-system-libc-headers,-cppcoreguidelines-pro-type-vararg,-hicpp-vararg,-hicpp-no-array-decay,-cppcoreguidelines-pro-bounds-array-to-pointer-decay,-cppcoreguidelines-pro-bounds-pointer-arithmetic,-google-runtime-int,-google-readability-todo"
|
||||||
|
"--limit-results=0")))))
|
||||||
|
)
|
63
scraping/get_youtube_channel_id.py
Executable file
63
scraping/get_youtube_channel_id.py
Executable file
@@ -0,0 +1,63 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
"""Get the Youtube channel ID from its name using webscraping."""
|
||||||
|
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
from logging import WARNING
|
||||||
|
from typing import Iterator
|
||||||
|
|
||||||
|
from scrapy import Spider, Request
|
||||||
|
from scrapy.crawler import CrawlerProcess
|
||||||
|
from scrapy.http import Response
|
||||||
|
|
||||||
|
|
||||||
|
class YoutubeChannelIdSpider(Spider):
|
||||||
|
"""Spider in charge of Youtube channel ID scraping."""
|
||||||
|
|
||||||
|
name = "youtube_channel"
|
||||||
|
allowed_domains = ['youtube.com']
|
||||||
|
|
||||||
|
def __init__(self, channel, *args, **kwargs):
|
||||||
|
"""Init YoutubeChannelIdSpider.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
channel: The name of the channel for which the ID will be retrieved.
|
||||||
|
*args: Non-keyword arguments transmitted to the Spider init method.
|
||||||
|
**kwargs: Keyword arguments given to the Spider init method.
|
||||||
|
"""
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.channel = channel
|
||||||
|
self.url = f'https://www.youtube.com/c/{channel}'
|
||||||
|
|
||||||
|
def start_requests(self) -> Iterator[Request]:
|
||||||
|
"""Lauch the HTTP request which allows to get Youtube channel main page."""
|
||||||
|
yield Request(url=self.url,
|
||||||
|
callback=self.parse,
|
||||||
|
cookies={'CONSENT': 'YES+1'})
|
||||||
|
|
||||||
|
def parse(self, response: Response, **kwargs) -> None:
|
||||||
|
"""Parse the Youtube channel main page and extract its ID from it.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
response: The scapy.Response to parse.
|
||||||
|
**kwargs: Not used argument.
|
||||||
|
"""
|
||||||
|
if channel_id := response.xpath(
|
||||||
|
'//meta[@itemprop="channelId"]/@content').get():
|
||||||
|
print(channel_id)
|
||||||
|
else:
|
||||||
|
print(f'Unable to find ID for channel {self.channel}')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
parser = ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
'channel',
|
||||||
|
help='The Youtube channel for which the ID shall be retrieved')
|
||||||
|
parser_args = parser.parse_args()
|
||||||
|
|
||||||
|
process = CrawlerProcess({'LOG_LEVEL': WARNING})
|
||||||
|
process.crawl(YoutubeChannelIdSpider, channel=parser_args.channel)
|
||||||
|
# The script will block here until the crawling is finished
|
||||||
|
process.start()
|
Reference in New Issue
Block a user