Add a script retrieving the Youtube channel ID from its name.
This commit is contained in:
63
scraping/get_youtube_channel_id.py
Executable file
63
scraping/get_youtube_channel_id.py
Executable file
@@ -0,0 +1,63 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
"""Get the Youtube channel ID from its name using webscraping."""
|
||||
|
||||
from argparse import ArgumentParser
|
||||
from logging import WARNING
|
||||
from typing import Iterator
|
||||
|
||||
from scrapy import Spider, Request
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
from scrapy.http import Response
|
||||
|
||||
|
||||
class YoutubeChannelIdSpider(Spider):
|
||||
"""Spider in charge of Youtube channel ID scraping."""
|
||||
|
||||
name = "youtube_channel"
|
||||
allowed_domains = ['youtube.com']
|
||||
|
||||
def __init__(self, channel, *args, **kwargs):
|
||||
"""Init YoutubeChannelIdSpider.
|
||||
|
||||
Args:
|
||||
channel: The name of the channel for which the ID will be retrieved.
|
||||
*args: Non-keyword arguments transmitted to the Spider init method.
|
||||
**kwargs: Keyword arguments given to the Spider init method.
|
||||
"""
|
||||
super().__init__(*args, **kwargs)
|
||||
self.channel = channel
|
||||
self.url = f'https://www.youtube.com/c/{channel}'
|
||||
|
||||
def start_requests(self) -> Iterator[Request]:
|
||||
"""Lauch the HTTP request which allows to get Youtube channel main page."""
|
||||
yield Request(url=self.url,
|
||||
callback=self.parse,
|
||||
cookies={'CONSENT': 'YES+1'})
|
||||
|
||||
def parse(self, response: Response, **kwargs) -> None:
|
||||
"""Parse the Youtube channel main page and extract its ID from it.
|
||||
|
||||
Args:
|
||||
response: The scapy.Response to parse.
|
||||
**kwargs: Not used argument.
|
||||
"""
|
||||
if channel_id := response.xpath(
|
||||
'//meta[@itemprop="channelId"]/@content').get():
|
||||
print(channel_id)
|
||||
else:
|
||||
print(f'Unable to find ID for channel {self.channel}')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument(
|
||||
'channel',
|
||||
help='The Youtube channel for which the ID shall be retrieved')
|
||||
parser_args = parser.parse_args()
|
||||
|
||||
process = CrawlerProcess({'LOG_LEVEL': WARNING})
|
||||
process.crawl(YoutubeChannelIdSpider, channel=parser_args.channel)
|
||||
# The script will block here until the crawling is finished
|
||||
process.start()
|
Reference in New Issue
Block a user