yt-dlp/yt_dlp/extractor/bitchute.py

import itertools
import re

from .common import InfoExtractor
from ..utils import (
    ExtractorError,
    HEADRequest,
    clean_html,
    get_element_by_class,
    int_or_none,
    orderedSet,
    traverse_obj,
    unified_strdate,
    urlencode_postdata,
)


class BitChuteIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P<id>[^/?#&]+)'
    _EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})']
    _TESTS = [{
        'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/',
        'md5': '7e427d7ed7af5a75b5855705ec750e2b',
        'info_dict': {
            'id': 'UGlrF9o9b-Q',
            'ext': 'mp4',
            'title': 'This is the first video on #BitChute !',
            'description': 'md5:a0337e7b1fe39e32336974af8173a034',
            'thumbnail': r're:^https?://.*\.jpg$',
            'uploader': 'BitChute',
            'upload_date': '20170103',
        },
    }, {
        # video not downloadable in browser, but we can recover it
        'url': 'https://www.bitchute.com/video/2s6B3nZjAk7R/',
        'md5': '05c12397d5354bf24494885b08d24ed1',
        'info_dict': {
            'id': '2s6B3nZjAk7R',
            'ext': 'mp4',
            'filesize': 71537926,
            'title': 'STYXHEXENHAMMER666 - Election Fraud, Clinton 2020, EU Armies, and Gun Control',
            'description': 'md5:228ee93bd840a24938f536aeac9cf749',
            'thumbnail': r're:^https?://.*\.jpg$',
            'uploader': 'BitChute',
            'upload_date': '20181113',
        },
        'params': {'check_formats': None},
    }, {
        'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/',
        'only_matching': True,
    }, {
        'url': 'https://www.bitchute.com/torrent/Zee5BE49045h/szoMrox2JEI.webtorrent',
        'only_matching': True,
    }]

    _HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36',
        'Referer': 'https://www.bitchute.com/',
    }

    def _check_format(self, video_url, video_id):
        urls = orderedSet(
            re.sub(r'(^https?://)(seed\d+)(?=\.bitchute\.com)', fr'\g<1>{host}', video_url)
            for host in (r'\g<2>', 'seed150', 'seed151', 'seed152', 'seed153'))
        for url in urls:
            try:
                response = self._request_webpage(
                    HEADRequest(url), video_id=video_id, note=f'Checking {url}', headers=self._HEADERS)
            except ExtractorError as e:
                self.to_screen(f'{video_id}: URL is invalid, skipping: {e.cause}')
                continue
            return {
                'url': url,
                'filesize': int_or_none(response.headers.get('Content-Length'))
            }

    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(
            f'https://www.bitchute.com/video/{video_id}', video_id, headers=self._HEADERS)

        publish_date = clean_html(get_element_by_class('video-publish-date', webpage))
        entries = self._parse_html5_media_entries(url, webpage, video_id)

        formats = []
        for format_ in traverse_obj(entries, (0, 'formats', ...)):
            if self.get_param('check_formats') is not False:
                format_.update(self._check_format(format_.pop('url'), video_id) or {})
                if 'url' not in format_:
                    continue
            formats.append(format_)

        if not formats:
            self.raise_no_formats(
                'Video is unavailable. Please make sure this video is playable in the browser '
                'before reporting this issue.', expected=True, video_id=video_id)
        self._sort_formats(formats)

        return {
            'id': video_id,
            'title': self._html_extract_title(webpage) or self._og_search_title(webpage),
            'description': self._og_search_description(webpage, default=None),
            'thumbnail': self._og_search_thumbnail(webpage),
            'uploader': clean_html(get_element_by_class('owner', webpage)),
            'upload_date': unified_strdate(self._search_regex(
                r'at \d+:\d+ UTC on (.+?)\.', publish_date, 'upload date', fatal=False)),
            'formats': formats,
        }


class BitChuteChannelIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P<id>[^/?#&]+)'
    _TEST = {
        'url': 'https://www.bitchute.com/channel/victoriaxrave/',
        'playlist_mincount': 185,
        'info_dict': {
            'id': 'victoriaxrave',
        },
    }

    _TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7'

    def _entries(self, channel_id):
        channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id
        offset = 0
        for page_num in itertools.count(1):
            data = self._download_json(
                '%sextend/' % channel_url, channel_id,
                'Downloading channel page %d' % page_num,
                data=urlencode_postdata({
                    'csrfmiddlewaretoken': self._TOKEN,
                    'name': '',
                    'offset': offset,
                }), headers={
                    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
                    'Referer': channel_url,
                    'X-Requested-With': 'XMLHttpRequest',
                    'Cookie': 'csrftoken=%s' % self._TOKEN,
                })
            if data.get('success') is False:
                break
            html = data.get('html')
            if not html:
                break
            video_ids = re.findall(
                r'class=["\']channel-videos-image-container[^>]+>\s*<a\b[^>]+\bhref=["\']/video/([^"\'/]+)',
                html)
            if not video_ids:
                break
            offset += len(video_ids)
            for video_id in video_ids:
                yield self.url_result(
                    'https://www.bitchute.com/video/%s' % video_id,
                    ie=BitChuteIE.ie_key(), video_id=video_id)

    def _real_extract(self, url):
        channel_id = self._match_id(url)
        return self.playlist_result(
            self._entries(channel_id), playlist_id=channel_id)
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00			`import itertools`
			`import re`

			`from .common import InfoExtractor`
[bitchute] Fix extraction (closes #18567) 2019-01-01 12:12:44 +01:00			`from ..utils import (`
[bitchute] Fix error for geoblocking Closes #26564. 2020-09-11 23:31:44 +02:00			`ExtractorError,`
[extractor/bitchute] Simplify extractor (#5066) * Check alternate domains when a URL does not work * Obey `--no-check-formats` * Remove webseeds (doesnt seem to exist anymore) Authored by: flashdagger, pukkandan Co-authored-by: Marcel <flashdagger@googlemail.com> 2022-11-04 15:08:38 +01:00			`HEADRequest,`
			`clean_html,`
			`get_element_by_class,`
			`int_or_none,`
[bitchute] Fix extraction (closes #18567) 2019-01-01 12:12:44 +01:00			`orderedSet,`
[extractor/bitchute] Simplify extractor (#5066) * Check alternate domains when a URL does not work * Obey `--no-check-formats` * Remove webseeds (doesnt seem to exist anymore) Authored by: flashdagger, pukkandan Co-authored-by: Marcel <flashdagger@googlemail.com> 2022-11-04 15:08:38 +01:00			`traverse_obj,`
[bitchute] Extract upload date (closes #22990) (#23193) 2019-11-26 18:20:39 +01:00			`unified_strdate,`
[bitchute] Fix extraction (closes #18567) 2019-01-01 12:12:44 +01:00			`urlencode_postdata,`
			`)`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00

			`class BitChuteIE(InfoExtractor):`
			`_VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video\|embed\|torrent/[^/]+)/(?P<id>[^/?#&]+)'`
[extractors] Use new framework for existing embeds (#4307) `Brightcove` is difficult to migrate because it's subclasses may depend on the signature of the current functions. So it is left as-is for now Note: Tests have not been migrated 2022-08-01 03:23:25 +02:00			`_EMBED_REGEX = [rf'<(?:script\|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})']`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00			`_TESTS = [{`
[bitchute] Fix test (#758) Authored by: mahanstreamer 2021-08-22 21:58:23 +02:00			`'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/',`
			`'md5': '7e427d7ed7af5a75b5855705ec750e2b',`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00			`'info_dict': {`
[extractor/bitchute] Simplify extractor (#5066) * Check alternate domains when a URL does not work * Obey `--no-check-formats` * Remove webseeds (doesnt seem to exist anymore) Authored by: flashdagger, pukkandan Co-authored-by: Marcel <flashdagger@googlemail.com> 2022-11-04 15:08:38 +01:00			`'id': 'UGlrF9o9b-Q',`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00			`'ext': 'mp4',`
[bitchute] Fix test (#758) Authored by: mahanstreamer 2021-08-22 21:58:23 +02:00			`'title': 'This is the first video on #BitChute !',`
			`'description': 'md5:a0337e7b1fe39e32336974af8173a034',`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00			`'thumbnail': r're:^https?://.*\.jpg$',`
[bitchute] Fix test (#758) Authored by: mahanstreamer 2021-08-22 21:58:23 +02:00			`'uploader': 'BitChute',`
			`'upload_date': '20170103',`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00			`},`
[extractor/bitchute] Simplify extractor (#5066) * Check alternate domains when a URL does not work * Obey `--no-check-formats` * Remove webseeds (doesnt seem to exist anymore) Authored by: flashdagger, pukkandan Co-authored-by: Marcel <flashdagger@googlemail.com> 2022-11-04 15:08:38 +01:00			`}, {`
			`# video not downloadable in browser, but we can recover it`
			`'url': 'https://www.bitchute.com/video/2s6B3nZjAk7R/',`
			`'md5': '05c12397d5354bf24494885b08d24ed1',`
			`'info_dict': {`
			`'id': '2s6B3nZjAk7R',`
			`'ext': 'mp4',`
			`'filesize': 71537926,`
			`'title': 'STYXHEXENHAMMER666 - Election Fraud, Clinton 2020, EU Armies, and Gun Control',`
			`'description': 'md5:228ee93bd840a24938f536aeac9cf749',`
			`'thumbnail': r're:^https?://.*\.jpg$',`
			`'uploader': 'BitChute',`
			`'upload_date': '20181113',`
			`},`
			`'params': {'check_formats': None},`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00			`}, {`
			`'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/',`
			`'only_matching': True,`
			`}, {`
			`'url': 'https://www.bitchute.com/torrent/Zee5BE49045h/szoMrox2JEI.webtorrent',`
			`'only_matching': True,`
			`}]`

[extractor/bitchute] Simplify extractor (#5066) * Check alternate domains when a URL does not work * Obey `--no-check-formats` * Remove webseeds (doesnt seem to exist anymore) Authored by: flashdagger, pukkandan Co-authored-by: Marcel <flashdagger@googlemail.com> 2022-11-04 15:08:38 +01:00			`_HEADERS = {`
			`'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36',`
			`'Referer': 'https://www.bitchute.com/',`
			`}`

			`def _check_format(self, video_url, video_id):`
			`urls = orderedSet(`
			`re.sub(r'(^https?://)(seed\d+)(?=\.bitchute\.com)', fr'\g<1>{host}', video_url)`
			`for host in (r'\g<2>', 'seed150', 'seed151', 'seed152', 'seed153'))`
			`for url in urls:`
			`try:`
			`response = self._request_webpage(`
			`HEADRequest(url), video_id=video_id, note=f'Checking {url}', headers=self._HEADERS)`
			`except ExtractorError as e:`
			`self.to_screen(f'{video_id}: URL is invalid, skipping: {e.cause}')`
			`continue`
			`return {`
			`'url': url,`
			`'filesize': int_or_none(response.headers.get('Content-Length'))`
			`}`

[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00			`def _real_extract(self, url):`
			`video_id = self._match_id(url)`
			`webpage = self._download_webpage(`
[extractor/bitchute] Simplify extractor (#5066) * Check alternate domains when a URL does not work * Obey `--no-check-formats` * Remove webseeds (doesnt seem to exist anymore) Authored by: flashdagger, pukkandan Co-authored-by: Marcel <flashdagger@googlemail.com> 2022-11-04 15:08:38 +01:00			`f'https://www.bitchute.com/video/{video_id}', video_id, headers=self._HEADERS)`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00
[extractor/bitchute] Simplify extractor (#5066) * Check alternate domains when a URL does not work * Obey `--no-check-formats` * Remove webseeds (doesnt seem to exist anymore) Authored by: flashdagger, pukkandan Co-authored-by: Marcel <flashdagger@googlemail.com> 2022-11-04 15:08:38 +01:00			`publish_date = clean_html(get_element_by_class('video-publish-date', webpage))`
			`entries = self._parse_html5_media_entries(url, webpage, video_id)`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00
[extractor/bitchute] Simplify extractor (#5066) * Check alternate domains when a URL does not work * Obey `--no-check-formats` * Remove webseeds (doesnt seem to exist anymore) Authored by: flashdagger, pukkandan Co-authored-by: Marcel <flashdagger@googlemail.com> 2022-11-04 15:08:38 +01:00			`formats = []`
			`for format_ in traverse_obj(entries, (0, 'formats', ...)):`
			`if self.get_param('check_formats') is not False:`
			`format_.update(self._check_format(format_.pop('url'), video_id) or {})`
			`if 'url' not in format_:`
			`continue`
			`formats.append(format_)`
[bitchute] Extract HTML5 formats (closes #21306) 2019-06-07 17:58:19 +02:00
			`if not formats:`
[extractor/bitchute] Simplify extractor (#5066) * Check alternate domains when a URL does not work * Obey `--no-check-formats` * Remove webseeds (doesnt seem to exist anymore) Authored by: flashdagger, pukkandan Co-authored-by: Marcel <flashdagger@googlemail.com> 2022-11-04 15:08:38 +01:00			`self.raise_no_formats(`
			`'Video is unavailable. Please make sure this video is playable in the browser '`
			`'before reporting this issue.', expected=True, video_id=video_id)`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00			`self._sort_formats(formats)`

			`return {`
			`'id': video_id,`
[extractor/bitchute] Simplify extractor (#5066) * Check alternate domains when a URL does not work * Obey `--no-check-formats` * Remove webseeds (doesnt seem to exist anymore) Authored by: flashdagger, pukkandan Co-authored-by: Marcel <flashdagger@googlemail.com> 2022-11-04 15:08:38 +01:00			`'title': self._html_extract_title(webpage) or self._og_search_title(webpage),`
			`'description': self._og_search_description(webpage, default=None),`
			`'thumbnail': self._og_search_thumbnail(webpage),`
			`'uploader': clean_html(get_element_by_class('owner', webpage)),`
			`'upload_date': unified_strdate(self._search_regex(`
			`r'at \d+:\d+ UTC on (.+?)\.', publish_date, 'upload date', fatal=False)),`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00			`'formats': formats,`
			`}`


			`class BitChuteChannelIE(InfoExtractor):`
			`_VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P<id>[^/?#&]+)'`
			`_TEST = {`
			`'url': 'https://www.bitchute.com/channel/victoriaxrave/',`
			`'playlist_mincount': 185,`
			`'info_dict': {`
			`'id': 'victoriaxrave',`
			`},`
			`}`

			`_TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7'`

			`def _entries(self, channel_id):`
			`channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id`
[bitchute] Improve page offset 2018-08-11 20:52:50 +02:00			`offset = 0`
			`for page_num in itertools.count(1):`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00			`data = self._download_json(`
			`'%sextend/' % channel_url, channel_id,`
[bitchute] Improve page offset 2018-08-11 20:52:50 +02:00			`'Downloading channel page %d' % page_num,`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00			`data=urlencode_postdata({`
			`'csrfmiddlewaretoken': self._TOKEN,`
			`'name': '',`
[bitchute] Improve page offset 2018-08-11 20:52:50 +02:00			`'offset': offset,`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00			`}), headers={`
			`'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',`
			`'Referer': channel_url,`
			`'X-Requested-With': 'XMLHttpRequest',`
			`'Cookie': 'csrftoken=%s' % self._TOKEN,`
			`})`
			`if data.get('success') is False:`
			`break`
			`html = data.get('html')`
			`if not html:`
			`break`
			`video_ids = re.findall(`
			`r'class=["\']channel-videos-image-container[^>]+>\s*<a\b[^>]+\bhref=["\']/video/([^"\'/]+)',`
			`html)`
			`if not video_ids:`
			`break`
[bitchute] Improve page offset 2018-08-11 20:52:50 +02:00			`offset += len(video_ids)`
[bitchute] Add extractor (closes #14052) 2018-08-11 20:47:10 +02:00			`for video_id in video_ids:`
			`yield self.url_result(`
			`'https://www.bitchute.com/video/%s' % video_id,`
			`ie=BitChuteIE.ie_key(), video_id=video_id)`

			`def _real_extract(self, url):`
			`channel_id = self._match_id(url)`
			`return self.playlist_result(`
			`self._entries(channel_id), playlist_id=channel_id)`