From 119d41f27061d220d276a2d38cfc8d873437452a Mon Sep 17 00:00:00 2001 From: imanoreotwe <4606611+imanoreotwe@users.noreply.github.com> Date: Sun, 26 May 2024 15:26:30 -0600 Subject: [PATCH] [ie/tiktok:collection] Add extractor (#9986) Closes #9984 Authored by: imanoreotwe, bashonly --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/tiktok.py | 58 +++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 37e6fc318..e9cd38a65 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2074,6 +2074,7 @@ from .threespeak import ( ) from .tiktok import ( DouyinIE, + TikTokCollectionIE, TikTokEffectIE, TikTokIE, TikTokLiveIE, diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 4113660a5..ab8efc19e 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -1117,6 +1117,64 @@ class TikTokTagIE(TikTokBaseListIE): return self.playlist_result(self._entries(tag_id, display_id), tag_id, display_id) +class TikTokCollectionIE(TikTokBaseIE): + IE_NAME = 'tiktok:collection' + _VALID_URL = r'https?://www\.tiktok\.com/@(?P[\w.-]+)/collection/(?P[^/?#]+)-(?P<id>\d+)/?(?:[?#]|$)' + _TESTS = [{ + # playlist should have exactly 9 videos + 'url': 'https://www.tiktok.com/@imanoreotwe/collection/count-test-7371330159376370462', + 'info_dict': { + 'id': '7371330159376370462', + 'title': 'imanoreotwe-count-test' + }, + 'playlist_count': 9 + }, { + # tests returning multiple pages of a large collection + 'url': 'https://www.tiktok.com/@imanoreotwe/collection/%F0%9F%98%82-7111887189571160875', + 'info_dict': { + 'id': '7111887189571160875', + 'title': 'imanoreotwe-%F0%9F%98%82' + }, + 'playlist_mincount': 100 + }] + _API_BASE_URL = 'https://www.tiktok.com/api/collection/item_list/' + _PAGE_COUNT = 30 + + def _build_web_query(self, collection_id, cursor): + return { + 'aid': '1988', + 'collectionId': collection_id, + 'count': self._PAGE_COUNT, + 'cursor': cursor, + 'sourceType': '113', + } + + def _entries(self, collection_id): + cursor = 0 + for page in itertools.count(1): + response = self._download_json( + self._API_BASE_URL, collection_id, f'Downloading page {page}', + query=self._build_web_query(collection_id, cursor)) + + for video in traverse_obj(response, ('itemList', lambda _, v: v['id'])): + video_id = video['id'] + author = traverse_obj(video, ('author', ('uniqueId', 'secUid', 'id'), {str}, any)) or '_' + webpage_url = self._create_url(author, video_id) + yield self.url_result( + webpage_url, TikTokIE, + **self._parse_aweme_video_web(video, webpage_url, video_id, extract_flat=True)) + + if not traverse_obj(response, 'hasMore'): + break + cursor += self._PAGE_COUNT + + def _real_extract(self, url): + collection_id, title, user_name = self._match_valid_url(url).group('id', 'title', 'user_id') + + return self.playlist_result( + self._entries(collection_id), collection_id, '-'.join((user_name, title))) + + class DouyinIE(TikTokBaseIE): _VALID_URL = r'https?://(?:www\.)?douyin\.com/video/(?P<id>[0-9]+)' _TESTS = [{