From e3a88568b0457f18a03a2a894287a03f7cc2dbbe Mon Sep 17 00:00:00 2001 From: Yasoob Date: Sun, 11 Aug 2013 22:23:05 +0500 Subject: [PATCH 01/10] Added an IE for hark.com --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/hark.py | 35 ++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 youtube_dl/extractor/hark.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 84c02c2ed..ca5664577 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -29,6 +29,7 @@ from .gametrailers import GametrailersIE from .generic import GenericIE from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE +from .hark import HarkIE from .hotnewhiphop import HotNewHipHopIE from .howcast import HowcastIE from .hypem import HypemIE diff --git a/youtube_dl/extractor/hark.py b/youtube_dl/extractor/hark.py new file mode 100644 index 000000000..ab0a69697 --- /dev/null +++ b/youtube_dl/extractor/hark.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- + +import re + +from .common import InfoExtractor +from ..utils import determine_ext + +class HarkIE(InfoExtractor): + _VALID_URL = r'https?://www\.hark\.com/clips/(.+?)-.+' + _TEST = { + u'url': u'http://www.hark.com/clips/mmbzyhkgny-obama-beyond-the-afghan-theater-we-only-target-al-qaeda-on-may-23-2013', + u'file': u'mmbzyhkgny.mp3', + u'md5': u'6783a58491b47b92c7c1af5a77d4cbee', + u'info_dict': { + u"title": u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' On May 23, 2013 ", + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) + embed_url = "http://www.hark.com/clips/%s/homepage_embed" %(video_id) + webpage = self._download_webpage(embed_url, video_id) + + final_url = self._search_regex(r'src="(.+?).mp3"', + webpage, 'video url')+'.mp3' + title = self._html_search_regex(r'(.+?)', + webpage, 'video title').replace(' Sound Clip and Quote - Hark','').replace( + 'Sound Clip , Quote, MP3, and Ringtone - Hark','') + + return {'id': video_id, + 'url' : final_url, + 'title': title, + 'ext': determine_ext(final_url), + } From cd0abcc0bb4c218fd02850a139b626d252e22599 Mon Sep 17 00:00:00 2001 From: Pierre Rudloff Date: Thu, 22 Aug 2013 13:54:23 +0200 Subject: [PATCH 02/10] Extractor for canalc2.tv --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/canalc2.py | 37 ++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 youtube_dl/extractor/canalc2.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 9d12608e1..576b8433a 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -7,6 +7,7 @@ from .bliptv import BlipTVIE, BlipTVUserIE from .breakcom import BreakIE from .brightcove import BrightcoveIE from .canalplus import CanalplusIE +from .canalc2 import Canalc2IE from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE from .condenast import CondeNastIE diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py new file mode 100644 index 000000000..d0e2ed536 --- /dev/null +++ b/youtube_dl/extractor/canalc2.py @@ -0,0 +1,37 @@ +# coding: utf-8 +"""Extractor for canalc2.tv""" +import re +import lxml.html + +from .common import InfoExtractor + +class Canalc2IE(InfoExtractor): + """Extractor for canalc2.tv""" + _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?idVideo=(\d+)&voir=oui' + + _TEST = { + u'url': u'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui', + u'file': u'12163.mp4', + u'md5': u'c00fa80517373764ff5c0b5eb5a58780', + u'info_dict': { + u'title': u'Terrasses du Numérique' + } + } + + def _real_extract(self, url): + video_id = re.match(self._VALID_URL, url).group(1) + webpage = self._download_webpage(url, video_id) + file_name = re.search(r"so\.addVariable\('file','(.*?)'\);", + webpage).group(1) + + video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name + + html = lxml.html.fromstring(webpage) + + title = html.cssselect('.evenement8')[0].text_content() + + return {'id': video_id, + 'ext' : 'mp4', + 'url' : video_url, + 'title' : title + } From ff2424595adf02cbe5d1f1071e53c3b2e5f32c9e Mon Sep 17 00:00:00 2001 From: Pierre Rudloff Date: Thu, 22 Aug 2013 14:47:51 +0200 Subject: [PATCH 03/10] lxml is not part of the standard library. --- youtube_dl/extractor/canalc2.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index d0e2ed536..215abf537 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -1,7 +1,6 @@ # coding: utf-8 """Extractor for canalc2.tv""" import re -import lxml.html from .common import InfoExtractor @@ -25,10 +24,9 @@ class Canalc2IE(InfoExtractor): webpage).group(1) video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name - - html = lxml.html.fromstring(webpage) - - title = html.cssselect('.evenement8')[0].text_content() + + title = self._html_search_regex(r'class="evenement8">(.*?)', + webpage, u'title') return {'id': video_id, 'ext' : 'mp4', From 627a91a9a827b48270d3f5e288404388946f0733 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 26 Aug 2013 21:29:31 +0200 Subject: [PATCH 04/10] [generic] small typo --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8488dca05..d034a11bb 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -126,7 +126,7 @@ class GenericIE(InfoExtractor): raise ExtractorError(u'Invalid URL: %s' % url) self.report_extraction(video_id) - # Look for BrigthCove: + # Look for BrightCove: m_brightcove = re.search(r'', webpage, re.DOTALL) if m_brightcove is not None: self.to_screen(u'Brightcove video detected.') From 976fc7d137c9f23533c9bcf5942af72d3c861b14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2EYasoob=20Ullah=20Khalid=20=E2=98=BA?= Date: Tue, 27 Aug 2013 01:00:17 +0500 Subject: [PATCH 05/10] fixed tests for c56 and dailymotion --- youtube_dl/extractor/c56.py | 4 ++-- youtube_dl/extractor/dailymotion.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/c56.py b/youtube_dl/extractor/c56.py index 4c8a8af09..dc3a8d47d 100644 --- a/youtube_dl/extractor/c56.py +++ b/youtube_dl/extractor/c56.py @@ -12,8 +12,8 @@ class C56IE(InfoExtractor): _TEST ={ u'url': u'http://www.56.com/u39/v_OTM0NDA3MTY.html', - u'file': u'93440716.mp4', - u'md5': u'9dc07b5c8e978112a6441f9e75d2b59e', + u'file': u'93440716.flv', + u'md5': u'e59995ac63d0457783ea05f93f12a866', u'info_dict': { u'title': u'网事知多少 第32期:车怒', }, diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index fa8c630d0..1ea449ca8 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -21,7 +21,7 @@ class DailymotionIE(InfoExtractor): u'file': u'x33vw9.mp4', u'md5': u'392c4b85a60a90dc4792da41ce3144eb', u'info_dict': { - u"uploader": u"Alex and Van .", + u"uploader": u"Amphora Alex and Van .", u"title": u"Tutoriel de Youtubeur\"DL DES VIDEO DE YOUTUBE\"" } } From 341ca8d74c8f090bd696111353400f0cef2ba9bc Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 27 Aug 2013 01:59:00 +0200 Subject: [PATCH 06/10] [trilulilu] Add support for trilulilu.ro Fun fact: The ads (not yet supported) are loaded from youtube ;) --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/trilulilu.py | 76 +++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 2 deletions(-) create mode 100644 youtube_dl/extractor/trilulilu.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f71ae2713..fa53d9af9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -6,6 +6,7 @@ from .bandcamp import BandcampIE from .bliptv import BlipTVIE, BlipTVUserIE from .breakcom import BreakIE from .brightcove import BrightcoveIE +from .c56 import C56IE from .canalplus import CanalplusIE from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE @@ -73,18 +74,18 @@ from .ted import TEDIE from .tf1 import TF1IE from .thisav import ThisAVIE from .traileraddict import TrailerAddictIE +from .trilulilu import TriluliluIE from .tudou import TudouIE from .tumblr import TumblrIE from .tutv import TutvIE -from .ustream import UstreamIE from .unistra import UnistraIE +from .ustream import UstreamIE from .vbox7 import Vbox7IE from .veoh import VeohIE from .vevo import VevoIE from .videofyme import VideofyMeIE from .vimeo import VimeoIE, VimeoChannelIE from .vine import VineIE -from .c56 import C56IE from .wat import WatIE from .weibo import WeiboIE from .wimp import WimpIE diff --git a/youtube_dl/extractor/trilulilu.py b/youtube_dl/extractor/trilulilu.py new file mode 100644 index 000000000..1c46156c7 --- /dev/null +++ b/youtube_dl/extractor/trilulilu.py @@ -0,0 +1,76 @@ +import json +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, +) + + +class TriluliluIE(InfoExtractor): + _VALID_URL = r'(?x)(?:https?://)?(?:www\.)?trilulilu\.ro/video-(?P[^/]+)/(?P[^/]+)' + _TEST = { + u"url": u"http://www.trilulilu.ro/video-animatie/big-buck-bunny-1", + u'file': u"big-buck-bunny-1.mp4", + u'info_dict': { + u"title": u"Big Buck Bunny", + u"description": u":) pentru copilul din noi", + }, + # Server ignores Range headers (--test) + u"params": { + u"skip_download": True + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('video_id') + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage) + thumbnail = self._og_search_thumbnail(webpage) + description = self._og_search_description(webpage) + + log_str = self._search_regex( + r'block_flash_vars[ ]=[ ]({[^}]+})', webpage, u'log info') + log = json.loads(log_str) + + format_url = (u'http://fs%(server)s.trilulilu.ro/%(hash)s/' + u'video-formats2' % log) + format_str = self._download_webpage( + format_url, video_id, + note=u'Downloading formats', + errnote=u'Error while downloading formats') + + format_doc = xml.etree.ElementTree.fromstring(format_str) + + video_url_template = ( + u'http://fs%(server)s.trilulilu.ro/stream.php?type=video' + u'&source=site&hash=%(hash)s&username=%(userid)s&' + u'key=ministhebest&format=%%s&sig=&exp=' % + log) + formats = [ + { + 'format': fnode.text, + 'url': video_url_template % fnode.text, + } + + for fnode in format_doc.findall('./formats/format') + ] + + info = { + '_type': 'video', + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + } + + # TODO: Remove when #980 has been merged + info['url'] = formats[-1]['url'] + info['ext'] = formats[-1]['format'].partition('-')[0] + + return info From b3889f702396b1e9641f2329d793915e5ae1454c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 27 Aug 2013 02:30:47 +0200 Subject: [PATCH 07/10] release 2013.08.27 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c10ebd4e8..dff568640 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.08.23' +__version__ = '2013.08.27' From 069d098f846ca53073ec646f335f77dac4439844 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 27 Aug 2013 10:21:57 +0200 Subject: [PATCH 08/10] [canalplus] Accept player.canalplus.fr urls --- youtube_dl/extractor/canalplus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 3b1c88876..1f02519a0 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -5,7 +5,7 @@ from .common import InfoExtractor from ..utils import unified_strdate class CanalplusIE(InfoExtractor): - _VALID_URL = r'https?://www\.canalplus\.fr/.*?\?vid=(?P\d+)' + _VALID_URL = r'https?://(www\.canalplus\.fr/.*?\?vid=|player\.canalplus\.fr/#/)(?P\d+)' _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s' IE_NAME = u'canalplus.fr' From 2a7b4da9b2ee11e88976e0e93796fd8460aa053d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 27 Aug 2013 10:25:38 +0200 Subject: [PATCH 09/10] [hark] get the song info in JSON and extract more information. --- youtube_dl/extractor/hark.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/hark.py b/youtube_dl/extractor/hark.py index ab0a69697..5bdd08afa 100644 --- a/youtube_dl/extractor/hark.py +++ b/youtube_dl/extractor/hark.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import re +import json from .common import InfoExtractor from ..utils import determine_ext @@ -12,24 +13,25 @@ class HarkIE(InfoExtractor): u'file': u'mmbzyhkgny.mp3', u'md5': u'6783a58491b47b92c7c1af5a77d4cbee', u'info_dict': { - u"title": u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' On May 23, 2013 ", + u'title': u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' on May 23, 2013", + u'description': u'President Barack Obama addressed the nation live on May 23, 2013 in a speech aimed at addressing counter-terrorism policies including the use of drone strikes, detainees at Guantanamo Bay prison facility, and American citizens who are terrorists.', + u'duration': 11, } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(1) - embed_url = "http://www.hark.com/clips/%s/homepage_embed" %(video_id) - webpage = self._download_webpage(embed_url, video_id) - - final_url = self._search_regex(r'src="(.+?).mp3"', - webpage, 'video url')+'.mp3' - title = self._html_search_regex(r'(.+?)', - webpage, 'video title').replace(' Sound Clip and Quote - Hark','').replace( - 'Sound Clip , Quote, MP3, and Ringtone - Hark','') + json_url = "http://www.hark.com/clips/%s.json" %(video_id) + info_json = self._download_webpage(json_url, video_id) + info = json.loads(info_json) + final_url = info['url'] return {'id': video_id, 'url' : final_url, - 'title': title, + 'title': info['name'], 'ext': determine_ext(final_url), + 'description': info['description'], + 'thumbnail': info['image_original'], + 'duration': info['duration'], } From e86ea47c029c1f95a696e43df7bea2e3e617fbc3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 27 Aug 2013 10:35:20 +0200 Subject: [PATCH 10/10] [canalc2] Small improvements --- youtube_dl/extractor/canalc2.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index 215abf537..50832217a 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -1,17 +1,17 @@ # coding: utf-8 -"""Extractor for canalc2.tv""" import re from .common import InfoExtractor + class Canalc2IE(InfoExtractor): - """Extractor for canalc2.tv""" + _IE_NAME = 'canalc2.tv' _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?idVideo=(\d+)&voir=oui' _TEST = { u'url': u'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui', u'file': u'12163.mp4', - u'md5': u'c00fa80517373764ff5c0b5eb5a58780', + u'md5': u'060158428b650f896c542dfbb3d6487f', u'info_dict': { u'title': u'Terrasses du Numérique' } @@ -20,16 +20,16 @@ class Canalc2IE(InfoExtractor): def _real_extract(self, url): video_id = re.match(self._VALID_URL, url).group(1) webpage = self._download_webpage(url, video_id) - file_name = re.search(r"so\.addVariable\('file','(.*?)'\);", - webpage).group(1) - + file_name = self._search_regex( + r"so\.addVariable\('file','(.*?)'\);", + webpage, 'file name') video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name - title = self._html_search_regex(r'class="evenement8">(.*?)', - webpage, u'title') + title = self._html_search_regex( + r'class="evenement8">(.*?)', webpage, u'title') return {'id': video_id, - 'ext' : 'mp4', - 'url' : video_url, - 'title' : title + 'ext': 'mp4', + 'url': video_url, + 'title': title, }