standardized the use of unescapeHTML; added clean_html()

This commit is contained in:
Filippo Valsorda - Campagna 2012-04-10 16:31:46 +02:00
parent ceba827e9a
commit d6a9615347
1 changed files with 15 additions and 17 deletions

View File

@ -242,6 +242,18 @@ def htmlentity_transform(matchobj):
return (u'&%s;' % entity) return (u'&%s;' % entity)
def clean_html(html):
"""Clean an HTML snippet into a readable string"""
# Newline vs <br />
html = html.replace('\n', ' ')
html = re.sub('<\s*br\s*/?\s*>', '\n', html)
# Strip html tags
html = re.sub('<.*?>', '', html)
# Replace html entities
html = re.sub(ur'(?u)&(.+?);', htmlentity_transform, html)
return html
def sanitize_title(utitle): def sanitize_title(utitle):
"""Sanitizes a video title so it could be used as part of a filename.""" """Sanitizes a video title so it could be used as part of a filename."""
utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle) utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
@ -3343,8 +3355,6 @@ class EscapistIE(InfoExtractor):
self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName) self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
def _real_extract(self, url): def _real_extract(self, url):
htmlParser = HTMLParser.HTMLParser()
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
if mobj is None: if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url) self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@ -3360,11 +3370,11 @@ class EscapistIE(InfoExtractor):
return return
descMatch = re.search('<meta name="description" content="([^"]*)"', webPage) descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
description = htmlParser.unescape(descMatch.group(1)) description = unescapeHTML(descMatch.group(1))
imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage) imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
imgUrl = htmlParser.unescape(imgMatch.group(1)) imgUrl = unescapeHTML(imgMatch.group(1))
playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage) playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
playerUrl = htmlParser.unescape(playerUrlMatch.group(1)) playerUrl = unescapeHTML(playerUrlMatch.group(1))
configUrlMatch = re.search('config=(.*)$', playerUrl) configUrlMatch = re.search('config=(.*)$', playerUrl)
configUrl = urllib2.unquote(configUrlMatch.group(1)) configUrl = urllib2.unquote(configUrlMatch.group(1))
@ -3423,8 +3433,6 @@ class CollegeHumorIE(InfoExtractor):
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
def _real_extract(self, url): def _real_extract(self, url):
htmlParser = HTMLParser.HTMLParser()
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
if mobj is None: if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url) self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@ -3495,8 +3503,6 @@ class XVideosIE(InfoExtractor):
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
def _real_extract(self, url): def _real_extract(self, url):
htmlParser = HTMLParser.HTMLParser()
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
if mobj is None: if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url) self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@ -3585,8 +3591,6 @@ class SoundcloudIE(InfoExtractor):
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
def _real_extract(self, url): def _real_extract(self, url):
htmlParser = HTMLParser.HTMLParser()
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
if mobj is None: if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url) self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@ -3674,8 +3678,6 @@ class InfoQIE(InfoExtractor):
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
def _real_extract(self, url): def _real_extract(self, url):
htmlParser = HTMLParser.HTMLParser()
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
if mobj is None: if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url) self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@ -3909,8 +3911,6 @@ class StanfordOpenClassroomIE(InfoExtractor):
except UnavailableVideoError, err: except UnavailableVideoError, err:
self._downloader.trouble(u'\nERROR: unable to download video') self._downloader.trouble(u'\nERROR: unable to download video')
elif mobj.group('course'): # A course page elif mobj.group('course'): # A course page
unescapeHTML = HTMLParser.HTMLParser().unescape
course = mobj.group('course') course = mobj.group('course')
info = { info = {
'id': _simplify_title(course), 'id': _simplify_title(course),
@ -3947,8 +3947,6 @@ class StanfordOpenClassroomIE(InfoExtractor):
assert entry['type'] == 'reference' assert entry['type'] == 'reference'
self.extract(entry['url']) self.extract(entry['url'])
else: # Root page else: # Root page
unescapeHTML = HTMLParser.HTMLParser().unescape
info = { info = {
'id': 'Stanford OpenClassroom', 'id': 'Stanford OpenClassroom',
'type': 'playlist', 'type': 'playlist',