X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=f69bad4f3a77d49be2d8b9cac3c6c256f4ee9898;hb=64c78d50ccf05f34e27b652530fc8b702aa54122;hp=61ba2a1b7a61f2c2f3dd760b2db1204e0e460b50;hpb=197080b10be36cbeccfe33b8de09a6a8a255f609;p=youtube-dl diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 61ba2a1b7..f69bad4f3 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -15,6 +15,7 @@ import email.utils import xml.etree.ElementTree import random import math +import operator from .utils import * @@ -73,13 +74,15 @@ class InfoExtractor(object): self._ready = False self.set_downloader(downloader) - def suitable(self, url): + @classmethod + def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(self._VALID_URL, url) is not None + return re.match(cls._VALID_URL, url) is not None - def working(self): + @classmethod + def working(cls): """Getter method for _WORKING.""" - return self._WORKING + return cls._WORKING def initialize(self): """Initializes an instance (authentication, etc).""" @@ -136,7 +139,6 @@ class YoutubeIE(InfoExtractor): (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/| tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls - (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs (?: # the various things that can precede the ID: (?:(?:v|embed|e)/) # v/ or embed/ or e/ |(?: # or the v= param in all its forms @@ -188,9 +190,11 @@ class YoutubeIE(InfoExtractor): } IE_NAME = u'youtube' - def suitable(self, url): + @classmethod + def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(self._VALID_URL, url, re.VERBOSE) is not None + if YoutubePlaylistIE.suitable(url): return False + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None def report_lang(self): """Report attempt to set language.""" @@ -304,7 +308,7 @@ class YoutubeIE(InfoExtractor): else: raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) except (IOError, netrc.NetrcParseError) as err: - self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err)) + self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err)) return # Set language @@ -313,7 +317,7 @@ class YoutubeIE(InfoExtractor): self.report_lang() compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err)) + self._downloader.report_warning(u'unable to set language: %s' % compat_str(err)) return # No authentication to be performed @@ -324,7 +328,7 @@ class YoutubeIE(InfoExtractor): try: login_page = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err)) + self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err)) return galx = None @@ -368,10 +372,10 @@ class YoutubeIE(InfoExtractor): self.report_login() login_results = compat_urllib_request.urlopen(request).read().decode('utf-8') if re.search(r'(?i)]* id="gaia_loginform"', login_results) is not None: - self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password') + self._downloader.report_warning(u'unable to log in: bad username or password') return except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err)) + self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) return # Confirm age @@ -1330,7 +1334,7 @@ class GenericIE(InfoExtractor): opener = compat_urllib_request.OpenerDirector() for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler, HTTPMethodFallback, HEADRedirectHandler, - compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]: + compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]: opener.add_handler(handler()) response = opener.open(HeadRequest(url)) @@ -1366,6 +1370,9 @@ class GenericIE(InfoExtractor): if mobj is None: # Broaden the search a little bit mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) + if mobj is None: + # Broaden the search a little bit: JWPlayer JS loader + mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage) if mobj is None: self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) return @@ -1449,7 +1456,7 @@ class YoutubeSearchIE(InfoExtractor): self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) return elif n > self._max_youtube_results: - self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n)) + self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n)) n = self._max_youtube_results self._download_n_results(query, n) return @@ -1469,12 +1476,16 @@ class YoutubeSearchIE(InfoExtractor): result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1) request = compat_urllib_request.Request(result_url) try: - data = compat_urllib_request.urlopen(request).read() + data = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err)) return api_response = json.loads(data)['data'] + if not 'items' in api_response: + self._downloader.trouble(u'[youtube] No video results') + return + new_ids = list(video['id'] for video in api_response['items']) video_ids += new_ids @@ -1527,7 +1538,7 @@ class GoogleSearchIE(InfoExtractor): self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) return elif n > self._max_google_results: - self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n)) + self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n)) n = self._max_google_results self._download_n_results(query, n) return @@ -1611,7 +1622,7 @@ class YahooSearchIE(InfoExtractor): self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) return elif n > self._max_yahoo_results: - self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n)) + self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n)) n = self._max_yahoo_results self._download_n_results(query, n) return @@ -1659,80 +1670,94 @@ class YahooSearchIE(InfoExtractor): class YoutubePlaylistIE(InfoExtractor): """Information Extractor for YouTube playlists.""" - _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*' - _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en' - _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s' - _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}" + _VALID_URL = r"""(?: + (?:https?://)? + (?:\w+\.)? + youtube\.com/ + (?: + (?:course|view_play_list|my_playlists|artist|playlist|watch) + \? (?:.*?&)*? (?:p|a|list)= + | user/.*?/user/ + | p/ + | user/.*?#[pg]/c/ + ) + ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,}) + .* + | + ((?:PL|EC|UU)[0-9A-Za-z-_]{10,}) + )""" + _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json' + _MAX_RESULTS = 50 IE_NAME = u'youtube:playlist' def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) + @classmethod + def suitable(cls, url): + """Receives a URL and returns True if suitable for this IE.""" + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None + def report_download_page(self, playlist_id, pagenum): """Report attempt to download playlist page with given number.""" self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum)) def _real_extract(self, url): # Extract playlist id - mobj = re.match(self._VALID_URL, url) + mobj = re.match(self._VALID_URL, url, re.VERBOSE) if mobj is None: self._downloader.trouble(u'ERROR: invalid url: %s' % url) return - # Single video case - if mobj.group(3) is not None: - self._downloader.download([mobj.group(3)]) - return - - # Download playlist pages - # prefix is 'p' as default for playlists but there are other types that need extra care - playlist_prefix = mobj.group(1) - if playlist_prefix == 'a': - playlist_access = 'artist' - else: - playlist_prefix = 'p' - playlist_access = 'view_play_list' - playlist_id = mobj.group(2) - video_ids = [] - pagenum = 1 + # Download playlist videos from API + playlist_id = mobj.group(1) or mobj.group(2) + page_num = 1 + videos = [] while True: - self.report_download_page(playlist_id, pagenum) - url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum) - request = compat_urllib_request.Request(url) + self.report_download_page(playlist_id, page_num) + + url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1) try: - page = compat_urllib_request.urlopen(request).read().decode('utf-8') + page = compat_urllib_request.urlopen(url).read().decode('utf8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err)) return - # Extract video identifiers - ids_in_page = [] - for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page): - if mobj.group(1) not in ids_in_page: - ids_in_page.append(mobj.group(1)) - video_ids.extend(ids_in_page) + try: + response = json.loads(page) + except ValueError as err: + self._downloader.trouble(u'ERROR: Invalid JSON in API response: ' + compat_str(err)) + return - if self._MORE_PAGES_INDICATOR not in page: + if not 'feed' in response or not 'entry' in response['feed']: + self._downloader.trouble(u'ERROR: Got a malformed response from YouTube API') + return + videos += [ (entry['yt$position']['$t'], entry['content']['src']) + for entry in response['feed']['entry'] + if 'content' in entry ] + + if len(response['feed']['entry']) < self._MAX_RESULTS: break - pagenum = pagenum + 1 + page_num += 1 - total = len(video_ids) + videos = [v[1] for v in sorted(videos)] + total = len(videos) playliststart = self._downloader.params.get('playliststart', 1) - 1 playlistend = self._downloader.params.get('playlistend', -1) if playlistend == -1: - video_ids = video_ids[playliststart:] + videos = videos[playliststart:] else: - video_ids = video_ids[playliststart:playlistend] + videos = videos[playliststart:playlistend] - if len(video_ids) == total: + if len(videos) == total: self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total)) else: - self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids))) + self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos))) - for id in video_ids: - self._downloader.download(['http://www.youtube.com/watch?v=%s' % id]) + for video in videos: + self._downloader.download([video]) return @@ -1918,9 +1943,8 @@ class BlipTVUserIE(InfoExtractor): while True: self.report_download_page(username, pagenum) - - request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) ) - + url = page_base + "&page=" + str(pagenum) + request = compat_urllib_request.Request( url ) try: page = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: @@ -2056,7 +2080,7 @@ class FacebookIE(InfoExtractor): else: raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) except (IOError, netrc.NetrcParseError) as err: - self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err)) + self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err)) return if useremail is None: @@ -2073,10 +2097,10 @@ class FacebookIE(InfoExtractor): self.report_login() login_results = compat_urllib_request.urlopen(request).read() if re.search(r'', login_results) is not None: - self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.') + self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.') return except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err)) + self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) return def _real_extract(self, url): @@ -2141,6 +2165,17 @@ class BlipTVIE(InfoExtractor): self._downloader.trouble(u'ERROR: invalid URL: %s' % url) return + urlp = compat_urllib_parse_urlparse(url) + if urlp.path.startswith('/play/'): + request = compat_urllib_request.Request(url) + response = compat_urllib_request.urlopen(request) + redirecturl = response.geturl() + rurlp = compat_urllib_parse_urlparse(redirecturl) + file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2] + url = 'http://blip.tv/a/a-' + file_id + return self._real_extract(url) + + if '?' in url: cchar = '&' else: @@ -2296,9 +2331,10 @@ class ComedyCentralIE(InfoExtractor): '400': '384x216', } - def suitable(self, url): + @classmethod + def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(self._VALID_URL, url, re.VERBOSE) is not None + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None def report_extraction(self, episode_id): self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id) @@ -2521,7 +2557,7 @@ class EscapistIE(InfoExtractor): 'uploader': showName, 'upload_date': None, 'title': showName, - 'ext': 'flv', + 'ext': 'mp4', 'thumbnail': imgUrl, 'description': description, 'player_url': playerUrl, @@ -3555,65 +3591,17 @@ class FunnyOrDieIE(InfoExtractor): } return [info] -class TweetReelIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P[0-9a-z]+)$' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) - return - - video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - - m = re.search(r'
', webpage) - if not m: - self._downloader.trouble(u'ERROR: Cannot find status ID') - status_id = m.group(1) - - m = re.search(r'
(.*?)
', webpage, flags=re.DOTALL) - if not m: - self._downloader.trouble(u'WARNING: Cannot find description') - desc = unescapeHTML(re.sub('', '', m.group(1))).strip() - - m = re.search(r'
.*?from (?P.+?)', webpage, flags=re.DOTALL) - if not m: - self._downloader.trouble(u'ERROR: Cannot find uploader') - uploader = unescapeHTML(m.group('uploader')) - uploader_id = unescapeHTML(m.group('uploader_id')) - - m = re.search(r'video|app)/ #If the page is only for videos or for a game (?P\d+)/? (?P\d*)(?P\??) #For urltype == video we sometimes get the videoID """ - def suitable(self, url): + @classmethod + def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(self._VALID_URL, url, re.VERBOSE) is not None + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None def _real_extract(self, url): m = re.match(self._VALID_URL, url, re.VERBOSE) @@ -3624,18 +3612,22 @@ class SteamIE(InfoExtractor): mweb = re.finditer(urlRE, webpage) namesRE = r'(?P.+?)' titles = re.finditer(namesRE, webpage) + thumbsRE = r'' + thumbs = re.finditer(thumbsRE, webpage) videos = [] - for vid,vtitle in zip(mweb,titles): + for vid,vtitle,thumb in zip(mweb,titles,thumbs): video_id = vid.group('videoID') title = vtitle.group('videoName') video_url = vid.group('videoURL') + video_thumb = thumb.group('thumbnail') if not video_url: self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id) info = { 'id':video_id, 'url':video_url, 'ext': 'flv', - 'title': unescapeHTML(title) + 'title': unescapeHTML(title), + 'thumbnail': video_thumb } videos.append(info) return videos @@ -3662,6 +3654,62 @@ class UstreamIE(InfoExtractor): } return [info] +class WorldStarHipHopIE(InfoExtractor): + _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P.*)' + IE_NAME = u'WorldStarHipHop' + + def _real_extract(self, url): + _src_url = r"""(http://hw-videos.*(?:mp4|flv))""" + + webpage_src = compat_urllib_request.urlopen(str(url)).read() + webpage_src = webpage_src.decode('utf-8') + + mobj = re.search(_src_url, webpage_src) + + if mobj is not None: + video_url = mobj.group() + if 'mp4' in video_url: + ext = 'mp4' + else: + ext = 'flv' + else: + video_url = None + ext = None + + _title = r"""(.*)""" + + mobj = re.search(_title, webpage_src) + + if mobj is not None: + title = mobj.group(1) + else: + title = 'World Start Hip Hop - %s' % time.ctime() + + _thumbnail = r"""rel="image_src" href="(.*)" />""" + mobj = re.search(_thumbnail, webpage_src) + + # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. + if mobj is not None: + thumbnail = mobj.group(1) + else: + _title = r"""candytitles.*>(.*)""" + mobj = re.search(_title, webpage_src) + if mobj is not None: + title = mobj.group(1) + thumbnail = None + + m = re.match(self._VALID_URL, url) + video_id = m.group('id') + + results = [{ + 'id': video_id, + 'url' : video_url, + 'title' : title, + 'thumbnail' : thumbnail, + 'ext' : ext, + }] + return results + class RBMARadioIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P[^/]+)$' @@ -3701,7 +3749,7 @@ class RBMARadioIE(InfoExtractor): class YouPornIE(InfoExtractor): """Information extractor for youporn.com.""" _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P[0-9]+)/(?P[^/]+)' - + def _print_formats(self, formats): """Print all available formats""" print(u'Available formats:') @@ -3729,23 +3777,23 @@ class YouPornIE(InfoExtractor): webpage = self._download_webpage(req, video_id) # Get the video title - result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage) + result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage) if result is None: - raise ExtractorError(u'ERROR: unable to extract video title') + raise ExtractorError(u'Unable to extract video title') video_title = result.group('title').strip() # Get the video date - result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage) + result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage) if result is None: - self._downloader.to_stderr(u'WARNING: unable to extract video date') + self._downloader.report_warning(u'unable to extract video date') upload_date = None else: upload_date = result.group('date').strip() # Get the video uploader - result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage) + result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage) if result is None: - self._downloader.to_stderr(u'ERROR: unable to extract uploader') + self._downloader.report_warning(u'unable to extract uploader') video_uploader = None else: video_uploader = result.group('uploader').strip() @@ -3763,8 +3811,8 @@ class YouPornIE(InfoExtractor): links = re.findall(LINK_RE, download_list_html) if(len(links) == 0): raise ExtractorError(u'ERROR: no known formats available for video') - - self._downloader.to_screen(u'[youporn] Links found: %d' % len(links)) + + self._downloader.to_screen(u'[youporn] Links found: %d' % len(links)) formats = [] for link in links: @@ -3815,7 +3863,7 @@ class YouPornIE(InfoExtractor): return return [format] - + class PornotubeIE(InfoExtractor): """Information extractor for pornotube.com.""" @@ -3887,7 +3935,7 @@ class YouJizzIE(InfoExtractor): embed_page_url = result.group(0).strip() video_id = result.group('videoid') - + webpage = self._download_webpage(embed_page_url, video_id) # Get the video URL @@ -3983,9 +4031,10 @@ class TEDIE(InfoExtractor): /(?P<name>\w+) # Here goes the name and then ".html" ''' - def suitable(self, url): + @classmethod + def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(self._VALID_URL, url, re.VERBOSE) is not None + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None def _real_extract(self, url): m=re.match(self._VALID_URL, url, re.VERBOSE) @@ -4008,31 +4057,30 @@ class TEDIE(InfoExtractor): ([.\s]*?)data-playlist_item_id="(\d+)" ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)" ''' - video_name_RE=r'<p\ class="talk-title"><a href="/talks/(.+).html">(?P<fullname>.+?)</a></p>' + video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>' webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage') m_videos=re.finditer(video_RE,webpage,re.VERBOSE) m_names=re.finditer(video_name_RE,webpage) info=[] for m_video, m_name in zip(m_videos,m_names): - video_dic={ - 'id': m_video.group('video_id'), - 'url': self._talk_video_link(m_video.group('mediaSlug')), - 'ext': 'mp4', - 'title': m_name.group('fullname') - } - info.append(video_dic) + video_id=m_video.group('video_id') + talk_url='http://www.ted.com%s' % m_name.group('talk_url') + info.append(self._talk_info(talk_url,video_id)) return info + def _talk_info(self, url, video_id=0): """Return the video for the talk in the url""" m=re.match(self._VALID_URL, url,re.VERBOSE) videoName=m.group('name') webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName) # If the url includes the language we get the title translated - title_RE=r'<h1><span id="altHeadline" >(?P<title>[\s\w:/\.\?=\+-\\\']*)</span></h1>' + title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>' title=re.search(title_RE, webpage).group('title') info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?) "id":(?P<videoID>[\d]+).*? "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"''' + thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"' + thumb_match=re.search(thumb_RE,webpage) info_match=re.search(info_RE,webpage,re.VERBOSE) video_id=info_match.group('videoID') mediaSlug=info_match.group('mediaSlug') @@ -4041,13 +4089,14 @@ class TEDIE(InfoExtractor): 'id': video_id, 'url': video_url, 'ext': 'mp4', - 'title': title + 'title': title, + 'thumbnail': thumb_match.group('thumbnail') } return info class MySpassIE(InfoExtractor): _VALID_URL = r'http://www.myspass.de/.*' - + def _real_extract(self, url): META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s' @@ -4057,12 +4106,12 @@ class MySpassIE(InfoExtractor): url_parent_path, video_id = os.path.split(url_path) if not video_id: _, video_id = os.path.split(url_parent_path) - + # get metadata metadata_url = META_DATA_URL_TEMPLATE % video_id metadata_text = self._download_webpage(metadata_url, video_id) metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8')) - + # extract values from metadata url_flv_el = metadata.find('url_flv') if url_flv_el is None: @@ -4140,9 +4189,9 @@ def gen_extractors(): GooglePlusIE(), ArteTvIE(), NBAIE(), + WorldStarHipHopIE(), JustinTVIE(), FunnyOrDieIE(), - TweetReelIE(), SteamIE(), UstreamIE(), RBMARadioIE(),