X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=f69bad4f3a77d49be2d8b9cac3c6c256f4ee9898;hb=64c78d50ccf05f34e27b652530fc8b702aa54122;hp=ac69f82fe95345dbb67e7441ed3f66de4627e613;hpb=f0877a445e29ff58a4d377c8a193efb8f7c9b8e8;p=youtube-dl diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index ac69f82fe..f69bad4f3 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -15,6 +15,7 @@ import email.utils import xml.etree.ElementTree import random import math +import operator from .utils import * @@ -73,13 +74,15 @@ class InfoExtractor(object): self._ready = False self.set_downloader(downloader) - def suitable(self, url): + @classmethod + def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(self._VALID_URL, url) is not None + return re.match(cls._VALID_URL, url) is not None - def working(self): + @classmethod + def working(cls): """Getter method for _WORKING.""" - return self._WORKING + return cls._WORKING def initialize(self): """Initializes an instance (authentication, etc).""" @@ -136,7 +139,6 @@ class YoutubeIE(InfoExtractor): (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/| tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls - (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs (?: # the various things that can precede the ID: (?:(?:v|embed|e)/) # v/ or embed/ or e/ |(?: # or the v= param in all its forms @@ -188,9 +190,11 @@ class YoutubeIE(InfoExtractor): } IE_NAME = u'youtube' - def suitable(self, url): + @classmethod + def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(self._VALID_URL, url, re.VERBOSE) is not None + if YoutubePlaylistIE.suitable(url): return False + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None def report_lang(self): """Report attempt to set language.""" @@ -304,7 +308,7 @@ class YoutubeIE(InfoExtractor): else: raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) except (IOError, netrc.NetrcParseError) as err: - self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err)) + self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err)) return # Set language @@ -313,7 +317,7 @@ class YoutubeIE(InfoExtractor): self.report_lang() compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err)) + self._downloader.report_warning(u'unable to set language: %s' % compat_str(err)) return # No authentication to be performed @@ -324,7 +328,7 @@ class YoutubeIE(InfoExtractor): try: login_page = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err)) + self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err)) return galx = None @@ -368,10 +372,10 @@ class YoutubeIE(InfoExtractor): self.report_login() login_results = compat_urllib_request.urlopen(request).read().decode('utf-8') if re.search(r'(?i)]* id="gaia_loginform"', login_results) is not None: - self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password') + self._downloader.report_warning(u'unable to log in: bad username or password') return except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err)) + self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) return # Confirm age @@ -718,6 +722,7 @@ class DailymotionIE(InfoExtractor): _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)' IE_NAME = u'dailymotion' + _WORKING = False def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) @@ -1329,7 +1334,7 @@ class GenericIE(InfoExtractor): opener = compat_urllib_request.OpenerDirector() for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler, HTTPMethodFallback, HEADRedirectHandler, - compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]: + compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]: opener.add_handler(handler()) response = opener.open(HeadRequest(url)) @@ -1365,6 +1370,9 @@ class GenericIE(InfoExtractor): if mobj is None: # Broaden the search a little bit mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) + if mobj is None: + # Broaden the search a little bit: JWPlayer JS loader + mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage) if mobj is None: self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) return @@ -1448,7 +1456,7 @@ class YoutubeSearchIE(InfoExtractor): self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) return elif n > self._max_youtube_results: - self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n)) + self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n)) n = self._max_youtube_results self._download_n_results(query, n) return @@ -1468,12 +1476,16 @@ class YoutubeSearchIE(InfoExtractor): result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1) request = compat_urllib_request.Request(result_url) try: - data = compat_urllib_request.urlopen(request).read() + data = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err)) return api_response = json.loads(data)['data'] + if not 'items' in api_response: + self._downloader.trouble(u'[youtube] No video results') + return + new_ids = list(video['id'] for video in api_response['items']) video_ids += new_ids @@ -1526,7 +1538,7 @@ class GoogleSearchIE(InfoExtractor): self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) return elif n > self._max_google_results: - self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n)) + self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n)) n = self._max_google_results self._download_n_results(query, n) return @@ -1610,7 +1622,7 @@ class YahooSearchIE(InfoExtractor): self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) return elif n > self._max_yahoo_results: - self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n)) + self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n)) n = self._max_yahoo_results self._download_n_results(query, n) return @@ -1658,80 +1670,94 @@ class YahooSearchIE(InfoExtractor): class YoutubePlaylistIE(InfoExtractor): """Information Extractor for YouTube playlists.""" - _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*' - _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en' - _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s' - _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}" + _VALID_URL = r"""(?: + (?:https?://)? + (?:\w+\.)? + youtube\.com/ + (?: + (?:course|view_play_list|my_playlists|artist|playlist|watch) + \? (?:.*?&)*? (?:p|a|list)= + | user/.*?/user/ + | p/ + | user/.*?#[pg]/c/ + ) + ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,}) + .* + | + ((?:PL|EC|UU)[0-9A-Za-z-_]{10,}) + )""" + _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json' + _MAX_RESULTS = 50 IE_NAME = u'youtube:playlist' def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) + @classmethod + def suitable(cls, url): + """Receives a URL and returns True if suitable for this IE.""" + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None + def report_download_page(self, playlist_id, pagenum): """Report attempt to download playlist page with given number.""" self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum)) def _real_extract(self, url): # Extract playlist id - mobj = re.match(self._VALID_URL, url) + mobj = re.match(self._VALID_URL, url, re.VERBOSE) if mobj is None: self._downloader.trouble(u'ERROR: invalid url: %s' % url) return - # Single video case - if mobj.group(3) is not None: - self._downloader.download([mobj.group(3)]) - return - - # Download playlist pages - # prefix is 'p' as default for playlists but there are other types that need extra care - playlist_prefix = mobj.group(1) - if playlist_prefix == 'a': - playlist_access = 'artist' - else: - playlist_prefix = 'p' - playlist_access = 'view_play_list' - playlist_id = mobj.group(2) - video_ids = [] - pagenum = 1 + # Download playlist videos from API + playlist_id = mobj.group(1) or mobj.group(2) + page_num = 1 + videos = [] while True: - self.report_download_page(playlist_id, pagenum) - url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum) - request = compat_urllib_request.Request(url) + self.report_download_page(playlist_id, page_num) + + url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1) try: - page = compat_urllib_request.urlopen(request).read().decode('utf-8') + page = compat_urllib_request.urlopen(url).read().decode('utf8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err)) return - # Extract video identifiers - ids_in_page = [] - for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page): - if mobj.group(1) not in ids_in_page: - ids_in_page.append(mobj.group(1)) - video_ids.extend(ids_in_page) + try: + response = json.loads(page) + except ValueError as err: + self._downloader.trouble(u'ERROR: Invalid JSON in API response: ' + compat_str(err)) + return - if self._MORE_PAGES_INDICATOR not in page: + if not 'feed' in response or not 'entry' in response['feed']: + self._downloader.trouble(u'ERROR: Got a malformed response from YouTube API') + return + videos += [ (entry['yt$position']['$t'], entry['content']['src']) + for entry in response['feed']['entry'] + if 'content' in entry ] + + if len(response['feed']['entry']) < self._MAX_RESULTS: break - pagenum = pagenum + 1 + page_num += 1 - total = len(video_ids) + videos = [v[1] for v in sorted(videos)] + total = len(videos) playliststart = self._downloader.params.get('playliststart', 1) - 1 playlistend = self._downloader.params.get('playlistend', -1) if playlistend == -1: - video_ids = video_ids[playliststart:] + videos = videos[playliststart:] else: - video_ids = video_ids[playliststart:playlistend] + videos = videos[playliststart:playlistend] - if len(video_ids) == total: + if len(videos) == total: self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total)) else: - self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids))) + self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos))) - for id in video_ids: - self._downloader.download(['http://www.youtube.com/watch?v=%s' % id]) + for video in videos: + self._downloader.download([video]) return @@ -1917,9 +1943,8 @@ class BlipTVUserIE(InfoExtractor): while True: self.report_download_page(username, pagenum) - - request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) ) - + url = page_base + "&page=" + str(pagenum) + request = compat_urllib_request.Request( url ) try: page = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: @@ -2055,7 +2080,7 @@ class FacebookIE(InfoExtractor): else: raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) except (IOError, netrc.NetrcParseError) as err: - self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err)) + self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err)) return if useremail is None: @@ -2072,10 +2097,10 @@ class FacebookIE(InfoExtractor): self.report_login() login_results = compat_urllib_request.urlopen(request).read() if re.search(r'', login_results) is not None: - self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.') + self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.') return except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err)) + self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) return def _real_extract(self, url): @@ -2097,6 +2122,10 @@ class FacebookIE(InfoExtractor): params_raw = compat_urllib_parse.unquote(data['params']) params = json.loads(params_raw) video_url = params['hd_src'] + if not video_url: + video_url = params['sd_src'] + if not video_url: + raise ExtractorError(u'Cannot find video URL') video_duration = int(params['video_duration']) m = re.search('

([^<]+)

', webpage) @@ -2136,6 +2165,17 @@ class BlipTVIE(InfoExtractor): self._downloader.trouble(u'ERROR: invalid URL: %s' % url) return + urlp = compat_urllib_parse_urlparse(url) + if urlp.path.startswith('/play/'): + request = compat_urllib_request.Request(url) + response = compat_urllib_request.urlopen(request) + redirecturl = response.geturl() + rurlp = compat_urllib_parse_urlparse(redirecturl) + file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2] + url = 'http://blip.tv/a/a-' + file_id + return self._real_extract(url) + + if '?' in url: cchar = '&' else: @@ -2232,7 +2272,7 @@ class MyVideoIE(InfoExtractor): webpage = self._download_webpage(webpage_url, video_id) self.report_extraction(video_id) - mobj = re.search(r'', + mobj = re.search(r'', webpage) if mobj is None: self._downloader.trouble(u'ERROR: unable to extract media URL') @@ -2291,9 +2331,10 @@ class ComedyCentralIE(InfoExtractor): '400': '384x216', } - def suitable(self, url): + @classmethod + def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(self._VALID_URL, url, re.VERBOSE) is not None + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None def report_extraction(self, episode_id): self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id) @@ -2516,7 +2557,7 @@ class EscapistIE(InfoExtractor): 'uploader': showName, 'upload_date': None, 'title': showName, - 'ext': 'flv', + 'ext': 'mp4', 'thumbnail': imgUrl, 'description': description, 'player_url': playerUrl, @@ -3550,65 +3591,17 @@ class FunnyOrDieIE(InfoExtractor): } return [info] -class TweetReelIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P[0-9a-z]+)$' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) - return - - video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - - m = re.search(r'
', webpage) - if not m: - self._downloader.trouble(u'ERROR: Cannot find status ID') - status_id = m.group(1) - - m = re.search(r'
(.*?)
', webpage, flags=re.DOTALL) - if not m: - self._downloader.trouble(u'WARNING: Cannot find description') - desc = unescapeHTML(re.sub('', '', m.group(1))).strip() - - m = re.search(r'
.*?from (?P.+?)', webpage, flags=re.DOTALL) - if not m: - self._downloader.trouble(u'ERROR: Cannot find uploader') - uploader = unescapeHTML(m.group('uploader')) - uploader_id = unescapeHTML(m.group('uploader_id')) - - m = re.search(r'video|app)/ #If the page is only for videos or for a game (?P\d+)/? (?P\d*)(?P\??) #For urltype == video we sometimes get the videoID """ - def suitable(self, url): + @classmethod + def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(self._VALID_URL, url, re.VERBOSE) is not None + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None def _real_extract(self, url): m = re.match(self._VALID_URL, url, re.VERBOSE) @@ -3619,18 +3612,22 @@ class SteamIE(InfoExtractor): mweb = re.finditer(urlRE, webpage) namesRE = r'(?P.+?)' titles = re.finditer(namesRE, webpage) + thumbsRE = r'' + thumbs = re.finditer(thumbsRE, webpage) videos = [] - for vid,vtitle in zip(mweb,titles): + for vid,vtitle,thumb in zip(mweb,titles,thumbs): video_id = vid.group('videoID') title = vtitle.group('videoName') video_url = vid.group('videoURL') + video_thumb = thumb.group('thumbnail') if not video_url: self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id) info = { 'id':video_id, 'url':video_url, 'ext': 'flv', - 'title': unescapeHTML(title) + 'title': unescapeHTML(title), + 'thumbnail': video_thumb } videos.append(info) return videos @@ -3657,6 +3654,62 @@ class UstreamIE(InfoExtractor): } return [info] +class WorldStarHipHopIE(InfoExtractor): + _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P.*)' + IE_NAME = u'WorldStarHipHop' + + def _real_extract(self, url): + _src_url = r"""(http://hw-videos.*(?:mp4|flv))""" + + webpage_src = compat_urllib_request.urlopen(str(url)).read() + webpage_src = webpage_src.decode('utf-8') + + mobj = re.search(_src_url, webpage_src) + + if mobj is not None: + video_url = mobj.group() + if 'mp4' in video_url: + ext = 'mp4' + else: + ext = 'flv' + else: + video_url = None + ext = None + + _title = r"""(.*)""" + + mobj = re.search(_title, webpage_src) + + if mobj is not None: + title = mobj.group(1) + else: + title = 'World Start Hip Hop - %s' % time.ctime() + + _thumbnail = r"""rel="image_src" href="(.*)" />""" + mobj = re.search(_thumbnail, webpage_src) + + # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. + if mobj is not None: + thumbnail = mobj.group(1) + else: + _title = r"""candytitles.*>(.*)""" + mobj = re.search(_title, webpage_src) + if mobj is not None: + title = mobj.group(1) + thumbnail = None + + m = re.match(self._VALID_URL, url) + video_id = m.group('id') + + results = [{ + 'id': video_id, + 'url' : video_url, + 'title' : title, + 'thumbnail' : thumbnail, + 'ext' : ext, + }] + return results + class RBMARadioIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P[^/]+)$' @@ -3696,7 +3749,7 @@ class RBMARadioIE(InfoExtractor): class YouPornIE(InfoExtractor): """Information extractor for youporn.com.""" _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P[0-9]+)/(?P[^/]+)' - + def _print_formats(self, formats): """Print all available formats""" print(u'Available formats:') @@ -3724,23 +3777,23 @@ class YouPornIE(InfoExtractor): webpage = self._download_webpage(req, video_id) # Get the video title - result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage) + result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage) if result is None: - raise ExtractorError(u'ERROR: unable to extract video title') + raise ExtractorError(u'Unable to extract video title') video_title = result.group('title').strip() # Get the video date - result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage) + result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage) if result is None: - self._downloader.to_stderr(u'WARNING: unable to extract video date') + self._downloader.report_warning(u'unable to extract video date') upload_date = None else: upload_date = result.group('date').strip() # Get the video uploader - result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage) + result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage) if result is None: - self._downloader.to_stderr(u'ERROR: unable to extract uploader') + self._downloader.report_warning(u'unable to extract uploader') video_uploader = None else: video_uploader = result.group('uploader').strip() @@ -3758,8 +3811,8 @@ class YouPornIE(InfoExtractor): links = re.findall(LINK_RE, download_list_html) if(len(links) == 0): raise ExtractorError(u'ERROR: no known formats available for video') - - self._downloader.to_screen(u'[youporn] Links found: %d' % len(links)) + + self._downloader.to_screen(u'[youporn] Links found: %d' % len(links)) formats = [] for link in links: @@ -3810,7 +3863,7 @@ class YouPornIE(InfoExtractor): return return [format] - + class PornotubeIE(InfoExtractor): """Information extractor for pornotube.com.""" @@ -3882,7 +3935,7 @@ class YouJizzIE(InfoExtractor): embed_page_url = result.group(0).strip() video_id = result.group('videoid') - + webpage = self._download_webpage(embed_page_url, video_id) # Get the video URL @@ -3912,7 +3965,7 @@ class EightTracksIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) - m = re.search(r"new TRAX.Mix\((.*?)\);\n*\s*TRAX.initSearchAutocomplete\('#search'\);", webpage, flags=re.DOTALL) + m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL) if not m: raise ExtractorError(u'Cannot find trax information') json_like = m.group(1) @@ -3968,6 +4021,135 @@ class KeekIE(InfoExtractor): } return [info] +class TEDIE(InfoExtractor): + _VALID_URL=r'''http://www.ted.com/ + ( + ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist + | + ((?P<type_talk>talks)) # We have a simple talk + ) + /(?P<name>\w+) # Here goes the name and then ".html" + ''' + + @classmethod + def suitable(cls, url): + """Receives a URL and returns True if suitable for this IE.""" + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None + + def _real_extract(self, url): + m=re.match(self._VALID_URL, url, re.VERBOSE) + if m.group('type_talk'): + return [self._talk_info(url)] + else : + playlist_id=m.group('playlist_id') + name=m.group('name') + self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name)) + return self._playlist_videos_info(url,name,playlist_id) + + def _talk_video_link(self,mediaSlug): + '''Returns the video link for that mediaSlug''' + return 'http://download.ted.com/talks/%s.mp4' % mediaSlug + + def _playlist_videos_info(self,url,name,playlist_id=0): + '''Returns the videos of the playlist''' + video_RE=r''' + <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)" + ([.\s]*?)data-playlist_item_id="(\d+)" + ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)" + ''' + video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>' + webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage') + m_videos=re.finditer(video_RE,webpage,re.VERBOSE) + m_names=re.finditer(video_name_RE,webpage) + info=[] + for m_video, m_name in zip(m_videos,m_names): + video_id=m_video.group('video_id') + talk_url='http://www.ted.com%s' % m_name.group('talk_url') + info.append(self._talk_info(talk_url,video_id)) + return info + + def _talk_info(self, url, video_id=0): + """Return the video for the talk in the url""" + m=re.match(self._VALID_URL, url,re.VERBOSE) + videoName=m.group('name') + webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName) + # If the url includes the language we get the title translated + title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>' + title=re.search(title_RE, webpage).group('title') + info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?) + "id":(?P<videoID>[\d]+).*? + "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"''' + thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"' + thumb_match=re.search(thumb_RE,webpage) + info_match=re.search(info_RE,webpage,re.VERBOSE) + video_id=info_match.group('videoID') + mediaSlug=info_match.group('mediaSlug') + video_url=self._talk_video_link(mediaSlug) + info = { + 'id': video_id, + 'url': video_url, + 'ext': 'mp4', + 'title': title, + 'thumbnail': thumb_match.group('thumbnail') + } + return info + +class MySpassIE(InfoExtractor): + _VALID_URL = r'http://www.myspass.de/.*' + + def _real_extract(self, url): + META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s' + + # video id is the last path element of the URL + # usually there is a trailing slash, so also try the second but last + url_path = compat_urllib_parse_urlparse(url).path + url_parent_path, video_id = os.path.split(url_path) + if not video_id: + _, video_id = os.path.split(url_parent_path) + + # get metadata + metadata_url = META_DATA_URL_TEMPLATE % video_id + metadata_text = self._download_webpage(metadata_url, video_id) + metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8')) + + # extract values from metadata + url_flv_el = metadata.find('url_flv') + if url_flv_el is None: + self._downloader.trouble(u'ERROR: unable to extract download url') + return + video_url = url_flv_el.text + extension = os.path.splitext(video_url)[1][1:] + title_el = metadata.find('title') + if title_el is None: + self._downloader.trouble(u'ERROR: unable to extract title') + return + title = title_el.text + format_id_el = metadata.find('format_id') + if format_id_el is None: + format = ext + else: + format = format_id_el.text + description_el = metadata.find('description') + if description_el is not None: + description = description_el.text + else: + description = None + imagePreview_el = metadata.find('imagePreview') + if imagePreview_el is not None: + thumbnail = imagePreview_el.text + else: + thumbnail = None + info = { + 'id': video_id, + 'url': video_url, + 'title': title, + 'ext': extension, + 'format': format, + 'thumbnail': thumbnail, + 'description': description + } + return [info] + def gen_extractors(): """ Return a list of an instance of every supported extractor. The order does matter; the first extractor matched is the one handling the URL. @@ -4007,14 +4189,16 @@ def gen_extractors(): GooglePlusIE(), ArteTvIE(), NBAIE(), + WorldStarHipHopIE(), JustinTVIE(), FunnyOrDieIE(), - TweetReelIE(), SteamIE(), UstreamIE(), RBMARadioIE(), EightTracksIE(), KeekIE(), + TEDIE(), + MySpassIE(), GenericIE() ]