X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=f69bad4f3a77d49be2d8b9cac3c6c256f4ee9898;hb=64c78d50ccf05f34e27b652530fc8b702aa54122;hp=021579ce01120dffa25e8d58ca54edb8ab72ac19;hpb=6324fd1d7494449c168805db8e3f9fa45396367b;p=youtube-dl diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 021579ce0..f69bad4f3 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -74,13 +74,15 @@ class InfoExtractor(object): self._ready = False self.set_downloader(downloader) - def suitable(self, url): + @classmethod + def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(self._VALID_URL, url) is not None + return re.match(cls._VALID_URL, url) is not None - def working(self): + @classmethod + def working(cls): """Getter method for _WORKING.""" - return self._WORKING + return cls._WORKING def initialize(self): """Initializes an instance (authentication, etc).""" @@ -137,7 +139,6 @@ class YoutubeIE(InfoExtractor): (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/| tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls - (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs (?: # the various things that can precede the ID: (?:(?:v|embed|e)/) # v/ or embed/ or e/ |(?: # or the v= param in all its forms @@ -189,9 +190,11 @@ class YoutubeIE(InfoExtractor): } IE_NAME = u'youtube' - def suitable(self, url): + @classmethod + def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(self._VALID_URL, url, re.VERBOSE) is not None + if YoutubePlaylistIE.suitable(url): return False + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None def report_lang(self): """Report attempt to set language.""" @@ -305,7 +308,7 @@ class YoutubeIE(InfoExtractor): else: raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) except (IOError, netrc.NetrcParseError) as err: - self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err)) + self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err)) return # Set language @@ -314,7 +317,7 @@ class YoutubeIE(InfoExtractor): self.report_lang() compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err)) + self._downloader.report_warning(u'unable to set language: %s' % compat_str(err)) return # No authentication to be performed @@ -325,7 +328,7 @@ class YoutubeIE(InfoExtractor): try: login_page = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err)) + self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err)) return galx = None @@ -369,10 +372,10 @@ class YoutubeIE(InfoExtractor): self.report_login() login_results = compat_urllib_request.urlopen(request).read().decode('utf-8') if re.search(r'(?i)]* id="gaia_loginform"', login_results) is not None: - self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password') + self._downloader.report_warning(u'unable to log in: bad username or password') return except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err)) + self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) return # Confirm age @@ -1453,7 +1456,7 @@ class YoutubeSearchIE(InfoExtractor): self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) return elif n > self._max_youtube_results: - self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n)) + self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n)) n = self._max_youtube_results self._download_n_results(query, n) return @@ -1479,6 +1482,10 @@ class YoutubeSearchIE(InfoExtractor): return api_response = json.loads(data)['data'] + if not 'items' in api_response: + self._downloader.trouble(u'[youtube] No video results') + return + new_ids = list(video['id'] for video in api_response['items']) video_ids += new_ids @@ -1531,7 +1538,7 @@ class GoogleSearchIE(InfoExtractor): self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) return elif n > self._max_google_results: - self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n)) + self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n)) n = self._max_google_results self._download_n_results(query, n) return @@ -1615,7 +1622,7 @@ class YahooSearchIE(InfoExtractor): self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) return elif n > self._max_yahoo_results: - self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n)) + self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n)) n = self._max_yahoo_results self._download_n_results(query, n) return @@ -1668,17 +1675,17 @@ class YoutubePlaylistIE(InfoExtractor): (?:\w+\.)? youtube\.com/ (?: - (?:course|view_play_list|my_playlists|artist|playlist) - \? .*? (p|a|list)= + (?:course|view_play_list|my_playlists|artist|playlist|watch) + \? (?:.*?&)*? (?:p|a|list)= | user/.*?/user/ | p/ | user/.*?#[pg]/c/ ) - (?:PL|EC)? - |PL|EC) - ([0-9A-Za-z-_]{10,}) - (?:/.*?/([0-9A-Za-z_-]+))? - .*""" + ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,}) + .* + | + ((?:PL|EC|UU)[0-9A-Za-z-_]{10,}) + )""" _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json' _MAX_RESULTS = 50 IE_NAME = u'youtube:playlist' @@ -1686,9 +1693,10 @@ class YoutubePlaylistIE(InfoExtractor): def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) - def suitable(self, url): + @classmethod + def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(self._VALID_URL, url, re.VERBOSE) is not None + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None def report_download_page(self, playlist_id, pagenum): """Report attempt to download playlist page with given number.""" @@ -1701,13 +1709,8 @@ class YoutubePlaylistIE(InfoExtractor): self._downloader.trouble(u'ERROR: invalid url: %s' % url) return - # Single video case - if mobj.group(3) is not None: - self._downloader.download([mobj.group(3)]) - return - # Download playlist videos from API - playlist_id = mobj.group(2) + playlist_id = mobj.group(1) or mobj.group(2) page_num = 1 videos = [] @@ -1727,14 +1730,18 @@ class YoutubePlaylistIE(InfoExtractor): self._downloader.trouble(u'ERROR: Invalid JSON in API response: ' + compat_str(err)) return - videos += [(entry['yt$position']['$t'], entry['content']['src']) for entry in response['feed']['entry']] + if not 'feed' in response or not 'entry' in response['feed']: + self._downloader.trouble(u'ERROR: Got a malformed response from YouTube API') + return + videos += [ (entry['yt$position']['$t'], entry['content']['src']) + for entry in response['feed']['entry'] + if 'content' in entry ] if len(response['feed']['entry']) < self._MAX_RESULTS: break page_num += 1 - videos = map(operator.itemgetter(1), sorted(videos)) - + videos = [v[1] for v in sorted(videos)] total = len(videos) playliststart = self._downloader.params.get('playliststart', 1) - 1 @@ -2073,7 +2080,7 @@ class FacebookIE(InfoExtractor): else: raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) except (IOError, netrc.NetrcParseError) as err: - self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err)) + self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err)) return if useremail is None: @@ -2090,10 +2097,10 @@ class FacebookIE(InfoExtractor): self.report_login() login_results = compat_urllib_request.urlopen(request).read() if re.search(r'', login_results) is not None: - self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.') + self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.') return except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err)) + self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) return def _real_extract(self, url): @@ -2158,6 +2165,17 @@ class BlipTVIE(InfoExtractor): self._downloader.trouble(u'ERROR: invalid URL: %s' % url) return + urlp = compat_urllib_parse_urlparse(url) + if urlp.path.startswith('/play/'): + request = compat_urllib_request.Request(url) + response = compat_urllib_request.urlopen(request) + redirecturl = response.geturl() + rurlp = compat_urllib_parse_urlparse(redirecturl) + file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2] + url = 'http://blip.tv/a/a-' + file_id + return self._real_extract(url) + + if '?' in url: cchar = '&' else: @@ -2313,9 +2331,10 @@ class ComedyCentralIE(InfoExtractor): '400': '384x216', } - def suitable(self, url): + @classmethod + def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(self._VALID_URL, url, re.VERBOSE) is not None + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None def report_extraction(self, episode_id): self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id) @@ -2538,7 +2557,7 @@ class EscapistIE(InfoExtractor): 'uploader': showName, 'upload_date': None, 'title': showName, - 'ext': 'flv', + 'ext': 'mp4', 'thumbnail': imgUrl, 'description': description, 'player_url': playerUrl, @@ -3572,55 +3591,6 @@ class FunnyOrDieIE(InfoExtractor): } return [info] -class TweetReelIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P[0-9a-z]+)$' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) - return - - video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - - m = re.search(r'
', webpage) - if not m: - self._downloader.trouble(u'ERROR: Cannot find status ID') - status_id = m.group(1) - - m = re.search(r'
(.*?)
', webpage, flags=re.DOTALL) - if not m: - self._downloader.trouble(u'WARNING: Cannot find description') - desc = unescapeHTML(re.sub('', '', m.group(1))).strip() - - m = re.search(r'
.*?from (?P.+?)', webpage, flags=re.DOTALL) - if not m: - self._downloader.trouble(u'ERROR: Cannot find uploader') - uploader = unescapeHTML(m.group('uploader')) - uploader_id = unescapeHTML(m.group('uploader_id')) - - m = re.search(r'video|app)/ #If the page is only for videos or for a game @@ -3628,9 +3598,10 @@ class SteamIE(InfoExtractor): (?P\d*)(?P\??) #For urltype == video we sometimes get the videoID """ - def suitable(self, url): + @classmethod + def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(self._VALID_URL, url, re.VERBOSE) is not None + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None def _real_extract(self, url): m = re.match(self._VALID_URL, url, re.VERBOSE) @@ -3683,6 +3654,62 @@ class UstreamIE(InfoExtractor): } return [info] +class WorldStarHipHopIE(InfoExtractor): + _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P.*)' + IE_NAME = u'WorldStarHipHop' + + def _real_extract(self, url): + _src_url = r"""(http://hw-videos.*(?:mp4|flv))""" + + webpage_src = compat_urllib_request.urlopen(str(url)).read() + webpage_src = webpage_src.decode('utf-8') + + mobj = re.search(_src_url, webpage_src) + + if mobj is not None: + video_url = mobj.group() + if 'mp4' in video_url: + ext = 'mp4' + else: + ext = 'flv' + else: + video_url = None + ext = None + + _title = r"""(.*)""" + + mobj = re.search(_title, webpage_src) + + if mobj is not None: + title = mobj.group(1) + else: + title = 'World Start Hip Hop - %s' % time.ctime() + + _thumbnail = r"""rel="image_src" href="(.*)" />""" + mobj = re.search(_thumbnail, webpage_src) + + # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. + if mobj is not None: + thumbnail = mobj.group(1) + else: + _title = r"""candytitles.*>(.*)""" + mobj = re.search(_title, webpage_src) + if mobj is not None: + title = mobj.group(1) + thumbnail = None + + m = re.match(self._VALID_URL, url) + video_id = m.group('id') + + results = [{ + 'id': video_id, + 'url' : video_url, + 'title' : title, + 'thumbnail' : thumbnail, + 'ext' : ext, + }] + return results + class RBMARadioIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P[^/]+)$' @@ -3758,7 +3785,7 @@ class YouPornIE(InfoExtractor): # Get the video date result = re.search(r'Date:(?P.*) ', webpage) if result is None: - self._downloader.to_stderr(u'WARNING: unable to extract video date') + self._downloader.report_warning(u'unable to extract video date') upload_date = None else: upload_date = result.group('date').strip() @@ -3766,7 +3793,7 @@ class YouPornIE(InfoExtractor): # Get the video uploader result = re.search(r'Submitted:(?P.*)', webpage) if result is None: - self._downloader.to_stderr(u'WARNING: unable to extract uploader') + self._downloader.report_warning(u'unable to extract uploader') video_uploader = None else: video_uploader = result.group('uploader').strip() @@ -4004,9 +4031,10 @@ class TEDIE(InfoExtractor): /(?P\w+) # Here goes the name and then ".html" ''' - def suitable(self, url): + @classmethod + def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(self._VALID_URL, url, re.VERBOSE) is not None + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None def _real_extract(self, url): m=re.match(self._VALID_URL, url, re.VERBOSE) @@ -4161,9 +4189,9 @@ def gen_extractors(): GooglePlusIE(), ArteTvIE(), NBAIE(), + WorldStarHipHopIE(), JustinTVIE(), FunnyOrDieIE(), - TweetReelIE(), SteamIE(), UstreamIE(), RBMARadioIE(),