X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=81eaddc7260333e70a01be3ea354edfc7513f339;hb=14294236bf9623fac4ad748389689b77d52b3547;hp=64a6cfbc8fa68ef34165eb1881c36ff7edbd2adf;hpb=60bd48b175792d55cc91a3ad8c3109e2ed30fcb8;p=youtube-dl diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 64a6cfbc8..81eaddc72 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -15,6 +15,7 @@ import email.utils import xml.etree.ElementTree import random import math +import operator from .utils import * @@ -47,7 +48,7 @@ class InfoExtractor(object): uploader_id: Nickname or id of the video uploader. location: Physical location of the video. player_url: SWF Player URL (used for rtmpdump). - subtitles: The .srt file contents. + subtitles: The subtitle file contents. urlhandle: [internal] The urlHandle to be used to download the file, like returned by urllib.request.urlopen @@ -73,13 +74,15 @@ class InfoExtractor(object): self._ready = False self.set_downloader(downloader) - def suitable(self, url): + @classmethod + def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(self._VALID_URL, url) is not None + return re.match(cls._VALID_URL, url) is not None - def working(self): + @classmethod + def working(cls): """Getter method for _WORKING.""" - return self._WORKING + return cls._WORKING def initialize(self): """Initializes an instance (authentication, etc).""" @@ -123,8 +126,36 @@ class InfoExtractor(object): def _download_webpage(self, url_or_request, video_id, note=None, errnote=None): """ Returns the data of the page as a string """ urlh = self._request_webpage(url_or_request, video_id, note, errnote) + content_type = urlh.headers.get('Content-Type', '') + m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) + if m: + encoding = m.group(1) + else: + encoding = 'utf-8' webpage_bytes = urlh.read() - return webpage_bytes.decode('utf-8', 'replace') + return webpage_bytes.decode(encoding, 'replace') + + #Methods for following #608 + #They set the correct value of the '_type' key + def video_result(self, video_info): + """Returns a video""" + video_info['_type'] = 'video' + return video_info + def url_result(self, url, ie=None): + """Returns a url that points to a page that should be processed""" + #TODO: ie should be the class used for getting the info + video_info = {'_type': 'url', + 'url': url} + return video_info + def playlist_result(self, entries, playlist_id=None, playlist_title=None): + """Returns a playlist""" + video_info = {'_type': 'playlist', + 'entries': entries} + if playlist_id: + video_info['id'] = playlist_id + if playlist_title: + video_info['title'] = playlist_title + return video_info class YoutubeIE(InfoExtractor): @@ -136,7 +167,6 @@ class YoutubeIE(InfoExtractor): (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/| tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls - (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs (?: # the various things that can precede the ID: (?:(?:v|embed|e)/) # v/ or embed/ or e/ |(?: # or the v= param in all its forms @@ -188,9 +218,11 @@ class YoutubeIE(InfoExtractor): } IE_NAME = u'youtube' - def suitable(self, url): + @classmethod + def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(self._VALID_URL, url, re.VERBOSE) is not None + if YoutubePlaylistIE.suitable(url): return False + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None def report_lang(self): """Report attempt to set language.""" @@ -214,7 +246,16 @@ class YoutubeIE(InfoExtractor): def report_video_subtitles_download(self, video_id): """Report attempt to download video info webpage.""" - self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id) + self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id) + + def report_video_subtitles_request(self, video_id, sub_lang, format): + """Report attempt to download video info webpage.""" + self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format)) + + def report_video_subtitles_available(self, video_id, sub_lang_list): + """Report available subtitles.""" + sub_lang = ",".join(list(sub_lang_list.keys())) + self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang)) def report_information_extraction(self, video_id): """Report attempt to extract video information.""" @@ -228,55 +269,75 @@ class YoutubeIE(InfoExtractor): """Indicate the download will use the RTMP protocol.""" self._downloader.to_screen(u'[youtube] RTMP download detected') - def _closed_captions_xml_to_srt(self, xml_string): - srt = '' - texts = re.findall(r'([^<]+)', xml_string, re.MULTILINE) - # TODO parse xml instead of regex - for n, (start, dur_tag, dur, caption) in enumerate(texts): - if not dur: dur = '4' - start = float(start) - end = start + float(dur) - start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000) - end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000) - caption = unescapeHTML(caption) - caption = unescapeHTML(caption) # double cycle, intentional - srt += str(n+1) + '\n' - srt += start + ' --> ' + end + '\n' - srt += caption + '\n\n' - return srt - - def _extract_subtitles(self, video_id): + def _get_available_subtitles(self, video_id): self.report_video_subtitles_download(video_id) request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) try: - srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8') + sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None) - srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list) - srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list) - if not srt_lang_list: - return (u'WARNING: video has no closed captions', None) - if self._downloader.params.get('subtitleslang', False): - srt_lang = self._downloader.params.get('subtitleslang') - elif 'en' in srt_lang_list: - srt_lang = 'en' - else: - srt_lang = list(srt_lang_list.keys())[0] - if not srt_lang in srt_lang_list: - return (u'WARNING: no closed captions found in the specified language', None) + return (u'unable to download video subtitles: %s' % compat_str(err), None) + sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) + sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list) + if not sub_lang_list: + return (u'video doesn\'t have subtitles', None) + return sub_lang_list + + def _list_available_subtitles(self, video_id): + sub_lang_list = self._get_available_subtitles(video_id) + self.report_video_subtitles_available(video_id, sub_lang_list) + + def _request_subtitle(self, sub_lang, sub_name, video_id, format): + """ + Return tuple: + (error_message, sub_lang, sub) + """ + self.report_video_subtitles_request(video_id, sub_lang, format) params = compat_urllib_parse.urlencode({ - 'lang': srt_lang, - 'name': srt_lang_list[srt_lang].encode('utf-8'), + 'lang': sub_lang, + 'name': sub_name, 'v': video_id, + 'fmt': format, }) url = 'http://www.youtube.com/api/timedtext?' + params try: - srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8') + sub = compat_urllib_request.urlopen(url).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None) - if not srt_xml: - return (u'WARNING: Did not fetch video subtitles', None) - return (None, self._closed_captions_xml_to_srt(srt_xml)) + return (u'unable to download video subtitles: %s' % compat_str(err), None, None) + if not sub: + return (u'Did not fetch video subtitles', None, None) + return (None, sub_lang, sub) + + def _extract_subtitle(self, video_id): + """ + Return a list with a tuple: + [(error_message, sub_lang, sub)] + """ + sub_lang_list = self._get_available_subtitles(video_id) + sub_format = self._downloader.params.get('subtitlesformat') + if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles + return [(sub_lang_list[0], None, None)] + if self._downloader.params.get('subtitleslang', False): + sub_lang = self._downloader.params.get('subtitleslang') + elif 'en' in sub_lang_list: + sub_lang = 'en' + else: + sub_lang = list(sub_lang_list.keys())[0] + if not sub_lang in sub_lang_list: + return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)] + + subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) + return [subtitle] + + def _extract_all_subtitles(self, video_id): + sub_lang_list = self._get_available_subtitles(video_id) + sub_format = self._downloader.params.get('subtitlesformat') + if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles + return [(sub_lang_list[0], None, None)] + subtitles = [] + for sub_lang in sub_lang_list: + subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) + subtitles.append(subtitle) + return subtitles def _print_formats(self, formats): print('Available formats:') @@ -304,7 +365,7 @@ class YoutubeIE(InfoExtractor): else: raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) except (IOError, netrc.NetrcParseError) as err: - self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err)) + self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err)) return # Set language @@ -313,7 +374,7 @@ class YoutubeIE(InfoExtractor): self.report_lang() compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err)) + self._downloader.report_warning(u'unable to set language: %s' % compat_str(err)) return # No authentication to be performed @@ -324,7 +385,7 @@ class YoutubeIE(InfoExtractor): try: login_page = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err)) + self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err)) return galx = None @@ -368,10 +429,10 @@ class YoutubeIE(InfoExtractor): self.report_login() login_results = compat_urllib_request.urlopen(request).read().decode('utf-8') if re.search(r'(?i)]* id="gaia_loginform"', login_results) is not None: - self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password') + self._downloader.report_warning(u'unable to log in: bad username or password') return except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err)) + self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) return # Confirm age @@ -384,13 +445,13 @@ class YoutubeIE(InfoExtractor): self.report_age_confirmation() age_results = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err)) + self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err)) return def _extract_id(self, url): mobj = re.match(self._VALID_URL, url, re.VERBOSE) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return video_id = mobj.group(2) return video_id @@ -409,7 +470,7 @@ class YoutubeIE(InfoExtractor): try: video_webpage_bytes = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err)) return video_webpage = video_webpage_bytes.decode('utf-8', 'ignore') @@ -434,18 +495,18 @@ class YoutubeIE(InfoExtractor): if 'token' in video_info: break except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download video info webpage: %s' % compat_str(err)) return if 'token' not in video_info: if 'reason' in video_info: - self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0]) + self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0]) else: - self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason') + self._downloader.report_error(u'"token" parameter not in video info for unknown reason') return # Check for "rental" videos if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: - self._downloader.trouble(u'ERROR: "rental" videos not supported') + self._downloader.report_error(u'"rental" videos not supported') return # Start extracting information @@ -453,7 +514,7 @@ class YoutubeIE(InfoExtractor): # uploader if 'author' not in video_info: - self._downloader.trouble(u'ERROR: unable to extract uploader name') + self._downloader.report_error(u'unable to extract uploader name') return video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0]) @@ -463,17 +524,17 @@ class YoutubeIE(InfoExtractor): if mobj is not None: video_uploader_id = mobj.group(1) else: - self._downloader.trouble(u'WARNING: unable to extract uploader nickname') + self._downloader.report_warning(u'unable to extract uploader nickname') # title if 'title' not in video_info: - self._downloader.trouble(u'ERROR: unable to extract video title') + self._downloader.report_error(u'unable to extract video title') return video_title = compat_urllib_parse.unquote_plus(video_info['title'][0]) # thumbnail image if 'thumbnail_url' not in video_info: - self._downloader.trouble(u'WARNING: unable to extract video thumbnail') + self._downloader.report_warning(u'unable to extract video thumbnail') video_thumbnail = '' else: # don't panic if we can't find it video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0]) @@ -497,15 +558,29 @@ class YoutubeIE(InfoExtractor): else: video_description = '' - # closed captions + # subtitles video_subtitles = None + if self._downloader.params.get('writesubtitles', False): - (srt_error, video_subtitles) = self._extract_subtitles(video_id) - if srt_error: - self._downloader.trouble(srt_error) + video_subtitles = self._extract_subtitle(video_id) + if video_subtitles: + (sub_error, sub_lang, sub) = video_subtitles[0] + if sub_error: + self._downloader.report_error(sub_error) + + if self._downloader.params.get('allsubtitles', False): + video_subtitles = self._extract_all_subtitles(video_id) + for video_subtitle in video_subtitles: + (sub_error, sub_lang, sub) = video_subtitle + if sub_error: + self._downloader.report_error(sub_error) + + if self._downloader.params.get('listsubtitles', False): + sub_lang_list = self._list_available_subtitles(video_id) + return if 'length_seconds' not in video_info: - self._downloader.trouble(u'WARNING: unable to extract video duration') + self._downloader.report_warning(u'unable to extract video duration') video_duration = '' else: video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]) @@ -533,7 +608,7 @@ class YoutubeIE(InfoExtractor): format_list = available_formats existing_formats = [x for x in format_list if x in url_map] if len(existing_formats) == 0: - self._downloader.trouble(u'ERROR: no known formats available for video') + self._downloader.report_error(u'no known formats available for video') return if self._downloader.params.get('listformats', None): self._print_formats(existing_formats) @@ -554,10 +629,10 @@ class YoutubeIE(InfoExtractor): video_url_list = [(rf, url_map[rf])] break if video_url_list is None: - self._downloader.trouble(u'ERROR: requested format not available') + self._downloader.report_error(u'requested format not available') return else: - self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info') + self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info') return results = [] @@ -620,7 +695,7 @@ class MetacafeIE(InfoExtractor): self.report_disclaimer() disclaimer = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err)) + self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err)) return # Confirm age @@ -633,14 +708,14 @@ class MetacafeIE(InfoExtractor): self.report_age_confirmation() disclaimer = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err)) + self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err)) return def _real_extract(self, url): # Extract id and simplified title from URL mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return video_id = mobj.group(1) @@ -657,7 +732,7 @@ class MetacafeIE(InfoExtractor): self.report_download_webpage(video_id) webpage = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err)) + self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err)) return # Extract URL, uploader and title from webpage @@ -677,15 +752,15 @@ class MetacafeIE(InfoExtractor): else: mobj = re.search(r' name="flashvars" value="(.*?)"', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract media URL') + self._downloader.report_error(u'unable to extract media URL') return vardict = compat_parse_qs(mobj.group(1)) if 'mediaData' not in vardict: - self._downloader.trouble(u'ERROR: unable to extract media URL') + self._downloader.report_error(u'unable to extract media URL') return mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0]) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract media URL') + self._downloader.report_error(u'unable to extract media URL') return mediaURL = mobj.group(1).replace('\\/', '/') video_extension = mediaURL[-3:] @@ -693,13 +768,13 @@ class MetacafeIE(InfoExtractor): mobj = re.search(r'(?im)(.*) - Video', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract title') + self._downloader.report_error(u'unable to extract title') return video_title = mobj.group(1).decode('utf-8') mobj = re.search(r'submitter=(.*?);', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract uploader nickname') + self._downloader.report_error(u'unable to extract uploader nickname') return video_uploader = mobj.group(1) @@ -731,7 +806,7 @@ class DailymotionIE(InfoExtractor): # Extract id and simplified title from URL mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return video_id = mobj.group(1).split('_')[0].split('?')[0] @@ -747,7 +822,7 @@ class DailymotionIE(InfoExtractor): self.report_extraction(video_id) mobj = re.search(r'\s*var flashvars = (.*)', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract media URL') + self._downloader.report_error(u'unable to extract media URL') return flashvars = compat_urllib_parse.unquote(mobj.group(1)) @@ -757,12 +832,12 @@ class DailymotionIE(InfoExtractor): self._downloader.to_screen(u'[dailymotion] Using %s' % key) break else: - self._downloader.trouble(u'ERROR: unable to extract video URL') + self._downloader.report_error(u'unable to extract video URL') return mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video URL') + self._downloader.report_error(u'unable to extract video URL') return video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/') @@ -771,7 +846,7 @@ class DailymotionIE(InfoExtractor): mobj = re.search(r'', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract title') + self._downloader.report_error(u'unable to extract title') return video_title = unescapeHTML(mobj.group('title')) @@ -781,7 +856,7 @@ class DailymotionIE(InfoExtractor): # lookin for official user mobj_official = re.search(r'', webpage) if mobj_official is None: - self._downloader.trouble(u'WARNING: unable to extract uploader nickname') + self._downloader.report_warning(u'unable to extract uploader nickname') else: video_uploader = mobj_official.group(1) else: @@ -823,7 +898,7 @@ class PhotobucketIE(InfoExtractor): # Extract id from URL mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) + self._downloader.report_error(u'Invalid URL: %s' % url) return video_id = mobj.group(1) @@ -836,14 +911,14 @@ class PhotobucketIE(InfoExtractor): self.report_download_webpage(video_id) webpage = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) + self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err)) return # Extract URL, uploader, and title from webpage self.report_extraction(video_id) mobj = re.search(r'', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract media URL') + self._downloader.report_error(u'unable to extract media URL') return mediaURL = compat_urllib_parse.unquote(mobj.group(1)) @@ -851,7 +926,7 @@ class PhotobucketIE(InfoExtractor): mobj = re.search(r'(.*) video by (.*) - Photobucket', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract title') + self._downloader.report_error(u'unable to extract title') return video_title = mobj.group(1).decode('utf-8') @@ -892,7 +967,7 @@ class YahooIE(InfoExtractor): # Extract ID from URL mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) + self._downloader.report_error(u'Invalid URL: %s' % url) return video_id = mobj.group(2) @@ -905,18 +980,18 @@ class YahooIE(InfoExtractor): try: webpage = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) + self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err)) return mobj = re.search(r'\("id", "([0-9]+)"\);', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: Unable to extract id field') + self._downloader.report_error(u'Unable to extract id field') return yahoo_id = mobj.group(1) mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: Unable to extract vid field') + self._downloader.report_error(u'Unable to extract vid field') return yahoo_vid = mobj.group(1) @@ -929,34 +1004,34 @@ class YahooIE(InfoExtractor): self.report_download_webpage(video_id) webpage = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) + self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err)) return # Extract uploader and title from webpage self.report_extraction(video_id) mobj = re.search(r'', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video title') + self._downloader.report_error(u'unable to extract video title') return video_title = mobj.group(1).decode('utf-8') mobj = re.search(r'

(.*)

', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video uploader') + self._downloader.report_error(u'unable to extract video uploader') return video_uploader = mobj.group(1).decode('utf-8') # Extract video thumbnail mobj = re.search(r'', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video thumbnail') + self._downloader.report_error(u'unable to extract video thumbnail') return video_thumbnail = mobj.group(1).decode('utf-8') # Extract video description mobj = re.search(r'', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video description') + self._downloader.report_error(u'unable to extract video description') return video_description = mobj.group(1).decode('utf-8') if not video_description: @@ -965,13 +1040,13 @@ class YahooIE(InfoExtractor): # Extract video height and width mobj = re.search(r'', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video height') + self._downloader.report_error(u'unable to extract video height') return yv_video_height = mobj.group(1) mobj = re.search(r'', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video width') + self._downloader.report_error(u'unable to extract video width') return yv_video_width = mobj.group(1) @@ -987,13 +1062,13 @@ class YahooIE(InfoExtractor): self.report_download_webpage(video_id) webpage = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) + self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err)) return # Extract media URL from playlist XML mobj = re.search(r'(.*)', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract title') + self._downloader.report_error(u'unable to extract title') return video_title = mobj.group(1) # video uploader is domain name mobj = re.match(r'(?:https?://)?([^/]*)/.*', url) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract title') + self._downloader.report_error(u'unable to extract title') return video_uploader = mobj.group(1) @@ -1433,7 +1504,7 @@ class YoutubeSearchIE(InfoExtractor): def _real_extract(self, query): mobj = re.match(self._VALID_URL, query) if mobj is None: - self._downloader.trouble(u'ERROR: invalid search query "%s"' % query) + self._downloader.report_error(u'invalid search query "%s"' % query) return prefix, query = query.split(':') @@ -1449,10 +1520,10 @@ class YoutubeSearchIE(InfoExtractor): try: n = int(prefix) if n <= 0: - self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) + self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query)) return elif n > self._max_youtube_results: - self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n)) + self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n)) n = self._max_youtube_results self._download_n_results(query, n) return @@ -1472,12 +1543,16 @@ class YoutubeSearchIE(InfoExtractor): result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1) request = compat_urllib_request.Request(result_url) try: - data = compat_urllib_request.urlopen(request).read() + data = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download API page: %s' % compat_str(err)) return api_response = json.loads(data)['data'] + if not 'items' in api_response: + self._downloader.trouble(u'[youtube] No video results') + return + new_ids = list(video['id'] for video in api_response['items']) video_ids += new_ids @@ -1511,7 +1586,7 @@ class GoogleSearchIE(InfoExtractor): def _real_extract(self, query): mobj = re.match(self._VALID_URL, query) if mobj is None: - self._downloader.trouble(u'ERROR: invalid search query "%s"' % query) + self._downloader.report_error(u'invalid search query "%s"' % query) return prefix, query = query.split(':') @@ -1527,10 +1602,10 @@ class GoogleSearchIE(InfoExtractor): try: n = int(prefix) if n <= 0: - self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) + self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query)) return elif n > self._max_google_results: - self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n)) + self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n)) n = self._max_google_results self._download_n_results(query, n) return @@ -1551,7 +1626,7 @@ class GoogleSearchIE(InfoExtractor): try: page = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err)) return # Extract video identifiers @@ -1595,7 +1670,7 @@ class YahooSearchIE(InfoExtractor): def _real_extract(self, query): mobj = re.match(self._VALID_URL, query) if mobj is None: - self._downloader.trouble(u'ERROR: invalid search query "%s"' % query) + self._downloader.report_error(u'invalid search query "%s"' % query) return prefix, query = query.split(':') @@ -1611,10 +1686,10 @@ class YahooSearchIE(InfoExtractor): try: n = int(prefix) if n <= 0: - self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) + self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query)) return elif n > self._max_yahoo_results: - self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n)) + self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n)) n = self._max_yahoo_results self._download_n_results(query, n) return @@ -1636,7 +1711,7 @@ class YahooSearchIE(InfoExtractor): try: page = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err)) return # Extract video identifiers @@ -1662,81 +1737,92 @@ class YahooSearchIE(InfoExtractor): class YoutubePlaylistIE(InfoExtractor): """Information Extractor for YouTube playlists.""" - _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*' - _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en' - _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s' - _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}" + _VALID_URL = r"""(?: + (?:https?://)? + (?:\w+\.)? + youtube\.com/ + (?: + (?:course|view_play_list|my_playlists|artist|playlist|watch) + \? (?:.*?&)*? (?:p|a|list)= + | p/ + ) + ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,}) + .* + | + ((?:PL|EC|UU)[0-9A-Za-z-_]{10,}) + )""" + _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json' + _MAX_RESULTS = 50 IE_NAME = u'youtube:playlist' def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) + @classmethod + def suitable(cls, url): + """Receives a URL and returns True if suitable for this IE.""" + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None + def report_download_page(self, playlist_id, pagenum): """Report attempt to download playlist page with given number.""" self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum)) def _real_extract(self, url): # Extract playlist id - mobj = re.match(self._VALID_URL, url) + mobj = re.match(self._VALID_URL, url, re.VERBOSE) if mobj is None: - self._downloader.trouble(u'ERROR: invalid url: %s' % url) + self._downloader.report_error(u'invalid url: %s' % url) return - # Single video case - if mobj.group(3) is not None: - self._downloader.download([mobj.group(3)]) - return - - # Download playlist pages - # prefix is 'p' as default for playlists but there are other types that need extra care - playlist_prefix = mobj.group(1) - if playlist_prefix == 'a': - playlist_access = 'artist' - else: - playlist_prefix = 'p' - playlist_access = 'view_play_list' - playlist_id = mobj.group(2) - video_ids = [] - pagenum = 1 + # Download playlist videos from API + playlist_id = mobj.group(1) or mobj.group(2) + page_num = 1 + videos = [] while True: - self.report_download_page(playlist_id, pagenum) - url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum) - request = compat_urllib_request.Request(url) + self.report_download_page(playlist_id, page_num) + + url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1) try: - page = compat_urllib_request.urlopen(request).read().decode('utf-8') + page = compat_urllib_request.urlopen(url).read().decode('utf8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err)) return - # Extract video identifiers - ids_in_page = [] - for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page): - if mobj.group(1) not in ids_in_page: - ids_in_page.append(mobj.group(1)) - video_ids.extend(ids_in_page) + try: + response = json.loads(page) + except ValueError as err: + self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err)) + return - if self._MORE_PAGES_INDICATOR not in page: + if not 'feed' in response or not 'entry' in response['feed']: + self._downloader.report_error(u'Got a malformed response from YouTube API') + return + videos += [ (entry['yt$position']['$t'], entry['content']['src']) + for entry in response['feed']['entry'] + if 'content' in entry ] + + if len(response['feed']['entry']) < self._MAX_RESULTS: break - pagenum = pagenum + 1 + page_num += 1 - total = len(video_ids) + videos = [v[1] for v in sorted(videos)] + total = len(videos) playliststart = self._downloader.params.get('playliststart', 1) - 1 playlistend = self._downloader.params.get('playlistend', -1) if playlistend == -1: - video_ids = video_ids[playliststart:] + videos = videos[playliststart:] else: - video_ids = video_ids[playliststart:playlistend] + videos = videos[playliststart:playlistend] - if len(video_ids) == total: + if len(videos) == total: self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total)) else: - self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids))) + self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos))) - for id in video_ids: - self._downloader.download(['http://www.youtube.com/watch?v=%s' % id]) - return + url_results = [self.url_result(url) for url in videos] + return [self.playlist_result(url_results, playlist_id)] class YoutubeChannelIE(InfoExtractor): @@ -1755,7 +1841,7 @@ class YoutubeChannelIE(InfoExtractor): # Extract channel id mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid url: %s' % url) + self._downloader.report_error(u'invalid url: %s' % url) return # Download channel pages @@ -1770,7 +1856,7 @@ class YoutubeChannelIE(InfoExtractor): try: page = compat_urllib_request.urlopen(request).read().decode('utf8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err)) return # Extract video identifiers @@ -1786,9 +1872,9 @@ class YoutubeChannelIE(InfoExtractor): self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) - for id in video_ids: - self._downloader.download(['http://www.youtube.com/watch?v=%s' % id]) - return + urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids] + url_entries = [self.url_result(url) for url in urls] + return [self.playlist_result(url_entries, channel_id)] class YoutubeUserIE(InfoExtractor): @@ -1813,7 +1899,7 @@ class YoutubeUserIE(InfoExtractor): # Extract username mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid url: %s' % url) + self._downloader.report_error(u'invalid url: %s' % url) return username = mobj.group(1) @@ -1835,7 +1921,7 @@ class YoutubeUserIE(InfoExtractor): try: page = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err)) return # Extract video identifiers @@ -1870,8 +1956,9 @@ class YoutubeUserIE(InfoExtractor): self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" % (username, all_ids_count, len(video_ids))) - for video_id in video_ids: - self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id]) + urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids] + url_results = [self.url_result(url) for url in urls] + return [self.playlist_result(url_results, playlist_title = username)] class BlipTVUserIE(InfoExtractor): @@ -1893,7 +1980,7 @@ class BlipTVUserIE(InfoExtractor): # Extract username mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid url: %s' % url) + self._downloader.report_error(u'invalid url: %s' % url) return username = mobj.group(1) @@ -1907,7 +1994,7 @@ class BlipTVUserIE(InfoExtractor): mobj = re.search(r'data-users-id="([^"]+)"', page) page_base = page_base % mobj.group(1) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err)) return @@ -1921,13 +2008,12 @@ class BlipTVUserIE(InfoExtractor): while True: self.report_download_page(username, pagenum) - - request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) ) - + url = page_base + "&page=" + str(pagenum) + request = compat_urllib_request.Request( url ) try: page = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) + self._downloader.report_error(u'unable to download webpage: %s' % str(err)) return # Extract video identifiers @@ -1962,8 +2048,9 @@ class BlipTVUserIE(InfoExtractor): self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" % (self.IE_NAME, username, all_ids_count, len(video_ids))) - for video_id in video_ids: - self._downloader.download([u'http://blip.tv/'+video_id]) + urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids] + url_entries = [self.url_result(url) for url in urls] + return [self.playlist_result(url_entries, playlist_title = username)] class DepositFilesIE(InfoExtractor): @@ -1991,7 +2078,7 @@ class DepositFilesIE(InfoExtractor): self.report_download_webpage(file_id) webpage = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err)) + self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err)) return # Search for the real file URL @@ -2001,9 +2088,9 @@ class DepositFilesIE(InfoExtractor): mobj = re.search(r'(Attention.*?)', webpage, re.DOTALL) if (mobj is not None) and (mobj.group(1) is not None): restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip() - self._downloader.trouble(u'ERROR: %s' % restriction_message) + self._downloader.report_error(u'%s' % restriction_message) else: - self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url) + self._downloader.report_error(u'unable to extract download URL from: %s' % url) return file_url = mobj.group(1) @@ -2012,7 +2099,7 @@ class DepositFilesIE(InfoExtractor): # Search for file title mobj = re.search(r'', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract title') + self._downloader.report_error(u'unable to extract title') return file_title = mobj.group(1).decode('utf-8') @@ -2059,7 +2146,7 @@ class FacebookIE(InfoExtractor): else: raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) except (IOError, netrc.NetrcParseError) as err: - self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err)) + self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err)) return if useremail is None: @@ -2076,16 +2163,16 @@ class FacebookIE(InfoExtractor): self.report_login() login_results = compat_urllib_request.urlopen(request).read() if re.search(r'', login_results) is not None: - self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.') + self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.') return except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err)) + self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) return def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return video_id = mobj.group('ID') @@ -2141,9 +2228,20 @@ class BlipTVIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return + urlp = compat_urllib_parse_urlparse(url) + if urlp.path.startswith('/play/'): + request = compat_urllib_request.Request(url) + response = compat_urllib_request.urlopen(request) + redirecturl = response.geturl() + rurlp = compat_urllib_parse_urlparse(redirecturl) + file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2] + url = 'http://blip.tv/a/a-' + file_id + return self._real_extract(url) + + if '?' in url: cchar = '&' else: @@ -2177,7 +2275,7 @@ class BlipTVIE(InfoExtractor): json_code_bytes = urlh.read() json_code = json_code_bytes.decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err)) + self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err)) return try: @@ -2208,7 +2306,7 @@ class BlipTVIE(InfoExtractor): 'user_agent': 'iTunes/10.6.1', } except (ValueError,KeyError) as err: - self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err)) + self._downloader.report_error(u'unable to parse video information: %s' % repr(err)) return return [info] @@ -2230,7 +2328,7 @@ class MyVideoIE(InfoExtractor): def _real_extract(self,url): mobj = re.match(self._VALID_URL, url) if mobj is None: - self._download.trouble(u'ERROR: invalid URL: %s' % url) + self._download.report_error(u'invalid URL: %s' % url) return video_id = mobj.group(1) @@ -2240,16 +2338,16 @@ class MyVideoIE(InfoExtractor): webpage = self._download_webpage(webpage_url, video_id) self.report_extraction(video_id) - mobj = re.search(r'', + mobj = re.search(r'([^<]+)', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract title') + self._downloader.report_error(u'unable to extract title') return video_title = mobj.group(1) @@ -2299,9 +2397,10 @@ class ComedyCentralIE(InfoExtractor): '400': '384x216', } - def suitable(self, url): + @classmethod + def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(self._VALID_URL, url, re.VERBOSE) is not None + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None def report_extraction(self, episode_id): self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id) @@ -2321,7 +2420,7 @@ class ComedyCentralIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url, re.VERBOSE) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return if mobj.group('shortname'): @@ -2352,16 +2451,16 @@ class ComedyCentralIE(InfoExtractor): html = htmlHandle.read() webpage = html.decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err)) return if dlNewest: url = htmlHandle.geturl() mobj = re.match(self._VALID_URL, url, re.VERBOSE) if mobj is None: - self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url) + self._downloader.report_error(u'Invalid redirected URL: ' + url) return if mobj.group('episode') == '': - self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url) + self._downloader.report_error(u'Redirected URL is still not specific: ' + url) return epTitle = mobj.group('episode') @@ -2374,7 +2473,7 @@ class ComedyCentralIE(InfoExtractor): altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage) if len(altMovieParams) == 0: - self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url) + self._downloader.report_error(u'unable to find Flash URL in webpage ' + url) return else: mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])] @@ -2385,7 +2484,7 @@ class ComedyCentralIE(InfoExtractor): try: indexXml = compat_urllib_request.urlopen(indexUrl).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err)) + self._downloader.report_error(u'unable to download episode index: ' + compat_str(err)) return results = [] @@ -2406,7 +2505,7 @@ class ComedyCentralIE(InfoExtractor): try: configXml = compat_urllib_request.urlopen(configReq).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err)) return cdoc = xml.etree.ElementTree.fromstring(configXml) @@ -2416,7 +2515,7 @@ class ComedyCentralIE(InfoExtractor): turls.append(finfo) if len(turls) == 0: - self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found') + self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found') continue if self._downloader.params.get('listformats', None): @@ -2473,7 +2572,7 @@ class EscapistIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return showName = mobj.group('showname') videoId = mobj.group('episode') @@ -2485,7 +2584,7 @@ class EscapistIE(InfoExtractor): m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type']) webPage = webPageBytes.decode(m.group(1) if m else 'utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err)) + self._downloader.report_error(u'unable to download webpage: ' + compat_str(err)) return descMatch = re.search('(.*?)\s+-\s+XVID', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video title') + self._downloader.report_error(u'unable to extract video title') return video_title = mobj.group(1) @@ -2645,7 +2744,7 @@ class XVideosIE(InfoExtractor): # Extract video thumbnail mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video thumbnail') + self._downloader.report_error(u'unable to extract video thumbnail') return video_thumbnail = mobj.group(0) @@ -2689,7 +2788,7 @@ class SoundcloudIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return # extract uploader (which is in the url) @@ -2707,7 +2806,7 @@ class SoundcloudIE(InfoExtractor): info_json_bytes = compat_urllib_request.urlopen(request).read() info_json = info_json_bytes.decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err)) return info = json.loads(info_json) @@ -2720,7 +2819,7 @@ class SoundcloudIE(InfoExtractor): stream_json_bytes = compat_urllib_request.urlopen(request).read() stream_json = stream_json_bytes.decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err)) return streams = json.loads(stream_json) @@ -2736,6 +2835,87 @@ class SoundcloudIE(InfoExtractor): 'description': info['description'], }] +class SoundcloudSetIE(InfoExtractor): + """Information extractor for soundcloud.com sets + To access the media, the uid of the song and a stream token + must be extracted from the page source and the script must make + a request to media.soundcloud.com/crossdomain.xml. Then + the media can be grabbed by requesting from an url composed + of the stream token and uid + """ + + _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)' + IE_NAME = u'soundcloud' + + def __init__(self, downloader=None): + InfoExtractor.__init__(self, downloader) + + def report_resolve(self, video_id): + """Report information extraction.""" + self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id)) + + def report_extraction(self, video_id): + """Report information extraction.""" + self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id)) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + return + + # extract uploader (which is in the url) + uploader = mobj.group(1) + # extract simple title (uploader + slug of song title) + slug_title = mobj.group(2) + simple_title = uploader + u'-' + slug_title + + self.report_resolve('%s/sets/%s' % (uploader, slug_title)) + + url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title) + resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28' + request = compat_urllib_request.Request(resolv_url) + try: + info_json_bytes = compat_urllib_request.urlopen(request).read() + info_json = info_json_bytes.decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err)) + return + + videos = [] + info = json.loads(info_json) + if 'errors' in info: + for err in info['errors']: + self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message'])) + return + + for track in info['tracks']: + video_id = track['id'] + self.report_extraction('%s/sets/%s' % (uploader, slug_title)) + + streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28' + request = compat_urllib_request.Request(streams_url) + try: + stream_json_bytes = compat_urllib_request.urlopen(request).read() + stream_json = stream_json_bytes.decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err)) + return + + streams = json.loads(stream_json) + mediaURL = streams['http_mp3_128_url'] + + videos.append({ + 'id': video_id, + 'url': mediaURL, + 'uploader': track['user']['username'], + 'upload_date': track['created_at'], + 'title': track['title'], + 'ext': u'mp3', + 'description': track['description'], + }) + return videos + class InfoQIE(InfoExtractor): """Information extractor for infoq.com""" @@ -2748,7 +2928,7 @@ class InfoQIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return webpage = self._download_webpage(url, video_id=url) @@ -2757,7 +2937,7 @@ class InfoQIE(InfoExtractor): # Extract video URL mobj = re.search(r"jsclassref='([^']*)'", webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video url') + self._downloader.report_error(u'unable to extract video url') return real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8')) video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id @@ -2765,7 +2945,7 @@ class InfoQIE(InfoExtractor): # Extract title mobj = re.search(r'contentTitle = "(.*?)";', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video title') + self._downloader.report_error(u'unable to extract video title') return video_title = mobj.group(1) @@ -2848,7 +3028,7 @@ class MixcloudIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return # extract uploader & filename from url uploader = mobj.group(1).decode('utf-8') @@ -2862,7 +3042,7 @@ class MixcloudIE(InfoExtractor): self.report_download_json(file_url) jsonData = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err)) + self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err)) return # parse JSON @@ -2886,7 +3066,7 @@ class MixcloudIE(InfoExtractor): break # got it! else: if req_format not in formats: - self._downloader.trouble(u'ERROR: format is not available') + self._downloader.report_error(u'format is not available') return url_list = self.get_urls(formats, req_format) @@ -2940,14 +3120,14 @@ class StanfordOpenClassroomIE(InfoExtractor): try: metaXml = compat_urllib_request.urlopen(xmlUrl).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err)) return mdoc = xml.etree.ElementTree.fromstring(metaXml) try: info['title'] = mdoc.findall('./title')[0].text info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text except IndexError: - self._downloader.trouble(u'\nERROR: Invalid metadata XML file') + self._downloader.report_error(u'Invalid metadata XML file') return info['ext'] = info['url'].rpartition('.')[2] return [info] @@ -2999,7 +3179,7 @@ class StanfordOpenClassroomIE(InfoExtractor): try: rootpage = compat_urllib_request.urlopen(rootURL).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err)) + self._downloader.report_error(u'unable to download course info page: ' + compat_str(err)) return info['title'] = info['id'] @@ -3031,7 +3211,7 @@ class MTVIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return if not mobj.group('proto'): url = 'http://' + url @@ -3041,25 +3221,25 @@ class MTVIE(InfoExtractor): mobj = re.search(r'', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract song name') + self._downloader.report_error(u'unable to extract song name') return song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1')) mobj = re.search(r'', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract performer') + self._downloader.report_error(u'unable to extract performer') return performer = unescapeHTML(mobj.group(1).decode('iso-8859-1')) video_title = performer + ' - ' + song_name mobj = re.search(r'', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to mtvn_uri') + self._downloader.report_error(u'unable to mtvn_uri') return mtvn_uri = mobj.group(1) mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract content id') + self._downloader.report_error(u'unable to extract content id') return content_id = mobj.group(1) @@ -3069,7 +3249,7 @@ class MTVIE(InfoExtractor): try: metadataXml = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err)) return mdoc = xml.etree.ElementTree.fromstring(metadataXml) @@ -3141,7 +3321,7 @@ class YoukuIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return video_id = mobj.group('ID') @@ -3152,7 +3332,7 @@ class YoukuIE(InfoExtractor): self.report_download_webpage(video_id) jsondata = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) + self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err)) return self.report_extraction(video_id) @@ -3183,7 +3363,7 @@ class YoukuIE(InfoExtractor): fileid = config['data'][0]['streamfileids'][format] keys = [s['k'] for s in config['data'][0]['segs'][format]] except (UnicodeDecodeError, ValueError, KeyError): - self._downloader.trouble(u'ERROR: unable to extract info section') + self._downloader.report_error(u'unable to extract info section') return files_info=[] @@ -3230,7 +3410,7 @@ class XNXXIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return video_id = mobj.group(1) @@ -3241,24 +3421,24 @@ class XNXXIE(InfoExtractor): webpage_bytes = compat_urllib_request.urlopen(url).read() webpage = webpage_bytes.decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err) + self._downloader.report_error(u'unable to download video webpage: %s' % err) return result = re.search(self.VIDEO_URL_RE, webpage) if result is None: - self._downloader.trouble(u'ERROR: unable to extract video url') + self._downloader.report_error(u'unable to extract video url') return video_url = compat_urllib_parse.unquote(result.group(1)) result = re.search(self.VIDEO_TITLE_RE, webpage) if result is None: - self._downloader.trouble(u'ERROR: unable to extract video title') + self._downloader.report_error(u'unable to extract video title') return video_title = result.group(1) result = re.search(self.VIDEO_THUMB_RE, webpage) if result is None: - self._downloader.trouble(u'ERROR: unable to extract video thumbnail') + self._downloader.report_error(u'unable to extract video thumbnail') return video_thumbnail = result.group(1) @@ -3307,7 +3487,7 @@ class GooglePlusIE(InfoExtractor): # Extract id from URL mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) + self._downloader.report_error(u'Invalid URL: %s' % url) return post_url = mobj.group(0) @@ -3321,7 +3501,7 @@ class GooglePlusIE(InfoExtractor): try: webpage = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err)) + self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err)) return # Extract update date @@ -3356,14 +3536,14 @@ class GooglePlusIE(InfoExtractor): pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]' mobj = re.search(pattern, webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract video page URL') + self._downloader.report_error(u'unable to extract video page URL') video_page = mobj.group(1) request = compat_urllib_request.Request(video_page) try: webpage = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) + self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err)) return self.report_extract_vid_page(video_page) @@ -3373,7 +3553,7 @@ class GooglePlusIE(InfoExtractor): pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"' mobj = re.findall(pattern, webpage) if len(mobj) == 0: - self._downloader.trouble(u'ERROR: unable to extract video links') + self._downloader.report_error(u'unable to extract video links') # Sort in resolution links = sorted(mobj) @@ -3405,7 +3585,7 @@ class NBAIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return video_id = mobj.group(1) @@ -3461,13 +3641,13 @@ class JustinTVIE(InfoExtractor): webpage_bytes = urlh.read() webpage = webpage_bytes.decode('utf-8', 'ignore') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err)) + self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err)) return response = json.loads(webpage) if type(response) != list: error_text = response.get('error', 'unknown error') - self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text) + self._downloader.report_error(u'Justin.tv API: %s' % error_text) return info = [] for clip in response: @@ -3492,7 +3672,7 @@ class JustinTVIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return api = 'http://api.justin.tv' @@ -3527,7 +3707,7 @@ class FunnyOrDieIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return video_id = mobj.group('id') @@ -3535,13 +3715,13 @@ class FunnyOrDieIE(InfoExtractor): m = re.search(r']*>\s*]*>\s*\s+(?P.*?)</a>", webpage) + m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL) if not m: self._downloader.trouble(u'Cannot find video title') - title = unescapeHTML(m.group('title')) + title = clean_html(m.group('title')) m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage) if m: @@ -3558,65 +3738,17 @@ class FunnyOrDieIE(InfoExtractor): } return [info] -class TweetReelIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) - return - - video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - - m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage) - if not m: - self._downloader.trouble(u'ERROR: Cannot find status ID') - status_id = m.group(1) - - m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL) - if not m: - self._downloader.trouble(u'WARNING: Cannot find description') - desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip() - - m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL) - if not m: - self._downloader.trouble(u'ERROR: Cannot find uploader') - uploader = unescapeHTML(m.group('uploader')) - uploader_id = unescapeHTML(m.group('uploader_id')) - - m = re.search(r'<span unixtime="([0-9]+)"', webpage) - if not m: - self._downloader.trouble(u'ERROR: Cannot find upload date') - upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d') - - title = desc - video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov' - - info = { - 'id': video_id, - 'url': video_url, - 'ext': 'mov', - 'title': title, - 'description': desc, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'internal_id': status_id, - 'upload_date': upload_date - } - return [info] - class SteamIE(InfoExtractor): - _VALID_URL = r"""http://store.steampowered.com/ + _VALID_URL = r"""http://store.steampowered.com/ (?P<urltype>video|app)/ #If the page is only for videos or for a game (?P<gameID>\d+)/? (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID """ - def suitable(self, url): + @classmethod + def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(self._VALID_URL, url, re.VERBOSE) is not None + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None def _real_extract(self, url): m = re.match(self._VALID_URL, url, re.VERBOSE) @@ -3636,7 +3768,7 @@ class SteamIE(InfoExtractor): video_url = vid.group('videoURL') video_thumb = thumb.group('thumbnail') if not video_url: - self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id) + self._downloader.report_error(u'Cannot find video url for %s' % video_id) info = { 'id':video_id, 'url':video_url, @@ -3669,6 +3801,62 @@ class UstreamIE(InfoExtractor): } return [info] +class WorldStarHipHopIE(InfoExtractor): + _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)' + IE_NAME = u'WorldStarHipHop' + + def _real_extract(self, url): + _src_url = r"""(http://hw-videos.*(?:mp4|flv))""" + + webpage_src = compat_urllib_request.urlopen(url).read() + webpage_src = webpage_src.decode('utf-8') + + mobj = re.search(_src_url, webpage_src) + + m = re.match(self._VALID_URL, url) + video_id = m.group('id') + + if mobj is not None: + video_url = mobj.group() + if 'mp4' in video_url: + ext = 'mp4' + else: + ext = 'flv' + else: + self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id) + return + + _title = r"""<title>(.*)""" + + mobj = re.search(_title, webpage_src) + + if mobj is not None: + title = mobj.group(1) + else: + title = 'World Start Hip Hop - %s' % time.ctime() + + _thumbnail = r"""rel="image_src" href="(.*)" />""" + mobj = re.search(_thumbnail, webpage_src) + + # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. + if mobj is not None: + thumbnail = mobj.group(1) + else: + _title = r"""candytitles.*>(.*)""" + mobj = re.search(_title, webpage_src) + if mobj is not None: + title = mobj.group(1) + thumbnail = None + + results = [{ + 'id': video_id, + 'url' : video_url, + 'title' : title, + 'thumbnail' : thumbnail, + 'ext' : ext, + }] + return results + class RBMARadioIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P[^/]+)$' @@ -3708,7 +3896,7 @@ class RBMARadioIE(InfoExtractor): class YouPornIE(InfoExtractor): """Information extractor for youporn.com.""" _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P[0-9]+)/(?P[^/]+)' - + def _print_formats(self, formats): """Print all available formats""" print(u'Available formats:') @@ -3726,7 +3914,7 @@ class YouPornIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return video_id = mobj.group('videoid') @@ -3744,7 +3932,7 @@ class YouPornIE(InfoExtractor): # Get the video date result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage) if result is None: - self._downloader.to_stderr(u'WARNING: unable to extract video date') + self._downloader.report_warning(u'unable to extract video date') upload_date = None else: upload_date = result.group('date').strip() @@ -3752,7 +3940,7 @@ class YouPornIE(InfoExtractor): # Get the video uploader result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage) if result is None: - self._downloader.to_stderr(u'WARNING: unable to extract uploader') + self._downloader.report_warning(u'unable to extract uploader') video_uploader = None else: video_uploader = result.group('uploader').strip() @@ -3770,8 +3958,8 @@ class YouPornIE(InfoExtractor): links = re.findall(LINK_RE, download_list_html) if(len(links) == 0): raise ExtractorError(u'ERROR: no known formats available for video') - - self._downloader.to_screen(u'[youporn] Links found: %d' % len(links)) + + self._downloader.to_screen(u'[youporn] Links found: %d' % len(links)) formats = [] for link in links: @@ -3818,11 +4006,11 @@ class YouPornIE(InfoExtractor): else: format = self._specific( req_format, formats ) if result is None: - self._downloader.trouble(u'ERROR: requested format not available') + self._downloader.report_error(u'requested format not available') return return [format] - + class PornotubeIE(InfoExtractor): """Information extractor for pornotube.com.""" @@ -3831,7 +4019,7 @@ class PornotubeIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return video_id = mobj.group('videoid') @@ -3844,7 +4032,7 @@ class PornotubeIE(InfoExtractor): VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",' result = re.search(VIDEO_URL_RE, webpage) if result is None: - self._downloader.trouble(u'ERROR: unable to extract video url') + self._downloader.report_error(u'unable to extract video url') return video_url = compat_urllib_parse.unquote(result.group('url')) @@ -3852,7 +4040,7 @@ class PornotubeIE(InfoExtractor): VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by' result = re.search(VIDEO_UPLOADED_RE, webpage) if result is None: - self._downloader.trouble(u'ERROR: unable to extract video title') + self._downloader.report_error(u'unable to extract video title') return upload_date = result.group('date') @@ -3873,7 +4061,7 @@ class YouJizzIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + self._downloader.report_error(u'invalid URL: %s' % url) return video_id = mobj.group('videoid') @@ -3894,7 +4082,7 @@ class YouJizzIE(InfoExtractor): embed_page_url = result.group(0).strip() video_id = result.group('videoid') - + webpage = self._download_webpage(embed_page_url, video_id) # Get the video URL @@ -3968,11 +4156,11 @@ class KeekIE(InfoExtractor): webpage = self._download_webpage(url, video_id) m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage) title = unescapeHTML(m.group('title')) - m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage) - uploader = unescapeHTML(m.group('uploader')) + m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage) + uploader = clean_html(m.group('uploader')) info = { - 'id':video_id, - 'url':video_url, + 'id': video_id, + 'url': video_url, 'ext': 'mp4', 'title': title, 'thumbnail': thumbnail, @@ -3990,9 +4178,10 @@ class TEDIE(InfoExtractor): /(?P<name>\w+) # Here goes the name and then ".html" ''' - def suitable(self, url): + @classmethod + def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(self._VALID_URL, url, re.VERBOSE) is not None + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None def _real_extract(self, url): m=re.match(self._VALID_URL, url, re.VERBOSE) @@ -4015,31 +4204,30 @@ class TEDIE(InfoExtractor): ([.\s]*?)data-playlist_item_id="(\d+)" ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)" ''' - video_name_RE=r'<p\ class="talk-title"><a href="/talks/(.+).html">(?P<fullname>.+?)</a></p>' + video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>' webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage') m_videos=re.finditer(video_RE,webpage,re.VERBOSE) m_names=re.finditer(video_name_RE,webpage) info=[] for m_video, m_name in zip(m_videos,m_names): - video_dic={ - 'id': m_video.group('video_id'), - 'url': self._talk_video_link(m_video.group('mediaSlug')), - 'ext': 'mp4', - 'title': m_name.group('fullname') - } - info.append(video_dic) + video_id=m_video.group('video_id') + talk_url='http://www.ted.com%s' % m_name.group('talk_url') + info.append(self._talk_info(talk_url,video_id)) return info + def _talk_info(self, url, video_id=0): """Return the video for the talk in the url""" m=re.match(self._VALID_URL, url,re.VERBOSE) videoName=m.group('name') webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName) # If the url includes the language we get the title translated - title_RE=r'<h1><span id="altHeadline" >(?P<title>[\s\w:/\.\?=\+-\\\']*)</span></h1>' + title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>' title=re.search(title_RE, webpage).group('title') info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?) "id":(?P<videoID>[\d]+).*? "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"''' + thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"' + thumb_match=re.search(thumb_RE,webpage) info_match=re.search(info_RE,webpage,re.VERBOSE) video_id=info_match.group('videoID') mediaSlug=info_match.group('mediaSlug') @@ -4048,13 +4236,14 @@ class TEDIE(InfoExtractor): 'id': video_id, 'url': video_url, 'ext': 'mp4', - 'title': title + 'title': title, + 'thumbnail': thumb_match.group('thumbnail') } return info class MySpassIE(InfoExtractor): _VALID_URL = r'http://www.myspass.de/.*' - + def _real_extract(self, url): META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s' @@ -4064,22 +4253,22 @@ class MySpassIE(InfoExtractor): url_parent_path, video_id = os.path.split(url_path) if not video_id: _, video_id = os.path.split(url_parent_path) - + # get metadata metadata_url = META_DATA_URL_TEMPLATE % video_id metadata_text = self._download_webpage(metadata_url, video_id) metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8')) - + # extract values from metadata url_flv_el = metadata.find('url_flv') if url_flv_el is None: - self._downloader.trouble(u'ERROR: unable to extract download url') + self._downloader.report_error(u'unable to extract download url') return video_url = url_flv_el.text extension = os.path.splitext(video_url)[1][1:] title_el = metadata.find('title') if title_el is None: - self._downloader.trouble(u'ERROR: unable to extract title') + self._downloader.report_error(u'unable to extract title') return title = title_el.text format_id_el = metadata.find('format_id') @@ -4108,6 +4297,89 @@ class MySpassIE(InfoExtractor): } return [info] +class SpiegelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$' + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + video_id = m.group('videoID') + + webpage = self._download_webpage(url, video_id) + m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage) + if not m: + raise ExtractorError(u'Cannot find title') + video_title = unescapeHTML(m.group(1)) + + xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml' + xml_code = self._download_webpage(xml_url, video_id, + note=u'Downloading XML', errnote=u'Failed to download XML') + + idoc = xml.etree.ElementTree.fromstring(xml_code) + last_type = idoc[-1] + filename = last_type.findall('./filename')[0].text + duration = float(last_type.findall('./duration')[0].text) + + video_url = 'http://video2.spiegel.de/flash/' + filename + video_ext = filename.rpartition('.')[2] + info = { + 'id': video_id, + 'url': video_url, + 'ext': video_ext, + 'title': video_title, + 'duration': duration, + } + return [info] + +class LiveLeakIE(InfoExtractor): + + _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)' + IE_NAME = u'liveleak' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + return + + video_id = mobj.group('video_id') + + webpage = self._download_webpage(url, video_id) + + m = re.search(r'file: "(.*?)",', webpage) + if not m: + self._downloader.report_error(u'unable to find video url') + return + video_url = m.group(1) + + m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage) + if not m: + self._downloader.trouble(u'Cannot find video title') + title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip() + + m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage) + if m: + desc = unescapeHTML(m.group('desc')) + else: + desc = None + + m = re.search(r'By:.*?(\w+)</a>', webpage) + if m: + uploader = clean_html(m.group(1)) + else: + uploader = None + + info = { + 'id': video_id, + 'url': video_url, + 'ext': 'mp4', + 'title': title, + 'description': desc, + 'uploader': uploader + } + + return [info] + + def gen_extractors(): """ Return a list of an instance of every supported extractor. The order does matter; the first extractor matched is the one handling the URL. @@ -4134,6 +4406,7 @@ def gen_extractors(): EscapistIE(), CollegeHumorIE(), XVideosIE(), + SoundcloudSetIE(), SoundcloudIE(), InfoQIE(), MixcloudIE(), @@ -4147,9 +4420,9 @@ def gen_extractors(): GooglePlusIE(), ArteTvIE(), NBAIE(), + WorldStarHipHopIE(), JustinTVIE(), FunnyOrDieIE(), - TweetReelIE(), SteamIE(), UstreamIE(), RBMARadioIE(), @@ -4157,7 +4430,7 @@ def gen_extractors(): KeekIE(), TEDIE(), MySpassIE(), + SpiegelIE(), + LiveLeakIE(), GenericIE() ] - -