X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=3b5be1d4266d03fcd51b260c39ba4e93fa1e971a;hb=c116339ddbe62d88b6295d519c03027070ec7d0d;hp=35ba6cc5c9d59752621178f568473f49a7357156;hpb=92b91c18780938283c505f5662c458e049bf3567;p=youtube-dl diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 35ba6cc5c..3b5be1d42 100644 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -29,37 +29,48 @@ class InfoExtractor(object): """Information Extractor class. Information extractors are the classes that, given a URL, extract - information from the video (or videos) the URL refers to. This - information includes the real video URL, the video title and simplified - title, author and others. The information is stored in a dictionary - which is then passed to the FileDownloader. The FileDownloader - processes this information possibly downloading the video to the file - system, among other possible outcomes. The dictionaries must include - the following fields: - - id: Video identifier. - url: Final video URL. - uploader: Nickname of the video uploader. - title: Literal title. - ext: Video filename extension. - format: Video format. - player_url: SWF Player URL (may be None). - - The following fields are optional. Their primary purpose is to allow - youtube-dl to serve as the backend for a video search function, such - as the one in youtube2mp3. They are only used when their respective - forced printing functions are called: - - thumbnail: Full URL to a video thumbnail image. - description: One-line video description. + information about the video (or videos) the URL refers to. This + information includes the real video URL, the video title, author and + others. The information is stored in a dictionary which is then + passed to the FileDownloader. The FileDownloader processes this + information possibly downloading the video to the file system, among + other possible outcomes. + + The dictionaries must include the following fields: + + id: Video identifier. + url: Final video URL. + uploader: Nickname of the video uploader, unescaped. + upload_date: Video upload date (YYYYMMDD). + title: Video title, unescaped. + ext: Video filename extension. + + The following fields are optional: + + format: The video format, defaults to ext (used for --get-format) + thumbnail: Full URL to a video thumbnail image. + description: One-line video description. + player_url: SWF Player URL (used for rtmpdump). + subtitles: The .srt file contents. + urlhandle: [internal] The urlHandle to be used to download the file, + like returned by urllib2.urlopen + + The fields should all be Unicode strings. Subclasses of this one should re-define the _real_initialize() and _real_extract() methods and define a _VALID_URL regexp. Probably, they should also be added to the list of extractors. + + _real_extract() must return a *list* of information dictionaries as + described above. + + Finally, the _WORKING attribute should be set to False for broken IEs + in order to warn the users and skip the tests. """ _ready = False _downloader = None + _WORKING = True def __init__(self, downloader=None): """Constructor. Receives an optional downloader.""" @@ -70,6 +81,10 @@ class InfoExtractor(object): """Receives a URL and returns True if suitable for this IE.""" return re.match(self._VALID_URL, url) is not None + def working(self): + """Getter method for _WORKING.""" + return self._WORKING + def initialize(self): """Initializes an instance (authentication, etc).""" if not self._ready: @@ -238,7 +253,7 @@ class YoutubeIE(InfoExtractor): else: raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) except (IOError, netrc.NetrcParseError), err: - self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err)) + self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % u(err)) return # Set language @@ -247,7 +262,7 @@ class YoutubeIE(InfoExtractor): self.report_lang() urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err)) + self._downloader.to_stderr(u'WARNING: unable to set language: %s' % u(err)) return # No authentication to be performed @@ -270,7 +285,7 @@ class YoutubeIE(InfoExtractor): self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password') return except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err)) + self._downloader.to_stderr(u'WARNING: unable to log in: %s' % u(err)) return # Confirm age @@ -283,7 +298,7 @@ class YoutubeIE(InfoExtractor): self.report_age_confirmation() age_results = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err)) + self._downloader.trouble(u'ERROR: unable to confirm age: %s' % u(err)) return def _real_extract(self, url): @@ -305,7 +320,7 @@ class YoutubeIE(InfoExtractor): try: video_webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err)) + self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % u(err)) return # Attempt to extract SWF player URL @@ -327,7 +342,7 @@ class YoutubeIE(InfoExtractor): if 'token' in video_info: break except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err)) + self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % u(err)) return if 'token' not in video_info: if 'reason' in video_info: @@ -365,7 +380,7 @@ class YoutubeIE(InfoExtractor): video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0]) # upload date - upload_date = u'NA' + upload_date = None mobj = re.search(r'id="eow-date.*?>(.*?)', video_webpage, re.DOTALL) if mobj is not None: upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) @@ -390,7 +405,7 @@ class YoutubeIE(InfoExtractor): try: srt_list = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err)) + raise Trouble(u'WARNING: unable to download video subtitles: %s' % u(err)) srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list) srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list) if not srt_lang_list: @@ -407,7 +422,7 @@ class YoutubeIE(InfoExtractor): try: srt_xml = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err)) + raise Trouble(u'WARNING: unable to download video subtitles: %s' % u(err)) if not srt_xml: raise Trouble(u'WARNING: unable to download video subtitles') video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8')) @@ -475,6 +490,9 @@ class YoutubeIE(InfoExtractor): # Extension video_extension = self._video_extensions.get(format_param, 'flv') + video_format = '{} - {}'.format(format_param.decode('utf-8') if format_param else video_extension.decode('utf-8'), + self._video_dimensions.get(format_param, '???')) + results.append({ 'id': video_id.decode('utf-8'), 'url': video_real_url.decode('utf-8'), @@ -482,7 +500,7 @@ class YoutubeIE(InfoExtractor): 'upload_date': upload_date, 'title': video_title, 'ext': video_extension.decode('utf-8'), - 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), + 'format': video_format, 'thumbnail': video_thumbnail.decode('utf-8'), 'description': video_description, 'player_url': player_url, @@ -526,7 +544,7 @@ class MetacafeIE(InfoExtractor): self.report_disclaimer() disclaimer = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err)) + self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % u(err)) return # Confirm age @@ -539,7 +557,7 @@ class MetacafeIE(InfoExtractor): self.report_age_confirmation() disclaimer = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err)) + self._downloader.trouble(u'ERROR: unable to confirm age: %s' % u(err)) return def _real_extract(self, url): @@ -563,7 +581,7 @@ class MetacafeIE(InfoExtractor): self.report_download_webpage(video_id) webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err)) + self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % u(err)) return # Extract URL, uploader and title from webpage @@ -613,11 +631,9 @@ class MetacafeIE(InfoExtractor): 'id': video_id.decode('utf-8'), 'url': video_url.decode('utf-8'), 'uploader': video_uploader.decode('utf-8'), - 'upload_date': u'NA', + 'upload_date': None, 'title': video_title, 'ext': video_extension.decode('utf-8'), - 'format': u'NA', - 'player_url': None, }] @@ -656,7 +672,7 @@ class DailymotionIE(InfoExtractor): self.report_download_webpage(video_id) webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err)) + self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % u(err)) return # Extract URL, uploader and title from webpage @@ -691,7 +707,7 @@ class DailymotionIE(InfoExtractor): return video_title = unescapeHTML(mobj.group('title').decode('utf-8')) - video_uploader = u'NA' + video_uploader = None mobj = re.search(r'(?im)[^<]+?]+?>([^<]+?)', webpage) if mobj is None: # lookin for official user @@ -703,7 +719,7 @@ class DailymotionIE(InfoExtractor): else: video_uploader = mobj.group(1) - video_upload_date = u'NA' + video_upload_date = None mobj = re.search(r'
([0-9]{2})-([0-9]{2})-([0-9]{4})
', webpage) if mobj is not None: video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1) @@ -715,8 +731,6 @@ class DailymotionIE(InfoExtractor): 'upload_date': video_upload_date, 'title': video_title, 'ext': video_extension.decode('utf-8'), - 'format': u'NA', - 'player_url': None, }] @@ -754,7 +768,7 @@ class GoogleIE(InfoExtractor): self.report_download_webpage(video_id) webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err)) return # Extract URL, uploader, and title from webpage @@ -793,7 +807,7 @@ class GoogleIE(InfoExtractor): try: webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err)) return mobj = re.search(r'', webpage) if mobj is None: @@ -806,12 +820,10 @@ class GoogleIE(InfoExtractor): return [{ 'id': video_id.decode('utf-8'), 'url': video_url.decode('utf-8'), - 'uploader': u'NA', - 'upload_date': u'NA', + 'uploader': None, + 'upload_date': None, 'title': video_title, 'ext': video_extension.decode('utf-8'), - 'format': u'NA', - 'player_url': None, }] @@ -849,7 +861,7 @@ class PhotobucketIE(InfoExtractor): self.report_download_webpage(video_id) webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err)) return # Extract URL, uploader, and title from webpage @@ -874,11 +886,9 @@ class PhotobucketIE(InfoExtractor): 'id': video_id.decode('utf-8'), 'url': video_url.decode('utf-8'), 'uploader': video_uploader, - 'upload_date': u'NA', + 'upload_date': None, 'title': video_title, 'ext': video_extension.decode('utf-8'), - 'format': u'NA', - 'player_url': None, }] @@ -919,7 +929,7 @@ class YahooIE(InfoExtractor): try: webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err)) return mobj = re.search(r'\("id", "([0-9]+)"\);', webpage) @@ -943,7 +953,7 @@ class YahooIE(InfoExtractor): self.report_download_webpage(video_id) webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err)) return # Extract uploader and title from webpage @@ -1001,7 +1011,7 @@ class YahooIE(InfoExtractor): self.report_download_webpage(video_id) webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err)) return # Extract media URL from playlist XML @@ -1016,13 +1026,11 @@ class YahooIE(InfoExtractor): 'id': video_id.decode('utf-8'), 'url': video_url, 'uploader': video_uploader, - 'upload_date': u'NA', + 'upload_date': None, 'title': video_title, 'ext': video_extension.decode('utf-8'), 'thumbnail': video_thumbnail.decode('utf-8'), 'description': video_description, - 'thumbnail': video_thumbnail, - 'player_url': None, }] @@ -1059,7 +1067,7 @@ class VimeoIE(InfoExtractor): self.report_download_webpage(video_id) webpage = urllib2.urlopen(request).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err)) return # Now we begin extracting as much information as we can from what we @@ -1090,7 +1098,7 @@ class VimeoIE(InfoExtractor): else: video_description = '' # Extract upload date - video_upload_date = u'NA' + video_upload_date = None mobj = re.search(r'', webpage) if mobj is not None: video_upload_date = mobj.group(1) @@ -1136,10 +1144,146 @@ class VimeoIE(InfoExtractor): 'ext': video_extension, 'thumbnail': video_thumbnail, 'description': video_description, - 'player_url': None, }] +class ArteTvIE(InfoExtractor): + """arte.tv information extractor.""" + + _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*' + _LIVE_URL = r'index-[0-9]+\.html$' + + IE_NAME = u'arte.tv' + + def __init__(self, downloader=None): + InfoExtractor.__init__(self, downloader) + + def report_download_webpage(self, video_id): + """Report webpage download.""" + self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id) + + def report_extraction(self, video_id): + """Report information extraction.""" + self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id) + + def fetch_webpage(self, url): + self._downloader.increment_downloads() + request = urllib2.Request(url) + try: + self.report_download_webpage(url) + webpage = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % u(err)) + return + except ValueError, err: + self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) + return + return webpage + + def grep_webpage(self, url, regex, regexFlags, matchTuples): + page = self.fetch_webpage(url) + mobj = re.search(regex, page, regexFlags) + info = {} + + if mobj is None: + self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) + return + + for (i, key, err) in matchTuples: + if mobj.group(i) is None: + self._downloader.trouble(err) + return + else: + info[key] = mobj.group(i) + + return info + + def extractLiveStream(self, url): + video_lang = url.split('/')[-4] + info = self.grep_webpage( + url, + r'src="(.*?/videothek_js.*?\.js)', + 0, + [ + (1, 'url', u'ERROR: Invalid URL: %s' % url) + ] + ) + http_host = url.split('/')[2] + next_url = 'http://%s%s' % (http_host, urllib.unquote(info.get('url'))) + info = self.grep_webpage( + next_url, + r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' + + '(http://.*?\.swf).*?' + + '(rtmp://.*?)\'', + re.DOTALL, + [ + (1, 'path', u'ERROR: could not extract video path: %s' % url), + (2, 'player', u'ERROR: could not extract video player: %s' % url), + (3, 'url', u'ERROR: could not extract video url: %s' % url) + ] + ) + video_url = u'%s/%s' % (info.get('url'), info.get('path')) + + def extractPlus7Stream(self, url): + video_lang = url.split('/')[-3] + info = self.grep_webpage( + url, + r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)', + 0, + [ + (1, 'url', u'ERROR: Invalid URL: %s' % url) + ] + ) + next_url = urllib.unquote(info.get('url')) + info = self.grep_webpage( + next_url, + r'