X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=f0d3e0c63e91766631121dd3c1ef06fc98e41d2a;hb=96731798dbdd5a8878ac5cf29b69c6c7c821311b;hp=a39e865dc13966e51f7bad1314a63db7d2b8e13a;hpb=abe7a3ac2a85b0323820270f0920d44682b5cd11;p=youtube-dl diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index a39e865dc..f0d3e0c63 100644 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -38,18 +38,24 @@ class InfoExtractor(object): The dictionaries must include the following fields: - id: Video identifier. - url: Final video URL. - uploader: Nickname of the video uploader. - title: Video title, unescaped. - ext: Video filename extension. + id: Video identifier. + url: Final video URL. + uploader: Nickname of the video uploader, unescaped. + upload_date: Video upload date (YYYYMMDD). + title: Video title, unescaped. + ext: Video filename extension. The following fields are optional: format: The video format, defaults to ext (used for --get-format) thumbnail: Full URL to a video thumbnail image. - description One-line video description. + description: One-line video description. player_url: SWF Player URL (used for rtmpdump). + subtitles: The .srt file contents. + urlhandle: [internal] The urlHandle to be used to download the file, + like returned by urllib2.urlopen + + The fields should all be Unicode strings. Subclasses of this one should re-define the _real_initialize() and _real_extract() methods and define a _VALID_URL regexp. @@ -57,10 +63,14 @@ class InfoExtractor(object): _real_extract() must return a *list* of information dictionaries as described above. + + Finally, the _WORKING attribute should be set to False for broken IEs + in order to warn the users and skip the tests. """ _ready = False _downloader = None + _WORKING = True def __init__(self, downloader=None): """Constructor. Receives an optional downloader.""" @@ -71,6 +81,10 @@ class InfoExtractor(object): """Receives a URL and returns True if suitable for this IE.""" return re.match(self._VALID_URL, url) is not None + def working(self): + """Getter method for _WORKING.""" + return self._WORKING + def initialize(self): """Initializes an instance (authentication, etc).""" if not self._ready: @@ -366,7 +380,7 @@ class YoutubeIE(InfoExtractor): video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0]) # upload date - upload_date = u'NA' + upload_date = None mobj = re.search(r'id="eow-date.*?>(.*?)', video_webpage, re.DOTALL) if mobj is not None: upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) @@ -617,7 +631,7 @@ class MetacafeIE(InfoExtractor): 'id': video_id.decode('utf-8'), 'url': video_url.decode('utf-8'), 'uploader': video_uploader.decode('utf-8'), - 'upload_date': u'NA', + 'upload_date': None, 'title': video_title, 'ext': video_extension.decode('utf-8'), }] @@ -693,7 +707,7 @@ class DailymotionIE(InfoExtractor): return video_title = unescapeHTML(mobj.group('title').decode('utf-8')) - video_uploader = u'NA' + video_uploader = None mobj = re.search(r'(?im)[^<]+?]+?>([^<]+?)', webpage) if mobj is None: # lookin for official user @@ -705,7 +719,7 @@ class DailymotionIE(InfoExtractor): else: video_uploader = mobj.group(1) - video_upload_date = u'NA' + video_upload_date = None mobj = re.search(r'
([0-9]{2})-([0-9]{2})-([0-9]{4})
', webpage) if mobj is not None: video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1) @@ -806,8 +820,8 @@ class GoogleIE(InfoExtractor): return [{ 'id': video_id.decode('utf-8'), 'url': video_url.decode('utf-8'), - 'uploader': u'NA', - 'upload_date': u'NA', + 'uploader': None, + 'upload_date': None, 'title': video_title, 'ext': video_extension.decode('utf-8'), }] @@ -872,7 +886,7 @@ class PhotobucketIE(InfoExtractor): 'id': video_id.decode('utf-8'), 'url': video_url.decode('utf-8'), 'uploader': video_uploader, - 'upload_date': u'NA', + 'upload_date': None, 'title': video_title, 'ext': video_extension.decode('utf-8'), }] @@ -1012,12 +1026,11 @@ class YahooIE(InfoExtractor): 'id': video_id.decode('utf-8'), 'url': video_url, 'uploader': video_uploader, - 'upload_date': u'NA', + 'upload_date': None, 'title': video_title, 'ext': video_extension.decode('utf-8'), 'thumbnail': video_thumbnail.decode('utf-8'), 'description': video_description, - 'thumbnail': video_thumbnail, }] @@ -1085,7 +1098,7 @@ class VimeoIE(InfoExtractor): else: video_description = '' # Extract upload date - video_upload_date = u'NA' + video_upload_date = None mobj = re.search(r'', webpage) if mobj is not None: video_upload_date = mobj.group(1) @@ -1134,6 +1147,143 @@ class VimeoIE(InfoExtractor): }] +class ArteTvIE(InfoExtractor): + """arte.tv information extractor.""" + + _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*' + _LIVE_URL = r'index-[0-9]+\.html$' + + IE_NAME = u'arte.tv' + + def __init__(self, downloader=None): + InfoExtractor.__init__(self, downloader) + + def report_download_webpage(self, video_id): + """Report webpage download.""" + self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id) + + def report_extraction(self, video_id): + """Report information extraction.""" + self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id) + + def fetch_webpage(self, url): + self._downloader.increment_downloads() + request = urllib2.Request(url) + try: + self.report_download_webpage(url) + webpage = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) + return + except ValueError, err: + self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) + return + return webpage + + def grep_webpage(self, url, regex, regexFlags, matchTuples): + page = self.fetch_webpage(url) + mobj = re.search(regex, page, regexFlags) + info = {} + + if mobj is None: + self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) + return + + for (i, key, err) in matchTuples: + if mobj.group(i) is None: + self._downloader.trouble(err) + return + else: + info[key] = mobj.group(i) + + return info + + def extractLiveStream(self, url): + video_lang = url.split('/')[-4] + info = self.grep_webpage( + url, + r'src="(.*?/videothek_js.*?\.js)', + 0, + [ + (1, 'url', u'ERROR: Invalid URL: %s' % url) + ] + ) + http_host = url.split('/')[2] + next_url = 'http://%s%s' % (http_host, urllib.unquote(info.get('url'))) + info = self.grep_webpage( + next_url, + r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' + + '(http://.*?\.swf).*?' + + '(rtmp://.*?)\'', + re.DOTALL, + [ + (1, 'path', u'ERROR: could not extract video path: %s' % url), + (2, 'player', u'ERROR: could not extract video player: %s' % url), + (3, 'url', u'ERROR: could not extract video url: %s' % url) + ] + ) + video_url = u'%s/%s' % (info.get('url'), info.get('path')) + + def extractPlus7Stream(self, url): + video_lang = url.split('/')[-3] + info = self.grep_webpage( + url, + r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)', + 0, + [ + (1, 'url', u'ERROR: Invalid URL: %s' % url) + ] + ) + next_url = urllib.unquote(info.get('url')) + info = self.grep_webpage( + next_url, + r'