X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=1575a3434d3258d0ff743c1275b4edf96f2bfab2;hb=9993976ae436c62ef86c2c45f0f001e5f7e471bd;hp=4fcff77ff7e55ee4ff3c7a4e8fb7d940b72cb716;hpb=3c4d6c9ebae975bafcf3ae1b8d419cb734b50581;p=youtube-dl diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 4fcff77ff..1575a3434 100644 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -97,7 +97,25 @@ class InfoExtractor(object): class YoutubeIE(InfoExtractor): """Information extractor for youtube.com.""" - _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|tube\.majestyc\.net/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$' + _VALID_URL = r"""^ + ( + (?:https?://)? # http(s):// (optional) + (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/| + tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains + (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs + (?: # the various things that can precede the ID: + (?:(?:v|embed|e)/) # v/ or embed/ or e/ + |(?: # or the v= param in all its forms + (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) + (?:\?|\#!?) # the params delimiter ? or # or #! + (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx) + v= + ) + )? # optional -> youtube.com/xxxx is OK + )? # all until now is optional -> you can pass the naked ID + ([0-9A-Za-z_-]+) # here is it! the YouTube video ID + (?(1).+)? # if we found the ID, everything can follow + $""" _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en' _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' @@ -136,6 +154,10 @@ class YoutubeIE(InfoExtractor): } IE_NAME = u'youtube' + def suitable(self, url): + """Receives a URL and returns True if suitable for this IE.""" + return re.match(self._VALID_URL, url, re.VERBOSE) is not None + def report_lang(self): """Report attempt to set language.""" self._downloader.to_screen(u'[youtube] Setting language') @@ -270,7 +292,7 @@ class YoutubeIE(InfoExtractor): url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/') # Extract video id from URL - mobj = re.match(self._VALID_URL, url) + mobj = re.match(self._VALID_URL, url, re.VERBOSE) if mobj is None: self._downloader.trouble(u'ERROR: invalid URL: %s' % url) return @@ -594,7 +616,7 @@ class MetacafeIE(InfoExtractor): class DailymotionIE(InfoExtractor): """Information Extractor for Dailymotion""" - _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)' + _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)' IE_NAME = u'dailymotion' def __init__(self, downloader=None): @@ -615,7 +637,7 @@ class DailymotionIE(InfoExtractor): self._downloader.trouble(u'ERROR: invalid URL: %s' % url) return - video_id = mobj.group(1) + video_id = mobj.group(1).split('_')[0].split('?')[0] video_extension = 'mp4' @@ -636,14 +658,22 @@ class DailymotionIE(InfoExtractor): self._downloader.trouble(u'ERROR: unable to extract media URL') return flashvars = urllib.unquote(mobj.group(1)) - if 'hqURL' in flashvars: max_quality = 'hqURL' - elif 'sdURL' in flashvars: max_quality = 'sdURL' - else: max_quality = 'ldURL' + + for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']: + if key in flashvars: + max_quality = key + self._downloader.to_screen(u'[dailymotion] Using %s' % key) + break + else: + self._downloader.trouble(u'ERROR: unable to extract video URL') + return + mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract media URL') + self._downloader.trouble(u'ERROR: unable to extract video URL') return - video_url = mobj.group(1).replace('\\/', '/') + + video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/') # TODO: support choosing qualities @@ -653,17 +683,23 @@ class DailymotionIE(InfoExtractor): return video_title = unescapeHTML(mobj.group('title').decode('utf-8')) + video_uploader = u'NA' mobj = re.search(r'(?im)[^<]+?]+?>([^<]+?)', webpage) if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract uploader nickname') - return - video_uploader = mobj.group(1) + self._downloader.trouble(u'WARNING: unable to extract uploader nickname') + else: + video_uploader = mobj.group(1) + + video_upload_date = u'NA' + mobj = re.search(r'
([0-9]{2})-([0-9]{2})-([0-9]{4})
', webpage) + if mobj is not None: + video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1) return [{ 'id': video_id.decode('utf-8'), 'url': video_url.decode('utf-8'), 'uploader': video_uploader.decode('utf-8'), - 'upload_date': u'NA', + 'upload_date': video_upload_date, 'title': video_title, 'ext': video_extension.decode('utf-8'), 'format': u'NA', @@ -1472,9 +1508,9 @@ class YahooSearchIE(InfoExtractor): class YoutubePlaylistIE(InfoExtractor): """Information Extractor for YouTube playlists.""" - _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*' + _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*' _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en' - _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=.*?%s' + _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s' _MORE_PAGES_INDICATOR = r'yt-uix-pager-next' IE_NAME = u'youtube:playlist' @@ -3145,3 +3181,127 @@ class XNXXIE(InfoExtractor): 'player_url': None} return [info] + + +class GooglePlusIE(InfoExtractor): + """Information extractor for plus.google.com.""" + + _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)' + IE_NAME = u'plus.google' + + def __init__(self, downloader=None): + InfoExtractor.__init__(self, downloader) + + def report_extract_entry(self, url): + """Report downloading extry""" + self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8')) + + def report_date(self, upload_date): + """Report downloading extry""" + self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date) + + def report_uploader(self, uploader): + """Report downloading extry""" + self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8')) + + def report_title(self, video_title): + """Report downloading extry""" + self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8')) + + def report_extract_vid_page(self, video_page): + """Report information extraction.""" + self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8')) + + def _real_extract(self, url): + # Extract id from URL + mobj = re.match(self._VALID_URL, url) + if mobj is None: + self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) + return + + post_url = mobj.group(0) + video_id = mobj.group(2) + + video_extension = 'flv' + + # Step 1, Retrieve post webpage to extract further information + self.report_extract_entry(post_url) + request = urllib2.Request(post_url) + try: + webpage = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % str(err)) + return + + # Extract update date + upload_date = u'NA' + pattern = 'title="Timestamp">(.*?)' + mobj = re.search(pattern, webpage) + if mobj: + upload_date = mobj.group(1) + # Convert timestring to a format suitable for filename + upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d") + upload_date = upload_date.strftime('%Y%m%d') + self.report_date(upload_date) + + # Extract uploader + uploader = u'NA' + pattern = r'rel\="author".*?>(.*?)' + mobj = re.search(pattern, webpage) + if mobj: + uploader = mobj.group(1) + self.report_uploader(uploader) + + # Extract title + # Get the first line for title + video_title = u'NA' + pattern = r'