Merge branch 'master' of github.com:rg3/youtube-dl
[youtube-dl] / youtube_dl / InfoExtractors.py
index 5cc0e9195f4e751b99e73af76aab1fbf730b1f3f..620cce1893788e9ad1abc326896a878c63524d13 100755 (executable)
@@ -114,8 +114,8 @@ class InfoExtractor(object):
     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
         """ Returns the response handle """
         if note is None:
-            note = u'Downloading video webpage'
-        if note is not False:
+            self.report_download_webpage(video_id)
+        elif note is not False:
             self.to_screen(u'%s: %s' % (video_id, note))
         try:
             return compat_urllib_request.urlopen(url_or_request)
@@ -152,6 +152,10 @@ class InfoExtractor(object):
         """Report information extraction."""
         self.to_screen(u'%s: Extracting information' % id_or_name)
 
+    def report_download_webpage(self, video_id):
+        """Report webpage download."""
+        self.to_screen(u'%s: Downloading webpage' % video_id)
+
     def report_age_confirmation(self):
         """Report attempt to confirm age."""
         self.to_screen(u'Confirming age')
@@ -558,19 +562,18 @@ class YoutubeIE(InfoExtractor):
         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
         if mobj is not None:
             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
-            format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
-            for expression in format_expressions:
-                try:
-                    upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
-                except:
-                    pass
+            upload_date = unified_strdate(upload_date)
 
         # description
         video_description = get_element_by_id("eow-description", video_webpage)
         if video_description:
             video_description = clean_html(video_description)
         else:
-            video_description = ''
+            fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
+            if fd_mobj:
+                video_description = unescapeHTML(fd_mobj.group(1))
+            else:
+                video_description = u''
 
         # subtitles
         video_subtitles = None
@@ -680,17 +683,10 @@ class MetacafeIE(InfoExtractor):
     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
     IE_NAME = u'metacafe'
 
-    def __init__(self, downloader=None):
-        InfoExtractor.__init__(self, downloader)
-
     def report_disclaimer(self):
         """Report disclaimer retrieval."""
         self.to_screen(u'Retrieving disclaimer')
 
-    def report_download_webpage(self, video_id):
-        """Report webpage download."""
-        self.to_screen(u'%s: Downloading webpage' % video_id)
-
     def _real_initialize(self):
         # Retrieve disclaimer
         request = compat_urllib_request.Request(self._DISCLAIMER)
@@ -791,9 +787,6 @@ class DailymotionIE(InfoExtractor):
     IE_NAME = u'dailymotion'
     _WORKING = False
 
-    def __init__(self, downloader=None):
-        InfoExtractor.__init__(self, downloader)
-
     def _real_extract(self, url):
         # Extract id and simplified title from URL
         mobj = re.match(self._VALID_URL, url)
@@ -875,13 +868,6 @@ class PhotobucketIE(InfoExtractor):
     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
     IE_NAME = u'photobucket'
 
-    def __init__(self, downloader=None):
-        InfoExtractor.__init__(self, downloader)
-
-    def report_download_webpage(self, video_id):
-        """Report webpage download."""
-        self.to_screen(u'%s: Downloading webpage' % video_id)
-
     def _real_extract(self, url):
         # Extract id from URL
         mobj = re.match(self._VALID_URL, url)
@@ -940,13 +926,6 @@ class YahooIE(InfoExtractor):
     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
     IE_NAME = u'video.yahoo'
 
-    def __init__(self, downloader=None):
-        InfoExtractor.__init__(self, downloader)
-
-    def report_download_webpage(self, video_id):
-        """Report webpage download."""
-        self.to_screen(u'%s: Downloading webpage' % video_id)
-
     def _real_extract(self, url, new_video=True):
         # Extract ID from URL
         mobj = re.match(self._VALID_URL, url)
@@ -1076,13 +1055,6 @@ class VimeoIE(InfoExtractor):
     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
     IE_NAME = u'vimeo'
 
-    def __init__(self, downloader=None):
-        InfoExtractor.__init__(self, downloader)
-
-    def report_download_webpage(self, video_id):
-        """Report webpage download."""
-        self.to_screen(u'%s: Downloading webpage' % video_id)
-
     def _real_extract(self, url, new_video=True):
         # Extract ID from URL
         mobj = re.match(self._VALID_URL, url)
@@ -1116,7 +1088,10 @@ class VimeoIE(InfoExtractor):
             config = webpage.split(' = {config:')[1].split(',assets:')[0]
             config = json.loads(config)
         except:
-            self._downloader.report_error(u'unable to extract info section')
+            if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
+                self._downloader.report_error(u'The author has restricted the access to this video, try with the "--referer" option')
+            else:
+                self._downloader.report_error(u'unable to extract info section')
             return
 
         # Extract title
@@ -1193,13 +1168,6 @@ class ArteTvIE(InfoExtractor):
 
     IE_NAME = u'arte.tv'
 
-    def __init__(self, downloader=None):
-        InfoExtractor.__init__(self, downloader)
-
-    def report_download_webpage(self, video_id):
-        """Report webpage download."""
-        self.to_screen(u'%s: Downloading webpage' % video_id)
-
     def fetch_webpage(self, url):
         request = compat_urllib_request.Request(url)
         try:
@@ -1323,14 +1291,11 @@ class GenericIE(InfoExtractor):
     _VALID_URL = r'.*'
     IE_NAME = u'generic'
 
-    def __init__(self, downloader=None):
-        InfoExtractor.__init__(self, downloader)
-
     def report_download_webpage(self, video_id):
         """Report webpage download."""
         if not self._downloader.params.get('test', False):
             self._downloader.report_warning(u'Falling back on generic information extractor.')
-        self.to_screen(u'%s: Downloading webpage' % video_id)
+        super(GenericIE, self).report_download_webpage(video_id)
 
     def report_following_redirect(self, new_url):
         """Report information extraction."""
@@ -1465,9 +1430,6 @@ class YoutubeSearchIE(InfoExtractor):
     _max_youtube_results = 1000
     IE_NAME = u'youtube:search'
 
-    def __init__(self, downloader=None):
-        InfoExtractor.__init__(self, downloader)
-
     def report_download_page(self, query, pagenum):
         """Report attempt to download search page with given number."""
         query = query.decode(preferredencoding())
@@ -1542,9 +1504,6 @@ class GoogleSearchIE(InfoExtractor):
     _max_google_results = 1000
     IE_NAME = u'video.google:search'
 
-    def __init__(self, downloader=None):
-        InfoExtractor.__init__(self, downloader)
-
     def report_download_page(self, query, pagenum):
         """Report attempt to download playlist page with given number."""
         query = query.decode(preferredencoding())
@@ -1626,9 +1585,6 @@ class YahooSearchIE(InfoExtractor):
     _max_yahoo_results = 1000
     IE_NAME = u'video.yahoo:search'
 
-    def __init__(self, downloader=None):
-        InfoExtractor.__init__(self, downloader)
-
     def report_download_page(self, query, pagenum):
         """Report attempt to download playlist page with given number."""
         query = query.decode(preferredencoding())
@@ -1722,9 +1678,6 @@ class YoutubePlaylistIE(InfoExtractor):
     _MAX_RESULTS = 50
     IE_NAME = u'youtube:playlist'
 
-    def __init__(self, downloader=None):
-        InfoExtractor.__init__(self, downloader)
-
     @classmethod
     def suitable(cls, url):
         """Receives a URL and returns True if suitable for this IE."""
@@ -1765,12 +1718,11 @@ class YoutubePlaylistIE(InfoExtractor):
             if 'feed' not in response:
                 self._downloader.report_error(u'Got a malformed response from YouTube API')
                 return
+            playlist_title = response['feed']['title']['$t']
             if 'entry' not in response['feed']:
                 # Number of videos is a multiple of self._MAX_RESULTS
                 break
 
-            playlist_title = response['feed']['title']['$t']
-
             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
                         for entry in response['feed']['entry']
                         if 'content' in entry ]
@@ -1869,9 +1821,6 @@ class YoutubeUserIE(InfoExtractor):
     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
     IE_NAME = u'youtube:user'
 
-    def __init__(self, downloader=None):
-        InfoExtractor.__init__(self, downloader)
-
     def report_download_page(self, username, start_index):
         """Report attempt to download user page."""
         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
@@ -1938,9 +1887,6 @@ class BlipTVUserIE(InfoExtractor):
     _PAGE_SIZE = 12
     IE_NAME = u'blip.tv:user'
 
-    def __init__(self, downloader=None):
-        InfoExtractor.__init__(self, downloader)
-
     def report_download_page(self, username, pagenum):
         """Report attempt to download user page."""
         self.to_screen(u'user %s: Downloading video ids from page %d' %
@@ -2016,10 +1962,6 @@ class DepositFilesIE(InfoExtractor):
 
     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
 
-    def report_download_webpage(self, file_id):
-        """Report webpage download."""
-        self.to_screen(u'%s: Downloading webpage' % file_id)
-
     def _real_extract(self, url):
         file_id = url.split('/')[-1]
         # Rebuild url in english locale
@@ -2270,9 +2212,6 @@ class MyVideoIE(InfoExtractor):
     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
     IE_NAME = u'myvideo'
 
-    def __init__(self, downloader=None):
-        InfoExtractor.__init__(self, downloader)
-
     def _real_extract(self,url):
         mobj = re.match(self._VALID_URL, url)
         if mobj is None:
@@ -2441,7 +2380,7 @@ class ComedyCentralIE(InfoExtractor):
             shortMediaId = mediaId.split(':')[-1]
             showId = mediaId.split(':')[-2].replace('.com', '')
             officialTitle = itemEl.findall('./title')[0].text
-            officialDate = itemEl.findall('./pubDate')[0].text
+            officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
 
             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
                         compat_urllib_parse.urlencode({'uri': mediaId}))
@@ -2708,9 +2647,6 @@ class SoundcloudIE(InfoExtractor):
     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
     IE_NAME = u'soundcloud'
 
-    def __init__(self, downloader=None):
-        InfoExtractor.__init__(self, downloader)
-
     def report_resolve(self, video_id):
         """Report information extraction."""
         self.to_screen(u'%s: Resolving id' % video_id)
@@ -2754,12 +2690,13 @@ class SoundcloudIE(InfoExtractor):
 
         streams = json.loads(stream_json)
         mediaURL = streams['http_mp3_128_url']
+        upload_date = unified_strdate(info['created_at'])
 
         return [{
             'id':       info['id'],
             'url':      mediaURL,
             'uploader': info['user']['username'],
-            'upload_date':  info['created_at'],
+            'upload_date': upload_date,
             'title':    info['title'],
             'ext':      u'mp3',
             'description': info['description'],
@@ -2775,10 +2712,7 @@ class SoundcloudSetIE(InfoExtractor):
      """
 
     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
-    IE_NAME = u'soundcloud'
-
-    def __init__(self, downloader=None):
-        InfoExtractor.__init__(self, downloader)
+    IE_NAME = u'soundcloud:set'
 
     def report_resolve(self, video_id):
         """Report information extraction."""
@@ -2857,7 +2791,7 @@ class InfoQIE(InfoExtractor):
         self.report_extraction(url)
 
         # Extract video URL
-        mobj = re.search(r"jsclassref='([^']*)'", webpage)
+        mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
         if mobj is None:
             self._downloader.report_error(u'unable to extract video url')
             return
@@ -2900,9 +2834,6 @@ class MixcloudIE(InfoExtractor):
     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
     IE_NAME = u'mixcloud'
 
-    def __init__(self, downloader=None):
-        InfoExtractor.__init__(self, downloader)
-
     def report_download_json(self, file_id):
         """Report JSON download."""
         self.to_screen(u'Downloading json')
@@ -3010,10 +2941,6 @@ class StanfordOpenClassroomIE(InfoExtractor):
     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
     IE_NAME = u'stanfordoc'
 
-    def report_download_webpage(self, objid):
-        """Report information extraction."""
-        self.to_screen(u'%s: Downloading webpage' % objid)
-
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         if mobj is None:
@@ -3192,10 +3119,6 @@ class MTVIE(InfoExtractor):
 class YoukuIE(InfoExtractor):
     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
 
-    def report_download_webpage(self, file_id):
-        """Report webpage download."""
-        self.to_screen(u'%s: Downloading webpage' % file_id)
-
     def _gen_sid(self):
         nowTime = int(time.time() * 1000)
         random1 = random.randint(1000,1998)
@@ -3305,10 +3228,6 @@ class XNXXIE(InfoExtractor):
     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
 
-    def report_webpage(self, video_id):
-        """Report information extraction"""
-        self.to_screen(u'%s: Downloading webpage' % video_id)
-
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         if mobj is None:
@@ -3316,7 +3235,7 @@ class XNXXIE(InfoExtractor):
             return
         video_id = mobj.group(1)
 
-        self.report_webpage(video_id)
+        self.report_download_webpage(video_id)
 
         # Get webpage content
         try:
@@ -3362,9 +3281,6 @@ class GooglePlusIE(InfoExtractor):
     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
     IE_NAME = u'plus.google'
 
-    def __init__(self, downloader=None):
-        InfoExtractor.__init__(self, downloader)
-
     def report_extract_entry(self, url):
         """Report downloading extry"""
         self.to_screen(u'Downloading entry: %s' % url)
@@ -3640,6 +3556,7 @@ class FunnyOrDieIE(InfoExtractor):
 
 class SteamIE(InfoExtractor):
     _VALID_URL = r"""http://store.steampowered.com/
+                (agecheck/)?
                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
                 (?P<gameID>\d+)/?
                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
@@ -3838,7 +3755,7 @@ class YouPornIE(InfoExtractor):
             self._downloader.report_warning(u'unable to extract video date')
             upload_date = None
         else:
-            upload_date = result.group('date').strip()
+            upload_date = unified_strdate(result.group('date').strip())
 
         # Get the video uploader
         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
@@ -3945,7 +3862,7 @@ class PornotubeIE(InfoExtractor):
         if result is None:
             self._downloader.report_error(u'unable to extract video title')
             return
-        upload_date = result.group('date')
+        upload_date = unified_strdate(result.group('date'))
 
         info = {'id': video_id,
                 'url': video_url,