Merge pull request #888 from rg3/youtube_playlists_fix_886

[youtube-dl] / youtube_dl / InfoExtractors.py
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py

index 6060a5988cb274a287bbc1ef219832f6a4c64622..15417f05a89899b7d0f06cd094998894fcc56d1e 100755 (executable)
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -420,7 +420,7 @@ class YoutubeIE(InfoExtractor):
      def _request_automatic_caption(self, video_id, webpage):
          """We need the webpage for getting the captions url, pass it as an
             argument to speed up the process."""
-        sub_lang = self._downloader.params.get('subtitleslang')
+        sub_lang = self._downloader.params.get('subtitleslang') or 'en'
          sub_format = self._downloader.params.get('subtitlesformat')
          self.to_screen(u'%s: Looking for automatic captions' % video_id)
          mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
@@ -732,8 +732,11 @@ class YoutubeIE(InfoExtractor):
              for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
                  url_data = compat_parse_qs(url_data_str)
                  if 'itag' in url_data and 'url' in url_data:
-                    url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
-                    if not 'ratebypass' in url: url += '&ratebypass=yes'
+                    url = url_data['url'][0]
+                    if 'sig' in url_data:
+                        url += '&signature=' + url_data['sig'][0]
+                    if 'ratebypass' not in url:
+                        url += '&ratebypass=yes'
                      url_map[url_data['itag'][0]] = url
  
              format_limit = self._downloader.params.get('format_limit', None)
@@ -940,16 +943,10 @@ class DailymotionIE(InfoExtractor):
          video_title = unescapeHTML(mobj.group('title'))
  
          video_uploader = None
-        mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
-        if mobj is None:
-            # lookin for official user
-            mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
-            if mobj_official is None:
-                self._downloader.report_warning(u'unable to extract uploader nickname')
-            else:
-                video_uploader = mobj_official.group(1)
-        else:
-            video_uploader = mobj.group(1)
+        video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
+                                             # Looking for official user
+                                             r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
+                                            webpage, 'video uploader')
  
          video_upload_date = None
          mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
@@ -1409,6 +1406,9 @@ class GenericIE(InfoExtractor):
          if mobj is None:
              # Broaden the search a little bit: JWPlayer JS loader
              mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
+        if mobj is None:
+            # Try to find twitter cards info
+            mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
          if mobj is None:
              raise ExtractorError(u'Invalid URL: %s' % url)
  
@@ -1430,16 +1430,12 @@ class GenericIE(InfoExtractor):
          #   Site Name | Video Title
          #   Video Title - Tagline | Site Name
          # and so on and so forth; it's just not practical
-        mobj = re.search(r'<title>(.*)</title>', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        video_title = mobj.group(1)
+        video_title = self._html_search_regex(r'<title>(.*)</title>',
+            webpage, u'video title')
  
          # video uploader is domain name
-        mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        video_uploader = mobj.group(1)
+        video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
+            url, u'video uploader')
  
          return [{
              'id':       video_id,
@@ -1460,7 +1456,6 @@ class YoutubeSearchIE(SearchInfoExtractor):
  
      def report_download_page(self, query, pagenum):
          """Report attempt to download search page with given number."""
-        query = query.decode(preferredencoding())
          self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
  
      def _get_n_results(self, query, n):
@@ -1578,7 +1573,7 @@ class YoutubePlaylistIE(InfoExtractor):
                       |
                          ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
                       )"""
-    _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
+    _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
      _MAX_RESULTS = 50
      IE_NAME = u'youtube:playlist'
  
@@ -1614,9 +1609,10 @@ class YoutubePlaylistIE(InfoExtractor):
                  # Number of videos is a multiple of self._MAX_RESULTS
                  break
  
-            videos += [ (entry['yt$position']['$t'], entry['content']['src'])
-                        for entry in response['feed']['entry']
-                        if 'content' in entry ]
+            for entry in response['feed']['entry']:
+                index = entry['yt$position']['$t']
+                if 'media$group' in entry and 'media$player' in entry['media$group']:
+                    videos.append((index, entry['media$group']['media$player']['url']))
  
              if len(response['feed']['entry']) < self._MAX_RESULTS:
                  break
@@ -2377,8 +2373,8 @@ class EscapistIE(InfoExtractor):
          showName = mobj.group('showname')
          videoId = mobj.group('episode')
  
-        self.report_extraction(showName)
-        webpage = self._download_webpage(url, showName)
+        self.report_extraction(videoId)
+        webpage = self._download_webpage(url, videoId)
  
          videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
              webpage, u'description', fatal=False)
@@ -2389,10 +2385,13 @@ class EscapistIE(InfoExtractor):
          playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
              webpage, u'player url')
  
+        title = self._html_search_regex('<meta name="title" content="([^"]*)"',
+            webpage, u'player url').split(' : ')[-1]
+
          configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
          configUrl = compat_urllib_parse.unquote(configUrl)
  
-        configJSON = self._download_webpage(configUrl, showName,
+        configJSON = self._download_webpage(configUrl, videoId,
                                              u'Downloading configuration',
                                              u'unable to download configuration')
  
@@ -2412,7 +2411,7 @@ class EscapistIE(InfoExtractor):
              'url': videoUrl,
              'uploader': showName,
              'upload_date': None,
-            'title': showName,
+            'title': title,
              'ext': 'mp4',
              'thumbnail': imgUrl,
              'description': videoDesc,
@@ -3364,6 +3363,8 @@ class SteamIE(InfoExtractor):
                  (?P<gameID>\d+)/?
                  (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
                  """
+    _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
+    _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
  
      @classmethod
      def suitable(cls, url):
@@ -3373,11 +3374,19 @@ class SteamIE(InfoExtractor):
      def _real_extract(self, url):
          m = re.match(self._VALID_URL, url, re.VERBOSE)
          gameID = m.group('gameID')
-        videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
-        self.report_age_confirmation()
+
+        videourl = self._VIDEO_PAGE_TEMPLATE % gameID
          webpage = self._download_webpage(videourl, gameID)
-        game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
-        
+
+        if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
+            videourl = self._AGECHECK_TEMPLATE % gameID
+            self.report_age_confirmation()
+            webpage = self._download_webpage(videourl, gameID)
+
+        self.report_extraction(gameID)
+        game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
+                                             webpage, 'game title')
+
          urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
          mweb = re.finditer(urlRE, webpage)
          namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
@@ -3483,8 +3492,8 @@ class RBMARadioIE(InfoExtractor):
  
          webpage = self._download_webpage(url, video_id)
  
-        json_data = self._search_regex(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>',
-            webpage, u'json data')
+        json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
+            webpage, u'json data', flags=re.MULTILINE)
  
          try:
              data = json.loads(json_data)
@@ -3581,14 +3590,14 @@ class YouPornIE(InfoExtractor):
              size = format[0]
              bitrate = format[1]
              format = "-".join( format )
-            title = u'%s-%s-%s' % (video_title, size, bitrate)
+            # title = u'%s-%s-%s' % (video_title, size, bitrate)
  
              formats.append({
                  'id': video_id,
                  'url': video_url,
                  'uploader': video_uploader,
                  'upload_date': upload_date,
-                'title': title,
+                'title': video_title,
                  'ext': extension,
                  'format': format,
                  'thumbnail': thumbnail,
@@ -3787,10 +3796,6 @@ class TEDIE(InfoExtractor):
              self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
              return [self._playlist_videos_info(url,name,playlist_id)]
  
-    def _talk_video_link(self,mediaSlug):
-        '''Returns the video link for that mediaSlug'''
-        return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
-
      def _playlist_videos_info(self,url,name,playlist_id=0):
          '''Returns the videos of the playlist'''
          video_RE=r'''
@@ -3803,9 +3808,8 @@ class TEDIE(InfoExtractor):
          m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
          m_names=re.finditer(video_name_RE,webpage)
  
-        playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
-        m_playlist = re.search(playlist_RE, webpage)
-        playlist_title = m_playlist.group('playlist_title')
+        playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
+                                                 webpage, 'playlist title')
  
          playlist_entries = []
          for m_video, m_name in zip(m_videos,m_names):
@@ -3816,27 +3820,28 @@ class TEDIE(InfoExtractor):
  
      def _talk_info(self, url, video_id=0):
          """Return the video for the talk in the url"""
-        m=re.match(self._VALID_URL, url,re.VERBOSE)
-        videoName=m.group('name')
-        webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
+        m = re.match(self._VALID_URL, url,re.VERBOSE)
+        video_name = m.group('name')
+        webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
+        self.report_extraction(video_name)
          # If the url includes the language we get the title translated
-        title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
-        title=re.search(title_RE, webpage).group('title')
-        info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
-                        "id":(?P<videoID>[\d]+).*?
-                        "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
-        thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
-        thumb_match=re.search(thumb_RE,webpage)
-        info_match=re.search(info_RE,webpage,re.VERBOSE)
-        video_id=info_match.group('videoID')
-        mediaSlug=info_match.group('mediaSlug')
-        video_url=self._talk_video_link(mediaSlug)
+        title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
+                                        webpage, 'title')
+        json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
+                                    webpage, 'json data')
+        info = json.loads(json_data)
+        desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
+                                       webpage, 'description', flags = re.DOTALL)
+        
+        thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
+                                       webpage, 'thumbnail')
          info = {
-                'id': video_id,
-                'url': video_url,
+                'id': info['id'],
+                'url': info['htmlStreams'][-1]['file'],
                  'ext': 'mp4',
                  'title': title,
-                'thumbnail': thumb_match.group('thumbnail')
+                'thumbnail': thumbnail,
+                'description': desc,
                  }
          return info
  
@@ -4002,6 +4007,64 @@ class ARDIE(InfoExtractor):
              info["url"] = stream["video_url"]
          return [info]
  
+class ZDFIE(InfoExtractor):
+    _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
+    _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
+    _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
+    _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
+    _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            raise ExtractorError(u'Invalid URL: %s' % url)
+        video_id = mobj.group('video_id')
+
+        html = self._download_webpage(url, video_id)
+        streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
+        if streams is None:
+            raise ExtractorError(u'No media url found.')
+
+        # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
+        # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
+        # choose first/default media type and highest quality for now
+        for s in streams:        #find 300 - dsl1000mbit
+            if s['quality'] == '300' and s['media_type'] == 'wstreaming':
+                stream_=s
+                break
+        for s in streams:        #find veryhigh - dsl2000mbit
+            if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
+                stream_=s
+                break
+        if stream_ is None:
+            raise ExtractorError(u'No stream found.')
+
+        media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
+
+        self.report_extraction(video_id)
+        mobj = re.search(self._TITLE, html)
+        if mobj is None:
+            raise ExtractorError(u'Cannot extract title')
+        title = unescapeHTML(mobj.group('title'))
+
+        mobj = re.search(self._MMS_STREAM, media_link)
+        if mobj is None:
+            mobj = re.search(self._RTSP_STREAM, media_link)
+            if mobj is None:
+                raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
+        mms_url = mobj.group('video_url')
+
+        mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
+        if mobj is None:
+            raise ExtractorError(u'Cannot extract extention')
+        ext = mobj.group('ext')
+
+        return [{'id': video_id,
+                 'url': mms_url,
+                 'title': title,
+                 'ext': ext
+                 }]
+
  class TumblrIE(InfoExtractor):
      _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
  
@@ -4328,7 +4391,7 @@ class XHamsterIE(InfoExtractor):
              video_upload_date = None
              self._downloader.report_warning(u'Unable to extract upload date')
  
-        video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)',
+        video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
              webpage, u'uploader id', default=u'anonymous')
  
          video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
@@ -4395,6 +4458,92 @@ class HypemIE(InfoExtractor):
              'artist':   artist,
          }]
  
+class Vbox7IE(InfoExtractor):
+    """Information Extractor for Vbox7"""
+    _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
+
+    def _real_extract(self,url):
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            raise ExtractorError(u'Invalid URL: %s' % url)
+        video_id = mobj.group(1)
+
+        redirect_page, urlh = self._download_webpage_handle(url, video_id)
+        new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
+        redirect_url = urlh.geturl() + new_location
+        webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
+
+        title = self._html_search_regex(r'<title>(.*)</title>',
+            webpage, u'title').split('/')[0].strip()
+
+        ext = "flv"
+        info_url = "http://vbox7.com/play/magare.do"
+        data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
+        info_request = compat_urllib_request.Request(info_url, data)
+        info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
+        if info_response is None:
+            raise ExtractorError(u'Unable to extract the media url')
+        (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
+
+        return [{
+            'id':        video_id,
+            'url':       final_url,
+            'ext':       ext,
+            'title':     title,
+            'thumbnail': thumbnail_url,
+        }]
+
+class GametrailersIE(InfoExtractor):
+    _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            raise ExtractorError(u'Invalid URL: %s' % url)
+        video_id = mobj.group('id')
+        video_type = mobj.group('type')
+        webpage = self._download_webpage(url, video_id)
+        if video_type == 'full-episodes':
+            mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
+        else:
+            mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
+        mgid = self._search_regex(mgid_re, webpage, u'mgid')
+        data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
+
+        info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
+                                           video_id, u'Downloading video info')
+        links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
+                                               video_id, u'Downloading video urls info')
+
+        self.report_extraction(video_id)
+        info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
+                      <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
+                      <image>.*
+                        <url>(?P<thumb>.*?)</url>.*
+                      </image>'''
+
+        m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
+        if m_info is None:
+            raise ExtractorError(u'Unable to extract video info')
+        video_title = m_info.group('title')
+        video_description = m_info.group('description')
+        video_thumb = m_info.group('thumb')
+
+        m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
+        if m_urls is None or len(m_urls) == 0:
+            raise ExtractError(u'Unable to extrat video url')
+        # They are sorted from worst to best quality
+        video_url = m_urls[-1].group('url')
+
+        return {'url':         video_url,
+                'id':          video_id,
+                'title':       video_title,
+                # Videos are actually flv not mp4
+                'ext':         'flv',
+                'thumbnail':   video_thumb,
+                'description': video_description,
+                }
  
  def gen_extractors():
      """ Return a list of an instance of every supported extractor.
@@ -4449,6 +4598,7 @@ def gen_extractors():
          SpiegelIE(),
          LiveLeakIE(),
          ARDIE(),
+        ZDFIE(),
          TumblrIE(),
          BandcampIE(),
          RedTubeIE(),
@@ -4459,6 +4609,8 @@ def gen_extractors():
          TeamcocoIE(),
          XHamsterIE(),
          HypemIE(),
+        Vbox7IE(),
+        GametrailersIE(),
          GenericIE()
      ]