[vimeo] Fix description extraction

[youtube-dl] / youtube_dl / extractor / youtube.py
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index 3a3a5a39e4bfcdce1a3bee26fee3e43326d62d90..67845349eed30db5ae2f037f4791196d9137b992 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -7,13 +7,13 @@ import itertools
  import json
  import os.path
  import re
  import json
  import os.path
  import re
-import string
  import struct
  import traceback
  import zlib
  
  from .common import InfoExtractor, SearchInfoExtractor
  from .subtitles import SubtitlesInfoExtractor
  import struct
  import traceback
  import zlib
  
  from .common import InfoExtractor, SearchInfoExtractor
  from .subtitles import SubtitlesInfoExtractor
+from ..jsinterp import JSInterpreter
  from ..utils import (
      compat_chr,
      compat_parse_qs,
  from ..utils import (
      compat_chr,
      compat_parse_qs,
@@ -151,6 +151,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                               )
                           ))
                           |youtu\.be/                                          # just youtu.be/xxxx
                               )
                           ))
                           |youtu\.be/                                          # just youtu.be/xxxx
+                         |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
                           )
                       )?                                                       # all until now is optional -> you can pass the naked ID
                       ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
                           )
                       )?                                                       # all until now is optional -> you can pass the naked ID
                       ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
@@ -209,23 +210,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
          '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
  
          # Dash webm
          '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
  
          # Dash webm
-        '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
-        '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
-        '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
-        '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
-        '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
-        '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
-        '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH webm', 'preference': -40},
-        '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH webm', 'preference': -40},
-        '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH webm', 'preference': -40},
-        '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH webm', 'preference': -40},
-        '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH webm', 'preference': -40},
-        '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH webm', 'preference': -40},
-        '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH webm', 'preference': -40},
+        '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
+        '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
+        '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
+        '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
+        '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
+        '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
+        '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+        '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+        '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+        '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+        '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+        '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+        '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
  
          # Dash webm audio
  
          # Dash webm audio
-        '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
-        '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
+        '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50},
+        '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
  
          # RTMP (unnamed)
          '_rtmp': {'protocol': 'rtmp'},
  
          # RTMP (unnamed)
          '_rtmp': {'protocol': 'rtmp'},
@@ -251,7 +252,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
              u"info_dict": {
                  u"upload_date": u"20120506",
                  u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
              u"info_dict": {
                  u"upload_date": u"20120506",
                  u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
-                u"description": u"md5:5b292926389560516e384ac437c0ec07",
+                u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
                  u"uploader": u"Icona Pop",
                  u"uploader_id": u"IconaPop"
              }
                  u"uploader": u"Icona Pop",
                  u"uploader_id": u"IconaPop"
              }
@@ -303,7 +304,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                  u'id': u'IB3lcPjvWLA',
                  u'ext': u'm4a',
                  u'title': u'Afrojack - The Spark ft. Spree Wilson',
                  u'id': u'IB3lcPjvWLA',
                  u'ext': u'm4a',
                  u'title': u'Afrojack - The Spark ft. Spree Wilson',
-                u'description': u'md5:3199ed45ee8836572865580804d7ac0f',
+                u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8',
                  u'uploader': u'AfrojackVEVO',
                  u'uploader_id': u'AfrojackVEVO',
                  u'upload_date': u'20131011',
                  u'uploader': u'AfrojackVEVO',
                  u'uploader_id': u'AfrojackVEVO',
                  u'upload_date': u'20131011',
@@ -438,113 +439,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
      def _parse_sig_js(self, jscode):
          funcname = self._search_regex(
              r'signature=([a-zA-Z]+)', jscode,
      def _parse_sig_js(self, jscode):
          funcname = self._search_regex(
              r'signature=([a-zA-Z]+)', jscode,
-            u'Initial JS player signature function name')
-
-        functions = {}
-
-        def argidx(varname):
-            return string.lowercase.index(varname)
-
-        def interpret_statement(stmt, local_vars, allow_recursion=20):
-            if allow_recursion < 0:
-                raise ExtractorError(u'Recursion limit reached')
-
-            if stmt.startswith(u'var '):
-                stmt = stmt[len(u'var '):]
-            ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
-                             r'=(?P<expr>.*)$', stmt)
-            if ass_m:
-                if ass_m.groupdict().get('index'):
-                    def assign(val):
-                        lvar = local_vars[ass_m.group('out')]
-                        idx = interpret_expression(ass_m.group('index'),
-                                                   local_vars, allow_recursion)
-                        assert isinstance(idx, int)
-                        lvar[idx] = val
-                        return val
-                    expr = ass_m.group('expr')
-                else:
-                    def assign(val):
-                        local_vars[ass_m.group('out')] = val
-                        return val
-                    expr = ass_m.group('expr')
-            elif stmt.startswith(u'return '):
-                assign = lambda v: v
-                expr = stmt[len(u'return '):]
-            else:
-                raise ExtractorError(
-                    u'Cannot determine left side of statement in %r' % stmt)
-
-            v = interpret_expression(expr, local_vars, allow_recursion)
-            return assign(v)
-
-        def interpret_expression(expr, local_vars, allow_recursion):
-            if expr.isdigit():
-                return int(expr)
-
-            if expr.isalpha():
-                return local_vars[expr]
-
-            m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
-            if m:
-                member = m.group('member')
-                val = local_vars[m.group('in')]
-                if member == 'split("")':
-                    return list(val)
-                if member == 'join("")':
-                    return u''.join(val)
-                if member == 'length':
-                    return len(val)
-                if member == 'reverse()':
-                    return val[::-1]
-                slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
-                if slice_m:
-                    idx = interpret_expression(
-                        slice_m.group('idx'), local_vars, allow_recursion-1)
-                    return val[idx:]
-
-            m = re.match(
-                r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
-            if m:
-                val = local_vars[m.group('in')]
-                idx = interpret_expression(m.group('idx'), local_vars,
-                                           allow_recursion-1)
-                return val[idx]
-
-            m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
-            if m:
-                a = interpret_expression(m.group('a'),
-                                         local_vars, allow_recursion)
-                b = interpret_expression(m.group('b'),
-                                         local_vars, allow_recursion)
-                return a % b
-
-            m = re.match(
-                r'^(?P<func>[a-zA-Z$]+)\((?P<args>[a-z0-9,]+)\)$', expr)
-            if m:
-                fname = m.group('func')
-                if fname not in functions:
-                    functions[fname] = extract_function(fname)
-                argvals = [int(v) if v.isdigit() else local_vars[v]
-                           for v in m.group('args').split(',')]
-                return functions[fname](argvals)
-            raise ExtractorError(u'Unsupported JS expression %r' % expr)
-
-        def extract_function(funcname):
-            func_m = re.search(
-                r'function ' + re.escape(funcname) +
-                r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
-                jscode)
-            argnames = func_m.group('args').split(',')
-
-            def resf(args):
-                local_vars = dict(zip(argnames, args))
-                for stmt in func_m.group('code').split(';'):
-                    res = interpret_statement(stmt, local_vars)
-                return res
-            return resf
-
-        initial_function = extract_function(funcname)
+             u'Initial JS player signature function name')
+
+        jsi = JSInterpreter(jscode)
+        initial_function = jsi.extract_function(funcname)
          return lambda s: initial_function([s])
  
      def _parse_sig_swf(self, file_contents):
          return lambda s: initial_function([s])
  
      def _parse_sig_swf(self, file_contents):
@@ -1184,9 +1082,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                      break
          if 'token' not in video_info:
              if 'reason' in video_info:
                      break
          if 'token' not in video_info:
              if 'reason' in video_info:
-                raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
+                raise ExtractorError(
+                    u'YouTube said: %s' % video_info['reason'][0],
+                    expected=True, video_id=video_id)
              else:
              else:
-                raise ExtractorError(u'"token" parameter not in video info for unknown reason')
+                raise ExtractorError(
+                    u'"token" parameter not in video info for unknown reason',
+                    video_id=video_id)
  
          if 'view_count' in video_info:
              view_count = int(video_info['view_count'][0])
  
          if 'view_count' in video_info:
              view_count = int(video_info['view_count'][0])
@@ -1215,7 +1117,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
  
          # title
          if 'title' in video_info:
  
          # title
          if 'title' in video_info:
-            video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
+            video_title = video_info['title'][0]
          else:
              self._downloader.report_warning(u'Unable to extract video title')
              video_title = u'_'
          else:
              self._downloader.report_warning(u'Unable to extract video title')
              video_title = u'_'
@@ -1521,7 +1423,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
                  self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
                  return self.url_result(video_id, 'Youtube', video_id=video_id)
              else:
                  self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
                  return self.url_result(video_id, 'Youtube', video_id=video_id)
              else:
-                self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
+                self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
  
          if playlist_id.startswith('RD'):
              # Mixes require a custom extraction process
  
          if playlist_id.startswith('RD'):
              # Mixes require a custom extraction process
@@ -1534,6 +1436,13 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
          page = self._download_webpage(url, playlist_id)
          more_widget_html = content_html = page
  
          page = self._download_webpage(url, playlist_id)
          more_widget_html = content_html = page
  
+        # Check if the playlist exists or is private
+        if re.search(r'<div class="yt-alert-message">[^<]*?The playlist does not exist[^<]*?</div>', page) is not None:
+            raise ExtractorError(
+                u'The playlist doesn\'t exist or is private, use --username or '
+                '--netrc to access it.',
+                expected=True)
+
          # Extract the video ids from the playlist pages
          ids = []
  
          # Extract the video ids from the playlist pages
          ids = []
  
@@ -1549,12 +1458,15 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
                  break
  
              more = self._download_json(
                  break
  
              more = self._download_json(
-                'https://youtube.com/%s' % mobj.group('more'), playlist_id, 'Downloading page #%s' % page_num)
+                'https://youtube.com/%s' % mobj.group('more'), playlist_id,
+                'Downloading page #%s' % page_num,
+                transform_source=uppercase_escape)
              content_html = more['content_html']
              more_widget_html = more['load_more_widget_html']
  
          playlist_title = self._html_search_regex(
              content_html = more['content_html']
              more_widget_html = more['load_more_widget_html']
  
          playlist_title = self._html_search_regex(
-                r'<h1 class="pl-header-title">\s*(.*?)\s*</h1>', page, u'title')
+            r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
+            page, u'title')
  
          url_results = self._ids_to_results(ids)
          return self.playlist_result(url_results, playlist_id, playlist_title)
  
          url_results = self._ids_to_results(ids)
          return self.playlist_result(url_results, playlist_id, playlist_title)
@@ -1712,7 +1624,7 @@ class YoutubeUserIE(InfoExtractor):
  
  class YoutubeSearchIE(SearchInfoExtractor):
      IE_DESC = u'YouTube.com searches'
  
  class YoutubeSearchIE(SearchInfoExtractor):
      IE_DESC = u'YouTube.com searches'
-    _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
+    _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
      _MAX_RESULTS = 1000
      IE_NAME = u'youtube:search'
      _SEARCH_KEY = 'ytsearch'
      _MAX_RESULTS = 1000
      IE_NAME = u'youtube:search'
      _SEARCH_KEY = 'ytsearch'
@@ -1723,9 +1635,12 @@ class YoutubeSearchIE(SearchInfoExtractor):
          video_ids = []
          pagenum = 0
          limit = n
          video_ids = []
          pagenum = 0
          limit = n
+        PAGE_SIZE = 50
  
  
-        while (50 * pagenum) < limit:
-            result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
+        while (PAGE_SIZE * pagenum) < limit:
+            result_url = self._API_URL % (
+                compat_urllib_parse.quote_plus(query.encode('utf-8')),
+                (PAGE_SIZE * pagenum) + 1)
              data_json = self._download_webpage(
                  result_url, video_id=u'query "%s"' % query,
                  note=u'Downloading page %s' % (pagenum + 1),
              data_json = self._download_webpage(
                  result_url, video_id=u'query "%s"' % query,
                  note=u'Downloading page %s' % (pagenum + 1),
@@ -1836,11 +1751,10 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
          feed_entries = []
          paging = 0
          for i in itertools.count(1):
          feed_entries = []
          paging = 0
          for i in itertools.count(1):
-            info = self._download_webpage(self._FEED_TEMPLATE % paging,
+            info = self._download_json(self._FEED_TEMPLATE % paging,
                                            u'%s feed' % self._FEED_NAME,
                                            u'Downloading page %s' % i)
                                            u'%s feed' % self._FEED_NAME,
                                            u'Downloading page %s' % i)
-            info = json.loads(info)
-            feed_html = info['feed_html']
+            feed_html = info.get('feed_html') or info.get('content_html')
              m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
              ids = orderedSet(m.group(1) for m in m_ids)
              feed_entries.extend(
              m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
              ids = orderedSet(m.group(1) for m in m_ids)
              feed_entries.extend(
@@ -1852,7 +1766,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
          return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
  
  class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
          return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
  
  class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
-    IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
+    IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
      _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
      _FEED_NAME = 'subscriptions'
      _PLAYLIST_TITLE = u'Youtube Subscriptions'
      _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
      _FEED_NAME = 'subscriptions'
      _PLAYLIST_TITLE = u'Youtube Subscriptions'