Merge branch 'master' into subtitles_rework
authorIsmael Mejia <iemejia@gmail.com>
Thu, 22 Aug 2013 23:47:10 +0000 (01:47 +0200)
committerIsmael Mejia <iemejia@gmail.com>
Thu, 22 Aug 2013 23:47:10 +0000 (01:47 +0200)
1  2 
youtube_dl/extractor/youtube.py

index f6ffb86c3f3dcdfc40cbebffa67ae65fff239919,e402ef17f27e967746b007da78c528553c4e8a43..571c73889008349fcbaf157e17d3f6d617ac5a5c
@@@ -7,7 -7,6 +7,7 @@@ import socke
  import itertools
  
  from .common import InfoExtractor, SearchInfoExtractor
 +from .subtitles import SubtitlesIE
  from ..utils import (
      compat_http_client,
      compat_parse_qs,
      orderedSet,
  )
  
 -class YoutubeBaseInfoExtractor(InfoExtractor):
 -    """Provide base functions for Youtube extractors"""
 -    _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 -    _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 -    _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 -    _NETRC_MACHINE = 'youtube'
 -    # If True it will raise an error if no login info is provided
 -    _LOGIN_REQUIRED = False
 -
 -    def report_lang(self):
 -        """Report attempt to set language."""
 -        self.to_screen(u'Setting language')
 -
 -    def _set_language(self):
 -        request = compat_urllib_request.Request(self._LANG_URL)
 -        try:
 -            self.report_lang()
 -            compat_urllib_request.urlopen(request).read()
 -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 -            self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 -            return False
 -        return True
 -
 -    def _login(self):
 -        (username, password) = self._get_login_info()
 -        # No authentication to be performed
 -        if username is None:
 -            if self._LOGIN_REQUIRED:
 -                raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
 -            return False
 -
 -        request = compat_urllib_request.Request(self._LOGIN_URL)
 -        try:
 -            login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 -            self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 -            return False
 -
 -        galx = None
 -        dsh = None
 -        match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 -        if match:
 -          galx = match.group(1)
 -        match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 -        if match:
 -          dsh = match.group(1)
 -
 -        # Log in
 -        login_form_strs = {
 -                u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 -                u'Email': username,
 -                u'GALX': galx,
 -                u'Passwd': password,
 -                u'PersistentCookie': u'yes',
 -                u'_utf8': u'霱',
 -                u'bgresponse': u'js_disabled',
 -                u'checkConnection': u'',
 -                u'checkedDomains': u'youtube',
 -                u'dnConn': u'',
 -                u'dsh': dsh,
 -                u'pstMsg': u'0',
 -                u'rmShown': u'1',
 -                u'secTok': u'',
 -                u'signIn': u'Sign in',
 -                u'timeStmp': u'',
 -                u'service': u'youtube',
 -                u'uilel': u'3',
 -                u'hl': u'en_US',
 -        }
 -        # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 -        # chokes on unicode
 -        login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 -        login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 -        request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 +
 +class YoutubeSubtitlesIE(SubtitlesIE):
 +
 +    def _get_available_subtitles(self, video_id):
 +        request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
          try:
 -            self.report_login()
 -            login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 -            if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 -                self._downloader.report_warning(u'unable to log in: bad username or password')
 -                return False
 +            sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 -            self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 -            return False
 -        return True
 -
 -    def _confirm_age(self):
 -        age_form = {
 -                'next_url':     '/',
 -                'action_confirm':   'Confirm',
 -                }
 -        request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 +            self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
 +            return {}
 +        lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 +
 +        sub_lang_list = {}
 +        for l in lang_list:
 +            lang = l[1]
 +            params = compat_urllib_parse.urlencode({
 +                'lang': lang,
 +                'v': video_id,
 +                'fmt': self._downloader.params.get('subtitlesformat'),
 +            })
 +            url = u'http://www.youtube.com/api/timedtext?' + params
 +            sub_lang_list[lang] = url
 +        if not sub_lang_list:
 +            self._downloader.report_warning(u'video doesn\'t have subtitles')
 +            return {}
 +        return sub_lang_list
 +
 +    def _request_automatic_caption(self, video_id, webpage):
 +        """We need the webpage for getting the captions url, pass it as an
 +           argument to speed up the process."""
 +        sub_lang = self._downloader.params.get('subtitleslang') or 'en'
 +        sub_format = self._downloader.params.get('subtitlesformat')
 +        self.to_screen(u'%s: Looking for automatic captions' % video_id)
 +        mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 +        err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
 +        if mobj is None:
 +            self._downloader.report_warning(err_msg)
 +            return {}
 +        player_config = json.loads(mobj.group(1))
          try:
 -            self.report_age_confirmation()
 -            compat_urllib_request.urlopen(request).read().decode('utf-8')
 -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 -            raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 -        return True
 +            args = player_config[u'args']
 +            caption_url = args[u'ttsurl']
 +            timestamp = args[u'timestamp']
 +            params = compat_urllib_parse.urlencode({
 +                'lang': 'en',
 +                'tlang': sub_lang,
 +                'fmt': sub_format,
 +                'ts': timestamp,
 +                'kind': 'asr',
 +            })
 +            subtitles_url = caption_url + '&' + params
 +            sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
 +            return {sub_lang: sub}
 +        # An extractor error can be raise by the download process if there are
 +        # no automatic captions but there are subtitles
 +        except (KeyError, ExtractorError):
 +            self._downloader.report_warning(err_msg)
 +            return {}
  
 -    def _real_initialize(self):
 -        if self._downloader is None:
 -            return
 -        if not self._set_language():
 -            return
 -        if not self._login():
 -            return
 -        self._confirm_age()
  
 -class YoutubeIE(YoutubeBaseInfoExtractor):
 +class YoutubeIE(YoutubeSubtitlesIE):
      IE_DESC = u'YouTube.com'
      _VALID_URL = r"""^
                       (
                           (?:                                                  # the various things that can precede the ID:
                               (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
                               |(?:                                             # or the v= param in all its forms
 -                                 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)?    # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 +                                 (?:watch|movie(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
                                   (?:\?|\#!?)                                  # the params delimiter ? or # or #!
                                   (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
                                   v=
          """Report attempt to download video info webpage."""
          self.to_screen(u'%s: Downloading video info webpage' % video_id)
  
 -    def report_video_subtitles_download(self, video_id):
 -        """Report attempt to download video info webpage."""
 -        self.to_screen(u'%s: Checking available subtitles' % video_id)
 -
 -    def report_video_subtitles_request(self, video_id, sub_lang, format):
 -        """Report attempt to download video info webpage."""
 -        self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 -
 -    def report_video_subtitles_available(self, video_id, sub_lang_list):
 -        """Report available subtitles."""
 -        sub_lang = ",".join(list(sub_lang_list.keys()))
 -        self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 -
      def report_information_extraction(self, video_id):
          """Report attempt to extract video information."""
          self.to_screen(u'%s: Extracting video information' % video_id)
          elif len(s) == 83:
              return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
          elif len(s) == 82:
-             return s[36] + s[79:67:-1] + s[81] + s[66:40:-1] + s[33] + s[39:36:-1] + s[40] + s[35] + s[0] + s[67] + s[32:0:-1] + s[34]
+             return s[1:19] + s[0] + s[20:68] + s[19] + s[69:82]
          elif len(s) == 81:
              return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
          elif len(s) == 79:
          # subtitles
          video_subtitles = None
  
 -        if self._downloader.params.get('writesubtitles', False):
 -            video_subtitles = self._extract_subtitle(video_id)
 -            if video_subtitles:
 -                (sub_error, sub_lang, sub) = video_subtitles[0]
 -                if sub_error:
 -                    self._downloader.report_warning(sub_error)
 -        
 -        if self._downloader.params.get('writeautomaticsub', False):
 +        if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False):
 +            video_subtitles = self._extract_subtitles(video_id)
 +        elif self._downloader.params.get('writeautomaticsub', False):
              video_subtitles = self._request_automatic_caption(video_id, video_webpage)
 -            (sub_error, sub_lang, sub) = video_subtitles[0]
 -            if sub_error:
 -                self._downloader.report_warning(sub_error)
 -
 -        if self._downloader.params.get('allsubtitles', False):
 -            video_subtitles = self._extract_all_subtitles(video_id)
 -            for video_subtitle in video_subtitles:
 -                (sub_error, sub_lang, sub) = video_subtitle
 -                if sub_error:
 -                    self._downloader.report_warning(sub_error)
  
          if self._downloader.params.get('listsubtitles', False):
              self._list_available_subtitles(video_id)