Merge branch 'master' into subtitles_rework
authorIsmael Mejia <iemejia@gmail.com>
Mon, 26 Aug 2013 02:03:40 +0000 (04:03 +0200)
committerIsmael Mejia <iemejia@gmail.com>
Tue, 27 Aug 2013 22:33:12 +0000 (00:33 +0200)
1  2 
youtube_dl/__init__.py
youtube_dl/extractor/dailymotion.py
youtube_dl/extractor/youtube.py

Simple merge
index f54ecc569cbe02714df09cdaf55c72ffd7129895,fa8c630d053168bf30d835952debd67536555c0c..003b1d8c3e6233368b764e2866431cde13e032f3
@@@ -1,39 -1,17 +1,40 @@@
  import re
  import json
+ import itertools
 +import socket
  
  from .common import InfoExtractor
 +from .subtitles import NoAutoSubtitlesIE
 +
  from ..utils import (
 +    compat_http_client,
 +    compat_urllib_error,
      compat_urllib_request,
 +    compat_str,
      get_element_by_attribute,
      get_element_by_id,
  
      ExtractorError,
  )
  
 -class DailymotionIE(InfoExtractor):
 +
 +class DailyMotionSubtitlesIE(NoAutoSubtitlesIE):
 +
 +    def _get_available_subtitles(self, video_id):
 +        request = compat_urllib_request.Request('https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id)
 +        try:
 +            sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 +        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 +            self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
 +            return {}
 +        info = json.loads(sub_list)
 +        if (info['total'] > 0):
 +            sub_lang_list = dict((l['language'], l['url']) for l in info['list'])
 +            return sub_lang_list
 +        self._downloader.report_warning(u'video doesn\'t have subtitles')
 +        return {}
 +
- class DailymotionIE(DailyMotionSubtitlesIE):
++class DailymotionIE(DailyMotionSubtitlesIE, InfoExtractor):
      """Information Extractor for Dailymotion"""
  
      _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
index 571c73889008349fcbaf157e17d3f6d617ac5a5c,446d53f644114e40915a821d63b0d61d6ed19d0d..370cc64cc911ff3871604c755ac7c7d13e44ec8c
@@@ -24,67 -23,114 +24,172 @@@ from ..utils import 
      orderedSet,
  )
  
+ class YoutubeBaseInfoExtractor(InfoExtractor):
+     """Provide base functions for Youtube extractors"""
+     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
+     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
+     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
+     _NETRC_MACHINE = 'youtube'
+     # If True it will raise an error if no login info is provided
+     _LOGIN_REQUIRED = False
+     def report_lang(self):
+         """Report attempt to set language."""
+         self.to_screen(u'Setting language')
+     def _set_language(self):
+         request = compat_urllib_request.Request(self._LANG_URL)
+         try:
+             self.report_lang()
+             compat_urllib_request.urlopen(request).read()
+         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
+             return False
+         return True
+     def _login(self):
+         (username, password) = self._get_login_info()
+         # No authentication to be performed
+         if username is None:
+             if self._LOGIN_REQUIRED:
+                 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
+             return False
+         request = compat_urllib_request.Request(self._LOGIN_URL)
+         try:
+             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
+         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
+             return False
+         galx = None
+         dsh = None
+         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
+         if match:
+           galx = match.group(1)
+         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
+         if match:
+           dsh = match.group(1)
+         # Log in
+         login_form_strs = {
+                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
+                 u'Email': username,
+                 u'GALX': galx,
+                 u'Passwd': password,
+                 u'PersistentCookie': u'yes',
+                 u'_utf8': u'霱',
+                 u'bgresponse': u'js_disabled',
+                 u'checkConnection': u'',
+                 u'checkedDomains': u'youtube',
+                 u'dnConn': u'',
+                 u'dsh': dsh,
+                 u'pstMsg': u'0',
+                 u'rmShown': u'1',
+                 u'secTok': u'',
+                 u'signIn': u'Sign in',
+                 u'timeStmp': u'',
+                 u'service': u'youtube',
+                 u'uilel': u'3',
+                 u'hl': u'en_US',
+         }
+         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
+         # chokes on unicode
+         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
+         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
+         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
+         try:
+             self.report_login()
+             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
+             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
+                 self._downloader.report_warning(u'unable to log in: bad username or password')
+                 return False
+         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
+             return False
+         return True
+     def _confirm_age(self):
+         age_form = {
+                 'next_url':     '/',
+                 'action_confirm':   'Confirm',
+                 }
+         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
+         try:
+             self.report_age_confirmation()
+             compat_urllib_request.urlopen(request).read().decode('utf-8')
+         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
+         return True
+     def _real_initialize(self):
+         if self._downloader is None:
+             return
+         if not self._set_language():
+             return
+         if not self._login():
+             return
+         self._confirm_age()
  
 -class YoutubeIE(YoutubeBaseInfoExtractor):
 +class YoutubeSubtitlesIE(SubtitlesIE):
 +
 +    def _get_available_subtitles(self, video_id):
 +        request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 +        try:
 +            sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 +        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 +            self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
 +            return {}
 +        lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 +
 +        sub_lang_list = {}
 +        for l in lang_list:
 +            lang = l[1]
 +            params = compat_urllib_parse.urlencode({
 +                'lang': lang,
 +                'v': video_id,
 +                'fmt': self._downloader.params.get('subtitlesformat'),
 +            })
 +            url = u'http://www.youtube.com/api/timedtext?' + params
 +            sub_lang_list[lang] = url
 +        if not sub_lang_list:
 +            self._downloader.report_warning(u'video doesn\'t have subtitles')
 +            return {}
 +        return sub_lang_list
 +
 +    def _request_automatic_caption(self, video_id, webpage):
 +        """We need the webpage for getting the captions url, pass it as an
 +           argument to speed up the process."""
 +        sub_lang = self._downloader.params.get('subtitleslang') or 'en'
 +        sub_format = self._downloader.params.get('subtitlesformat')
 +        self.to_screen(u'%s: Looking for automatic captions' % video_id)
 +        mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 +        err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
 +        if mobj is None:
 +            self._downloader.report_warning(err_msg)
 +            return {}
 +        player_config = json.loads(mobj.group(1))
 +        try:
 +            args = player_config[u'args']
 +            caption_url = args[u'ttsurl']
 +            timestamp = args[u'timestamp']
 +            params = compat_urllib_parse.urlencode({
 +                'lang': 'en',
 +                'tlang': sub_lang,
 +                'fmt': sub_format,
 +                'ts': timestamp,
 +                'kind': 'asr',
 +            })
 +            subtitles_url = caption_url + '&' + params
 +            sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
 +            return {sub_lang: sub}
 +        # An extractor error can be raise by the download process if there are
 +        # no automatic captions but there are subtitles
 +        except (KeyError, ExtractorError):
 +            self._downloader.report_warning(err_msg)
 +            return {}
 +
- class YoutubeIE(YoutubeSubtitlesIE):
++class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor):
      IE_DESC = u'YouTube.com'
      _VALID_URL = r"""^
                       (
              # Fallback to the other algortihms
              return self._decrypt_signature(s)
  
--
--    def _get_available_subtitles(self, video_id):
--        self.report_video_subtitles_download(video_id)
--        request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
--        try:
--            sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
--        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 -            self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
 -            return {}
--        sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
--        sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
--        if not sub_lang_list:
-             return (u'video doesn\'t have subtitles', None)
 -            self._downloader.report_warning(u'video doesn\'t have subtitles')
 -            return {}
--        return sub_lang_list
--
--    def _list_available_subtitles(self, video_id):
--        sub_lang_list = self._get_available_subtitles(video_id)
--        self.report_video_subtitles_available(video_id, sub_lang_list)
--
--    def _request_subtitle(self, sub_lang, sub_name, video_id, format):
--        """
-         Return tuple:
-         (error_message, sub_lang, sub)
 -        Return the subtitle as a string or None if they are not found
--        """
--        self.report_video_subtitles_request(video_id, sub_lang, format)
--        params = compat_urllib_parse.urlencode({
--            'lang': sub_lang,
--            'name': sub_name,
--            'v': video_id,
--            'fmt': format,
--        })
--        url = 'http://www.youtube.com/api/timedtext?' + params
--        try:
--            sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
--        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 -            self._downloader.report_warning(u'unable to download video subtitles for %s: %s' % (sub_lang, compat_str(err)))
 -            return
--        if not sub:
-             return (u'Did not fetch video subtitles', None, None)
-         return (None, sub_lang, sub)
 -            self._downloader.report_warning(u'Did not fetch video subtitles')
 -            return
 -        return sub
--
--    def _request_automatic_caption(self, video_id, webpage):
--        """We need the webpage for getting the captions url, pass it as an
--           argument to speed up the process."""
--        sub_lang = self._downloader.params.get('subtitleslang') or 'en'
--        sub_format = self._downloader.params.get('subtitlesformat')
--        self.to_screen(u'%s: Looking for automatic captions' % video_id)
--        mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
--        err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
--        if mobj is None:
-             return [(err_msg, None, None)]
 -            self._downloader.report_warning(err_msg)
 -            return {}
--        player_config = json.loads(mobj.group(1))
--        try:
--            args = player_config[u'args']
--            caption_url = args[u'ttsurl']
--            timestamp = args[u'timestamp']
--            params = compat_urllib_parse.urlencode({
--                'lang': 'en',
--                'tlang': sub_lang,
--                'fmt': sub_format,
--                'ts': timestamp,
--                'kind': 'asr',
--            })
--            subtitles_url = caption_url + '&' + params
--            sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
-             return [(None, sub_lang, sub)]
-         except KeyError:
-             return [(err_msg, None, None)]
-     def _extract_subtitle(self, video_id):
 -            return {sub_lang: sub}
 -        # An extractor error can be raise by the download process if there are
 -        # no automatic captions but there are subtitles
 -        except (KeyError, ExtractorError):
 -            self._downloader.report_warning(err_msg)
 -            return {}
 -    
 -    def _extract_subtitles(self, video_id):
--        """
-         Return a list with a tuple:
-         [(error_message, sub_lang, sub)]
 -        Return a dictionary: {language: subtitles} or {} if the subtitles
 -        couldn't be found
--        """
--        sub_lang_list = self._get_available_subtitles(video_id)
--        sub_format = self._downloader.params.get('subtitlesformat')
-         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
-             return [(sub_lang_list[0], None, None)]
-         if self._downloader.params.get('subtitleslang', False):
-             sub_lang = self._downloader.params.get('subtitleslang')
-         elif 'en' in sub_lang_list:
-             sub_lang = 'en'
 -        if  not sub_lang_list: #There was some error, it didn't get the available subtitles
 -            return {}
 -        if self._downloader.params.get('allsubtitles', False):
 -            pass
--        else:
-             sub_lang = list(sub_lang_list.keys())[0]
-         if not sub_lang in sub_lang_list:
-             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
-         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
-         return [subtitle]
-     def _extract_all_subtitles(self, video_id):
-         sub_lang_list = self._get_available_subtitles(video_id)
-         sub_format = self._downloader.params.get('subtitlesformat')
-         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
-             return [(sub_lang_list[0], None, None)]
-         subtitles = []
 -            if self._downloader.params.get('subtitleslang', False):
 -                sub_lang = self._downloader.params.get('subtitleslang')
 -            elif 'en' in sub_lang_list:
 -                sub_lang = 'en'
 -            else:
 -                sub_lang = list(sub_lang_list.keys())[0]
 -            if not sub_lang in sub_lang_list:
 -                self._downloader.report_warning(u'no closed captions found in the specified language "%s"' % sub_lang)
 -                return {}
 -            sub_lang_list = {sub_lang: sub_lang_list[sub_lang]}
 -        subtitles = {}
--        for sub_lang in sub_lang_list:
--            subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
-             subtitles.append(subtitle)
 -            if subtitle:
 -                subtitles[sub_lang] = subtitle
--        return subtitles
--
      def _print_formats(self, formats):
          print('Available formats:')
          for x in formats: