Merge branch 'master' into subtitles_rework
authorIsmael Mejia <iemejia@gmail.com>
Mon, 26 Aug 2013 02:03:40 +0000 (04:03 +0200)
committerIsmael Mejia <iemejia@gmail.com>
Tue, 27 Aug 2013 22:33:12 +0000 (00:33 +0200)
1  2 
youtube_dl/__init__.py
youtube_dl/extractor/dailymotion.py
youtube_dl/extractor/youtube.py

diff --combined youtube_dl/__init__.py
index c21bf6d4afb41eeba91f2d82c556a31ccf6b0601,441ca6b6a74ed4cc2882c4c84fbdf457cd2b7da2..5d686a928e1660870669243854147f0d0bd3b857
@@@ -190,10 -190,13 +190,10 @@@ def parseOpts(overrideArguments=None)
  
      subtitles.add_option('--write-sub', '--write-srt',
              action='store_true', dest='writesubtitles',
 -            help='write subtitle file (currently youtube only)', default=False)
 +            help='write subtitle file', default=False)
      subtitles.add_option('--write-auto-sub', '--write-automatic-sub',
              action='store_true', dest='writeautomaticsub',
 -            help='write automatic subtitle file (currently youtube only)', default=False)
 -    subtitles.add_option('--only-sub',
 -            action='store_true', dest='skip_download',
 -            help='[deprecated] alias of --skip-download', default=False)
 +            help='write automatic subtitle file (youtube only)', default=False)
      subtitles.add_option('--all-subs',
              action='store_true', dest='allsubtitles',
              help='downloads all the available subtitles of the video', default=False)
              help='keeps the video file on disk after the post-processing; the video is erased by default')
      postproc.add_option('--no-post-overwrites', action='store_true', dest='nopostoverwrites', default=False,
              help='do not overwrite post-processed files; the post-processed files are overwritten by default')
+     postproc.add_option('--embed-subs', action='store_true', dest='embedsubtitles', default=False,
+             help='embed subtitles in the video (only for mp4 videos)')
  
  
      parser.add_option_group(general)
@@@ -608,6 -613,8 +610,8 @@@ def _real_main(argv=None)
          ydl.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, nopostoverwrites=opts.nopostoverwrites))
      if opts.recodevideo:
          ydl.add_post_processor(FFmpegVideoConvertor(preferedformat=opts.recodevideo))
+     if opts.embedsubtitles:
+         ydl.add_post_processor(FFmpegEmbedSubtitlePP(subtitlesformat=opts.subtitlesformat))
  
      # Update version
      if opts.update_self:
index f54ecc569cbe02714df09cdaf55c72ffd7129895,fa8c630d053168bf30d835952debd67536555c0c..003b1d8c3e6233368b764e2866431cde13e032f3
@@@ -1,39 -1,17 +1,40 @@@
  import re
  import json
+ import itertools
 +import socket
  
  from .common import InfoExtractor
 +from .subtitles import NoAutoSubtitlesIE
 +
  from ..utils import (
 +    compat_http_client,
 +    compat_urllib_error,
      compat_urllib_request,
 +    compat_str,
      get_element_by_attribute,
      get_element_by_id,
  
      ExtractorError,
  )
  
 -class DailymotionIE(InfoExtractor):
 +
 +class DailyMotionSubtitlesIE(NoAutoSubtitlesIE):
 +
 +    def _get_available_subtitles(self, video_id):
 +        request = compat_urllib_request.Request('https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id)
 +        try:
 +            sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 +        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 +            self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
 +            return {}
 +        info = json.loads(sub_list)
 +        if (info['total'] > 0):
 +            sub_lang_list = dict((l['language'], l['url']) for l in info['list'])
 +            return sub_lang_list
 +        self._downloader.report_warning(u'video doesn\'t have subtitles')
 +        return {}
 +
- class DailymotionIE(DailyMotionSubtitlesIE):
++class DailymotionIE(DailyMotionSubtitlesIE, InfoExtractor):
      """Information Extractor for Dailymotion"""
  
      _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
@@@ -43,7 -21,7 +44,7 @@@
          u'file': u'x33vw9.mp4',
          u'md5': u'392c4b85a60a90dc4792da41ce3144eb',
          u'info_dict': {
-             u"uploader": u"Alex and Van .",
+             u"uploader": u"Alex and Van .", 
              u"title": u"Tutoriel de Youtubeur\"DL DES VIDEO DE YOUTUBE\""
          }
      }
          for key in ['stream_h264_hd1080_url','stream_h264_hd_url',
                      'stream_h264_hq_url','stream_h264_url',
                      'stream_h264_ld_url']:
-             if info.get(key):  # key in info and info[key]:
+             if info.get(key):#key in info and info[key]:
                  max_quality = key
-                 self.to_screen(u'%s: Using %s' % (video_id, key))
+                 self.to_screen(u'Using %s' % key)
                  break
          else:
              raise ExtractorError(u'Unable to extract video URL')
          video_url = info[max_quality]
  
 +        # subtitles
 +        video_subtitles = None
 +        video_webpage = None
 +
 +        if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False):
 +            video_subtitles = self._extract_subtitles(video_id)
 +        elif self._downloader.params.get('writeautomaticsub', False):
 +            video_subtitles = self._request_automatic_caption(video_id, video_webpage)
 +
 +        if self._downloader.params.get('listsubtitles', False):
 +            self._list_available_subtitles(video_id)
 +            return
 +
          return [{
              'id':       video_id,
              'url':      video_url,
              'upload_date':  video_upload_date,
              'title':    self._og_search_title(webpage),
              'ext':      video_extension,
 +            'subtitles':    video_subtitles,
              'thumbnail': info['thumbnail_url']
          }]
  
index 571c73889008349fcbaf157e17d3f6d617ac5a5c,446d53f644114e40915a821d63b0d61d6ed19d0d..370cc64cc911ff3871604c755ac7c7d13e44ec8c
@@@ -7,7 -7,6 +7,7 @@@ import socke
  import itertools
  
  from .common import InfoExtractor, SearchInfoExtractor
 +from .subtitles import SubtitlesIE
  from ..utils import (
      compat_http_client,
      compat_parse_qs,
      orderedSet,
  )
  
+ class YoutubeBaseInfoExtractor(InfoExtractor):
+     """Provide base functions for Youtube extractors"""
+     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
+     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
+     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
+     _NETRC_MACHINE = 'youtube'
+     # If True it will raise an error if no login info is provided
+     _LOGIN_REQUIRED = False
+     def report_lang(self):
+         """Report attempt to set language."""
+         self.to_screen(u'Setting language')
+     def _set_language(self):
+         request = compat_urllib_request.Request(self._LANG_URL)
+         try:
+             self.report_lang()
+             compat_urllib_request.urlopen(request).read()
+         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
+             return False
+         return True
+     def _login(self):
+         (username, password) = self._get_login_info()
+         # No authentication to be performed
+         if username is None:
+             if self._LOGIN_REQUIRED:
+                 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
+             return False
+         request = compat_urllib_request.Request(self._LOGIN_URL)
+         try:
+             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
+         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
+             return False
+         galx = None
+         dsh = None
+         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
+         if match:
+           galx = match.group(1)
+         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
+         if match:
+           dsh = match.group(1)
+         # Log in
+         login_form_strs = {
+                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
+                 u'Email': username,
+                 u'GALX': galx,
+                 u'Passwd': password,
+                 u'PersistentCookie': u'yes',
+                 u'_utf8': u'霱',
+                 u'bgresponse': u'js_disabled',
+                 u'checkConnection': u'',
+                 u'checkedDomains': u'youtube',
+                 u'dnConn': u'',
+                 u'dsh': dsh,
+                 u'pstMsg': u'0',
+                 u'rmShown': u'1',
+                 u'secTok': u'',
+                 u'signIn': u'Sign in',
+                 u'timeStmp': u'',
+                 u'service': u'youtube',
+                 u'uilel': u'3',
+                 u'hl': u'en_US',
+         }
+         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
+         # chokes on unicode
+         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
+         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
+         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
+         try:
+             self.report_login()
+             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
+             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
+                 self._downloader.report_warning(u'unable to log in: bad username or password')
+                 return False
+         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
+             return False
+         return True
+     def _confirm_age(self):
+         age_form = {
+                 'next_url':     '/',
+                 'action_confirm':   'Confirm',
+                 }
+         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
+         try:
+             self.report_age_confirmation()
+             compat_urllib_request.urlopen(request).read().decode('utf-8')
+         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
+         return True
+     def _real_initialize(self):
+         if self._downloader is None:
+             return
+         if not self._set_language():
+             return
+         if not self._login():
+             return
+         self._confirm_age()
  
 -class YoutubeIE(YoutubeBaseInfoExtractor):
 +class YoutubeSubtitlesIE(SubtitlesIE):
 +
 +    def _get_available_subtitles(self, video_id):
 +        request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 +        try:
 +            sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 +        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 +            self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
 +            return {}
 +        lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 +
 +        sub_lang_list = {}
 +        for l in lang_list:
 +            lang = l[1]
 +            params = compat_urllib_parse.urlencode({
 +                'lang': lang,
 +                'v': video_id,
 +                'fmt': self._downloader.params.get('subtitlesformat'),
 +            })
 +            url = u'http://www.youtube.com/api/timedtext?' + params
 +            sub_lang_list[lang] = url
 +        if not sub_lang_list:
 +            self._downloader.report_warning(u'video doesn\'t have subtitles')
 +            return {}
 +        return sub_lang_list
 +
 +    def _request_automatic_caption(self, video_id, webpage):
 +        """We need the webpage for getting the captions url, pass it as an
 +           argument to speed up the process."""
 +        sub_lang = self._downloader.params.get('subtitleslang') or 'en'
 +        sub_format = self._downloader.params.get('subtitlesformat')
 +        self.to_screen(u'%s: Looking for automatic captions' % video_id)
 +        mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 +        err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
 +        if mobj is None:
 +            self._downloader.report_warning(err_msg)
 +            return {}
 +        player_config = json.loads(mobj.group(1))
 +        try:
 +            args = player_config[u'args']
 +            caption_url = args[u'ttsurl']
 +            timestamp = args[u'timestamp']
 +            params = compat_urllib_parse.urlencode({
 +                'lang': 'en',
 +                'tlang': sub_lang,
 +                'fmt': sub_format,
 +                'ts': timestamp,
 +                'kind': 'asr',
 +            })
 +            subtitles_url = caption_url + '&' + params
 +            sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
 +            return {sub_lang: sub}
 +        # An extractor error can be raise by the download process if there are
 +        # no automatic captions but there are subtitles
 +        except (KeyError, ExtractorError):
 +            self._downloader.report_warning(err_msg)
 +            return {}
 +
- class YoutubeIE(YoutubeSubtitlesIE):
++class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor):
      IE_DESC = u'YouTube.com'
      _VALID_URL = r"""^
                       (
                           (?:                                                  # the various things that can precede the ID:
                               (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
                               |(?:                                             # or the v= param in all its forms
-                                  (?:watch|movie(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
+                                  (?:(?:watch|movie)(?:_popup)?(?:\.php)?)?    # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
                                   (?:\?|\#!?)                                  # the params delimiter ? or # or #!
                                   (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
                                   v=
          """Report attempt to download video info webpage."""
          self.to_screen(u'%s: Downloading video info webpage' % video_id)
  
 -    def report_video_subtitles_download(self, video_id):
 -        """Report attempt to download video info webpage."""
 -        self.to_screen(u'%s: Checking available subtitles' % video_id)
 -
 -    def report_video_subtitles_request(self, video_id, sub_lang, format):
 -        """Report attempt to download video info webpage."""
 -        self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 -
 -    def report_video_subtitles_available(self, video_id, sub_lang_list):
 -        """Report available subtitles."""
 -        sub_lang = ",".join(list(sub_lang_list.keys()))
 -        self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 -
      def report_information_extraction(self, video_id):
          """Report attempt to extract video information."""
          self.to_screen(u'%s: Extracting video information' % video_id)
              return s[1:19] + s[0] + s[20:68] + s[19] + s[69:82]
          elif len(s) == 81:
              return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
+         elif len(s) == 80:
+             return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
          elif len(s) == 79:
              return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
  
              # Fallback to the other algortihms
              return self._decrypt_signature(s)
  
--
--    def _get_available_subtitles(self, video_id):
--        self.report_video_subtitles_download(video_id)
--        request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
--        try:
--            sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
--        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 -            self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
 -            return {}
--        sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
--        sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
--        if not sub_lang_list:
-             return (u'video doesn\'t have subtitles', None)
 -            self._downloader.report_warning(u'video doesn\'t have subtitles')
 -            return {}
--        return sub_lang_list
--
--    def _list_available_subtitles(self, video_id):
--        sub_lang_list = self._get_available_subtitles(video_id)
--        self.report_video_subtitles_available(video_id, sub_lang_list)
--
--    def _request_subtitle(self, sub_lang, sub_name, video_id, format):
--        """
-         Return tuple:
-         (error_message, sub_lang, sub)
 -        Return the subtitle as a string or None if they are not found
--        """
--        self.report_video_subtitles_request(video_id, sub_lang, format)
--        params = compat_urllib_parse.urlencode({
--            'lang': sub_lang,
--            'name': sub_name,
--            'v': video_id,
--            'fmt': format,
--        })
--        url = 'http://www.youtube.com/api/timedtext?' + params
--        try:
--            sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
--        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 -            self._downloader.report_warning(u'unable to download video subtitles for %s: %s' % (sub_lang, compat_str(err)))
 -            return
--        if not sub:
-             return (u'Did not fetch video subtitles', None, None)
-         return (None, sub_lang, sub)
 -            self._downloader.report_warning(u'Did not fetch video subtitles')
 -            return
 -        return sub
--
--    def _request_automatic_caption(self, video_id, webpage):
--        """We need the webpage for getting the captions url, pass it as an
--           argument to speed up the process."""
--        sub_lang = self._downloader.params.get('subtitleslang') or 'en'
--        sub_format = self._downloader.params.get('subtitlesformat')
--        self.to_screen(u'%s: Looking for automatic captions' % video_id)
--        mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
--        err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
--        if mobj is None:
-             return [(err_msg, None, None)]
 -            self._downloader.report_warning(err_msg)
 -            return {}
--        player_config = json.loads(mobj.group(1))
--        try:
--            args = player_config[u'args']
--            caption_url = args[u'ttsurl']
--            timestamp = args[u'timestamp']
--            params = compat_urllib_parse.urlencode({
--                'lang': 'en',
--                'tlang': sub_lang,
--                'fmt': sub_format,
--                'ts': timestamp,
--                'kind': 'asr',
--            })
--            subtitles_url = caption_url + '&' + params
--            sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
-             return [(None, sub_lang, sub)]
-         except KeyError:
-             return [(err_msg, None, None)]
-     def _extract_subtitle(self, video_id):
 -            return {sub_lang: sub}
 -        # An extractor error can be raise by the download process if there are
 -        # no automatic captions but there are subtitles
 -        except (KeyError, ExtractorError):
 -            self._downloader.report_warning(err_msg)
 -            return {}
 -    
 -    def _extract_subtitles(self, video_id):
--        """
-         Return a list with a tuple:
-         [(error_message, sub_lang, sub)]
 -        Return a dictionary: {language: subtitles} or {} if the subtitles
 -        couldn't be found
--        """
--        sub_lang_list = self._get_available_subtitles(video_id)
--        sub_format = self._downloader.params.get('subtitlesformat')
-         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
-             return [(sub_lang_list[0], None, None)]
-         if self._downloader.params.get('subtitleslang', False):
-             sub_lang = self._downloader.params.get('subtitleslang')
-         elif 'en' in sub_lang_list:
-             sub_lang = 'en'
 -        if  not sub_lang_list: #There was some error, it didn't get the available subtitles
 -            return {}
 -        if self._downloader.params.get('allsubtitles', False):
 -            pass
--        else:
-             sub_lang = list(sub_lang_list.keys())[0]
-         if not sub_lang in sub_lang_list:
-             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
-         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
-         return [subtitle]
-     def _extract_all_subtitles(self, video_id):
-         sub_lang_list = self._get_available_subtitles(video_id)
-         sub_format = self._downloader.params.get('subtitlesformat')
-         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
-             return [(sub_lang_list[0], None, None)]
-         subtitles = []
 -            if self._downloader.params.get('subtitleslang', False):
 -                sub_lang = self._downloader.params.get('subtitleslang')
 -            elif 'en' in sub_lang_list:
 -                sub_lang = 'en'
 -            else:
 -                sub_lang = list(sub_lang_list.keys())[0]
 -            if not sub_lang in sub_lang_list:
 -                self._downloader.report_warning(u'no closed captions found in the specified language "%s"' % sub_lang)
 -                return {}
 -            sub_lang_list = {sub_lang: sub_lang_list[sub_lang]}
 -        subtitles = {}
--        for sub_lang in sub_lang_list:
--            subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
-             subtitles.append(subtitle)
 -            if subtitle:
 -                subtitles[sub_lang] = subtitle
--        return subtitles
--
      def _print_formats(self, formats):
          print('Available formats:')
          for x in formats: