Merge branch 'vimeo' of ssh://github.com/rbrito/youtube-dl into vimeo
authorRogério Brito <rbrito@ime.usp.br>
Tue, 22 Feb 2011 00:52:48 +0000 (21:52 -0300)
committerRogério Brito <rbrito@ime.usp.br>
Tue, 22 Feb 2011 00:52:48 +0000 (21:52 -0300)
1  2 
youtube-dl

diff --combined youtube-dl
index 5a68a2ee9a169039b4ced962d12b5511af7844d7,0e837868f28fa3179823905a56fd827aebe6f3dd..782372688918ba638206e8a755e849f1ee7527bd
@@@ -6,7 -6,6 +6,7 @@@
  # Author: Vasyl' Vavrychuk
  # Author: Witold Baryluk
  # Author: Paweł Paprota
 +# Author: Gergely Imreh
  # License: Public domain code
  import cookielib
  import ctypes
@@@ -1724,7 -1723,7 +1724,7 @@@ class VimeoIE(InfoExtractor)
        """Information extractor for vimeo.com."""
  
        # _VALID_URL matches Vimeo URLs
-       _VALID_URL = r'(?:http://)?vimeo\.com/([0-9]+)'
+       _VALID_URL = r'(?:http://)?(?:(?:www|player).)?vimeo\.com/(?:video/)?([0-9]+)'
  
        def __init__(self, downloader=None):
                InfoExtractor.__init__(self, downloader)
                        self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
                        return
  
-               # Extract uploader and title from webpage
+               # Now we begin extracting as much information as we can from what we
+               # retrieved. First we extract the information common to all extractors,
+               # and latter we extract those that are Vimeo specific.
                self.report_extraction(video_id)
+               # Extract title
                mobj = re.search(r'<caption>(.*?)</caption>', webpage)
                if mobj is None:
                        self._downloader.trouble(u'ERROR: unable to extract video title')
                video_title = mobj.group(1).decode('utf-8')
                simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
  
+               # Extract uploader
                mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
                if mobj is None:
                        self._downloader.trouble(u'ERROR: unable to extract video uploader')
                # if not video_description: video_description = 'No description available.'
                video_description = 'Foo.'
  
-               # Extract request signature
+               # Vimeo specific: extract request signature
                mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
                if mobj is None:
                        self._downloader.trouble(u'ERROR: unable to extract request signature')
                        return
                sig = mobj.group(1).decode('utf-8')
  
-               # Extract request signature expiration
+               # Vimeo specific: Extract request signature expiration
                mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
                if mobj is None:
                        self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
@@@ -2452,229 -2456,6 +2457,229 @@@ class DepositFilesIE(InfoExtractor)
                except UnavailableVideoError, err:
                        self._downloader.trouble(u'ERROR: unable to download file')
  
 +class FacebookIE(InfoExtractor):
 +      """Information Extractor for Facebook"""
 +
 +      _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
 +      _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
 +      _NETRC_MACHINE = 'facebook'
 +      _available_formats = ['highqual', 'lowqual']
 +      _video_extensions = {
 +              'highqual': 'mp4',
 +              'lowqual': 'mp4',
 +      }
 +
 +      def __init__(self, downloader=None):
 +              InfoExtractor.__init__(self, downloader)
 +
 +      @staticmethod
 +      def suitable(url):
 +              return (re.match(FacebookIE._VALID_URL, url) is not None)
 +
 +      def _reporter(self, message):
 +              """Add header and report message."""
 +              self._downloader.to_screen(u'[facebook] %s' % message)
 +
 +      def report_login(self):
 +              """Report attempt to log in."""
 +              self._reporter(u'Logging in')
 +
 +      def report_video_webpage_download(self, video_id):
 +              """Report attempt to download video webpage."""
 +              self._reporter(u'%s: Downloading video webpage' % video_id)
 +
 +      def report_information_extraction(self, video_id):
 +              """Report attempt to extract video information."""
 +              self._reporter(u'%s: Extracting video information' % video_id)
 +
 +      def _parse_page(self, video_webpage):
 +              """Extract video information from page"""
 +              # General data
 +              data = {'title': r'class="video_title datawrap">(.*?)</',
 +                      'description': r'<div class="datawrap">(.*?)</div>',
 +                      'owner': r'\("video_owner_name", "(.*?)"\)',
 +                      'upload_date': r'data-date="(.*?)"',
 +                      'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
 +                      }
 +              video_info = {}
 +              for piece in data.keys():
 +                      mobj = re.search(data[piece], video_webpage)
 +                      if mobj is not None:
 +                              video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
 +
 +              # Video urls
 +              video_urls = {}
 +              for fmt in self._available_formats:
 +                      mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
 +                      if mobj is not None:
 +                              # URL is in a Javascript segment inside an escaped Unicode format within
 +                              # the generally utf-8 page
 +                              video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
 +              video_info['video_urls'] = video_urls
 +
 +              return video_info
 +
 +      def _real_initialize(self):
 +              if self._downloader is None:
 +                      return
 +
 +              useremail = None
 +              password = None
 +              downloader_params = self._downloader.params
 +
 +              # Attempt to use provided username and password or .netrc data
 +              if downloader_params.get('username', None) is not None:
 +                      useremail = downloader_params['username']
 +                      password = downloader_params['password']
 +              elif downloader_params.get('usenetrc', False):
 +                      try:
 +                              info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 +                              if info is not None:
 +                                      useremail = info[0]
 +                                      password = info[2]
 +                              else:
 +                                      raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 +                      except (IOError, netrc.NetrcParseError), err:
 +                              self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 +                              return
 +
 +              if useremail is None:
 +                      return
 +
 +              # Log in
 +              login_form = {
 +                      'email': useremail,
 +                      'pass': password,
 +                      'login': 'Log+In'
 +                      }
 +              request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 +              try:
 +                      self.report_login()
 +                      login_results = urllib2.urlopen(request).read()
 +                      if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
 +                              self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
 +                              return
 +              except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 +                      self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 +                      return
 +
 +      def _real_extract(self, url):
 +              mobj = re.match(self._VALID_URL, url)
 +              if mobj is None:
 +                      self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 +                      return
 +              video_id = mobj.group('ID')
 +
 +              # Get video webpage
 +              self.report_video_webpage_download(video_id)
 +              request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
 +              try:
 +                      page = urllib2.urlopen(request)
 +                      video_webpage = page.read()
 +              except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 +                      self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 +                      return
 +
 +              # Start extracting information
 +              self.report_information_extraction(video_id)
 +
 +              # Extract information
 +              video_info = self._parse_page(video_webpage)
 +
 +              # uploader
 +              if 'owner' not in video_info:
 +                      self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 +                      return
 +              video_uploader = video_info['owner']
 +
 +              # title
 +              if 'title' not in video_info:
 +                      self._downloader.trouble(u'ERROR: unable to extract video title')
 +                      return
 +              video_title = video_info['title']
 +              video_title = video_title.decode('utf-8')
 +              video_title = sanitize_title(video_title)
 +
 +              # simplified title
 +              simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 +              simple_title = simple_title.strip(ur'_')
 +
 +              # thumbnail image
 +              if 'thumbnail' not in video_info:
 +                      self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 +                      video_thumbnail = ''
 +              else:
 +                      video_thumbnail = video_info['thumbnail']
 +
 +              # upload date
 +              upload_date = u'NA'
 +              if 'upload_date' in video_info:
 +                      upload_time = video_info['upload_date']
 +                      timetuple = email.utils.parsedate_tz(upload_time)
 +                      if timetuple is not None:
 +                              try:
 +                                      upload_date = time.strftime('%Y%m%d', timetuple[0:9])
 +                              except:
 +                                      pass
 +
 +              # description
 +              video_description = 'No description available.'
 +              if (self._downloader.params.get('forcedescription', False) and
 +                  'description' in video_info):
 +                      video_description = video_info['description']
 +
 +              url_map = video_info['video_urls']
 +              if len(url_map.keys()) > 0:
 +                      # Decide which formats to download
 +                      req_format = self._downloader.params.get('format', None)
 +                      format_limit = self._downloader.params.get('format_limit', None)
 +
 +                      if format_limit is not None and format_limit in self._available_formats:
 +                              format_list = self._available_formats[self._available_formats.index(format_limit):]
 +                      else:
 +                              format_list = self._available_formats
 +                      existing_formats = [x for x in format_list if x in url_map]
 +                      if len(existing_formats) == 0:
 +                              self._downloader.trouble(u'ERROR: no known formats available for video')
 +                              return
 +                      if req_format is None:
 +                              video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 +                      elif req_format == '-1':
 +                              video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 +                      else:
 +                              # Specific format
 +                              if req_format not in url_map:
 +                                      self._downloader.trouble(u'ERROR: requested format not available')
 +                                      return
 +                              video_url_list = [(req_format, url_map[req_format])] # Specific format
 +
 +              for format_param, video_real_url in video_url_list:
 +
 +                      # At this point we have a new video
 +                      self._downloader.increment_downloads()
 +
 +                      # Extension
 +                      video_extension = self._video_extensions.get(format_param, 'mp4')
 +
 +                      # Find the video URL in fmt_url_map or conn paramters
 +                      try:
 +                              # Process video information
 +                              self._downloader.process_info({
 +                                      'id':           video_id.decode('utf-8'),
 +                                      'url':          video_real_url.decode('utf-8'),
 +                                      'uploader':     video_uploader.decode('utf-8'),
 +                                      'upload_date':  upload_date,
 +                                      'title':        video_title,
 +                                      'stitle':       simple_title,
 +                                      'ext':          video_extension.decode('utf-8'),
 +                                      'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 +                                      'thumbnail':    video_thumbnail.decode('utf-8'),
 +                                      'description':  video_description.decode('utf-8'),
 +                                      'player_url':   None,
 +                              })
 +                      except UnavailableVideoError, err:
 +                              self._downloader.trouble(u'\nERROR: unable to download video')
 +
  class PostProcessor(object):
        """Post Processor class.
  
@@@ -2931,7 -2712,6 +2936,7 @@@ if __name__ == '__main__'
                yahoo_ie = YahooIE()
                yahoo_search_ie = YahooSearchIE(yahoo_ie)
                deposit_files_ie = DepositFilesIE()
 +              facebook_ie = FacebookIE()
                generic_ie = GenericIE()
  
                # File downloader
                fd.add_info_extractor(yahoo_ie)
                fd.add_info_extractor(yahoo_search_ie)
                fd.add_info_extractor(deposit_files_ie)
 +              fd.add_info_extractor(facebook_ie)
  
                # This must come last since it's the
                # fallback if none of the others work