Merge branch 'vimeo' of ssh://github.com/rbrito/youtube-dl into vimeo

author Rogério Brito <rbrito@ime.usp.br>

Tue, 22 Feb 2011 00:52:48 +0000 (21:52 -0300)

committer Rogério Brito <rbrito@ime.usp.br>

Tue, 22 Feb 2011 00:52:48 +0000 (21:52 -0300)
author Rogério Brito <rbrito@ime.usp.br>
Tue, 22 Feb 2011 00:52:48 +0000 (21:52 -0300)
committer Rogério Brito <rbrito@ime.usp.br>
Tue, 22 Feb 2011 00:52:48 +0000 (21:52 -0300)
diff --combined youtube-dl

index 5a68a2ee9a169039b4ced962d12b5511af7844d7,0e837868f28fa3179823905a56fd827aebe6f3dd..782372688918ba638206e8a755e849f1ee7527bd
--- 1/youtube-dl
--- 2/youtube-dl
+++ b/youtube-dl
@@@ -6,7 -6,6 +6,7 @@@
   # Author: Vasyl' Vavrychuk
   # Author: Witold Baryluk
   # Author: Paweł Paprota
+ +# Author: Gergely Imreh
   # License: Public domain code
   import cookielib
   import ctypes
@@@ -1724,7 -1723,7 +1724,7 @@@ class VimeoIE(InfoExtractor)
         """Information extractor for vimeo.com."""
   
         # _VALID_URL matches Vimeo URLs
-       _VALID_URL = r'(?:http://)?vimeo\.com/([0-9]+)'
+       _VALID_URL = r'(?:http://)?(?:(?:www|player).)?vimeo\.com/(?:video/)?([0-9]+)'
   
         def __init__(self, downloader=None):
                 InfoExtractor.__init__(self, downloader)
@@@ -1765,8 -1764,12 +1765,12 @@@
                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
                         return
   
-               # Extract uploader and title from webpage
+               # Now we begin extracting as much information as we can from what we
+               # retrieved. First we extract the information common to all extractors,
+               # and latter we extract those that are Vimeo specific.
                 self.report_extraction(video_id)
+ 
+               # Extract title
                 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
                 if mobj is None:
                         self._downloader.trouble(u'ERROR: unable to extract video title')
@@@ -1774,6 -1777,7 +1778,7 @@@
                 video_title = mobj.group(1).decode('utf-8')
                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
   
+               # Extract uploader
                 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
                 if mobj is None:
                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
@@@ -1796,14 -1800,14 +1801,14 @@@
                 # if not video_description: video_description = 'No description available.'
                 video_description = 'Foo.'
   
-               # Extract request signature
+               # Vimeo specific: extract request signature
                 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
                 if mobj is None:
                         self._downloader.trouble(u'ERROR: unable to extract request signature')
                         return
                 sig = mobj.group(1).decode('utf-8')
   
-               # Extract request signature expiration
+               # Vimeo specific: Extract request signature expiration
                 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
                 if mobj is None:
                         self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
@@@ -2452,229 -2456,6 +2457,229 @@@ class DepositFilesIE(InfoExtractor)
                 except UnavailableVideoError, err:
                         self._downloader.trouble(u'ERROR: unable to download file')
   
+ +class FacebookIE(InfoExtractor):
+ +      """Information Extractor for Facebook"""
+ +
+ +      _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
+ +      _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
+ +      _NETRC_MACHINE = 'facebook'
+ +      _available_formats = ['highqual', 'lowqual']
+ +      _video_extensions = {
+ +              'highqual': 'mp4',
+ +              'lowqual': 'mp4',
+ +      }
+ +
+ +      def __init__(self, downloader=None):
+ +              InfoExtractor.__init__(self, downloader)
+ +
+ +      @staticmethod
+ +      def suitable(url):
+ +              return (re.match(FacebookIE._VALID_URL, url) is not None)
+ +
+ +      def _reporter(self, message):
+ +              """Add header and report message."""
+ +              self._downloader.to_screen(u'[facebook] %s' % message)
+ +
+ +      def report_login(self):
+ +              """Report attempt to log in."""
+ +              self._reporter(u'Logging in')
+ +
+ +      def report_video_webpage_download(self, video_id):
+ +              """Report attempt to download video webpage."""
+ +              self._reporter(u'%s: Downloading video webpage' % video_id)
+ +
+ +      def report_information_extraction(self, video_id):
+ +              """Report attempt to extract video information."""
+ +              self._reporter(u'%s: Extracting video information' % video_id)
+ +
+ +      def _parse_page(self, video_webpage):
+ +              """Extract video information from page"""
+ +              # General data
+ +              data = {'title': r'class="video_title datawrap">(.*?)</',
+ +                      'description': r'<div class="datawrap">(.*?)</div>',
+ +                      'owner': r'\("video_owner_name", "(.*?)"\)',
+ +                      'upload_date': r'data-date="(.*?)"',
+ +                      'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
+ +                      }
+ +              video_info = {}
+ +              for piece in data.keys():
+ +                      mobj = re.search(data[piece], video_webpage)
+ +                      if mobj is not None:
+ +                              video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
+ +
+ +              # Video urls
+ +              video_urls = {}
+ +              for fmt in self._available_formats:
+ +                      mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
+ +                      if mobj is not None:
+ +                              # URL is in a Javascript segment inside an escaped Unicode format within
+ +                              # the generally utf-8 page
+ +                              video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
+ +              video_info['video_urls'] = video_urls
+ +
+ +              return video_info
+ +
+ +      def _real_initialize(self):
+ +              if self._downloader is None:
+ +                      return
+ +
+ +              useremail = None
+ +              password = None
+ +              downloader_params = self._downloader.params
+ +
+ +              # Attempt to use provided username and password or .netrc data
+ +              if downloader_params.get('username', None) is not None:
+ +                      useremail = downloader_params['username']
+ +                      password = downloader_params['password']
+ +              elif downloader_params.get('usenetrc', False):
+ +                      try:
+ +                              info = netrc.netrc().authenticators(self._NETRC_MACHINE)
+ +                              if info is not None:
+ +                                      useremail = info[0]
+ +                                      password = info[2]
+ +                              else:
+ +                                      raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
+ +                      except (IOError, netrc.NetrcParseError), err:
+ +                              self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
+ +                              return
+ +
+ +              if useremail is None:
+ +                      return
+ +
+ +              # Log in
+ +              login_form = {
+ +                      'email': useremail,
+ +                      'pass': password,
+ +                      'login': 'Log+In'
+ +                      }
+ +              request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
+ +              try:
+ +                      self.report_login()
+ +                      login_results = urllib2.urlopen(request).read()
+ +                      if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
+ +                              self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
+ +                              return
+ +              except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ +                      self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
+ +                      return
+ +
+ +      def _real_extract(self, url):
+ +              mobj = re.match(self._VALID_URL, url)
+ +              if mobj is None:
+ +                      self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
+ +                      return
+ +              video_id = mobj.group('ID')
+ +
+ +              # Get video webpage
+ +              self.report_video_webpage_download(video_id)
+ +              request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
+ +              try:
+ +                      page = urllib2.urlopen(request)
+ +                      video_webpage = page.read()
+ +              except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+ +                      self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
+ +                      return
+ +
+ +              # Start extracting information
+ +              self.report_information_extraction(video_id)
+ +
+ +              # Extract information
+ +              video_info = self._parse_page(video_webpage)
+ +
+ +              # uploader
+ +              if 'owner' not in video_info:
+ +                      self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
+ +                      return
+ +              video_uploader = video_info['owner']
+ +
+ +              # title
+ +              if 'title' not in video_info:
+ +                      self._downloader.trouble(u'ERROR: unable to extract video title')
+ +                      return
+ +              video_title = video_info['title']
+ +              video_title = video_title.decode('utf-8')
+ +              video_title = sanitize_title(video_title)
+ +
+ +              # simplified title
+ +              simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
+ +              simple_title = simple_title.strip(ur'_')
+ +
+ +              # thumbnail image
+ +              if 'thumbnail' not in video_info:
+ +                      self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
+ +                      video_thumbnail = ''
+ +              else:
+ +                      video_thumbnail = video_info['thumbnail']
+ +
+ +              # upload date
+ +              upload_date = u'NA'
+ +              if 'upload_date' in video_info:
+ +                      upload_time = video_info['upload_date']
+ +                      timetuple = email.utils.parsedate_tz(upload_time)
+ +                      if timetuple is not None:
+ +                              try:
+ +                                      upload_date = time.strftime('%Y%m%d', timetuple[0:9])
+ +                              except:
+ +                                      pass
+ +
+ +              # description
+ +              video_description = 'No description available.'
+ +              if (self._downloader.params.get('forcedescription', False) and
+ +                  'description' in video_info):
+ +                      video_description = video_info['description']
+ +
+ +              url_map = video_info['video_urls']
+ +              if len(url_map.keys()) > 0:
+ +                      # Decide which formats to download
+ +                      req_format = self._downloader.params.get('format', None)
+ +                      format_limit = self._downloader.params.get('format_limit', None)
+ +
+ +                      if format_limit is not None and format_limit in self._available_formats:
+ +                              format_list = self._available_formats[self._available_formats.index(format_limit):]
+ +                      else:
+ +                              format_list = self._available_formats
+ +                      existing_formats = [x for x in format_list if x in url_map]
+ +                      if len(existing_formats) == 0:
+ +                              self._downloader.trouble(u'ERROR: no known formats available for video')
+ +                              return
+ +                      if req_format is None:
+ +                              video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
+ +                      elif req_format == '-1':
+ +                              video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
+ +                      else:
+ +                              # Specific format
+ +                              if req_format not in url_map:
+ +                                      self._downloader.trouble(u'ERROR: requested format not available')
+ +                                      return
+ +                              video_url_list = [(req_format, url_map[req_format])] # Specific format
+ +
+ +              for format_param, video_real_url in video_url_list:
+ +
+ +                      # At this point we have a new video
+ +                      self._downloader.increment_downloads()
+ +
+ +                      # Extension
+ +                      video_extension = self._video_extensions.get(format_param, 'mp4')
+ +
+ +                      # Find the video URL in fmt_url_map or conn paramters
+ +                      try:
+ +                              # Process video information
+ +                              self._downloader.process_info({
+ +                                      'id':           video_id.decode('utf-8'),
+ +                                      'url':          video_real_url.decode('utf-8'),
+ +                                      'uploader':     video_uploader.decode('utf-8'),
+ +                                      'upload_date':  upload_date,
+ +                                      'title':        video_title,
+ +                                      'stitle':       simple_title,
+ +                                      'ext':          video_extension.decode('utf-8'),
+ +                                      'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
+ +                                      'thumbnail':    video_thumbnail.decode('utf-8'),
+ +                                      'description':  video_description.decode('utf-8'),
+ +                                      'player_url':   None,
+ +                              })
+ +                      except UnavailableVideoError, err:
+ +                              self._downloader.trouble(u'\nERROR: unable to download video')
+ +
   class PostProcessor(object):
         """Post Processor class.
   
@@@ -2931,7 -2712,6 +2936,7 @@@ if __name__ == '__main__'
                 yahoo_ie = YahooIE()
                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
                 deposit_files_ie = DepositFilesIE()
+ +              facebook_ie = FacebookIE()
                 generic_ie = GenericIE()
   
                 # File downloader
@@@ -2984,7 -2764,6 +2989,7 @@@
                 fd.add_info_extractor(yahoo_ie)
                 fd.add_info_extractor(yahoo_search_ie)
                 fd.add_info_extractor(deposit_files_ie)
+ +              fd.add_info_extractor(facebook_ie)
   
                 # This must come last since it's the
                 # fallback if none of the others work
author	Rogério Brito <rbrito@ime.usp.br>
	Tue, 22 Feb 2011 00:52:48 +0000 (21:52 -0300)
committer	Rogério Brito <rbrito@ime.usp.br>
	Tue, 22 Feb 2011 00:52:48 +0000 (21:52 -0300)