Merge remote-tracking branch 'jaimeMF/split-downloaders'

author Philipp Hagemeister <phihag@phihag.de>

Mon, 23 Dec 2013 04:03:32 +0000 (05:03 +0100)

committer Philipp Hagemeister <phihag@phihag.de>

Mon, 23 Dec 2013 04:03:32 +0000 (05:03 +0100)
author Philipp Hagemeister <phihag@phihag.de>
Mon, 23 Dec 2013 04:03:32 +0000 (05:03 +0100)
committer Philipp Hagemeister <phihag@phihag.de>
Mon, 23 Dec 2013 04:03:32 +0000 (05:03 +0100)
diff --git a/README.md b/README.md

index 68b2e1ae77187d6bf44522d52fc54d5c33f2c280..91e18e372b9a803dc7358f2e6d4af3b31eb754a5 100644 (file)
--- a/README.md
+++ b/README.md
@@ -39,7 +39,8 @@ which means you can modify it, redistribute it or use it however you like.
                                 /youtube-dl .
      --no-cache-dir             Disable filesystem caching
      --bidi-workaround          Work around terminals that lack bidirectional
-                               text support. Requires fribidi executable in PATH
+                               text support. Requires bidiv or fribidi
+                               executable in PATH
  
  ## Video Selection:
      --playlist-start NUMBER    playlist video to start at (default is 1)
@@ -56,6 +57,10 @@ which means you can modify it, redistribute it or use it however you like.
      --date DATE                download only videos uploaded in this date
      --datebefore DATE          download only videos uploaded before this date
      --dateafter DATE           download only videos uploaded after this date
+    --min-views COUNT          Do not download any videos with less than COUNT
+                               views
+    --max-views COUNT          Do not download any videos with more than COUNT
+                               views
      --no-playlist              download only the currently playing video
      --age-limit YEARS          download only videos suitable for the given age
      --download-archive FILE    Download only videos not listed in the archive
@@ -127,6 +132,7 @@ which means you can modify it, redistribute it or use it however you like.
      --get-id                   simulate, quiet but print id
      --get-thumbnail            simulate, quiet but print thumbnail URL
      --get-description          simulate, quiet but print video description
+    --get-duration             simulate, quiet but print video length
      --get-filename             simulate, quiet but print output filename
      --get-format               simulate, quiet but print output format
      -j, --dump-json            simulate, quiet but print JSON information
diff --git a/test/test_all_urls.py b/test/test_all_urls.py

index e9458b2e331526de1c986c697c4a8d0ddd682f14..bd77b7c30149d556caa1237b4be4c06a56adc613 100644 (file)
--- a/test/test_all_urls.py
+++ b/test/test_all_urls.py
@@ -10,6 +10,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  from test.helper import get_testcases
  
  from youtube_dl.extractor import (
+    FacebookIE,
      gen_extractors,
      JustinTVIE,
      YoutubeIE,
@@ -87,12 +88,15 @@ class TestAllURLsMatching(unittest.TestCase):
          assertExtractId('http://www.youtube.com/watch?v=BaW_jenozKcsharePLED17F32AD9753930', 'BaW_jenozKc')
          assertExtractId('BaW_jenozKc', 'BaW_jenozKc')
  
+    def test_facebook_matching(self):
+        self.assertTrue(FacebookIE.suitable(u'https://www.facebook.com/Shiniknoh#!/photo.php?v=10153317450565268'))
+
      def test_no_duplicates(self):
          ies = gen_extractors()
          for tc in get_testcases():
              url = tc['url']
              for ie in ies:
-                if type(ie).__name__ in ['GenericIE', tc['name'] + 'IE']:
+                if type(ie).__name__ in ('GenericIE', tc['name'] + 'IE'):
                      self.assertTrue(ie.suitable(url), '%s should match URL %r' % (type(ie).__name__, url))
                  else:
                      self.assertFalse(ie.suitable(url), '%s should not match URL %r' % (type(ie).__name__, url))
diff --git a/test/test_playlists.py b/test/test_playlists.py

index 87ca401e5be52eb24fd7dc6653691437ef060570..1b7b4e3d808cb936fa5fac07136049bd174a4490 100644 (file)
--- a/test/test_playlists.py
+++ b/test/test_playlists.py
@@ -12,6 +12,7 @@ from test.helper import FakeYDL
  
  
  from youtube_dl.extractor import (
+    AcademicEarthCourseIE,
      DailymotionPlaylistIE,
      DailymotionUserIE,
      VimeoChannelIE,
@@ -26,7 +27,8 @@ from youtube_dl.extractor import (
      BambuserChannelIE,
      BandcampAlbumIE,
      SmotriCommunityIE,
-    SmotriUserIE
+    SmotriUserIE,
+    IviCompilationIE
  )
  
  
@@ -158,5 +160,34 @@ class TestPlaylists(unittest.TestCase):
          self.assertEqual(result['title'], u'Inspector')
          self.assertTrue(len(result['entries']) >= 9)
  
+    def test_AcademicEarthCourse(self):
+        dl = FakeYDL()
+        ie = AcademicEarthCourseIE(dl)
+        result = ie.extract(u'http://academicearth.org/courses/building-dynamic-websites/')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['id'], u'building-dynamic-websites')
+        self.assertEqual(result['title'], u'Building Dynamic Websites')
+        self.assertEqual(result['description'], u"Today's websites are increasingly dynamic. Pages are no longer static HTML files but instead generated by scripts and database calls. User interfaces are more seamless, with technologies like Ajax replacing traditional page reloads. This course teaches students how to build dynamic websites with Ajax and with Linux, Apache, MySQL, and PHP (LAMP), one of today's most popular frameworks. Students learn how to set up domain names with DNS, how to structure pages with XHTML and CSS, how to program in JavaScript and PHP, how to configure Apache and MySQL, how to design and query databases with SQL, how to use Ajax with both XML and JSON, and how to build mashups. The course explores issues of security, scalability, and cross-browser support and also discusses enterprise-level deployments of websites, including third-party hosting, virtualization, colocation in data centers, firewalling, and load-balancing.")
+        self.assertEqual(len(result['entries']), 10)
+        
+    def test_ivi_compilation(self):
+        dl = FakeYDL()
+        ie = IviCompilationIE(dl)
+        result = ie.extract('http://www.ivi.ru/watch/dezhurnyi_angel')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['id'], u'dezhurnyi_angel')
+        self.assertEqual(result['title'], u'Дежурный ангел (2010 - 2012)')
+        self.assertTrue(len(result['entries']) >= 36)
+        
+    def test_ivi_compilation_season(self):
+        dl = FakeYDL()
+        ie = IviCompilationIE(dl)
+        result = ie.extract('http://www.ivi.ru/watch/dezhurnyi_angel/season2')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['id'], u'dezhurnyi_angel/season2')
+        self.assertEqual(result['title'], u'Дежурный ангел (2010 - 2012) 2 сезон')
+        self.assertTrue(len(result['entries']) >= 20)
+
+
  if __name__ == '__main__':
      unittest.main()
diff --git a/test/test_utils.py b/test/test_utils.py

index 0fa66beecd3f8f82b599704af260ca3d0aa0298d..e5778cd83ee9ea74e4786243f1e6279aed3697d3 100644 (file)
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -13,20 +13,21 @@ import xml.etree.ElementTree
  
  #from youtube_dl.utils import htmlentity_transform
  from youtube_dl.utils import (
-    timeconvert,
-    sanitize_filename,
-    unescapeHTML,
-    orderedSet,
      DateRange,
-    unified_strdate,
+    encodeFilename,
      find_xpath_attr,
      get_meta_content,
-    xpath_with_ns,
-    smuggle_url,
-    unsmuggle_url,
+    orderedSet,
+    sanitize_filename,
      shell_quote,
-    encodeFilename,
+    smuggle_url,
      str_to_int,
+    timeconvert,
+    unescapeHTML,
+    unified_strdate,
+    unsmuggle_url,
+    url_basename,
+    xpath_with_ns,
  )
  
  if sys.version_info < (3, 0):
@@ -181,6 +182,15 @@ class TestUtil(unittest.TestCase):
          self.assertEqual(str_to_int('123,456'), 123456)
          self.assertEqual(str_to_int('123.456'), 123456)
  
+    def test_url_basename(self):
+        self.assertEqual(url_basename(u'http://foo.de/'), u'')
+        self.assertEqual(url_basename(u'http://foo.de/bar/baz'), u'baz')
+        self.assertEqual(url_basename(u'http://foo.de/bar/baz?x=y'), u'baz')
+        self.assertEqual(url_basename(u'http://foo.de/bar/baz#x=y'), u'baz')
+        self.assertEqual(url_basename(u'http://foo.de/bar/baz/'), u'baz')
+        self.assertEqual(
+            url_basename(u'http://media.w3.org/2010/05/sintel/trailer.mp4'),
+            u'trailer.mp4')
  
  if __name__ == '__main__':
      unittest.main()
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py

index 414aa5a80cb575642ee4ff20e393c7c96afb4e14..04771c6372dbeb4463af5d87cf3775a39324334a 100644 (file)
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -34,6 +34,7 @@ from .utils import (
      encodeFilename,
      ExtractorError,
      format_bytes,
+    formatSeconds,
      get_term_width,
      locked_file,
      make_HTTPS_handler,
@@ -46,6 +47,7 @@ from .utils import (
      subtitles_filename,
      takewhile_inclusive,
      UnavailableVideoError,
+    url_basename,
      write_json_file,
      write_string,
      YoutubeDLHandler,
@@ -94,6 +96,7 @@ class YoutubeDL(object):
      forcethumbnail:    Force printing thumbnail URL.
      forcedescription:  Force printing description.
      forcefilename:     Force printing final filename.
+    forceduration:     Force printing duration.
      forcejson:         Force printing info_dict as JSON.
      simulate:          Do not download the video files.
      format:            Video format code.
@@ -127,7 +130,16 @@ class YoutubeDL(object):
      noplaylist:        Download single video instead of a playlist if in doubt.
      age_limit:         An integer representing the user's age in years.
                         Unsuitable videos for the given age are skipped.
-    download_archive:   File name of a file where all downloads are recorded.
+    min_views:         An integer representing the minimum view count the video
+                       must have in order to not be skipped.
+                       Videos without view count information are always
+                       downloaded. None for no limit.
+    max_views:         An integer representing the maximum view count.
+                       Videos that are more popular than that are not
+                       downloaded.
+                       Videos without view count information are always
+                       downloaded. None for no limit.
+    download_archive:  File name of a file where all downloads are recorded.
                         Videos already present in the file are not downloaded
                         again.
      cookiefile:        File name where cookies should be read from and dumped to.
@@ -171,12 +183,18 @@ class YoutubeDL(object):
                      width_args = []
                  else:
                      width_args = ['-w', str(width)]
-                self._fribidi = subprocess.Popen(
-                    ['fribidi', '-c', 'UTF-8'] + width_args,
+                sp_kwargs = dict(
                      stdin=subprocess.PIPE,
                      stdout=slave,
                      stderr=self._err_file)
-                self._fribidi_channel = os.fdopen(master, 'rb')
+                try:
+                    self._output_process = subprocess.Popen(
+                        ['bidiv'] + width_args, **sp_kwargs
+                    )
+                except OSError:
+                    self._output_process = subprocess.Popen(
+                        ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
+                self._output_channel = os.fdopen(master, 'rb')
              except OSError as ose:
                  if ose.errno == 2:
                      self.report_warning(u'Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
@@ -233,14 +251,15 @@ class YoutubeDL(object):
          self._fd_progress_hooks.append(ph)
  
      def _bidi_workaround(self, message):
-        if not hasattr(self, '_fribidi_channel'):
+        if not hasattr(self, '_output_channel'):
              return message
  
+        assert hasattr(self, '_output_process')
          assert type(message) == type(u'')
          line_count = message.count(u'\n') + 1
-        self._fribidi.stdin.write((message + u'\n').encode('utf-8'))
-        self._fribidi.stdin.flush()
-        res = u''.join(self._fribidi_channel.readline().decode('utf-8')
+        self._output_process.stdin.write((message + u'\n').encode('utf-8'))
+        self._output_process.stdin.flush()
+        res = u''.join(self._output_channel.readline().decode('utf-8')
                         for _ in range(line_count))
          return res[:-len(u'\n')]
  
@@ -357,22 +376,6 @@ class YoutubeDL(object):
          error_message = u'%s %s' % (_msg_header, message)
          self.trouble(error_message, tb)
  
-    def report_writedescription(self, descfn):
-        """ Report that the description file is being written """
-        self.to_screen(u'[info] Writing video description to: ' + descfn)
-
-    def report_writesubtitles(self, sub_filename):
-        """ Report that the subtitles file is being written """
-        self.to_screen(u'[info] Writing video subtitles to: ' + sub_filename)
-
-    def report_writeinfojson(self, infofn):
-        """ Report that the metadata file has been written """
-        self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
-
-    def report_writeannotations(self, annofn):
-        """ Report that the annotations file has been written. """
-        self.to_screen(u'[info] Writing video annotations to: ' + annofn)
-
      def report_file_already_downloaded(self, file_name):
          """Report file has already been fully downloaded."""
          try:
@@ -417,13 +420,14 @@ class YoutubeDL(object):
      def _match_entry(self, info_dict):
          """ Returns None iff the file should be downloaded """
  
+        video_title = info_dict.get('title', info_dict.get('id', u'video'))
          if 'title' in info_dict:
              # This can happen when we're just evaluating the playlist
              title = info_dict['title']
              matchtitle = self.params.get('matchtitle', False)
              if matchtitle:
                  if not re.search(matchtitle, title, re.IGNORECASE):
-                    return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
+                    return u'"' + title + '" title did not match pattern "' + matchtitle + '"'
              rejecttitle = self.params.get('rejecttitle', False)
              if rejecttitle:
                  if re.search(rejecttitle, title, re.IGNORECASE):
@@ -432,14 +436,21 @@ class YoutubeDL(object):
          if date is not None:
              dateRange = self.params.get('daterange', DateRange())
              if date not in dateRange:
-                return u'[download] %s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
+                return u'%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
+        view_count = info_dict.get('view_count', None)
+        if view_count is not None:
+            min_views = self.params.get('min_views')
+            if min_views is not None and view_count < min_views:
+                return u'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
+            max_views = self.params.get('max_views')
+            if max_views is not None and view_count > max_views:
+                return u'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
          age_limit = self.params.get('age_limit')
          if age_limit is not None:
              if age_limit < info_dict.get('age_limit', 0):
                  return u'Skipping "' + title + '" because it is age restricted'
          if self.in_download_archive(info_dict):
-            return (u'%s has already been recorded in archive'
-                    % info_dict.get('title', info_dict.get('id', u'video')))
+            return u'%s has already been recorded in archive' % video_title
          return None
  
      @staticmethod
@@ -483,6 +494,7 @@ class YoutubeDL(object):
                      {
                          'extractor': ie.IE_NAME,
                          'webpage_url': url,
+                        'webpage_url_basename': url_basename(url),
                          'extractor_key': ie.ie_key(),
                      })
                  if process:
@@ -556,16 +568,16 @@ class YoutubeDL(object):
  
              n_all_entries = len(ie_result['entries'])
              playliststart = self.params.get('playliststart', 1) - 1
-            playlistend = self.params.get('playlistend', -1)
-
+            playlistend = self.params.get('playlistend', None)
+            # For backwards compatibility, interpret -1 as whole list
              if playlistend == -1:
-                entries = ie_result['entries'][playliststart:]
-            else:
-                entries = ie_result['entries'][playliststart:playlistend]
+                playlistend = None
  
+            entries = ie_result['entries'][playliststart:playlistend]
              n_entries = len(entries)
  
-            self.to_screen(u"[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
+            self.to_screen(
+                u"[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
                  (ie_result['extractor'], playlist, n_all_entries, n_entries))
  
              for i, entry in enumerate(entries, 1):
@@ -575,6 +587,7 @@ class YoutubeDL(object):
                      'playlist_index': i + playliststart,
                      'extractor': ie_result['extractor'],
                      'webpage_url': ie_result['webpage_url'],
+                    'webpage_url_basename': url_basename(ie_result['webpage_url']),
                      'extractor_key': ie_result['extractor_key'],
                  }
  
@@ -595,6 +608,7 @@ class YoutubeDL(object):
                      {
                          'extractor': ie_result['extractor'],
                          'webpage_url': ie_result['webpage_url'],
+                        'webpage_url_basename': url_basename(ie_result['webpage_url']),
                          'extractor_key': ie_result['extractor_key'],
                      })
                  return r
@@ -631,7 +645,7 @@ class YoutubeDL(object):
              info_dict['playlist_index'] = None
  
          # This extractors handle format selection themselves
-        if info_dict['extractor'] in [u'youtube', u'Youku']:
+        if info_dict['extractor'] in [u'Youku']:
              if download:
                  self.process_info(info_dict)
              return info_dict
@@ -657,10 +671,6 @@ class YoutubeDL(object):
              if 'ext' not in format:
                  format['ext'] = determine_ext(format['url'])
  
-        if self.params.get('listformats', None):
-            self.list_formats(info_dict)
-            return
-
          format_limit = self.params.get('format_limit', None)
          if format_limit:
              formats = list(takewhile_inclusive(
@@ -673,9 +683,16 @@ class YoutubeDL(object):
                  except ValueError:
                      ext_ord = -1
                  # We only compare the extension if they have the same height and width
-                return (f.get('height'), f.get('width'), ext_ord)
+                return (f.get('height') if f.get('height') is not None else -1,
+                        f.get('width') if f.get('width') is not None else -1,
+                        ext_ord)
              formats = sorted(formats, key=_free_formats_key)
  
+        info_dict['formats'] = formats
+        if self.params.get('listformats', None):
+            self.list_formats(info_dict)
+            return
+
          req_format = self.params.get('format', 'best')
          if req_format is None:
              req_format = 'best'
@@ -750,6 +767,8 @@ class YoutubeDL(object):
              self.to_stdout(info_dict['description'])
          if self.params.get('forcefilename', False) and filename is not None:
              self.to_stdout(filename)
+        if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
+            self.to_stdout(formatSeconds(info_dict['duration']))
          if self.params.get('forceformat', False):
              self.to_stdout(info_dict['format'])
          if self.params.get('forcejson', False):
@@ -772,28 +791,34 @@ class YoutubeDL(object):
              return
  
          if self.params.get('writedescription', False):
-            try:
-                descfn = filename + u'.description'
-                self.report_writedescription(descfn)
-                with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
-                    descfile.write(info_dict['description'])
-            except (KeyError, TypeError):
-                self.report_warning(u'There\'s no description to write.')
-            except (OSError, IOError):
-                self.report_error(u'Cannot write description file ' + descfn)
-                return
+            descfn = filename + u'.description'
+            if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
+                self.to_screen(u'[info] Video description is already present')
+            else:
+                try:
+                    self.to_screen(u'[info] Writing video description to: ' + descfn)
+                    with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
+                        descfile.write(info_dict['description'])
+                except (KeyError, TypeError):
+                    self.report_warning(u'There\'s no description to write.')
+                except (OSError, IOError):
+                    self.report_error(u'Cannot write description file ' + descfn)
+                    return
  
          if self.params.get('writeannotations', False):
-            try:
-                annofn = filename + u'.annotations.xml'
-                self.report_writeannotations(annofn)
-                with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
-                    annofile.write(info_dict['annotations'])
-            except (KeyError, TypeError):
-                self.report_warning(u'There are no annotations to write.')
-            except (OSError, IOError):
-                self.report_error(u'Cannot write annotations file: ' + annofn)
-                return
+            annofn = filename + u'.annotations.xml'
+            if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
+                self.to_screen(u'[info] Video annotations are already present')
+            else:
+                try:
+                    self.to_screen(u'[info] Writing video annotations to: ' + annofn)
+                    with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
+                        annofile.write(info_dict['annotations'])
+                except (KeyError, TypeError):
+                    self.report_warning(u'There are no annotations to write.')
+                except (OSError, IOError):
+                    self.report_error(u'Cannot write annotations file: ' + annofn)
+                    return
  
          subtitles_are_requested = any([self.params.get('writesubtitles', False),
                                         self.params.get('writeautomaticsub')])
@@ -809,38 +834,48 @@ class YoutubeDL(object):
                      continue
                  try:
                      sub_filename = subtitles_filename(filename, sub_lang, sub_format)
-                    self.report_writesubtitles(sub_filename)
-                    with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
-                            subfile.write(sub)
+                    if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
+                        self.to_screen(u'[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
+                    else:
+                        self.to_screen(u'[info] Writing video subtitles to: ' + sub_filename)
+                        with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
+                                subfile.write(sub)
                  except (OSError, IOError):
                      self.report_error(u'Cannot write subtitles file ' + descfn)
                      return
  
          if self.params.get('writeinfojson', False):
              infofn = os.path.splitext(filename)[0] + u'.info.json'
-            self.report_writeinfojson(infofn)
-            try:
-                json_info_dict = dict((k, v) for k, v in info_dict.items() if not k in ['urlhandle'])
-                write_json_file(json_info_dict, encodeFilename(infofn))
-            except (OSError, IOError):
-                self.report_error(u'Cannot write metadata to JSON file ' + infofn)
-                return
+            if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
+                self.to_screen(u'[info] Video description metadata is already present')
+            else:
+                self.to_screen(u'[info] Writing video description metadata as JSON to: ' + infofn)
+                try:
+                    json_info_dict = dict((k, v) for k, v in info_dict.items() if not k in ['urlhandle'])
+                    write_json_file(json_info_dict, encodeFilename(infofn))
+                except (OSError, IOError):
+                    self.report_error(u'Cannot write metadata to JSON file ' + infofn)
+                    return
  
          if self.params.get('writethumbnail', False):
              if info_dict.get('thumbnail') is not None:
                  thumb_format = determine_ext(info_dict['thumbnail'], u'jpg')
                  thumb_filename = os.path.splitext(filename)[0] + u'.' + thumb_format
-                self.to_screen(u'[%s] %s: Downloading thumbnail ...' %
-                               (info_dict['extractor'], info_dict['id']))
-                try:
-                    uf = compat_urllib_request.urlopen(info_dict['thumbnail'])
-                    with open(thumb_filename, 'wb') as thumbf:
-                        shutil.copyfileobj(uf, thumbf)
-                    self.to_screen(u'[%s] %s: Writing thumbnail to: %s' %
-                        (info_dict['extractor'], info_dict['id'], thumb_filename))
-                except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-                    self.report_warning(u'Unable to download thumbnail "%s": %s' %
-                        (info_dict['thumbnail'], compat_str(err)))
+                if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
+                    self.to_screen(u'[%s] %s: Thumbnail is already present' %
+                                   (info_dict['extractor'], info_dict['id']))
+                else:
+                    self.to_screen(u'[%s] %s: Downloading thumbnail ...' %
+                                   (info_dict['extractor'], info_dict['id']))
+                    try:
+                        uf = compat_urllib_request.urlopen(info_dict['thumbnail'])
+                        with open(thumb_filename, 'wb') as thumbf:
+                            shutil.copyfileobj(uf, thumbf)
+                        self.to_screen(u'[%s] %s: Writing thumbnail to: %s' %
+                            (info_dict['extractor'], info_dict['id'], thumb_filename))
+                    except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+                        self.report_warning(u'Unable to download thumbnail "%s": %s' %
+                            (info_dict['thumbnail'], compat_str(err)))
  
          if not self.params.get('skip_download', False):
              if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)):
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py

index 3e82cd637dad9c59d89580daac1379826326f600..c37d28c5997ed8d348afdcefe61acef1b0bd511e 100644 (file)
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -37,6 +37,7 @@ __authors__  = (
      'Anton Larionov',
      'Takuya Tsuchida',
      'Sergey M.',
+    'Michael Orlitzky',
  )
  
  __license__ = 'Public Domain'
@@ -55,13 +56,13 @@ from .utils import (
      compat_print,
      DateRange,
      decodeOption,
-    determine_ext,
      get_term_width,
      DownloadError,
      get_cachedir,
      MaxDownloadsReached,
      preferredencoding,
      SameFileError,
+    setproctitle,
      std_headers,
      write_string,
  )
@@ -193,13 +194,17 @@ def parseOpts(overrideArguments=None):
          type=float, default=None, help=optparse.SUPPRESS_HELP)
      general.add_option(
          '--bidi-workaround', dest='bidi_workaround', action='store_true',
-        help=u'Work around terminals that lack bidirectional text support. Requires fribidi executable in PATH')
+        help=u'Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH')
  
  
-    selection.add_option('--playlist-start',
-            dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is %default)', default=1)
-    selection.add_option('--playlist-end',
-            dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
+    selection.add_option(
+        '--playlist-start',
+        dest='playliststart', metavar='NUMBER', default=1, type=int,
+        help='playlist video to start at (default is %default)')
+    selection.add_option(
+        '--playlist-end',
+        dest='playlistend', metavar='NUMBER', default=None, type=int,
+        help='playlist video to end at (default is last)')
      selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
      selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
      selection.add_option('--max-downloads', metavar='NUMBER',
@@ -210,6 +215,14 @@ def parseOpts(overrideArguments=None):
      selection.add_option('--date', metavar='DATE', dest='date', help='download only videos uploaded in this date', default=None)
      selection.add_option('--datebefore', metavar='DATE', dest='datebefore', help='download only videos uploaded before this date', default=None)
      selection.add_option('--dateafter', metavar='DATE', dest='dateafter', help='download only videos uploaded after this date', default=None)
+    selection.add_option(
+        '--min-views', metavar='COUNT', dest='min_views',
+        default=None, type=int,
+        help="Do not download any videos with less than COUNT views",)
+    selection.add_option(
+        '--max-views', metavar='COUNT', dest='max_views',
+        default=None, type=int,
+        help="Do not download any videos with more than COUNT views",)
      selection.add_option('--no-playlist', action='store_true', dest='noplaylist', help='download only the currently playing video', default=False)
      selection.add_option('--age-limit', metavar='YEARS', dest='age_limit',
                           help='download only videos suitable for the given age',
@@ -290,6 +303,9 @@ def parseOpts(overrideArguments=None):
      verbosity.add_option('--get-description',
              action='store_true', dest='getdescription',
              help='simulate, quiet but print video description', default=False)
+    verbosity.add_option('--get-duration',
+            action='store_true', dest='getduration',
+            help='simulate, quiet but print video length', default=False)
      verbosity.add_option('--get-filename',
              action='store_true', dest='getfilename',
              help='simulate, quiet but print output filename', default=False)
@@ -460,12 +476,15 @@ def parseOpts(overrideArguments=None):
  
      return parser, opts, args
  
+
  def _real_main(argv=None):
      # Compatibility fixes for Windows
      if sys.platform == 'win32':
          # https://github.com/rg3/youtube-dl/issues/820
          codecs.register(lambda name: codecs.lookup('utf-8') if name == 'cp65001' else None)
  
+    setproctitle(u'youtube-dl')
+
      parser, opts, args = parseOpts(argv)
  
      # Set user agent
@@ -505,7 +524,6 @@ def _real_main(argv=None):
          for ie in sorted(extractors, key=lambda ie: ie.IE_NAME.lower()):
              compat_print(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie._WORKING else ''))
              matchedUrls = [url for url in all_urls if ie.suitable(url)]
-            all_urls = [url for url in all_urls if url not in matchedUrls]
              for mu in matchedUrls:
                  compat_print(u'  ' + mu)
          sys.exit(0)
@@ -560,18 +578,10 @@ def _real_main(argv=None):
          if numeric_buffersize is None:
              parser.error(u'invalid buffer size specified')
          opts.buffersize = numeric_buffersize
-    try:
-        opts.playliststart = int(opts.playliststart)
-        if opts.playliststart <= 0:
-            raise ValueError(u'Playlist start must be positive')
-    except (TypeError, ValueError):
-        parser.error(u'invalid playlist start number specified')
-    try:
-        opts.playlistend = int(opts.playlistend)
-        if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
-            raise ValueError(u'Playlist end must be greater than playlist start')
-    except (TypeError, ValueError):
-        parser.error(u'invalid playlist end number specified')
+    if opts.playliststart <= 0:
+        raise ValueError(u'Playlist start must be positive')
+    if opts.playlistend not in (-1, None) and opts.playlistend < opts.playliststart:
+        raise ValueError(u'Playlist end must be greater than playlist start')
      if opts.extractaudio:
          if opts.audioformat not in ['best', 'aac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav']:
              parser.error(u'invalid audio format specified')
@@ -604,27 +614,30 @@ def _real_main(argv=None):
              or (opts.useid and u'%(id)s.%(ext)s')
              or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
              or u'%(title)s-%(id)s.%(ext)s')
-    if '%(ext)s' not in outtmpl and opts.extractaudio:
+    if not os.path.splitext(outtmpl)[1] and opts.extractaudio:
          parser.error(u'Cannot download a video and extract audio into the same'
-                     u' file! Use "%%(ext)s" instead of %r' %
-                     determine_ext(outtmpl, u''))
+                     u' file! Use "{0}.%(ext)s" instead of "{0}" as the output'
+                     u' template'.format(outtmpl))
+
+    any_printing = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson
  
      ydl_opts = {
          'usenetrc': opts.usenetrc,
          'username': opts.username,
          'password': opts.password,
          'videopassword': opts.videopassword,
-        'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.dumpjson),
+        'quiet': (opts.quiet or any_printing),
          'forceurl': opts.geturl,
          'forcetitle': opts.gettitle,
          'forceid': opts.getid,
          'forcethumbnail': opts.getthumbnail,
          'forcedescription': opts.getdescription,
+        'forceduration': opts.getduration,
          'forcefilename': opts.getfilename,
          'forceformat': opts.getformat,
          'forcejson': opts.dumpjson,
          'simulate': opts.simulate,
-        'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.dumpjson),
+        'skip_download': (opts.skip_download or opts.simulate or any_printing),
          'format': opts.format,
          'format_limit': opts.format_limit,
          'listformats': opts.listformats,
@@ -668,6 +681,8 @@ def _real_main(argv=None):
          'keepvideo': opts.keepvideo,
          'min_filesize': opts.min_filesize,
          'max_filesize': opts.max_filesize,
+        'min_views': opts.min_views,
+        'max_views': opts.max_views,
          'daterange': date,
          'cachedir': opts.cachedir,
          'youtube_print_sig_code': opts.youtube_print_sig_code,
diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py

index 9a0c93fa6f4efb415f7e6dad25239a4c219a2542..e9c5e21521d66baa177986e8ca878e3fc1a75461 100644 (file)
--- a/youtube_dl/aes.py
+++ b/youtube_dl/aes.py
@@ -1,4 +1,4 @@
-__all__ = ['aes_encrypt', 'key_expansion', 'aes_ctr_decrypt', 'aes_decrypt_text']
+__all__ = ['aes_encrypt', 'key_expansion', 'aes_ctr_decrypt', 'aes_cbc_decrypt', 'aes_decrypt_text']
  
  import base64
  from math import ceil
@@ -32,6 +32,31 @@ def aes_ctr_decrypt(data, key, counter):
      
      return decrypted_data
  
+def aes_cbc_decrypt(data, key, iv):
+    """
+    Decrypt with aes in CBC mode
+    
+    @param {int[]} data        cipher
+    @param {int[]} key         16/24/32-Byte cipher key
+    @param {int[]} iv          16-Byte IV
+    @returns {int[]}           decrypted data
+    """
+    expanded_key = key_expansion(key)
+    block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES))
+    
+    decrypted_data=[]
+    previous_cipher_block = iv
+    for i in range(block_count):
+        block = data[i*BLOCK_SIZE_BYTES : (i+1)*BLOCK_SIZE_BYTES]
+        block += [0]*(BLOCK_SIZE_BYTES - len(block))
+        
+        decrypted_block = aes_decrypt(block, expanded_key)
+        decrypted_data += xor(decrypted_block, previous_cipher_block)
+        previous_cipher_block = block
+    decrypted_data = decrypted_data[:len(data)]
+    
+    return decrypted_data
+
  def key_expansion(data):
      """
      Generate key schedule
@@ -75,7 +100,7 @@ def aes_encrypt(data, expanded_key):
      @returns {int[]}             16-Byte cipher
      """
      rounds = len(expanded_key) // BLOCK_SIZE_BYTES - 1
-    
+
      data = xor(data, expanded_key[:BLOCK_SIZE_BYTES])
      for i in range(1, rounds+1):
          data = sub_bytes(data)
@@ -83,6 +108,26 @@ def aes_encrypt(data, expanded_key):
          if i != rounds:
              data = mix_columns(data)
          data = xor(data, expanded_key[i*BLOCK_SIZE_BYTES : (i+1)*BLOCK_SIZE_BYTES])
+
+    return data
+
+def aes_decrypt(data, expanded_key):
+    """
+    Decrypt one block with aes
+    
+    @param {int[]} data          16-Byte cipher
+    @param {int[]} expanded_key  176/208/240-Byte expanded key
+    @returns {int[]}             16-Byte state
+    """
+    rounds = len(expanded_key) // BLOCK_SIZE_BYTES - 1
+    
+    for i in range(rounds, 0, -1):
+        data = xor(data, expanded_key[i*BLOCK_SIZE_BYTES : (i+1)*BLOCK_SIZE_BYTES])
+        if i != rounds:
+            data = mix_columns_inv(data)
+        data = shift_rows_inv(data)
+        data = sub_bytes_inv(data)
+    data = xor(data, expanded_key[:BLOCK_SIZE_BYTES])
      
      return data
  
@@ -139,14 +184,69 @@ SBOX = (0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B,
          0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E,
          0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
          0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16)
-MIX_COLUMN_MATRIX = ((2,3,1,1),
-                     (1,2,3,1),
-                     (1,1,2,3),
-                     (3,1,1,2))
+SBOX_INV = (0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
+            0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
+            0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
+            0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
+            0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
+            0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
+            0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
+            0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
+            0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
+            0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
+            0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
+            0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
+            0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
+            0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
+            0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
+            0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d)
+MIX_COLUMN_MATRIX = ((0x2,0x3,0x1,0x1),
+                     (0x1,0x2,0x3,0x1),
+                     (0x1,0x1,0x2,0x3),
+                     (0x3,0x1,0x1,0x2))
+MIX_COLUMN_MATRIX_INV = ((0xE,0xB,0xD,0x9),
+                         (0x9,0xE,0xB,0xD),
+                         (0xD,0x9,0xE,0xB),
+                         (0xB,0xD,0x9,0xE))
+RIJNDAEL_EXP_TABLE = (0x01, 0x03, 0x05, 0x0F, 0x11, 0x33, 0x55, 0xFF, 0x1A, 0x2E, 0x72, 0x96, 0xA1, 0xF8, 0x13, 0x35,
+                      0x5F, 0xE1, 0x38, 0x48, 0xD8, 0x73, 0x95, 0xA4, 0xF7, 0x02, 0x06, 0x0A, 0x1E, 0x22, 0x66, 0xAA,
+                      0xE5, 0x34, 0x5C, 0xE4, 0x37, 0x59, 0xEB, 0x26, 0x6A, 0xBE, 0xD9, 0x70, 0x90, 0xAB, 0xE6, 0x31,
+                      0x53, 0xF5, 0x04, 0x0C, 0x14, 0x3C, 0x44, 0xCC, 0x4F, 0xD1, 0x68, 0xB8, 0xD3, 0x6E, 0xB2, 0xCD,
+                      0x4C, 0xD4, 0x67, 0xA9, 0xE0, 0x3B, 0x4D, 0xD7, 0x62, 0xA6, 0xF1, 0x08, 0x18, 0x28, 0x78, 0x88,
+                      0x83, 0x9E, 0xB9, 0xD0, 0x6B, 0xBD, 0xDC, 0x7F, 0x81, 0x98, 0xB3, 0xCE, 0x49, 0xDB, 0x76, 0x9A,
+                      0xB5, 0xC4, 0x57, 0xF9, 0x10, 0x30, 0x50, 0xF0, 0x0B, 0x1D, 0x27, 0x69, 0xBB, 0xD6, 0x61, 0xA3,
+                      0xFE, 0x19, 0x2B, 0x7D, 0x87, 0x92, 0xAD, 0xEC, 0x2F, 0x71, 0x93, 0xAE, 0xE9, 0x20, 0x60, 0xA0,
+                      0xFB, 0x16, 0x3A, 0x4E, 0xD2, 0x6D, 0xB7, 0xC2, 0x5D, 0xE7, 0x32, 0x56, 0xFA, 0x15, 0x3F, 0x41,
+                      0xC3, 0x5E, 0xE2, 0x3D, 0x47, 0xC9, 0x40, 0xC0, 0x5B, 0xED, 0x2C, 0x74, 0x9C, 0xBF, 0xDA, 0x75,
+                      0x9F, 0xBA, 0xD5, 0x64, 0xAC, 0xEF, 0x2A, 0x7E, 0x82, 0x9D, 0xBC, 0xDF, 0x7A, 0x8E, 0x89, 0x80,
+                      0x9B, 0xB6, 0xC1, 0x58, 0xE8, 0x23, 0x65, 0xAF, 0xEA, 0x25, 0x6F, 0xB1, 0xC8, 0x43, 0xC5, 0x54,
+                      0xFC, 0x1F, 0x21, 0x63, 0xA5, 0xF4, 0x07, 0x09, 0x1B, 0x2D, 0x77, 0x99, 0xB0, 0xCB, 0x46, 0xCA,
+                      0x45, 0xCF, 0x4A, 0xDE, 0x79, 0x8B, 0x86, 0x91, 0xA8, 0xE3, 0x3E, 0x42, 0xC6, 0x51, 0xF3, 0x0E,
+                      0x12, 0x36, 0x5A, 0xEE, 0x29, 0x7B, 0x8D, 0x8C, 0x8F, 0x8A, 0x85, 0x94, 0xA7, 0xF2, 0x0D, 0x17,
+                      0x39, 0x4B, 0xDD, 0x7C, 0x84, 0x97, 0xA2, 0xFD, 0x1C, 0x24, 0x6C, 0xB4, 0xC7, 0x52, 0xF6, 0x01)
+RIJNDAEL_LOG_TABLE = (0x00, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, 0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03,
+                      0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, 0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1,
+                      0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, 0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78,
+                      0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, 0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e,
+                      0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, 0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38,
+                      0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, 0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10,
+                      0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, 0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba,
+                      0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, 0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57,
+                      0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, 0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8,
+                      0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, 0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0,
+                      0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, 0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7,
+                      0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, 0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d,
+                      0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, 0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1,
+                      0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, 0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab,
+                      0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, 0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5,
+                      0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, 0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07)
  
  def sub_bytes(data):
      return [SBOX[x] for x in data]
  
+def sub_bytes_inv(data):
+    return [SBOX_INV[x] for x in data]
+
  def rotate(data):
      return data[1:] + [data[0]]
  
@@ -160,30 +260,31 @@ def key_schedule_core(data, rcon_iteration):
  def xor(data1, data2):
      return [x^y for x, y in zip(data1, data2)]
  
-def mix_column(data):
+def rijndael_mul(a, b):
+    if(a==0 or b==0):
+        return 0
+    return RIJNDAEL_EXP_TABLE[(RIJNDAEL_LOG_TABLE[a] + RIJNDAEL_LOG_TABLE[b]) % 0xFF]
+
+def mix_column(data, matrix):
      data_mixed = []
      for row in range(4):
          mixed = 0
          for column in range(4):
-            addend = data[column]
-            if MIX_COLUMN_MATRIX[row][column] in (2,3):
-                addend <<= 1
-                if addend > 0xff:
-                    addend &= 0xff
-                    addend ^= 0x1b
-                if MIX_COLUMN_MATRIX[row][column] == 3:
-                    addend ^= data[column]
-            mixed ^= addend & 0xff
+            # xor is (+) and (-)
+            mixed ^= rijndael_mul(data[column], matrix[row][column])
          data_mixed.append(mixed)
      return data_mixed
  
-def mix_columns(data):
+def mix_columns(data, matrix=MIX_COLUMN_MATRIX):
      data_mixed = []
      for i in range(4):
          column = data[i*4 : (i+1)*4]
-        data_mixed += mix_column(column)
+        data_mixed += mix_column(column, matrix)
      return data_mixed
  
+def mix_columns_inv(data):
+    return mix_columns(data, MIX_COLUMN_MATRIX_INV)
+
  def shift_rows(data):
      data_shifted = []
      for column in range(4):
@@ -191,6 +292,13 @@ def shift_rows(data):
              data_shifted.append( data[((column + row) & 0b11) * 4 + row] )
      return data_shifted
  
+def shift_rows_inv(data):
+    data_shifted = []
+    for column in range(4):
+        for row in range(4):
+            data_shifted.append( data[((column - row) & 0b11) * 4 + row] )
+    return data_shifted
+
  def inc(data):
      data = data[:] # copy
      for i in range(len(data)-1,-1,-1):
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py

index 1149dc1ec497b6cfffea1da3db70dea39469f4b3..a39a1e2f49803161913442236244b1910d27755c 100644 (file)
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -1,6 +1,8 @@
-from .appletrailers import AppleTrailersIE
+from .academicearth import AcademicEarthCourseIE
  from .addanime import AddAnimeIE
  from .anitube import AnitubeIE
+from .aparat import AparatIE
+from .appletrailers import AppleTrailersIE
  from .archiveorg import ArchiveOrgIE
  from .ard import ARDIE
  from .arte import (
@@ -13,6 +15,7 @@ from .arte import (
  from .auengine import AUEngineIE
  from .bambuser import BambuserIE, BambuserChannelIE
  from .bandcamp import BandcampIE, BandcampAlbumIE
+from .blinkx import BlinkxIE
  from .bliptv import BlipTVIE, BlipTVUserIE
  from .bloomberg import BloombergIE
  from .breakcom import BreakIE
@@ -20,6 +23,8 @@ from .brightcove import BrightcoveIE
  from .c56 import C56IE
  from .canalplus import CanalplusIE
  from .canalc2 import Canalc2IE
+from .cbs import CBSIE
+from .channel9 import Channel9IE
  from .cinemassacre import CinemassacreIE
  from .clipfish import ClipfishIE
  from .clipsyndicate import ClipsyndicateIE
@@ -28,6 +33,7 @@ from .collegehumor import CollegeHumorIE
  from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE
  from .condenast import CondeNastIE
  from .criterion import CriterionIE
+from .crunchyroll import CrunchyrollIE
  from .cspan import CSpanIE
  from .d8 import D8IE
  from .dailymotion import (
@@ -78,6 +84,10 @@ from .ina import InaIE
  from .infoq import InfoQIE
  from .instagram import InstagramIE
  from .internetvideoarchive import InternetVideoArchiveIE
+from .ivi import (
+    IviIE,
+    IviCompilationIE
+)
  from .jeuxvideo import JeuxVideoIE
  from .jukebox import JukeboxIE
  from .justintv import JustinTVIE
@@ -87,6 +97,7 @@ from .kickstarter import KickStarterIE
  from .keek import KeekIE
  from .liveleak import LiveLeakIE
  from .livestream import LivestreamIE, LivestreamOriginalIE
+from .mdr import MDRIE
  from .metacafe import MetacafeIE
  from .metacritic import MetacriticIE
  from .mit import TechTVMITIE, MITIE
@@ -111,9 +122,11 @@ from .orf import ORFIE
  from .pbs import PBSIE
  from .photobucket import PhotobucketIE
  from .podomatic import PodomaticIE
+from .pornhd import PornHdIE
  from .pornhub import PornHubIE
  from .pornotube import PornotubeIE
  from .pyvideo import PyvideoIE
+from .radiofrance import RadioFranceIE
  from .rbmaradio import RBMARadioIE
  from .redtube import RedTubeIE
  from .ringtv import RingTVIE
diff --git a/youtube_dl/extractor/academicearth.py b/youtube_dl/extractor/academicearth.py

new file mode 100644 (file)

index 0000000..ac05f82
--- /dev/null
+++ b/youtube_dl/extractor/academicearth.py
@@ -0,0 +1,31 @@
+import re
+
+from .common import InfoExtractor
+
+
+class AcademicEarthCourseIE(InfoExtractor):
+    _VALID_URL = r'^https?://(?:www\.)?academicearth\.org/(?:courses|playlists)/(?P<id>[^?#/]+)'
+    IE_NAME = u'AcademicEarth:Course'
+
+    def _real_extract(self, url):
+        m = re.match(self._VALID_URL, url)
+        playlist_id = m.group('id')
+
+        webpage = self._download_webpage(url, playlist_id)
+        title = self._html_search_regex(
+            r'<h1 class="playlist-name">(.*?)</h1>', webpage, u'title')
+        description = self._html_search_regex(
+            r'<p class="excerpt">(.*?)</p>',
+            webpage, u'description', fatal=False)
+        urls = re.findall(
+            r'<h3 class="lecture-title"><a target="_blank" href="([^"]+)">',
+            webpage)
+        entries = [self.url_result(u) for u in urls]
+
+        return {
+            '_type': 'playlist',
+            'id': playlist_id,
+            'title': title,
+            'description': description,
+            'entries': entries,
+        }
diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py

new file mode 100644 (file)

index 0000000..7e93bc4
--- /dev/null
+++ b/youtube_dl/extractor/aparat.py
@@ -0,0 +1,56 @@
+#coding: utf-8
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    HEADRequest,
+)
+
+
+class AparatIE(InfoExtractor):
+    _VALID_URL = r'^https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)'
+
+    _TEST = {
+        u'url': u'http://www.aparat.com/v/wP8On',
+        u'file': u'wP8On.mp4',
+        u'md5': u'6714e0af7e0d875c5a39c4dc4ab46ad1',
+        u'info_dict': {
+            u"title": u"تیم گلکسی 11 - زومیت",
+        },
+        #u'skip': u'Extremely unreliable',
+    }
+
+    def _real_extract(self, url):
+        m = re.match(self._VALID_URL, url)
+        video_id = m.group('id')
+
+        # Note: There is an easier-to-parse configuration at
+        # http://www.aparat.com/video/video/config/videohash/%video_id
+        # but the URL in there does not work
+        embed_url = (u'http://www.aparat.com/video/video/embed/videohash/' +
+                     video_id + u'/vt/frame')
+        webpage = self._download_webpage(embed_url, video_id)
+
+        video_urls = re.findall(r'fileList\[[0-9]+\]\s*=\s*"([^"]+)"', webpage)
+        for i, video_url in enumerate(video_urls):
+            req = HEADRequest(video_url)
+            res = self._request_webpage(
+                req, video_id, note=u'Testing video URL %d' % i, errnote=False)
+            if res:
+                break
+        else:
+            raise ExtractorError(u'No working video URLs found')
+
+        title = self._search_regex(r'\s+title:\s*"([^"]+)"', webpage, u'title')
+        thumbnail = self._search_regex(
+            r'\s+image:\s*"([^"]+)"', webpage, u'thumbnail', fatal=False)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': video_url,
+            'ext': 'mp4',
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py

index 4b7bef775ee1e029a1093785ffc32c9f220f3fa6..9254fbfe0de5cb9138cb50deeb4719f94c18f92e 100644 (file)
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -266,20 +266,6 @@ class ArteTVDDCIE(ArteTVPlus7IE):
      IE_NAME = u'arte.tv:ddc'
      _VALID_URL = r'http?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>.+)'
  
-    _TEST = {
-        u'url': u'http://ddc.arte.tv/folge/neues-aus-mauretanien',
-        u'file': u'049881-009_PLUS7-D.flv',
-        u'info_dict': {
-            u'title': u'Mit offenen Karten',
-            u'description': u'md5:57929b0eaeddeb8a0c983f58e9ebd3b6',
-            u'upload_date': u'20131207',
-        },
-        u'params': {
-            # rtmp download
-            u'skip_download': True,
-        },
-    }
-
      def _real_extract(self, url):
          video_id, lang = self._extract_url_info(url)
          if lang == 'folge':
diff --git a/youtube_dl/extractor/blinkx.py b/youtube_dl/extractor/blinkx.py

new file mode 100644 (file)

index 0000000..144ce64
--- /dev/null
+++ b/youtube_dl/extractor/blinkx.py
@@ -0,0 +1,90 @@
+import datetime
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    remove_start,
+)
+
+
+class BlinkxIE(InfoExtractor):
+    _VALID_URL = r'^(?:https?://(?:www\.)blinkx\.com/#?ce/|blinkx:)(?P<id>[^?]+)'
+    _IE_NAME = u'blinkx'
+
+    _TEST = {
+        u'url': u'http://www.blinkx.com/ce/8aQUy7GVFYgFzpKhT0oqsilwOGFRVXk3R1ZGWWdGenBLaFQwb3FzaWx3OGFRVXk3R1ZGWWdGenB',
+        u'file': u'8aQUy7GV.mp4',
+        u'md5': u'2e9a07364af40163a908edbf10bb2492',
+        u'info_dict': {
+            u"title": u"Police Car Rolls Away",
+            u"uploader": u"stupidvideos.com",
+            u"upload_date": u"20131215",
+            u"description": u"A police car gently rolls away from a fight. Maybe it felt weird being around a confrontation and just had to get out of there!",
+            u"duration": 14.886,
+            u"thumbnails": [{
+                "width": 100,
+                "height": 76,
+                "url": "http://cdn.blinkx.com/stream/b/41/StupidVideos/20131215/1873969261/1873969261_tn_0.jpg",
+            }],
+        },
+    }
+
+    def _real_extract(self, url):
+        m = re.match(self._VALID_URL, url)
+        video_id = m.group('id')
+        display_id = video_id[:8]
+
+        api_url = (u'https://apib4.blinkx.com/api.php?action=play_video&' +
+                   u'video=%s' % video_id)
+        data_json = self._download_webpage(api_url, display_id)
+        data = json.loads(data_json)['api']['results'][0]
+        dt = datetime.datetime.fromtimestamp(data['pubdate_epoch'])
+        upload_date = dt.strftime('%Y%m%d')
+
+        duration = None
+        thumbnails = []
+        formats = []
+        for m in data['media']:
+            if m['type'] == 'jpg':
+                thumbnails.append({
+                    'url': m['link'],
+                    'width': int(m['w']),
+                    'height': int(m['h']),
+                })
+            elif m['type'] == 'original':
+                duration = m['d']
+            elif m['type'] == 'youtube':
+                yt_id = m['link']
+                self.to_screen(u'Youtube video detected: %s' % yt_id)
+                return self.url_result(yt_id, 'Youtube', video_id=yt_id)
+            elif m['type'] in ('flv', 'mp4'):
+                vcodec = remove_start(m['vcodec'], 'ff')
+                acodec = remove_start(m['acodec'], 'ff')
+                format_id = (u'%s-%sk-%s' %
+                             (vcodec,
+                              (int(m['vbr']) + int(m['abr'])) // 1000,
+                              m['w']))
+                formats.append({
+                    'format_id': format_id,
+                    'url': m['link'],
+                    'vcodec': vcodec,
+                    'acodec': acodec,
+                    'abr': int(m['abr']) // 1000,
+                    'vbr': int(m['vbr']) // 1000,
+                    'width': int(m['w']),
+                    'height': int(m['h']),
+                })
+        formats.sort(key=lambda f: (f['width'], f['vbr'], f['abr']))
+
+        return {
+            'id': display_id,
+            'fullid': video_id,
+            'title': data['title'],
+            'formats': formats,
+            'uploader': data['channel_name'],
+            'upload_date': upload_date,
+            'description': data.get('description'),
+            'thumbnails': thumbnails,
+            'duration': duration,
+        }
diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py

index 5e33a69df42fcbaa1b17f1737d66f5841ca50318..0e63208dfbe5a68b758a683a77bf0e09911ba282 100644 (file)
--- a/youtube_dl/extractor/bliptv.py
+++ b/youtube_dl/extractor/bliptv.py
@@ -70,13 +70,14 @@ class BlipTVIE(InfoExtractor):
          info = None
          urlh = self._request_webpage(request, None, False,
              u'unable to download video info webpage')
+
          if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
              basename = url.split('/')[-1]
              title,ext = os.path.splitext(basename)
              title = title.decode('UTF-8')
              ext = ext.replace('.', '')
              self.report_direct_download(title)
-            info = {
+            return {
                  'id': title,
                  'url': url,
                  'uploader': None,
@@ -85,49 +86,47 @@ class BlipTVIE(InfoExtractor):
                  'ext': ext,
                  'urlhandle': urlh
              }
-        if info is None: # Regular URL
-            try:
-                json_code_bytes = urlh.read()
-                json_code = json_code_bytes.decode('utf-8')
-            except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-                raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
-
-            try:
-                json_data = json.loads(json_code)
-                if 'Post' in json_data:
-                    data = json_data['Post']
-                else:
-                    data = json_data
-
-                upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
-                if 'additionalMedia' in data:
-                    formats = sorted(data['additionalMedia'], key=lambda f: int(f['media_height']))
-                    best_format = formats[-1]
-                    video_url = best_format['url']
-                else:
-                    video_url = data['media']['url']
-                umobj = re.match(self._URL_EXT, video_url)
-                if umobj is None:
-                    raise ValueError('Can not determine filename extension')
-                ext = umobj.group(1)
-
-                info = {
-                    'id': compat_str(data['item_id']),
-                    'url': video_url,
-                    'uploader': data['display_name'],
-                    'upload_date': upload_date,
-                    'title': data['title'],
-                    'ext': ext,
-                    'format': data['media']['mimeType'],
-                    'thumbnail': data['thumbnailUrl'],
-                    'description': data['description'],
-                    'player_url': data['embedUrl'],
-                    'user_agent': 'iTunes/10.6.1',
-                }
-            except (ValueError,KeyError) as err:
-                raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
-
-        return [info]
+
+        try:
+            json_code_bytes = urlh.read()
+            json_code = json_code_bytes.decode('utf-8')
+        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+            raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
+
+        try:
+            json_data = json.loads(json_code)
+            if 'Post' in json_data:
+                data = json_data['Post']
+            else:
+                data = json_data
+
+            upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
+            if 'additionalMedia' in data:
+                formats = sorted(data['additionalMedia'], key=lambda f: int(f['media_height']))
+                best_format = formats[-1]
+                video_url = best_format['url']
+            else:
+                video_url = data['media']['url']
+            umobj = re.match(self._URL_EXT, video_url)
+            if umobj is None:
+                raise ValueError('Can not determine filename extension')
+            ext = umobj.group(1)
+
+            return {
+                'id': compat_str(data['item_id']),
+                'url': video_url,
+                'uploader': data['display_name'],
+                'upload_date': upload_date,
+                'title': data['title'],
+                'ext': ext,
+                'format': data['media']['mimeType'],
+                'thumbnail': data['thumbnailUrl'],
+                'description': data['description'],
+                'player_url': data['embedUrl'],
+                'user_agent': 'iTunes/10.6.1',
+            }
+        except (ValueError, KeyError) as err:
+            raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
  
  
  class BlipTVUserIE(InfoExtractor):
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py

index b1b7526ca98f03cd44707a0f966567a1a0d363b8..f7f0041c0872f84349d2ee060ef8ada9aed9d6bd 100644 (file)
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -26,7 +26,7 @@ class BrightcoveIE(InfoExtractor):
              # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/
              u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001',
              u'file': u'2371591881001.mp4',
-            u'md5': u'8eccab865181d29ec2958f32a6a754f5',
+            u'md5': u'5423e113865d26e40624dce2e4b45d95',
              u'note': u'Test Brightcove downloads and detection in GenericIE',
              u'info_dict': {
                  u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”',
diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py

new file mode 100644 (file)

index 0000000..ac03158
--- /dev/null
+++ b/youtube_dl/extractor/cbs.py
@@ -0,0 +1,30 @@
+import re
+
+from .common import InfoExtractor
+
+
+class CBSIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?cbs\.com/shows/[^/]+/video/(?P<id>[^/]+)/.*'
+
+    _TEST = {
+        u'url': u'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/',
+        u'file': u'4JUVEwq3wUT7.flv',
+        u'info_dict': {
+            u'title': u'Connect Chat feat. Garth Brooks',
+            u'description': u'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!',
+            u'duration': 1495,
+        },
+        u'params': {
+            # rtmp download
+            u'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        webpage = self._download_webpage(url, video_id)
+        real_id = self._search_regex(
+            r"video\.settings\.pid\s*=\s*'([^']+)';",
+            webpage, u'real video ID')
+        return self.url_result(u'theplatform:%s' % real_id)
diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py

new file mode 100644 (file)

index 0000000..ae70ea2
--- /dev/null
+++ b/youtube_dl/extractor/channel9.py
@@ -0,0 +1,267 @@
+# encoding: utf-8
+
+import re
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+class Channel9IE(InfoExtractor):
+    '''
+    Common extractor for channel9.msdn.com.
+
+    The type of provided URL (video or playlist) is determined according to
+    meta Search.PageType from web page HTML rather than URL itself, as it is
+    not always possible to do.    
+    '''
+    IE_DESC = u'Channel 9'
+    IE_NAME = u'channel9'
+    _VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
+
+    _TESTS = [
+        {
+            u'url': u'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
+            u'file': u'Events_TechEd_Australia_2013_KOS002.mp4',
+            u'md5': u'bbd75296ba47916b754e73c3a4bbdf10',
+            u'info_dict': {
+                u'title': u'Developer Kick-Off Session: Stuff We Love',
+                u'description': u'md5:c08d72240b7c87fcecafe2692f80e35f',
+                u'duration': 4576,
+                u'thumbnail': u'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
+                u'session_code': u'KOS002',
+                u'session_day': u'Day 1',
+                u'session_room': u'Arena 1A',
+                u'session_speakers': [ u'Ed Blankenship', u'Andrew Coates', u'Brady Gaster', u'Patrick Klug', u'Mads Kristensen' ],
+            },
+        },
+        {
+            u'url': u'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
+            u'file': u'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4',
+            u'md5': u'b43ee4529d111bc37ba7ee4f34813e68',
+            u'info_dict': {
+                u'title': u'Self-service BI with Power BI - nuclear testing',
+                u'description': u'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
+                u'duration': 1540,
+                u'thumbnail': u'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
+                u'authors': [ u'Mike Wilmot' ],
+            },
+        }
+    ]
+
+    _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
+
+    # Sorted by quality
+    _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
+
+    def _restore_bytes(self, formatted_size):
+        if not formatted_size:
+            return 0
+        m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
+        if not m:
+            return 0
+        units = m.group('units')
+        try:
+            exponent = [u'B', u'KB', u'MB', u'GB', u'TB', u'PB', u'EB', u'ZB', u'YB'].index(units.upper())
+        except ValueError:
+            return 0
+        size = float(m.group('size'))
+        return int(size * (1024 ** exponent))
+
+    def _formats_from_html(self, html):
+        FORMAT_REGEX = r'''
+            (?x)
+            <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
+            <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
+            (?:<div\s+class="popup\s+rounded">\s*
+            <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
+            </div>)?                                                # File size part may be missing
+        '''
+        # Extract known formats
+        formats = [{'url': x.group('url'),
+                 'format_id': x.group('quality'),
+                 'format_note': x.group('note'),
+                 'format': '%s (%s)' % (x.group('quality'), x.group('note')), 
+                 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
+                 } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
+        # Sort according to known formats list
+        formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id']))
+        return formats
+
+    def _extract_title(self, html):
+        title = self._html_search_meta(u'title', html, u'title')
+        if title is None:           
+            title = self._og_search_title(html)
+            TITLE_SUFFIX = u' (Channel 9)'
+            if title is not None and title.endswith(TITLE_SUFFIX):
+                title = title[:-len(TITLE_SUFFIX)]
+        return title
+
+    def _extract_description(self, html):
+        DESCRIPTION_REGEX = r'''(?sx)
+            <div\s+class="entry-content">\s*
+            <div\s+id="entry-body">\s*
+            (?P<description>.+?)\s*
+            </div>\s*
+            </div>
+        '''
+        m = re.search(DESCRIPTION_REGEX, html)
+        if m is not None:
+            return m.group('description')
+        return self._html_search_meta(u'description', html, u'description')
+
+    def _extract_duration(self, html):
+        m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
+        return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
+
+    def _extract_slides(self, html):
+        m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
+        return m.group('slidesurl') if m is not None else None
+
+    def _extract_zip(self, html):
+        m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
+        return m.group('zipurl') if m is not None else None
+
+    def _extract_avg_rating(self, html):
+        m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
+        return float(m.group('avgrating')) if m is not None else 0
+
+    def _extract_rating_count(self, html):
+        m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
+        return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
+
+    def _extract_view_count(self, html):
+        m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
+        return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
+
+    def _extract_comment_count(self, html):
+        m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
+        return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
+
+    def _fix_count(self, count):
+        return int(str(count).replace(',', '')) if count is not None else None
+
+    def _extract_authors(self, html):
+        m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
+        if m is None:
+            return None
+        return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
+
+    def _extract_session_code(self, html):
+        m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
+        return m.group('code') if m is not None else None
+
+    def _extract_session_day(self, html):
+        m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
+        return m.group('day') if m is not None else None
+
+    def _extract_session_room(self, html):
+        m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
+        return m.group('room') if m is not None else None
+
+    def _extract_session_speakers(self, html):
+        return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
+
+    def _extract_content(self, html, content_path):
+        # Look for downloadable content        
+        formats = self._formats_from_html(html)
+        slides = self._extract_slides(html)
+        zip_ = self._extract_zip(html)
+
+        # Nothing to download
+        if len(formats) == 0 and slides is None and zip_ is None:
+            self._downloader.report_warning(u'None of recording, slides or zip are available for %s' % content_path)
+            return
+
+        # Extract meta
+        title = self._extract_title(html)
+        description = self._extract_description(html)
+        thumbnail = self._og_search_thumbnail(html)
+        duration = self._extract_duration(html)
+        avg_rating = self._extract_avg_rating(html)
+        rating_count = self._extract_rating_count(html)
+        view_count = self._extract_view_count(html)
+        comment_count = self._extract_comment_count(html)
+
+        common = {'_type': 'video',
+                  'id': content_path,
+                  'description': description,
+                  'thumbnail': thumbnail,
+                  'duration': duration,
+                  'avg_rating': avg_rating,
+                  'rating_count': rating_count,
+                  'view_count': view_count,
+                  'comment_count': comment_count,
+                }
+
+        result = []
+
+        if slides is not None:
+            d = common.copy()
+            d.update({ 'title': title + '-Slides', 'url': slides })
+            result.append(d)
+
+        if zip_ is not None:
+            d = common.copy()
+            d.update({ 'title': title + '-Zip', 'url': zip_ })
+            result.append(d)
+
+        if len(formats) > 0:
+            d = common.copy()
+            d.update({ 'title': title, 'formats': formats })
+            result.append(d)
+
+        return result
+
+    def _extract_entry_item(self, html, content_path):
+        contents = self._extract_content(html, content_path)
+        if contents is None:
+            return contents
+
+        authors = self._extract_authors(html)
+
+        for content in contents:
+            content['authors'] = authors
+
+        return contents
+
+    def _extract_session(self, html, content_path):
+        contents = self._extract_content(html, content_path)
+        if contents is None:
+            return contents
+
+        session_meta = {'session_code': self._extract_session_code(html),
+                        'session_day': self._extract_session_day(html),
+                        'session_room': self._extract_session_room(html),
+                        'session_speakers': self._extract_session_speakers(html),
+                        }
+
+        for content in contents:
+            content.update(session_meta)
+
+        return contents
+
+    def _extract_list(self, content_path):
+        rss = self._download_xml(self._RSS_URL % content_path, content_path, u'Downloading RSS')
+        entries = [self.url_result(session_url.text, 'Channel9')
+                   for session_url in rss.findall('./channel/item/link')]
+        title_text = rss.find('./channel/title').text
+        return self.playlist_result(entries, content_path, title_text)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        content_path = mobj.group('contentpath')
+
+        webpage = self._download_webpage(url, content_path, u'Downloading web page')
+
+        page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
+        if page_type_m is None:
+            raise ExtractorError(u'Search.PageType not found, don\'t know how to process this page', expected=True)
+
+        page_type = page_type_m.group('pagetype')
+        if page_type == 'List':         # List page, may contain list of 'item'-like objects
+            return self._extract_list(content_path)
+        elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
+            return self._extract_entry_item(webpage, content_path)
+        elif page_type == 'Session':    # Event session page, may contain downloadable content
+            return self._extract_session(webpage, content_path)
+        else:
+            raise ExtractorError(u'Unexpected Search.PageType %s' % page_type, expected=True)
+\ No newline at end of file
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index 69a083b68aa3cee7d8ec2a6af9af7108a28bdf9b..ba46a7bc77d17ed4bcf4dcf7764b1d39f4799958 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -18,6 +18,7 @@ from ..utils import (
      sanitize_filename,
      unescapeHTML,
  )
+_NO_DEFAULT = object()
  
  
  class InfoExtractor(object):
@@ -34,15 +35,39 @@ class InfoExtractor(object):
      The dictionaries must include the following fields:
  
      id:             Video identifier.
-    url:            Final video URL.
      title:          Video title, unescaped.
-    ext:            Video filename extension.
  
-    Instead of url and ext, formats can also specified.
+    Additionally, it must contain either a formats entry or url and ext:
+
+    formats:        A list of dictionaries for each format available, it must
+                    be ordered from worst to best quality. Potential fields:
+                    * url        Mandatory. The URL of the video file
+                    * ext        Will be calculated from url if missing
+                    * format     A human-readable description of the format
+                                 ("mp4 container with h264/opus").
+                                 Calculated from the format_id, width, height.
+                                 and format_note fields if missing.
+                    * format_id  A short description of the format
+                                 ("mp4_h264_opus" or "19")
+                    * format_note Additional info about the format
+                                 ("3D" or "DASH video")
+                    * width      Width of the video, if known
+                    * height     Height of the video, if known
+                    * abr        Average audio bitrate in KBit/s
+                    * acodec     Name of the audio codec in use
+                    * vbr        Average video bitrate in KBit/s
+                    * vcodec     Name of the video codec in use
+                    * filesize   The number of bytes, if known in advance
+                    * player_url SWF Player URL (used for rtmpdump).
+    url:            Final video URL.
+    ext:            Video filename extension.
+    format:         The video format, defaults to ext (used for --get-format)
+    player_url:     SWF Player URL (used for rtmpdump).
+    urlhandle:      [internal] The urlHandle to be used to download the file,
+                    like returned by urllib.request.urlopen
  
      The following fields are optional:
  
-    format:         The video format, defaults to ext (used for --get-format)
      thumbnails:     A list of dictionaries (with the entries "resolution" and
                      "url") for the varying thumbnails
      thumbnail:      Full URL to a video thumbnail image.
@@ -51,35 +76,14 @@ class InfoExtractor(object):
      upload_date:    Video upload date (YYYYMMDD).
      uploader_id:    Nickname or id of the video uploader.
      location:       Physical location of the video.
-    player_url:     SWF Player URL (used for rtmpdump).
      subtitles:      The subtitle file contents as a dictionary in the format
                      {language: subtitles}.
+    duration:       Length of the video in seconds, as an integer.
      view_count:     How many users have watched the video on the platform.
      like_count:     Number of positive ratings of the video
      dislike_count:  Number of negative ratings of the video
      comment_count:  Number of comments on the video
-    urlhandle:      [internal] The urlHandle to be used to download the file,
-                    like returned by urllib.request.urlopen
      age_limit:      Age restriction for the video, as an integer (years)
-    formats:        A list of dictionaries for each format available, it must
-                    be ordered from worst to best quality. Potential fields:
-                    * url       Mandatory. The URL of the video file
-                    * ext       Will be calculated from url if missing
-                    * format    A human-readable description of the format
-                                ("mp4 container with h264/opus").
-                                Calculated from the format_id, width, height.
-                                and format_note fields if missing.
-                    * format_id A short description of the format
-                                ("mp4_h264_opus" or "19")
-                    * format_note Additional info about the format
-                                ("3D" or "DASH video")
-                    * width     Width of the video, if known
-                    * height    Height of the video, if known
-                    * abr       Average audio bitrate in KBit/s
-                    * acodec    Name of the audio codec in use
-                    * vbr       Average video bitrate in KBit/s
-                    * vcodec    Name of the video codec in use
-                    * filesize  The number of bytes, if known in advance
      webpage_url:    The url to the video webpage, if given to youtube-dl it
                      should allow to get the same result again. (It will be set
                      by YoutubeDL if it's missing)
@@ -166,6 +170,8 @@ class InfoExtractor(object):
          try:
              return self._downloader.urlopen(url_or_request)
          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+            if errnote is False:
+                return False
              if errnote is None:
                  errnote = u'Unable to download webpage'
              errmsg = u'%s: %s' % (errnote, compat_str(err))
@@ -259,7 +265,8 @@ class InfoExtractor(object):
          self.to_screen(u'Logging in')
  
      #Methods for following #608
-    def url_result(self, url, ie=None, video_id=None):
+    @staticmethod
+    def url_result(url, ie=None, video_id=None):
          """Returns a url that points to a page that should be processed"""
          #TODO: ie should be the class used for getting the info
          video_info = {'_type': 'url',
@@ -268,7 +275,8 @@ class InfoExtractor(object):
          if video_id is not None:
              video_info['id'] = video_id
          return video_info
-    def playlist_result(self, entries, playlist_id=None, playlist_title=None):
+    @staticmethod
+    def playlist_result(entries, playlist_id=None, playlist_title=None):
          """Returns a playlist"""
          video_info = {'_type': 'playlist',
                        'entries': entries}
@@ -278,7 +286,7 @@ class InfoExtractor(object):
              video_info['title'] = playlist_title
          return video_info
  
-    def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
+    def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
          """
          Perform a regex search on the given string, using a single or a list of
          patterns returning the first matching group.
@@ -292,7 +300,7 @@ class InfoExtractor(object):
                  mobj = re.search(p, string, flags)
                  if mobj: break
  
-        if sys.stderr.isatty() and os.name != 'nt':
+        if os.name != 'nt' and sys.stderr.isatty():
              _name = u'\033[0;34m%s\033[0m' % name
          else:
              _name = name
@@ -300,7 +308,7 @@ class InfoExtractor(object):
          if mobj:
              # return the first matching group
              return next(g for g in mobj.groups() if g is not None)
-        elif default is not None:
+        elif default is not _NO_DEFAULT:
              return default
          elif fatal:
              raise RegexNotFoundError(u'Unable to extract %s' % _name)
@@ -309,7 +317,7 @@ class InfoExtractor(object):
                  u'please report this issue on http://yt-dl.org/bug' % _name)
              return None
  
-    def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
+    def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
          """
          Like _search_regex, but strips HTML tags and unescapes entities.
          """
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py

new file mode 100644 (file)

index 0000000..2b66bdd
--- /dev/null
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -0,0 +1,171 @@
+# encoding: utf-8
+import re, base64, zlib
+from hashlib import sha1
+from math import pow, sqrt, floor
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    compat_urllib_parse,
+    compat_urllib_request,
+    bytes_to_intlist,
+    intlist_to_bytes,
+    unified_strdate,
+    clean_html,
+)
+from ..aes import (
+    aes_cbc_decrypt,
+    inc,
+)
+
+class CrunchyrollIE(InfoExtractor):
+    _VALID_URL = r'(?:https?://)?(?:www\.)?(?P<url>crunchyroll\.com/[^/]*/[^/?&]*?(?P<video_id>[0-9]+))(?:[/?&]|$)'
+    _TESTS = [{
+        u'url': u'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
+        u'file': u'645513.flv',
+        #u'md5': u'b1639fd6ddfaa43788c85f6d1dddd412',
+        u'info_dict': {
+            u'title': u'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!',
+            u'description': u'md5:2d17137920c64f2f49981a7797d275ef',
+            u'thumbnail': u'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg',
+            u'uploader': u'Yomiuri Telecasting Corporation (YTV)',
+            u'upload_date': u'20131013',
+        },
+        u'params': {
+            # rtmp
+            u'skip_download': True,
+        },
+    }]
+
+    _FORMAT_IDS = {
+        u'360': (u'60', u'106'),
+        u'480': (u'61', u'106'),
+        u'720': (u'62', u'106'),
+        u'1080': (u'80', u'108'),
+    }
+
+    def _decrypt_subtitles(self, data, iv, id):
+        data = bytes_to_intlist(data)
+        iv = bytes_to_intlist(iv)
+        id = int(id)
+
+        def obfuscate_key_aux(count, modulo, start):
+            output = list(start)
+            for _ in range(count):
+                output.append(output[-1] + output[-2])
+            # cut off start values
+            output = output[2:]
+            output = list(map(lambda x: x % modulo + 33, output))
+            return output
+
+        def obfuscate_key(key):
+            num1 = int(floor(pow(2, 25) * sqrt(6.9)))
+            num2 = (num1 ^ key) << 5
+            num3 = key ^ num1
+            num4 = num3 ^ (num3 >> 3) ^ num2
+            prefix = intlist_to_bytes(obfuscate_key_aux(20, 97, (1, 2)))
+            shaHash = bytes_to_intlist(sha1(prefix + str(num4).encode(u'ascii')).digest())
+            # Extend 160 Bit hash to 256 Bit
+            return shaHash + [0] * 12
+        
+        key = obfuscate_key(id)
+        class Counter:
+            __value = iv
+            def next_value(self):
+                temp = self.__value
+                self.__value = inc(self.__value)
+                return temp
+        decrypted_data = intlist_to_bytes(aes_cbc_decrypt(data, key, iv))
+        return zlib.decompress(decrypted_data)
+
+    def _convert_subtitles_to_srt(self, subtitles):
+        i=1
+        output = u''
+        for start, end, text in re.findall(r'<event [^>]*?start="([^"]+)" [^>]*?end="([^"]+)" [^>]*?text="([^"]+)"[^>]*?>', subtitles):
+            start = start.replace(u'.', u',')
+            end = end.replace(u'.', u',')
+            text = clean_html(text)
+            text = text.replace(u'\\N', u'\n')
+            if not text:
+                continue
+            output += u'%d\n%s --> %s\n%s\n\n' % (i, start, end, text)
+            i+=1
+        return output
+
+    def _real_extract(self,url):
+        mobj = re.match(self._VALID_URL, url)
+
+        webpage_url = u'http://www.' + mobj.group('url')
+        video_id = mobj.group(u'video_id')
+        webpage = self._download_webpage(webpage_url, video_id)
+        note_m = self._html_search_regex(r'<div class="showmedia-trailer-notice">(.+?)</div>', webpage, u'trailer-notice', default=u'')
+        if note_m:
+            raise ExtractorError(note_m)
+
+        video_title = self._html_search_regex(r'<h1[^>]*>(.+?)</h1>', webpage, u'video_title', flags=re.DOTALL)
+        video_title = re.sub(r' {2,}', u' ', video_title)
+        video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, u'video_description', default=u'')
+        if not video_description:
+            video_description = None
+        video_upload_date = self._html_search_regex(r'<div>Availability for free users:(.+?)</div>', webpage, u'video_upload_date', fatal=False, flags=re.DOTALL)
+        if video_upload_date:
+            video_upload_date = unified_strdate(video_upload_date)
+        video_uploader = self._html_search_regex(r'<div>\s*Publisher:(.+?)</div>', webpage, u'video_uploader', fatal=False, flags=re.DOTALL)
+
+        playerdata_url = compat_urllib_parse.unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, u'playerdata_url'))
+        playerdata_req = compat_urllib_request.Request(playerdata_url)
+        playerdata_req.data = compat_urllib_parse.urlencode({u'current_page': webpage_url})
+        playerdata_req.add_header(u'Content-Type', u'application/x-www-form-urlencoded')
+        playerdata = self._download_webpage(playerdata_req, video_id, note=u'Downloading media info')
+        
+        stream_id = self._search_regex(r'<media_id>([^<]+)', playerdata, u'stream_id')
+        video_thumbnail = self._search_regex(r'<episode_image_url>([^<]+)', playerdata, u'thumbnail', fatal=False)
+
+        formats = []
+        for fmt in re.findall(r'\?p([0-9]{3,4})=1', webpage):
+            stream_quality, stream_format = self._FORMAT_IDS[fmt]
+            video_format = fmt+u'p'
+            streamdata_req = compat_urllib_request.Request(u'http://www.crunchyroll.com/xml/')
+            # urlencode doesn't work!
+            streamdata_req.data = u'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality='+stream_quality+u'&media%5Fid='+stream_id+u'&video%5Fformat='+stream_format
+            streamdata_req.add_header(u'Content-Type', u'application/x-www-form-urlencoded')
+            streamdata_req.add_header(u'Content-Length', str(len(streamdata_req.data)))
+            streamdata = self._download_webpage(streamdata_req, video_id, note=u'Downloading media info for '+video_format)
+            video_url = self._search_regex(r'<host>([^<]+)', streamdata, u'video_url')
+            video_play_path = self._search_regex(r'<file>([^<]+)', streamdata, u'video_play_path')
+            formats.append({
+                u'url': video_url,
+                u'play_path':   video_play_path,
+                u'ext': 'flv',
+                u'format': video_format,
+                u'format_id': video_format,
+            })
+
+        subtitles = {}
+        for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage):
+            sub_page = self._download_webpage(u'http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id='+sub_id,\
+                                              video_id, note=u'Downloading subtitles for '+sub_name)
+            id = self._search_regex(r'id=\'([0-9]+)', sub_page, u'subtitle_id', fatal=False)
+            iv = self._search_regex(r'<iv>([^<]+)', sub_page, u'subtitle_iv', fatal=False)
+            data = self._search_regex(r'<data>([^<]+)', sub_page, u'subtitle_data', fatal=False)
+            if not id or not iv or not data:
+                continue
+            id = int(id)
+            iv = base64.b64decode(iv)
+            data = base64.b64decode(data)
+
+            subtitle = self._decrypt_subtitles(data, iv, id).decode(u'utf-8')
+            lang_code = self._search_regex(r'lang_code=\'([^\']+)', subtitle, u'subtitle_lang_code', fatal=False)
+            if not lang_code:
+                continue
+            subtitles[lang_code] = self._convert_subtitles_to_srt(subtitle)
+
+        return {
+            u'id':          video_id,
+            u'title':       video_title,
+            u'description': video_description,
+            u'thumbnail':   video_thumbnail,
+            u'uploader':    video_uploader,
+            u'upload_date': video_upload_date,
+            u'subtitles':   subtitles,
+            u'formats':     formats,
+        }
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py

index aea7e557e85457b0526812afb37c46f249e35826..6685c94a3d6b283e0b7f2240ebfcf35ce462edc2 100644 (file)
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -28,7 +28,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor):
  class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
      """Information Extractor for Dailymotion"""
  
-    _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/(?:embed/)?video/([^/]+)'
+    _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(embed|#)/)?video/(?P<id>[^/?_]+)'
      IE_NAME = u'dailymotion'
  
      _FORMATS = [
@@ -81,7 +81,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
          # Extract id and simplified title from URL
          mobj = re.match(self._VALID_URL, url)
  
-        video_id = mobj.group(1).split('_')[0].split('?')[0]
+        video_id = mobj.group('id')
  
          url = 'http://www.dailymotion.com/video/%s' % video_id
  
diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py

index d418ce4a8a29c122e811c96aac76d388c790b560..4876ecb4812710e2509eec8fc19f00dac60d2fde 100644 (file)
--- a/youtube_dl/extractor/daum.py
+++ b/youtube_dl/extractor/daum.py
@@ -9,7 +9,7 @@ from ..utils import (
  
  
  class DaumIE(InfoExtractor):
-    _VALID_URL = r'https?://tvpot\.daum\.net/.*?clipid=(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/.*?clipid=(?P<id>\d+)'
      IE_NAME = u'daum.net'
  
      _TEST = {
diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py

index 3b210710e3695ec3aa940b335d9868a281d7740a..4556079c8ad5edce7a6a3efe29989299d719ed28 100644 (file)
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@@ -17,7 +17,7 @@ from ..utils import (
  class FacebookIE(InfoExtractor):
      """Information Extractor for Facebook"""
  
-    _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
+    _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:[^#?]*#!/)?(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
      _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1'
      _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'
      _NETRC_MACHINE = 'facebook'
@@ -27,7 +27,7 @@ class FacebookIE(InfoExtractor):
          u'file': u'120708114770723.mp4',
          u'md5': u'48975a41ccc4b7a581abd68651c1a5a8',
          u'info_dict': {
-            u"duration": 279, 
+            u"duration": 279,
              u"title": u"PEOPLE ARE AWESOME 2013"
          }
      }
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index 216e032186297b7b91a488fb1edd1421e3270b39..7a14c98f9b6ef9d550606c72c330d0730ec1233e 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -11,10 +11,14 @@ from ..utils import (
      compat_urlparse,
  
      ExtractorError,
+    HEADRequest,
      smuggle_url,
      unescapeHTML,
+    unified_strdate,
+    url_basename,
  )
  from .brightcove import BrightcoveIE
+from .ooyala import OoyalaIE
  
  
  class GenericIE(InfoExtractor):
@@ -71,6 +75,27 @@ class GenericIE(InfoExtractor):
                  u'skip_download': True,
              },
          },
+        # Direct link to a video
+        {
+            u'url': u'http://media.w3.org/2010/05/sintel/trailer.mp4',
+            u'file': u'trailer.mp4',
+            u'md5': u'67d406c2bcb6af27fa886f31aa934bbe',
+            u'info_dict': {
+                u'id': u'trailer',
+                u'title': u'trailer',
+                u'upload_date': u'20100513',
+            }
+        },
+        # ooyala video
+        {
+            u'url': u'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
+            u'md5': u'5644c6ca5d5782c1d0d350dad9bd840c',
+            u'info_dict': {
+                u'id': u'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
+                u'ext': u'mp4',
+                u'title': u'2cc213299525360.mov', #that's what we get
+            },
+        },
      ]
  
      def report_download_webpage(self, video_id):
@@ -83,23 +108,20 @@ class GenericIE(InfoExtractor):
          """Report information extraction."""
          self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
  
-    def _test_redirect(self, url):
+    def _send_head(self, url):
          """Check if it is a redirect, like url shorteners, in case return the new url."""
-        class HeadRequest(compat_urllib_request.Request):
-            def get_method(self):
-                return "HEAD"
  
          class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
              """
              Subclass the HTTPRedirectHandler to make it use our
-            HeadRequest also on the redirected URL
+            HEADRequest also on the redirected URL
              """
              def redirect_request(self, req, fp, code, msg, headers, newurl):
                  if code in (301, 302, 303, 307):
                      newurl = newurl.replace(' ', '%20')
                      newheaders = dict((k,v) for k,v in req.headers.items()
                                        if k.lower() not in ("content-length", "content-type"))
-                    return HeadRequest(newurl,
+                    return HEADRequest(newurl,
                                         headers=newheaders,
                                         origin_req_host=req.get_origin_req_host(),
                                         unverifiable=True)
@@ -128,32 +150,49 @@ class GenericIE(InfoExtractor):
                          compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
              opener.add_handler(handler())
  
-        response = opener.open(HeadRequest(url))
+        response = opener.open(HEADRequest(url))
          if response is None:
              raise ExtractorError(u'Invalid URL protocol')
-        new_url = response.geturl()
-
-        if url == new_url:
-            return False
-
-        self.report_following_redirect(new_url)
-        return new_url
+        return response
  
      def _real_extract(self, url):
          parsed_url = compat_urlparse.urlparse(url)
          if not parsed_url.scheme:
              self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
              return self.url_result('http://' + url)
+        video_id = os.path.splitext(url.split('/')[-1])[0]
  
          try:
-            new_url = self._test_redirect(url)
-            if new_url:
-                return [self.url_result(new_url)]
+            response = self._send_head(url)
+
+            # Check for redirect
+            new_url = response.geturl()
+            if url != new_url:
+                self.report_following_redirect(new_url)
+                return self.url_result(new_url)
+
+            # Check for direct link to a video
+            content_type = response.headers.get('Content-Type', '')
+            m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
+            if m:
+                upload_date = response.headers.get('Last-Modified')
+                if upload_date:
+                    upload_date = unified_strdate(upload_date)
+                return {
+                    'id': video_id,
+                    'title': os.path.splitext(url_basename(url))[0],
+                    'formats': [{
+                        'format_id': m.group('format_id'),
+                        'url': url,
+                        'vcodec': u'none' if m.group('type') == 'audio' else None
+                    }],
+                    'upload_date': upload_date,
+                }
+
          except compat_urllib_error.HTTPError:
              # This may be a stupid server that doesn't like HEAD, our UA, or so
              pass
  
-        video_id = url.split('/')[-1]
          try:
              webpage = self._download_webpage(url, video_id)
          except ValueError:
@@ -183,7 +222,7 @@ class GenericIE(InfoExtractor):
              self.to_screen(u'Brightcove video detected.')
              return self.url_result(bc_url, 'Brightcove')
  
-        # Look for embedded Vimeo player
+        # Look for embedded (iframe) Vimeo player
          mobj = re.search(
              r'<iframe[^>]+?src="(https?://player.vimeo.com/video/.+?)"', webpage)
          if mobj:
@@ -191,9 +230,18 @@ class GenericIE(InfoExtractor):
              surl = smuggle_url(player_url, {'Referer': url})
              return self.url_result(surl, 'Vimeo')
  
+        # Look for embedded (swf embed) Vimeo player
+        mobj = re.search(
+            r'<embed[^>]+?src="(https?://(?:www\.)?vimeo.com/moogaloop.swf.+?)"', webpage)
+        if mobj:
+            return self.url_result(mobj.group(1), 'Vimeo')
+
          # Look for embedded YouTube player
-        matches = re.findall(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube\.com/embed/.+?)\1', webpage)
+        matches = re.findall(r'''(?x)
+            (?:<iframe[^>]+?src=|embedSWF\(\s*)
+            (["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube\.com/
+                (?:embed|v)/.+?)
+            \1''', webpage)
          if matches:
              urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube')
                       for tuppl in matches]
@@ -222,6 +270,18 @@ class GenericIE(InfoExtractor):
                  'id': video_id,
              }
  
+        # Look for embedded blip.tv player
+        mobj = re.search(r'<meta\s[^>]*https?://api.blip.tv/\w+/redirect/\w+/(\d+)', webpage)
+        if mobj:
+            return self.url_result('http://blip.tv/seo/-'+mobj.group(1), 'BlipTV')
+        mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*https?://(?:\w+\.)?blip.tv/(?:play/|api\.swf#)([a-zA-Z0-9]+)', webpage)
+        if mobj:
+            player_url = 'http://blip.tv/play/%s.x?p=1' % mobj.group(1)
+            player_page = self._download_webpage(player_url, mobj.group(1))
+            blip_video_id = self._search_regex(r'data-episode-id="(\d+)', player_page, u'blip_video_id', fatal=False)
+            if blip_video_id:
+                return self.url_result('http://blip.tv/seo/-'+blip_video_id, 'BlipTV')
+
          # Look for Bandcamp pages with custom domain
          mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
          if mobj is not None:
@@ -229,6 +289,22 @@ class GenericIE(InfoExtractor):
              # Don't set the extractor because it can be a track url or an album
              return self.url_result(burl)
  
+        # Look for embedded Vevo player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'))
+
+        # Look for Ooyala videos
+        mobj = re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=([^"&]+)', webpage)
+        if mobj is not None:
+            return OoyalaIE._build_url_result(mobj.group(1))
+
+        # Look for Aparat videos
+        mobj = re.search(r'<iframe src="(http://www.aparat.com/video/[^"]+)"', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group(1), 'Aparat')
+
          # Start with something easy: JW Player in SWFObject
          mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
          if mobj is None:
diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py

index 57b79a3363484d2e1a663112d63c219e2895781f..381af91e42d4c9f642b35643107f5dafd026aad9 100644 (file)
--- a/youtube_dl/extractor/ign.py
+++ b/youtube_dl/extractor/ign.py
@@ -44,7 +44,7 @@ class IGNIE(InfoExtractor):
                  {
                      u'file': u'638672ee848ae4ff108df2a296418ee2.mp4',
                      u'info_dict': {
-                        u'title': u'GTA 5\'s Twisted Beauty in Super Slow Motion',
+                        u'title': u'26 Twisted Moments from GTA 5 in Slow Motion',
                          u'description': u'The twisted beauty of GTA 5 in stunning slow motion.',
                      },
                  },
diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py

index 6fb373db2ae1a1385efc267284f3bbbc5383836c..e5332cce820ca239c915da402107a77143f0484b 100644 (file)
--- a/youtube_dl/extractor/imdb.py
+++ b/youtube_dl/extractor/imdb.py
@@ -11,7 +11,7 @@ from ..utils import (
  class ImdbIE(InfoExtractor):
      IE_NAME = u'imdb'
      IE_DESC = u'Internet Movie Database trailers'
-    _VALID_URL = r'http://www\.imdb\.com/video/imdb/vi(?P<id>\d+)'
+    _VALID_URL = r'http://(?:www|m)\.imdb\.com/video/imdb/vi(?P<id>\d+)'
  
      _TEST = {
          u'url': u'http://www.imdb.com/video/imdb/vi2524815897',
@@ -27,7 +27,7 @@ class ImdbIE(InfoExtractor):
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
          video_id = mobj.group('id')
-        webpage = self._download_webpage(url,video_id)
+        webpage = self._download_webpage('http://www.imdb.com/video/imdb/vi%s' % video_id, video_id)
          descr = get_element_by_attribute('itemprop', 'description', webpage)
          available_formats = re.findall(
              r'case \'(?P<f_id>.*?)\' :$\s+url = \'(?P<path>.*?)\'', webpage,
diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py

new file mode 100644 (file)

index 0000000..4bdf55f
--- /dev/null
+++ b/youtube_dl/extractor/ivi.py
@@ -0,0 +1,154 @@
+# encoding: utf-8
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_request,
+    ExtractorError,
+)
+
+
+class IviIE(InfoExtractor):
+    IE_DESC = u'ivi.ru'
+    IE_NAME = u'ivi'
+    _VALID_URL = r'^https?://(?:www\.)?ivi\.ru/watch(?:/(?P<compilationid>[^/]+))?/(?P<videoid>\d+)'
+
+    _TESTS = [
+        # Single movie
+        {
+            u'url': u'http://www.ivi.ru/watch/53141',
+            u'file': u'53141.mp4',
+            u'md5': u'6ff5be2254e796ed346251d117196cf4',
+            u'info_dict': {
+                u'title': u'Иван Васильевич меняет профессию',
+                u'description': u'md5:14d8eda24e9d93d29b5857012c6d6346',
+                u'duration': 5498,
+                u'thumbnail': u'http://thumbs.ivi.ru/f20.vcp.digitalaccess.ru/contents/d/1/c3c885163a082c29bceeb7b5a267a6.jpg',
+            },
+            u'skip': u'Only works from Russia',
+        },
+        # Serial's serie
+        {
+            u'url': u'http://www.ivi.ru/watch/dezhurnyi_angel/74791',
+            u'file': u'74791.mp4',
+            u'md5': u'3e6cc9a848c1d2ebcc6476444967baa9',
+            u'info_dict': {
+                u'title': u'Дежурный ангел - 1 серия',
+                u'duration': 2490,
+                u'thumbnail': u'http://thumbs.ivi.ru/f7.vcp.digitalaccess.ru/contents/8/e/bc2f6c2b6e5d291152fdd32c059141.jpg',
+            },
+            u'skip': u'Only works from Russia',
+         }
+    ]
+    
+    # Sorted by quality
+    _known_formats = ['MP4-low-mobile', 'MP4-mobile', 'FLV-lo', 'MP4-lo', 'FLV-hi', 'MP4-hi', 'MP4-SHQ']
+
+    # Sorted by size
+    _known_thumbnails = ['Thumb-120x90', 'Thumb-160', 'Thumb-640x480']
+
+    def _extract_description(self, html):
+        m = re.search(r'<meta name="description" content="(?P<description>[^"]+)"/>', html)
+        return m.group('description') if m is not None else None
+
+    def _extract_comment_count(self, html):
+        m = re.search(u'(?s)<a href="#" id="view-comments" class="action-button dim gradient">\s*Комментарии:\s*(?P<commentcount>\d+)\s*</a>', html)
+        return int(m.group('commentcount')) if m is not None else 0
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('videoid')
+
+        api_url = 'http://api.digitalaccess.ru/api/json/'
+
+        data = {u'method': u'da.content.get',
+                u'params': [video_id, {u'site': u's183',
+                                       u'referrer': u'http://www.ivi.ru/watch/%s' % video_id,
+                                       u'contentid': video_id
+                                    }
+                            ]
+                }
+
+        request = compat_urllib_request.Request(api_url, json.dumps(data))
+
+        video_json_page = self._download_webpage(request, video_id, u'Downloading video JSON')
+        video_json = json.loads(video_json_page)
+
+        if u'error' in video_json:
+            error = video_json[u'error']
+            if error[u'origin'] == u'NoRedisValidData':
+                raise ExtractorError(u'Video %s does not exist' % video_id, expected=True)
+            raise ExtractorError(u'Unable to download video %s: %s' % (video_id, error[u'message']), expected=True)
+
+        result = video_json[u'result']
+
+        formats = [{'url': x[u'url'],
+                    'format_id': x[u'content_format']
+                    } for x in result[u'files'] if x[u'content_format'] in self._known_formats]
+        formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id']))
+
+        if len(formats) == 0:
+            self._downloader.report_warning(u'No media links available for %s' % video_id)
+            return
+
+        duration = result[u'duration']
+        compilation = result[u'compilation']
+        title = result[u'title']
+
+        title = '%s - %s' % (compilation, title) if compilation is not None else title  
+
+        previews = result[u'preview']
+        previews.sort(key=lambda fmt: self._known_thumbnails.index(fmt['content_format']))
+        thumbnail = previews[-1][u'url'] if len(previews) > 0 else None
+
+        video_page = self._download_webpage(url, video_id, u'Downloading video page')
+        description = self._extract_description(video_page)
+        comment_count = self._extract_comment_count(video_page)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'description': description,
+            'duration': duration,
+            'comment_count': comment_count,
+            'formats': formats,
+        }
+
+
+class IviCompilationIE(InfoExtractor):
+    IE_DESC = u'ivi.ru compilations'
+    IE_NAME = u'ivi:compilation'
+    _VALID_URL = r'^https?://(?:www\.)?ivi\.ru/watch/(?!\d+)(?P<compilationid>[a-z\d_-]+)(?:/season(?P<seasonid>\d+))?$'
+
+    def _extract_entries(self, html, compilation_id):
+        return [self.url_result('http://www.ivi.ru/watch/%s/%s' % (compilation_id, serie), 'Ivi')
+                for serie in re.findall(r'<strong><a href="/watch/%s/(\d+)">(?:[^<]+)</a></strong>' % compilation_id, html)]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        compilation_id = mobj.group('compilationid')
+        season_id = mobj.group('seasonid')
+
+        if season_id is not None: # Season link
+            season_page = self._download_webpage(url, compilation_id, u'Downloading season %s web page' % season_id)
+            playlist_id = '%s/season%s' % (compilation_id, season_id)
+            playlist_title = self._html_search_meta(u'title', season_page, u'title')
+            entries = self._extract_entries(season_page, compilation_id)
+        else: # Compilation link            
+            compilation_page = self._download_webpage(url, compilation_id, u'Downloading compilation web page')
+            playlist_id = compilation_id
+            playlist_title = self._html_search_meta(u'title', compilation_page, u'title')
+            seasons = re.findall(r'<a href="/watch/%s/season(\d+)">[^<]+</a>' % compilation_id, compilation_page)
+            if len(seasons) == 0: # No seasons in this compilation
+                entries = self._extract_entries(compilation_page, compilation_id)
+            else:
+                entries = []
+                for season_id in seasons:
+                    season_page = self._download_webpage('http://www.ivi.ru/watch/%s/season%s' % (compilation_id, season_id),
+                                                         compilation_id, u'Downloading season %s web page' % season_id)
+                    entries.extend(self._extract_entries(season_page, compilation_id))
+
+        return self.playlist_result(entries, playlist_id, playlist_title)
+\ No newline at end of file
diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py

new file mode 100644 (file)

index 0000000..08ce064
--- /dev/null
+++ b/youtube_dl/extractor/mdr.py
@@ -0,0 +1,63 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+)
+
+
+class MDRIE(InfoExtractor):
+    _VALID_URL = r'^(?P<domain>(?:https?://)?(?:www\.)?mdr\.de)/mediathek/(?:.*)/(?P<type>video|audio)(?P<video_id>[^/_]+)_.*'
+    
+    # No tests, MDR regularily deletes its videos
+
+    def _real_extract(self, url):
+        m = re.match(self._VALID_URL, url)
+        video_id = m.group('video_id')
+        domain = m.group('domain')
+
+        # determine title and media streams from webpage
+        html = self._download_webpage(url, video_id)
+
+        title = self._html_search_regex(r'<h2>(.*?)</h2>', html, u'title')
+        xmlurl = self._search_regex(
+            r'(/mediathek/(?:.+)/(?:video|audio)[0-9]+-avCustom.xml)', html, u'XML URL')
+
+        doc = self._download_xml(domain + xmlurl, video_id)
+        formats = []
+        for a in doc.findall('./assets/asset'):
+            url_el = a.find('.//progressiveDownloadUrl')
+            if url_el is None:
+                continue
+            abr = int(a.find('bitrateAudio').text) // 1000
+            media_type = a.find('mediaType').text
+            format = {
+                'abr': abr,
+                'filesize': int(a.find('fileSize').text),
+                'url': url_el.text,
+            }
+
+            vbr_el = a.find('bitrateVideo')
+            if vbr_el is None:
+                format.update({
+                    'vcodec': 'none',
+                    'format_id': u'%s-%d' % (media_type, abr),
+                })
+            else:
+                vbr = int(vbr_el.text) // 1000
+                format.update({
+                    'vbr': vbr,
+                    'width': int(a.find('frameWidth').text),
+                    'height': int(a.find('frameHeight').text),
+                    'format_id': u'%s-%d' % (media_type, vbr),
+                })
+            formats.append(format)
+        formats.sort(key=lambda f: (f.get('vbr'), f['abr']))
+        if not formats:
+            raise ExtractorError(u'Could not find any valid formats')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py

index 5b2bd96334e2eb1a090db2260f08ab9cc1a4b882..ed11f521aa02aa3fe421b8fc743b0a26b1e1cdd0 100644 (file)
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -93,7 +93,9 @@ class MTVServicesInfoExtractor(InfoExtractor):
  
  
  class MTVIE(MTVServicesInfoExtractor):
-    _VALID_URL = r'^https?://(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$'
+    _VALID_URL = r'''(?x)^https?://
+        (?:(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$|
+           m\.mtv\.com/videos/video\.rbml\?.*?id=(?P<mgid>[^&]+))'''
  
      _FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/'
  
@@ -127,16 +129,17 @@ class MTVIE(MTVServicesInfoExtractor):
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
          video_id = mobj.group('videoid')
-
-        webpage = self._download_webpage(url, video_id)
-
-        # Some videos come from Vevo.com
-        m_vevo = re.search(r'isVevoVideo = true;.*?vevoVideoId = "(.*?)";',
-                           webpage, re.DOTALL)
-        if m_vevo:
-            vevo_id = m_vevo.group(1);
-            self.to_screen(u'Vevo video detected: %s' % vevo_id)
-            return self.url_result('vevo:%s' % vevo_id, ie='Vevo')
-
-        uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, u'uri')
+        uri = mobj.group('mgid')
+        if uri is None:
+            webpage = self._download_webpage(url, video_id)
+    
+            # Some videos come from Vevo.com
+            m_vevo = re.search(r'isVevoVideo = true;.*?vevoVideoId = "(.*?)";',
+                               webpage, re.DOTALL)
+            if m_vevo:
+                vevo_id = m_vevo.group(1);
+                self.to_screen(u'Vevo video detected: %s' % vevo_id)
+                return self.url_result('vevo:%s' % vevo_id, ie='Vevo')
+    
+            uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, u'uri')
          return self._get_videos_info(uri)
diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py

index c012ec0cfacb2afea6b395c5c87509f53ed58614..4cab30631956b903682fc2de7aa5dd551bcdd4a3 100644 (file)
--- a/youtube_dl/extractor/naver.py
+++ b/youtube_dl/extractor/naver.py
@@ -9,7 +9,7 @@ from ..utils import (
  
  
  class NaverIE(InfoExtractor):
-    _VALID_URL = r'https?://tvcast\.naver\.com/v/(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P<id>\d+)'
  
      _TEST = {
          u'url': u'http://tvcast.naver.com/v/81652',
diff --git a/youtube_dl/extractor/ndtv.py b/youtube_dl/extractor/ndtv.py

index 2e8501f99b32309f7d65561c0d39492d95a917fb..d81df3c10668492383c11b03cf30db7d797f7c90 100644 (file)
--- a/youtube_dl/extractor/ndtv.py
+++ b/youtube_dl/extractor/ndtv.py
@@ -1,6 +1,4 @@
-import json
  import re
-import time
  
  from .common import InfoExtractor
  from ..utils import month_by_name
diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py

index 1f7b4d2e7e9fa79ef9f81f71f190f943c35dd3a5..d08e47734c217864a93062acbe87e2e658c57779 100644 (file)
--- a/youtube_dl/extractor/ooyala.py
+++ b/youtube_dl/extractor/ooyala.py
@@ -22,6 +22,11 @@ class OoyalaIE(InfoExtractor):
      def _url_for_embed_code(embed_code):
          return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code
  
+    @classmethod
+    def _build_url_result(cls, embed_code):
+        return cls.url_result(cls._url_for_embed_code(embed_code),
+            ie=cls.ie_key())
+
      def _extract_result(self, info, more_info):
          return {'id': info['embedCode'],
                  'ext': 'mp4',
diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py

new file mode 100644 (file)

index 0000000..71abd50
--- /dev/null
+++ b/youtube_dl/extractor/pornhd.py
@@ -0,0 +1,38 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import compat_urllib_parse
+
+
+class PornHdIE(InfoExtractor):
+    _VALID_URL = r'(?:http://)?(?:www\.)?pornhd\.com/videos/(?P<video_id>[0-9]+)/(?P<video_title>.+)'
+    _TEST = {
+        u'url': u'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video',
+        u'file': u'1962.flv',
+        u'md5': u'35272469887dca97abd30abecc6cdf75',
+        u'info_dict': {
+            u"title": u"sierra-day-gets-his-cum-all-over-herself-hd-porn-video",
+            u"age_limit": 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+
+        video_id = mobj.group('video_id')
+        video_title = mobj.group('video_title')
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_url = self._html_search_regex(
+            r'&hd=(http.+?)&', webpage, u'video URL')
+        video_url = compat_urllib_parse.unquote(video_url)
+        age_limit = 18
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'ext': 'flv',
+            'title': video_title,
+            'age_limit': age_limit,
+        }
diff --git a/youtube_dl/extractor/radiofrance.py b/youtube_dl/extractor/radiofrance.py

new file mode 100644 (file)

index 0000000..34652f6
--- /dev/null
+++ b/youtube_dl/extractor/radiofrance.py
@@ -0,0 +1,55 @@
+# coding: utf-8
+import re
+
+from .common import InfoExtractor
+
+
+class RadioFranceIE(InfoExtractor):
+    _VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)'
+    IE_NAME = u'radiofrance'
+
+    _TEST = {
+        u'url': u'http://maison.radiofrance.fr/radiovisions/one-one',
+        u'file': u'one-one.ogg',
+        u'md5': u'bdbb28ace95ed0e04faab32ba3160daf',
+        u'info_dict': {
+            u"title": u"One to one",
+            u"description": u"Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.",
+            u"uploader": u"Thomas Hercouët",
+        },
+    }
+
+    def _real_extract(self, url):
+        m = re.match(self._VALID_URL, url)
+        video_id = m.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, u'title')
+        description = self._html_search_regex(
+            r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>',
+            webpage, u'description', fatal=False)
+        uploader = self._html_search_regex(
+            r'<div class="credit">&nbsp;&nbsp;&copy;&nbsp;(.*?)</div>',
+            webpage, u'uploader', fatal=False)
+
+        formats_str = self._html_search_regex(
+            r'class="jp-jplayer[^"]*" data-source="([^"]+)">',
+            webpage, u'audio URLs')
+        formats = [
+            {
+                'format_id': fm[0],
+                'url': fm[1],
+                'vcodec': 'none',
+            }
+            for fm in
+            re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str)
+        ]
+        # No sorting, we don't know any more about these formats
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'description': description,
+            'uploader': uploader,
+        }
diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py

index 511674d8da62d250487efc6ea8afcf7badb19755..ccf0b1546452bbe85837ca1de837f7321a0bec0c 100644 (file)
--- a/youtube_dl/extractor/rtlnow.py
+++ b/youtube_dl/extractor/rtlnow.py
@@ -10,7 +10,7 @@ from ..utils import (
  
  class RTLnowIE(InfoExtractor):
      """Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW"""
-    _VALID_URL = r'(?:http://)?(?P<url>(?P<base_url>rtl-now\.rtl\.de|rtl2now\.rtl2\.de|(?:www\.)?voxnow\.de|(?:www\.)?rtlnitronow\.de|(?:www\.)?superrtlnow\.de|(?:www\.)?n-tvnow\.de)/+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)'
+    _VALID_URL = r'(?:http://)?(?P<url>(?P<domain>rtl-now\.rtl\.de|rtl2now\.rtl2\.de|(?:www\.)?voxnow\.de|(?:www\.)?rtlnitronow\.de|(?:www\.)?superrtlnow\.de|(?:www\.)?n-tvnow\.de)/+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)'
      _TESTS = [{
          u'url': u'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1',
          u'file': u'90419.flv',
@@ -82,7 +82,7 @@ class RTLnowIE(InfoExtractor):
          mobj = re.match(self._VALID_URL, url)
  
          webpage_url = u'http://' + mobj.group('url')
-        video_page_url = u'http://' + mobj.group('base_url')
+        video_page_url = u'http://' + mobj.group('domain') + u'/'
          video_id = mobj.group(u'video_id')
  
          webpage = self._download_webpage(webpage_url, video_id)
diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py

index 4ea89bf85e7c27cd159af8b6a927a92476b70b0d..beea58d6317727133f85b74c14097445cf785dc5 100644 (file)
--- a/youtube_dl/extractor/smotri.py
+++ b/youtube_dl/extractor/smotri.py
@@ -1,5 +1,6 @@
  # encoding: utf-8
  
+import os.path
  import re
  import json
  import hashlib
@@ -10,6 +11,7 @@ from ..utils import (
      compat_urllib_parse,
      compat_urllib_request,
      ExtractorError,
+    url_basename,
  )
  
  
@@ -132,7 +134,16 @@ class SmotriIE(InfoExtractor):
          # We will extract some from the video web page instead
          video_page_url = 'http://' + mobj.group('url')
          video_page = self._download_webpage(video_page_url, video_id, u'Downloading video page')
-        
+
+        # Warning if video is unavailable
+        warning = self._html_search_regex(
+            r'<div class="videoUnModer">(.*?)</div>', video_page,
+            u'warning messagef', default=None)
+        if warning is not None:
+            self._downloader.report_warning(
+                u'Video %s may not be available; smotri said: %s ' %
+                (video_id, warning))
+
          # Adult content
          if re.search(u'EroConfirmText">', video_page) is not None:
              self.report_age_confirmation()
@@ -148,38 +159,44 @@ class SmotriIE(InfoExtractor):
          # Extract the rest of meta data
          video_title = self._search_meta(u'name', video_page, u'title')
          if not video_title:
-            video_title = video_url.rsplit('/', 1)[-1]
+            video_title = os.path.splitext(url_basename(video_url))[0]
  
          video_description = self._search_meta(u'description', video_page)
          END_TEXT = u' на сайте Smotri.com'
-        if video_description.endswith(END_TEXT):
+        if video_description and video_description.endswith(END_TEXT):
              video_description = video_description[:-len(END_TEXT)]
          START_TEXT = u'Смотреть онлайн ролик '
-        if video_description.startswith(START_TEXT):
+        if video_description and video_description.startswith(START_TEXT):
              video_description = video_description[len(START_TEXT):]
          video_thumbnail = self._search_meta(u'thumbnail', video_page)
  
          upload_date_str = self._search_meta(u'uploadDate', video_page, u'upload date')
-        upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str)
-        video_upload_date = (
-            (
-                upload_date_m.group('year') +
-                upload_date_m.group('month') +
-                upload_date_m.group('day')
+        if upload_date_str:
+            upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str)
+            video_upload_date = (
+                (
+                    upload_date_m.group('year') +
+                    upload_date_m.group('month') +
+                    upload_date_m.group('day')
+                )
+                if upload_date_m else None
              )
-            if upload_date_m else None
-        )
+        else:
+            video_upload_date = None
          
          duration_str = self._search_meta(u'duration', video_page)
-        duration_m = re.search(r'T(?P<hours>[0-9]{2})H(?P<minutes>[0-9]{2})M(?P<seconds>[0-9]{2})S', duration_str)
-        video_duration = (
-            (
-                (int(duration_m.group('hours')) * 60 * 60) +
-                (int(duration_m.group('minutes')) * 60) +
-                int(duration_m.group('seconds'))
+        if duration_str:
+            duration_m = re.search(r'T(?P<hours>[0-9]{2})H(?P<minutes>[0-9]{2})M(?P<seconds>[0-9]{2})S', duration_str)
+            video_duration = (
+                (
+                    (int(duration_m.group('hours')) * 60 * 60) +
+                    (int(duration_m.group('minutes')) * 60) +
+                    int(duration_m.group('seconds'))
+                )
+                if duration_m else None
              )
-            if duration_m else None
-        )
+        else:
+            video_duration = None
          
          video_uploader = self._html_search_regex(
              u'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info[^"]+">(.*?)</a>',
@@ -202,7 +219,7 @@ class SmotriIE(InfoExtractor):
              'uploader': video_uploader,
              'upload_date': video_upload_date,
              'uploader_id': video_uploader_id,
-            'video_duration': video_duration,
+            'duration': video_duration,
              'view_count': video_view_count,
              'age_limit': 18 if adult_content else 0,
              'video_page_url': video_page_url
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py

index cbba4094bf14c1096cd5b5d6d657513d230655a5..e22ff9c387ab0e01c1e6fcb1da793af877f37a5c 100644 (file)
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -24,7 +24,7 @@ class SoundcloudIE(InfoExtractor):
       """
  
      _VALID_URL = r'''^(?:https?://)?
-                    (?:(?:(?:www\.)?soundcloud\.com/
+                    (?:(?:(?:www\.|m\.)?soundcloud\.com/
                              (?P<uploader>[\w\d-]+)/
                              (?!sets/)(?P<title>[\w\d-]+)/?
                              (?P<token>[^?]+?)?(?:[?].*)?$)
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py

index 61452e47d760cc76731f8e9bfbc319377cf84a25..cec65261bfffd2a25702634047a99526fa3a7d10 100644 (file)
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -3,6 +3,7 @@ import json
  
  from .common import InfoExtractor
  from ..utils import (
+    ExtractorError,
      xpath_with_ns,
  )
  
@@ -32,6 +33,17 @@ class ThePlatformIE(InfoExtractor):
          smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?'
              'format=smil&mbr=true'.format(video_id))
          meta = self._download_xml(smil_url, video_id)
+
+        try:
+            error_msg = next(
+                n.attrib['abstract']
+                for n in meta.findall(_x('.//smil:ref'))
+                if n.attrib.get('title') == u'Geographic Restriction')
+        except StopIteration:
+            pass
+        else:
+            raise ExtractorError(error_msg, expected=True)
+
          info_url = 'http://link.theplatform.com/s/dJ5BDC/{0}?format=preview'.format(video_id)
          info_json = self._download_webpage(info_url, video_id)
          info = json.loads(info_json)
diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py

index 4f803bcd3c02c69bc390fa0784e8622007f9db49..5a136a9527613e2fb076c1169b3ef5e90c24eafd 100644 (file)
--- a/youtube_dl/extractor/vbox7.py
+++ b/youtube_dl/extractor/vbox7.py
@@ -15,7 +15,7 @@ class Vbox7IE(InfoExtractor):
      _TEST = {
          u'url': u'http://vbox7.com/play:249bb972c2',
          u'file': u'249bb972c2.flv',
-        u'md5': u'9c70d6d956f888bdc08c124acc120cfe',
+        u'md5': u'99f65c0c9ef9b682b97313e052734c3f',
          u'info_dict': {
              u"title": u"\u0421\u043c\u044f\u0445! \u0427\u0443\u0434\u043e - \u0447\u0438\u0441\u0442 \u0437\u0430 \u0441\u0435\u043a\u0443\u043d\u0434\u0438 - \u0421\u043a\u0440\u0438\u0442\u0430 \u043a\u0430\u043c\u0435\u0440\u0430"
          }
diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py

index 4823992ef40f9987a8ef39dde6200286f3bd40a6..a4b26a26f4132840c57700fad96785dfb390a8db 100644 (file)
--- a/youtube_dl/extractor/vevo.py
+++ b/youtube_dl/extractor/vevo.py
@@ -15,7 +15,12 @@ class VevoIE(InfoExtractor):
      Accepts urls from vevo.com or in the format 'vevo:{id}'
      (currently used by MTVIE)
      """
-    _VALID_URL = r'((http://www\.vevo\.com/watch/(?:[^/]+/[^/]+/)?)|(vevo:))(?P<id>.*?)(\?|$)'
+    _VALID_URL = r'''(?x)
+        (?:https?://www\.vevo\.com/watch/(?:[^/]+/[^/]+/)?|
+           https?://cache\.vevo\.com/m/html/embed\.html\?video=|
+           https?://videoplayer\.vevo\.com/embed/embedded\?videoId=|
+           vevo:)
+        (?P<id>[^&?#]+)'''
      _TESTS = [{
          u'url': u'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',
          u'file': u'GB1101300280.mp4',
diff --git a/youtube_dl/extractor/videopremium.py b/youtube_dl/extractor/videopremium.py

index acae81448e38e3b362fcfdd93b4a6dcd9cc5f7d0..65463c73324ca83ab87b45bc33d569c3fe881163 100644 (file)
--- a/youtube_dl/extractor/videopremium.py
+++ b/youtube_dl/extractor/videopremium.py
@@ -15,6 +15,7 @@ class VideoPremiumIE(InfoExtractor):
          u'params': {
              u'skip_download': True,
          },
+        u'skip': u'Test file has been deleted.',
      }
  
      def _real_extract(self, url):
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py

index ea440952898a15f24019ba421c9efbe810ff1fec..c3623fcbe6b01493c5ec2115f4fe5f2d32737e59 100644 (file)
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -16,11 +16,20 @@ from ..utils import (
      unsmuggle_url,
  )
  
+
  class VimeoIE(InfoExtractor):
      """Information extractor for vimeo.com."""
  
      # _VALID_URL matches Vimeo URLs
-    _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|(?P<player>player))\.)?vimeo(?P<pro>pro)?\.com/(?:.*?/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)/?(?:[?].*)?(?:#.*)?$'
+    _VALID_URL = r'''(?x)
+        (?P<proto>https?://)?
+        (?:(?:www|(?P<player>player))\.)?
+        vimeo(?P<pro>pro)?\.com/
+        (?:.*?/)?
+        (?:(?:play_redirect_hls|moogaloop\.swf)\?clip_id=)?
+        (?:videos?/)?
+        (?P<id>[0-9]+)
+        /?(?:[?&].*)?(?:[#].*)?$'''
      _NETRC_MACHINE = 'vimeo'
      IE_NAME = u'vimeo'
      _TESTS = [
diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py

index e3458d2bd4abaa196190f886afce2e9ac05df191..1a6a7688d435bd275777aeb4ba5425cf56d00267 100644 (file)
--- a/youtube_dl/extractor/xtube.py
+++ b/youtube_dl/extractor/xtube.py
@@ -32,7 +32,7 @@ class XTubeIE(InfoExtractor):
  
          video_title = self._html_search_regex(r'<div class="p_5px[^>]*>([^<]+)', webpage, u'title')
          video_uploader = self._html_search_regex(r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, u'uploader', fatal=False)
-        video_description = self._html_search_regex(r'<p class="video_description">([^<]+)', webpage, u'description', default=None)
+        video_description = self._html_search_regex(r'<p class="video_description">([^<]+)', webpage, u'description', fatal=False)
          video_url= self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, u'video_url').replace('\\/', '/')
          path = compat_urllib_parse_urlparse(video_url).path
          extension = os.path.splitext(path)[1][1:]
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index 874429b78cc4917ca1cbbec7245c85436dd73783..55c345e8a8d4f2c48ff2620fa56df98cecd5db6b 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -162,23 +162,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                            # Dash audio
                            '141', '172', '140', '171', '139',
                            ]
-    _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
-                                      # Apple HTTP Live Streaming
-                                      '96', '95', '94', '93', '92', '132', '151',
-                                      # 3D
-                                      '85', '102', '84', '101', '83', '100', '82',
-                                      # Dash video
-                                      '138', '248', '137', '247', '136', '246', '245',
-                                      '244', '135', '243', '134', '242', '133', '160',
-                                      # Dash audio
-                                      '172', '141', '171', '140', '139',
-                                      ]
-    _video_formats_map = {
-        'flv': ['35', '34', '6', '5'],
-        '3gp': ['36', '17', '13'],
-        'mp4': ['38', '37', '22', '18'],
-        'webm': ['46', '45', '44', '43'],
-    }
      _video_extensions = {
          '13': '3gp',
          '17': '3gp',
@@ -236,54 +219,54 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
          '248': 'webm',
      }
      _video_dimensions = {
-        '5': '400x240',
-        '6': '???',
-        '13': '???',
-        '17': '176x144',
-        '18': '640x360',
-        '22': '1280x720',
-        '34': '640x360',
-        '35': '854x480',
-        '36': '320x240',
-        '37': '1920x1080',
-        '38': '4096x3072',
-        '43': '640x360',
-        '44': '854x480',
-        '45': '1280x720',
-        '46': '1920x1080',
-        '82': '360p',
-        '83': '480p',
-        '84': '720p',
-        '85': '1080p',
-        '92': '240p',
-        '93': '360p',
-        '94': '480p',
-        '95': '720p',
-        '96': '1080p',
-        '100': '360p',
-        '101': '480p',
-        '102': '720p',
-        '132': '240p',
-        '151': '72p',
-        '133': '240p',
-        '134': '360p',
-        '135': '480p',
-        '136': '720p',
-        '137': '1080p',
-        '138': '>1080p',
-        '139': '48k',
-        '140': '128k',
-        '141': '256k',
-        '160': '192p',
-        '171': '128k',
-        '172': '256k',
-        '242': '240p',
-        '243': '360p',
-        '244': '480p',
-        '245': '480p',
-        '246': '480p',
-        '247': '720p',
-        '248': '1080p',
+        '5': {'width': 400, 'height': 240},
+        '6': {},
+        '13': {},
+        '17': {'width': 176, 'height': 144},
+        '18': {'width': 640, 'height': 360},
+        '22': {'width': 1280, 'height': 720},
+        '34': {'width': 640, 'height': 360},
+        '35': {'width': 854, 'height': 480},
+        '36': {'width': 320, 'height': 240},
+        '37': {'width': 1920, 'height': 1080},
+        '38': {'width': 4096, 'height': 3072},
+        '43': {'width': 640, 'height': 360},
+        '44': {'width': 854, 'height': 480},
+        '45': {'width': 1280, 'height': 720},
+        '46': {'width': 1920, 'height': 1080},
+        '82': {'height': 360, 'display': '360p'},
+        '83': {'height': 480, 'display': '480p'},
+        '84': {'height': 720, 'display': '720p'},
+        '85': {'height': 1080, 'display': '1080p'},
+        '92': {'height': 240, 'display': '240p'},
+        '93': {'height': 360, 'display': '360p'},
+        '94': {'height': 480, 'display': '480p'},
+        '95': {'height': 720, 'display': '720p'},
+        '96': {'height': 1080, 'display': '1080p'},
+        '100': {'height': 360, 'display': '360p'},
+        '101': {'height': 480, 'display': '480p'},
+        '102': {'height': 720, 'display': '720p'},
+        '132': {'height': 240, 'display': '240p'},
+        '151': {'height': 72, 'display': '72p'},
+        '133': {'height': 240, 'display': '240p'},
+        '134': {'height': 360, 'display': '360p'},
+        '135': {'height': 480, 'display': '480p'},
+        '136': {'height': 720, 'display': '720p'},
+        '137': {'height': 1080, 'display': '1080p'},
+        '138': {'height': 1081, 'display': '>1080p'},
+        '139': {'display': '48k'},
+        '140': {'display': '128k'},
+        '141': {'display': '256k'},
+        '160': {'height': 192, 'display': '192p'},
+        '171': {'display': '128k'},
+        '172': {'display': '256k'},
+        '242': {'height': 240, 'display': '240p'},
+        '243': {'height': 360, 'display': '360p'},
+        '244': {'height': 480, 'display': '480p'},
+        '245': {'height': 480, 'display': '480p'},
+        '246': {'height': 480, 'display': '480p'},
+        '247': {'height': 720, 'display': '720p'},
+        '248': {'height': 1080, 'display': '1080p'},
      }
      _special_itags = {
          '82': '3D',
@@ -1153,13 +1136,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
              self._downloader.report_warning(err_msg)
              return {}
  
-    def _print_formats(self, formats):
-        print('Available formats:')
-        for x in formats:
-            print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
-                                        self._video_dimensions.get(x, '???'),
-                                        ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
-
      def _extract_id(self, url):
          mobj = re.match(self._VALID_URL, url, re.VERBOSE)
          if mobj is None:
@@ -1172,48 +1148,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
          Transform a dictionary in the format {itag:url} to a list of (itag, url)
          with the requested formats.
          """
-        req_format = self._downloader.params.get('format', None)
-        format_limit = self._downloader.params.get('format_limit', None)
-        available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
-        if format_limit is not None and format_limit in available_formats:
-            format_list = available_formats[available_formats.index(format_limit):]
-        else:
-            format_list = available_formats
-        existing_formats = [x for x in format_list if x in url_map]
+        existing_formats = [x for x in self._available_formats if x in url_map]
          if len(existing_formats) == 0:
              raise ExtractorError(u'no known formats available for video')
-        if self._downloader.params.get('listformats', None):
-            self._print_formats(existing_formats)
-            return
-        if req_format is None or req_format == 'best':
-            video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
-        elif req_format == 'worst':
-            video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
-        elif req_format in ('-1', 'all'):
-            video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
-        else:
-            # Specific formats. We pick the first in a slash-delimeted sequence.
-            # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
-            # available in the specified format. For example,
-            # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
-            # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
-            # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
-            req_formats = req_format.split('/')
-            video_url_list = None
-            for rf in req_formats:
-                if rf in url_map:
-                    video_url_list = [(rf, url_map[rf])]
-                    break
-                if rf in self._video_formats_map:
-                    for srf in self._video_formats_map[rf]:
-                        if srf in url_map:
-                            video_url_list = [(srf, url_map[srf])]
-                            break
-                    else:
-                        continue
-                    break
-            if video_url_list is None:
-                raise ExtractorError(u'requested format not available')
+        video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
+        video_url_list.reverse() # order worst to best
          return video_url_list
  
      def _extract_from_m3u8(self, manifest_url, video_id):
@@ -1361,7 +1300,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                  video_description = u''
  
          def _extract_count(klass):
-            count = self._search_regex(r'class="%s">([\d,]+)</span>' % re.escape(klass), video_webpage, klass, fatal=False)
+            count = self._search_regex(
+                r'class="%s">([\d,]+)</span>' % re.escape(klass),
+                video_webpage, klass, default=None)
              if count is not None:
                  return int(count.replace(',', ''))
              return None
@@ -1377,9 +1318,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
  
          if 'length_seconds' not in video_info:
              self._downloader.report_warning(u'unable to extract video duration')
-            video_duration = ''
+            video_duration = None
          else:
-            video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
+            video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
  
          # annotations
          video_annotations = None
@@ -1460,50 +1401,67 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                          url += '&ratebypass=yes'
                      url_map[url_data['itag'][0]] = url
              video_url_list = self._get_video_url_list(url_map)
-            if not video_url_list:
-                return
          elif video_info.get('hlsvp'):
              manifest_url = video_info['hlsvp'][0]
              url_map = self._extract_from_m3u8(manifest_url, video_id)
              video_url_list = self._get_video_url_list(url_map)
-            if not video_url_list:
-                return
-
          else:
              raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
  
-        results = []
+        formats = []
          for itag, video_real_url in video_url_list:
              # Extension
              video_extension = self._video_extensions.get(itag, 'flv')
+            resolution = self._video_dimensions.get(itag, {}).get('display')
+            width = self._video_dimensions.get(itag, {}).get('width')
+            height = self._video_dimensions.get(itag, {}).get('height')
+            note = self._special_itags.get(itag)
  
              video_format = '{0} - {1}{2}'.format(itag if itag else video_extension,
-                                              self._video_dimensions.get(itag, '???'),
+                                              '%dx%d' % (width, height) if width is not None and height is not None else (resolution if resolution is not None else '???'),
                                                ' ('+self._special_itags[itag]+')' if itag in self._special_itags else '')
  
-            results.append({
-                'id':       video_id,
-                'url':      video_real_url,
-                'uploader': video_uploader,
-                'uploader_id': video_uploader_id,
-                'upload_date':  upload_date,
-                'title':    video_title,
-                'ext':      video_extension,
-                'format':   video_format,
-                'format_id': itag,
-                'thumbnail':    video_thumbnail,
-                'description':  video_description,
-                'player_url':   player_url,
-                'subtitles':    video_subtitles,
-                'duration':     video_duration,
-                'age_limit':    18 if age_gate else 0,
-                'annotations':  video_annotations,
-                'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
-                'view_count': view_count,
-                'like_count': like_count,
-                'dislike_count': dislike_count,
+            formats.append({
+                'url':         video_real_url,
+                'ext':         video_extension,
+                'format':      video_format,
+                'format_id':   itag,
+                'player_url':  player_url,
+                '_resolution': resolution,
+                'width':       width,
+                'height':      height,
+                'format_note': note,
              })
-        return results
+
+        def _formats_key(f):
+            note = f.get('format_note')
+            if note is None:
+                note = u''
+            is_dash = u'DASH' in note
+            return (
+                0 if is_dash else 1,
+                f.get('height') if f.get('height') is not None else -1,
+                f.get('width') if f.get('width') is not None else -1)
+        formats.sort(key=_formats_key)
+
+        return {
+            'id':           video_id,
+            'uploader':     video_uploader,
+            'uploader_id':  video_uploader_id,
+            'upload_date':  upload_date,
+            'title':        video_title,
+            'thumbnail':    video_thumbnail,
+            'description':  video_description,
+            'subtitles':    video_subtitles,
+            'duration':     video_duration,
+            'age_limit':    18 if age_gate else 0,
+            'annotations':  video_annotations,
+            'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
+            'view_count':   view_count,
+            'like_count': like_count,
+            'dislike_count': dislike_count,
+            'formats':      formats,
+        }
  
  class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
      IE_DESC = u'YouTube.com playlists'
@@ -1715,7 +1673,7 @@ class YoutubeUserIE(InfoExtractor):
          # page by page until there are no video ids - it means we got
          # all of them.
  
-        video_ids = []
+        url_results = []
  
          for pagenum in itertools.count(0):
              start_index = pagenum * self._GDATA_PAGE_SIZE + 1
@@ -1733,10 +1691,17 @@ class YoutubeUserIE(InfoExtractor):
                  break
  
              # Extract video identifiers
-            ids_in_page = []
-            for entry in response['feed']['entry']:
-                ids_in_page.append(entry['id']['$t'].split('/')[-1])
-            video_ids.extend(ids_in_page)
+            entries = response['feed']['entry']
+            for entry in entries:
+                title = entry['title']['$t']
+                video_id = entry['id']['$t'].split('/')[-1]
+                url_results.append({
+                    '_type': 'url',
+                    'url': video_id,
+                    'ie_key': 'Youtube',
+                    'id': 'video_id',
+                    'title': title,
+                })
  
              # A little optimization - if current page is not
              # "full", ie. does not contain PAGE_SIZE video ids then
@@ -1744,12 +1709,9 @@ class YoutubeUserIE(InfoExtractor):
              # are no more ids on further pages - no need to query
              # again.
  
-            if len(ids_in_page) < self._GDATA_PAGE_SIZE:
+            if len(entries) < self._GDATA_PAGE_SIZE:
                  break
  
-        url_results = [
-            self.url_result(video_id, 'Youtube', video_id=video_id)
-            for video_id in video_ids]
          return self.playlist_result(url_results, playlist_title=username)
  
  
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index 4593488ce5e30bb69a19c56cc4c84a581d3d17f6..2e48f187e665dad81caa663efdb9d0c33f088936 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1,6 +1,7 @@
  #!/usr/bin/env python
  # -*- coding: utf-8 -*-
  
+import ctypes
  import datetime
  import email.utils
  import errno
@@ -766,6 +767,10 @@ def unified_strdate(date_str):
              upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
          except:
              pass
+    if upload_date is None:
+        timetuple = email.utils.parsedate_tz(date_str)
+        if timetuple:
+            upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
      return upload_date
  
  def determine_ext(url, default_ext=u'unknown_video'):
@@ -1051,7 +1056,7 @@ def month_by_name(name):
      """ Return the number of a month by (locale-independently) English name """
  
      ENGLISH_NAMES = [
-        u'Januar', u'February', u'March', u'April', u'May', u'June',
+        u'January', u'February', u'March', u'April', u'May', u'June',
          u'July', u'August', u'September', u'October', u'November', u'December']
      try:
          return ENGLISH_NAMES.index(name) + 1
@@ -1062,3 +1067,34 @@ def month_by_name(name):
  def fix_xml_all_ampersand(xml_str):
      """Replace all the '&' by '&amp;' in XML"""
      return xml_str.replace(u'&', u'&amp;')
+
+
+def setproctitle(title):
+    assert isinstance(title, type(u''))
+    try:
+        libc = ctypes.cdll.LoadLibrary("libc.so.6")
+    except OSError:
+        return
+    title = title
+    buf = ctypes.create_string_buffer(len(title) + 1)
+    buf.value = title.encode('utf-8')
+    try:
+        libc.prctl(15, ctypes.byref(buf), 0, 0, 0)
+    except AttributeError:
+        return  # Strange libc, just skip this
+
+
+def remove_start(s, start):
+    if s.startswith(start):
+        return s[len(start):]
+    return s
+
+
+def url_basename(url):
+    path = compat_urlparse.urlparse(url).path
+    return path.strip(u'/').split(u'/')[-1]
+
+
+class HEADRequest(compat_urllib_request.Request):
+    def get_method(self):
+        return "HEAD"
diff --git a/youtube_dl/version.py b/youtube_dl/version.py

index b9a52fcfab43ee9063395c605d69d9f95509339c..67f427894481f955b24c1cb6a32f67a2754f45a6 100644 (file)
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,2 +1,2 @@
  
-__version__ = '2013.12.11.2'
+__version__ = '2013.12.23.3'
author	Philipp Hagemeister <phihag@phihag.de>
	Mon, 23 Dec 2013 04:03:32 +0000 (05:03 +0100)
committer	Philipp Hagemeister <phihag@phihag.de>
	Mon, 23 Dec 2013 04:03:32 +0000 (05:03 +0100)
README.md		patch \| blob \| history
test/test_all_urls.py		patch \| blob \| history
test/test_playlists.py		patch \| blob \| history
test/test_utils.py		patch \| blob \| history
youtube_dl/YoutubeDL.py		patch \| blob \| history
youtube_dl/__init__.py		patch \| blob \| history
youtube_dl/aes.py		patch \| blob \| history
youtube_dl/extractor/__init__.py		patch \| blob \| history
youtube_dl/extractor/academicearth.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/aparat.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/arte.py		patch \| blob \| history
youtube_dl/extractor/blinkx.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/bliptv.py		patch \| blob \| history
youtube_dl/extractor/brightcove.py		patch \| blob \| history
youtube_dl/extractor/cbs.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/channel9.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/common.py		patch \| blob \| history
youtube_dl/extractor/crunchyroll.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/dailymotion.py		patch \| blob \| history
youtube_dl/extractor/daum.py		patch \| blob \| history
youtube_dl/extractor/facebook.py		patch \| blob \| history
youtube_dl/extractor/generic.py		patch \| blob \| history
youtube_dl/extractor/ign.py		patch \| blob \| history
youtube_dl/extractor/imdb.py		patch \| blob \| history
youtube_dl/extractor/ivi.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/mdr.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/mtv.py		patch \| blob \| history
youtube_dl/extractor/naver.py		patch \| blob \| history
youtube_dl/extractor/ndtv.py		patch \| blob \| history
youtube_dl/extractor/ooyala.py		patch \| blob \| history
youtube_dl/extractor/pornhd.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/radiofrance.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/rtlnow.py		patch \| blob \| history
youtube_dl/extractor/smotri.py		patch \| blob \| history
youtube_dl/extractor/soundcloud.py		patch \| blob \| history
youtube_dl/extractor/theplatform.py		patch \| blob \| history
youtube_dl/extractor/vbox7.py		patch \| blob \| history
youtube_dl/extractor/vevo.py		patch \| blob \| history
youtube_dl/extractor/videopremium.py		patch \| blob \| history
youtube_dl/extractor/vimeo.py		patch \| blob \| history
youtube_dl/extractor/xtube.py		patch \| blob \| history
youtube_dl/extractor/youtube.py		patch \| blob \| history
youtube_dl/utils.py		patch \| blob \| history
youtube_dl/version.py		patch \| blob \| history