Merge remote-tracking branch 'rbrito/swap-dimensions'

author Philipp Hagemeister <phihag@phihag.de>

Mon, 25 Nov 2013 05:19:15 +0000 (06:19 +0100)

committer Philipp Hagemeister <phihag@phihag.de>

Mon, 25 Nov 2013 05:19:15 +0000 (06:19 +0100)
author Philipp Hagemeister <phihag@phihag.de>
Mon, 25 Nov 2013 05:19:15 +0000 (06:19 +0100)
committer Philipp Hagemeister <phihag@phihag.de>
Mon, 25 Nov 2013 05:19:15 +0000 (06:19 +0100)
diff --git a/README.md b/README.md

index 2b8db0cfc2c9f7f78be5f3fb69fdeb345562d1b5..580b1600446d727b20eaa850ca7ec60535626a31 100644 (file)
--- a/README.md
+++ b/README.md
@@ -79,24 +79,27 @@ which means you can modify it, redistribute it or use it however you like.
                                 different, %(autonumber)s to get an automatically
                                 incremented number, %(ext)s for the filename
                                 extension, %(format)s for the format description
-                               (like "22 - 1280x720" or "HD")%(upload_date)s for
-                               the upload date (YYYYMMDD), %(extractor)s for the
-                               provider (youtube, metacafe, etc), %(id)s for the
-                               video id , %(playlist)s for the playlist the
-                               video is in, %(playlist_index)s for the position
-                               in the playlist and %% for a literal percent. Use
-                               - to output to stdout. Can also be used to
-                               download to a different directory, for example
-                               with -o '/my/downloads/%(uploader)s/%(title)s-%(i
-                               d)s.%(ext)s' .
+                               (like "22 - 1280x720" or "HD"),%(format_id)s for
+                               the unique id of the format (like Youtube's
+                               itags: "137"),%(upload_date)s for the upload date
+                               (YYYYMMDD), %(extractor)s for the provider
+                               (youtube, metacafe, etc), %(id)s for the video id
+                               , %(playlist)s for the playlist the video is in,
+                               %(playlist_index)s for the position in the
+                               playlist and %% for a literal percent. Use - to
+                               output to stdout. Can also be used to download to
+                               a different directory, for example with -o '/my/d
+                               ownloads/%(uploader)s/%(title)s-%(id)s.%(ext)s' .
      --autonumber-size NUMBER   Specifies the number of digits in %(autonumber)s
                                 when it is present in output filename template or
-                               --autonumber option is given
+                               --auto-number option is given
      --restrict-filenames       Restrict filenames to only ASCII characters, and
                                 avoid "&" and spaces in filenames
      -a, --batch-file FILE      file containing URLs to download ('-' for stdin)
      -w, --no-overwrites        do not overwrite files
-    -c, --continue             resume partially downloaded files
+    -c, --continue             force resume of partially downloaded files. By
+                               default, youtube-dl will resume downloads if
+                               possible.
      --no-continue              do not resume partially downloaded files (restart
                                 from beginning)
      --cookies FILE             file to read cookies from and dump cookie jar in
@@ -120,12 +123,15 @@ which means you can modify it, redistribute it or use it however you like.
      --get-description          simulate, quiet but print video description
      --get-filename             simulate, quiet but print output filename
      --get-format               simulate, quiet but print output format
+    -j, --dump-json            simulate, quiet but print JSON information
      --newline                  output progress bar as new lines
      --no-progress              do not print progress bar
      --console-title            display progress in console titlebar
      -v, --verbose              print various debugging information
      --dump-intermediate-pages  print downloaded pages to debug problems(very
                                 verbose)
+    --write-pages              Write downloaded pages to files in the current
+                               directory
  
  ## Video Format Options:
      -f, --format FORMAT        video format code, specifiy the order of
diff --git a/devscripts/check-porn.py b/devscripts/check-porn.py

new file mode 100644 (file)

index 0000000..63401fe
--- /dev/null
+++ b/devscripts/check-porn.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python
+
+"""
+This script employs a VERY basic heuristic ('porn' in webpage.lower()) to check
+if we are not 'age_limit' tagging some porn site
+"""
+
+# Allow direct execution
+import os
+import sys
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from test.helper import get_testcases
+from youtube_dl.utils import compat_urllib_request
+
+for test in get_testcases():
+    try:
+        webpage = compat_urllib_request.urlopen(test['url'], timeout=10).read()
+    except:
+        print('\nFail: {0}'.format(test['name']))
+        continue
+
+    webpage = webpage.decode('utf8', 'replace')
+
+    if 'porn' in webpage.lower() and ('info_dict' not in test
+                                      or 'age_limit' not in test['info_dict']
+                                      or test['info_dict']['age_limit'] != 18):
+        print('\nPotential missing age_limit check: {0}'.format(test['name']))
+
+    elif 'porn' not in webpage.lower() and ('info_dict' in test and
+                                            'age_limit' in test['info_dict'] and
+                                            test['info_dict']['age_limit'] == 18):
+        print('\nPotential false negative: {0}'.format(test['name']))
+
+    else:
+        sys.stdout.write('.')
+    sys.stdout.flush()
+
+print()
diff --git a/setup.py b/setup.py

index 3b6dc2d40f0f551630dac1007aaf72e0af819724..8e24fe67918eeefa2f3f8b445ccfb480b8c841a8 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -8,8 +8,10 @@ import sys
  
  try:
      from setuptools import setup
+    setuptools_available = True
  except ImportError:
      from distutils.core import setup
+    setuptools_available = False
  
  try:
      # This will create an exe that needs Microsoft Visual C++ 2008
@@ -43,13 +45,16 @@ if len(sys.argv) >= 2 and sys.argv[1] == 'py2exe':
      params = py2exe_params
  else:
      params = {
-        'scripts': ['bin/youtube-dl'],
          'data_files': [  # Installing system-wide would require sudo...
              ('etc/bash_completion.d', ['youtube-dl.bash-completion']),
              ('share/doc/youtube_dl', ['README.txt']),
-            ('share/man/man1/', ['youtube-dl.1'])
+            ('share/man/man1', ['youtube-dl.1'])
          ]
      }
+    if setuptools_available:
+        params['entry_points'] = {'console_scripts': ['youtube-dl = youtube_dl:main']}
+    else:
+        params['scripts'] = ['bin/youtube-dl']
  
  # Get the version from youtube_dl/version.py without importing the package
  exec(compile(open('youtube_dl/version.py').read(),
@@ -63,6 +68,7 @@ setup(
      ' YouTube.com and other video sites.',
      url='https://github.com/rg3/youtube-dl',
      author='Ricardo Garcia',
+    author_email='ytdl@yt-dl.org',
      maintainer='Philipp Hagemeister',
      maintainer_email='phihag@phihag.de',
      packages=['youtube_dl', 'youtube_dl.extractor'],
diff --git a/test/helper.py b/test/helper.py

index 777119ea5fa6fe7b43ae5efe53e9c7685be347b8..b1f421ac58331bad23328502f42a0e1316df853d 100644 (file)
--- a/test/helper.py
+++ b/test/helper.py
@@ -5,13 +5,11 @@ import json
  import os.path
  import re
  import types
+import sys
  
  import youtube_dl.extractor
  from youtube_dl import YoutubeDL
-
-
-def global_setup():
-    youtube_dl._setup_opener(timeout=10)
+from youtube_dl.utils import preferredencoding
  
  
  def get_params(override=None):
@@ -33,6 +31,21 @@ def try_rm(filename):
              raise
  
  
+def report_warning(message):
+    '''
+    Print the message to stderr, it will be prefixed with 'WARNING:'
+    If stderr is a tty file the 'WARNING:' will be colored
+    '''
+    if sys.stderr.isatty() and os.name != 'nt':
+        _msg_header = u'\033[0;33mWARNING:\033[0m'
+    else:
+        _msg_header = u'WARNING:'
+    output = u'%s %s\n' % (_msg_header, message)
+    if 'b' in getattr(sys.stderr, 'mode', '') or sys.version_info[0] < 3:
+        output = output.encode(preferredencoding())
+    sys.stderr.write(output)
+
+
  class FakeYDL(YoutubeDL):
      def __init__(self, override=None):
          # Different instances of the downloader can't share the same dictionary
diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py

index ba6dc05bc3c5549783d7e93c3226484da2f0602a..58cf9c313607020d1493b420f8b93e18ccccd474 100644 (file)
--- a/test/test_YoutubeDL.py
+++ b/test/test_YoutubeDL.py
@@ -62,10 +62,10 @@ class TestFormatSelection(unittest.TestCase):
  
      def test_format_limit(self):
          formats = [
-            {u'format_id': u'meh'},
-            {u'format_id': u'good'},
-            {u'format_id': u'great'},
-            {u'format_id': u'excellent'},
+            {u'format_id': u'meh', u'url': u'http://example.com/meh'},
+            {u'format_id': u'good', u'url': u'http://example.com/good'},
+            {u'format_id': u'great', u'url': u'http://example.com/great'},
+            {u'format_id': u'excellent', u'url': u'http://example.com/exc'},
          ]
          info_dict = {
              u'formats': formats, u'extractor': u'test', 'id': 'testvid'}
@@ -94,6 +94,52 @@ class TestFormatSelection(unittest.TestCase):
          downloaded = ydl.downloaded_info_dicts[0]
          self.assertEqual(downloaded[u'format_id'], u'excellent')
  
+    def test_format_selection(self):
+        formats = [
+            {u'format_id': u'35', u'ext': u'mp4'},
+            {u'format_id': u'45', u'ext': u'webm'},
+            {u'format_id': u'47', u'ext': u'webm'},
+            {u'format_id': u'2', u'ext': u'flv'},
+        ]
+        info_dict = {u'formats': formats, u'extractor': u'test'}
+
+        ydl = YDL({'format': u'20/47'})
+        ydl.process_ie_result(info_dict)
+        downloaded = ydl.downloaded_info_dicts[0]
+        self.assertEqual(downloaded['format_id'], u'47')
+
+        ydl = YDL({'format': u'20/71/worst'})
+        ydl.process_ie_result(info_dict)
+        downloaded = ydl.downloaded_info_dicts[0]
+        self.assertEqual(downloaded['format_id'], u'35')
+
+        ydl = YDL()
+        ydl.process_ie_result(info_dict)
+        downloaded = ydl.downloaded_info_dicts[0]
+        self.assertEqual(downloaded['format_id'], u'2')
+
+        ydl = YDL({'format': u'webm/mp4'})
+        ydl.process_ie_result(info_dict)
+        downloaded = ydl.downloaded_info_dicts[0]
+        self.assertEqual(downloaded['format_id'], u'47')
+
+        ydl = YDL({'format': u'3gp/40/mp4'})
+        ydl.process_ie_result(info_dict)
+        downloaded = ydl.downloaded_info_dicts[0]
+        self.assertEqual(downloaded['format_id'], u'35')
+
+    def test_add_extra_info(self):
+        test_dict = {
+            'extractor': 'Foo',
+        }
+        extra_info = {
+            'extractor': 'Bar',
+            'playlist': 'funny videos',
+        }
+        YDL.add_extra_info(test_dict, extra_info)
+        self.assertEqual(test_dict['extractor'], 'Foo')
+        self.assertEqual(test_dict['playlist'], 'funny videos')
+
  
  if __name__ == '__main__':
      unittest.main()
diff --git a/test/test_age_restriction.py b/test/test_age_restriction.py

index d500c6edceb6018510b9226d925d9f407b72fcbd..c9cdb96cb30578d58724ddadb4328ad790316a39 100644 (file)
--- a/test/test_age_restriction.py
+++ b/test/test_age_restriction.py
@@ -6,8 +6,7 @@ import sys
  import unittest
  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  
-from test.helper import global_setup, try_rm
-global_setup()
+from test.helper import try_rm
  
  
  from youtube_dl import YoutubeDL
@@ -24,7 +23,7 @@ def _download_restricted(url, filename, age):
      }
      ydl = YoutubeDL(params)
      ydl.add_default_info_extractors()
-    json_filename = filename + '.info.json'
+    json_filename = os.path.splitext(filename)[0] + '.info.json'
      try_rm(json_filename)
      ydl.download([url])
      res = os.path.exists(json_filename)
diff --git a/test/test_all_urls.py b/test/test_all_urls.py

index 56e5f80e1f6ddb17fef3ee5c499c238996c12051..1f1adb6b46e0fa2e8a683e6593f699476397a0cd 100644 (file)
--- a/test/test_all_urls.py
+++ b/test/test_all_urls.py
@@ -100,10 +100,11 @@ class TestAllURLsMatching(unittest.TestCase):
      def test_keywords(self):
          self.assertMatch(':ytsubs', ['youtube:subscriptions'])
          self.assertMatch(':ytsubscriptions', ['youtube:subscriptions'])
-        self.assertMatch(':thedailyshow', ['ComedyCentral'])
-        self.assertMatch(':tds', ['ComedyCentral'])
-        self.assertMatch(':colbertreport', ['ComedyCentral'])
-        self.assertMatch(':cr', ['ComedyCentral'])
+        self.assertMatch(':ythistory', ['youtube:history'])
+        self.assertMatch(':thedailyshow', ['ComedyCentralShows'])
+        self.assertMatch(':tds', ['ComedyCentralShows'])
+        self.assertMatch(':colbertreport', ['ComedyCentralShows'])
+        self.assertMatch(':cr', ['ComedyCentralShows'])
  
  
  if __name__ == '__main__':
diff --git a/test/test_dailymotion_subtitles.py b/test/test_dailymotion_subtitles.py

deleted file mode 100644 (file)

index c596415..0000000
--- a/test/test_dailymotion_subtitles.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/usr/bin/env python
-
-# Allow direct execution
-import os
-import sys
-import unittest
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-from test.helper import FakeYDL, global_setup, md5
-global_setup()
-
-
-from youtube_dl.extractor import DailymotionIE
-
-class TestDailymotionSubtitles(unittest.TestCase):
-    def setUp(self):
-        self.DL = FakeYDL()
-        self.url = 'http://www.dailymotion.com/video/xczg00'
-    def getInfoDict(self):
-        IE = DailymotionIE(self.DL)
-        info_dict = IE.extract(self.url)
-        return info_dict
-    def getSubtitles(self):
-        info_dict = self.getInfoDict()
-        return info_dict[0]['subtitles']
-    def test_no_writesubtitles(self):
-        subtitles = self.getSubtitles()
-        self.assertEqual(subtitles, None)
-    def test_subtitles(self):
-        self.DL.params['writesubtitles'] = True
-        subtitles = self.getSubtitles()
-        self.assertEqual(md5(subtitles['en']), '976553874490cba125086bbfea3ff76f')
-    def test_subtitles_lang(self):
-        self.DL.params['writesubtitles'] = True
-        self.DL.params['subtitleslangs'] = ['fr']
-        subtitles = self.getSubtitles()
-        self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792')
-    def test_allsubtitles(self):
-        self.DL.params['writesubtitles'] = True
-        self.DL.params['allsubtitles'] = True
-        subtitles = self.getSubtitles()
-        self.assertEqual(len(subtitles.keys()), 5)
-    def test_list_subtitles(self):
-        self.DL.expect_warning(u'Automatic Captions not supported by this server')
-        self.DL.params['listsubtitles'] = True
-        info_dict = self.getInfoDict()
-        self.assertEqual(info_dict, None)
-    def test_automatic_captions(self):
-        self.DL.expect_warning(u'Automatic Captions not supported by this server')
-        self.DL.params['writeautomaticsub'] = True
-        self.DL.params['subtitleslang'] = ['en']
-        subtitles = self.getSubtitles()
-        self.assertTrue(len(subtitles.keys()) == 0)
-    def test_nosubtitles(self):
-        self.DL.expect_warning(u'video doesn\'t have subtitles')
-        self.url = 'http://www.dailymotion.com/video/x12u166_le-zapping-tele-star-du-08-aout-2013_tv'
-        self.DL.params['writesubtitles'] = True
-        self.DL.params['allsubtitles'] = True
-        subtitles = self.getSubtitles()
-        self.assertEqual(len(subtitles), 0)
-    def test_multiple_langs(self):
-        self.DL.params['writesubtitles'] = True
-        langs = ['es', 'fr', 'de']
-        self.DL.params['subtitleslangs'] = langs
-        subtitles = self.getSubtitles()
-        for lang in langs:
-            self.assertTrue(subtitles.get(lang) is not None, u'Subtitles for \'%s\' not extracted' % lang)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/test_download.py b/test/test_download.py

index b9a9be11d9686243ed2a1d5b748db4bc04712c54..dd5818dba91c166936e45f1c7d8779c752fa3b86 100644 (file)
--- a/test/test_download.py
+++ b/test/test_download.py
@@ -6,8 +6,13 @@ import sys
  import unittest
  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  
-from test.helper import get_params, get_testcases, global_setup, try_rm, md5
-global_setup()
+from test.helper import (
+    get_params,
+    get_testcases,
+    try_rm,
+    md5,
+    report_warning
+)
  
  
  import hashlib
@@ -19,10 +24,12 @@ import youtube_dl.YoutubeDL
  from youtube_dl.utils import (
      compat_str,
      compat_urllib_error,
+    compat_HTTPError,
      DownloadError,
      ExtractorError,
      UnavailableVideoError,
  )
+from youtube_dl.extractor import get_info_extractor
  
  RETRIES = 3
  
@@ -55,17 +62,25 @@ def generator(test_case):
  
      def test_template(self):
          ie = youtube_dl.extractor.get_info_extractor(test_case['name'])
+        other_ies = [get_info_extractor(ie_key) for ie_key in test_case.get('add_ie', [])]
          def print_skipping(reason):
              print('Skipping %s: %s' % (test_case['name'], reason))
-        if not ie._WORKING:
+        if not ie.working():
              print_skipping('IE marked as not _WORKING')
              return
-        if 'playlist' not in test_case and not test_case['file']:
-            print_skipping('No output file specified')
-            return
+        if 'playlist' not in test_case:
+            info_dict = test_case.get('info_dict', {})
+            if not test_case.get('file') and not (info_dict.get('id') and info_dict.get('ext')):
+                print_skipping('The output file cannot be know, the "file" '
+                    'key is missing or the info_dict is incomplete')
+                return
          if 'skip' in test_case:
              print_skipping(test_case['skip'])
              return
+        for other_ie in other_ies:
+            if not other_ie.working():
+                print_skipping(u'test depends on %sIE, marked as not WORKING' % other_ie.ie_key())
+                return
  
          params = get_params(test_case.get('params', {}))
  
@@ -77,35 +92,48 @@ def generator(test_case):
                  finished_hook_called.add(status['filename'])
          ydl.fd.add_progress_hook(_hook)
  
+        def get_tc_filename(tc):
+            return tc.get('file') or ydl.prepare_filename(tc.get('info_dict', {}))
+
          test_cases = test_case.get('playlist', [test_case])
-        for tc in test_cases:
-            try_rm(tc['file'])
-            try_rm(tc['file'] + '.part')
-            try_rm(tc['file'] + '.info.json')
+        def try_rm_tcs_files():
+            for tc in test_cases:
+                tc_filename = get_tc_filename(tc)
+                try_rm(tc_filename)
+                try_rm(tc_filename + '.part')
+                try_rm(os.path.splitext(tc_filename)[0] + '.info.json')
+        try_rm_tcs_files()
          try:
-            for retry in range(1, RETRIES + 1):
+            try_num = 1
+            while True:
                  try:
                      ydl.download([test_case['url']])
                  except (DownloadError, ExtractorError) as err:
-                    if retry == RETRIES: raise
-
                      # Check if the exception is not a network related one
-                    if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
+                    if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503):
                          raise
  
-                    print('Retrying: {0} failed tries\n\n##########\n\n'.format(retry))
+                    if try_num == RETRIES:
+                        report_warning(u'Failed due to network errors, skipping...')
+                        return
+
+                    print('Retrying: {0} failed tries\n\n##########\n\n'.format(try_num))
+
+                    try_num += 1
                  else:
                      break
  
              for tc in test_cases:
+                tc_filename = get_tc_filename(tc)
                  if not test_case.get('params', {}).get('skip_download', False):
-                    self.assertTrue(os.path.exists(tc['file']), msg='Missing file ' + tc['file'])
-                    self.assertTrue(tc['file'] in finished_hook_called)
-                self.assertTrue(os.path.exists(tc['file'] + '.info.json'))
+                    self.assertTrue(os.path.exists(tc_filename), msg='Missing file ' + tc_filename)
+                    self.assertTrue(tc_filename in finished_hook_called)
+                info_json_fn = os.path.splitext(tc_filename)[0] + '.info.json'
+                self.assertTrue(os.path.exists(info_json_fn))
                  if 'md5' in tc:
-                    md5_for_file = _file_md5(tc['file'])
+                    md5_for_file = _file_md5(tc_filename)
                      self.assertEqual(md5_for_file, tc['md5'])
-                with io.open(tc['file'] + '.info.json', encoding='utf-8') as infof:
+                with io.open(info_json_fn, encoding='utf-8') as infof:
                      info_dict = json.load(infof)
                  for (info_field, expected) in tc.get('info_dict', {}).items():
                      if isinstance(expected, compat_str) and expected.startswith('md5:'):
@@ -125,11 +153,11 @@ def generator(test_case):
                  # Check for the presence of mandatory fields
                  for key in ('id', 'url', 'title', 'ext'):
                      self.assertTrue(key in info_dict.keys() and info_dict[key])
+                # Check for mandatory fields that are automatically set by YoutubeDL
+                for key in ['webpage_url', 'extractor', 'extractor_key']:
+                    self.assertTrue(info_dict.get(key), u'Missing field: %s' % key)
          finally:
-            for tc in test_cases:
-                try_rm(tc['file'])
-                try_rm(tc['file'] + '.part')
-                try_rm(tc['file'] + '.info.json')
+            try_rm_tcs_files()
  
      return test_template
  
diff --git a/test/test_playlists.py b/test/test_playlists.py

index d6a8d56df99609e50ea5885d2f5a3eb48b72cf37..167801ae246087aae4c7068cb11b84245e560649 100644 (file)
--- a/test/test_playlists.py
+++ b/test/test_playlists.py
@@ -8,8 +8,7 @@ import sys
  import unittest
  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  
-from test.helper import FakeYDL, global_setup
-global_setup()
+from test.helper import FakeYDL
  
  
  from youtube_dl.extractor import (
@@ -17,9 +16,12 @@ from youtube_dl.extractor import (
      DailymotionUserIE,
      VimeoChannelIE,
      UstreamChannelIE,
+    SoundcloudSetIE,
      SoundcloudUserIE,
      LivestreamIE,
      NHLVideocenterIE,
+    BambuserChannelIE,
+    BandcampAlbumIE
  )
  
  
@@ -60,6 +62,14 @@ class TestPlaylists(unittest.TestCase):
          self.assertEqual(result['id'], u'5124905')
          self.assertTrue(len(result['entries']) >= 11)
  
+    def test_soundcloud_set(self):
+        dl = FakeYDL()
+        ie = SoundcloudSetIE(dl)
+        result = ie.extract('https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['title'], u'The Royal Concept EP')
+        self.assertTrue(len(result['entries']) >= 6)
+
      def test_soundcloud_user(self):
          dl = FakeYDL()
          ie = SoundcloudUserIE(dl)
@@ -85,5 +95,21 @@ class TestPlaylists(unittest.TestCase):
          self.assertEqual(result['title'], u'Highlights')
          self.assertEqual(len(result['entries']), 12)
  
+    def test_bambuser_channel(self):
+        dl = FakeYDL()
+        ie = BambuserChannelIE(dl)
+        result = ie.extract('http://bambuser.com/channel/pixelversity')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['title'], u'pixelversity')
+        self.assertTrue(len(result['entries']) >= 60)
+
+    def test_bandcamp_album(self):
+        dl = FakeYDL()
+        ie = BandcampAlbumIE(dl)
+        result = ie.extract('http://mpallante.bandcamp.com/album/nightmare-night-ep')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['title'], u'Nightmare Night EP')
+        self.assertTrue(len(result['entries']) >= 4)
+
  if __name__ == '__main__':
      unittest.main()
diff --git a/test/test_subtitles.py b/test/test_subtitles.py

new file mode 100644 (file)

index 0000000..94a1f77
--- /dev/null
+++ b/test/test_subtitles.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python
+
+# Allow direct execution
+import os
+import sys
+import unittest
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from test.helper import FakeYDL, md5
+
+
+from youtube_dl.extractor import (
+    YoutubeIE,
+    DailymotionIE,
+    TEDIE,
+)
+
+
+class BaseTestSubtitles(unittest.TestCase):
+    url = None
+    IE = None
+    def setUp(self):
+        self.DL = FakeYDL()
+        self.ie = self.IE(self.DL)
+
+    def getInfoDict(self):
+        info_dict = self.ie.extract(self.url)
+        return info_dict
+
+    def getSubtitles(self):
+        info_dict = self.getInfoDict()
+        return info_dict['subtitles']
+
+
+class TestYoutubeSubtitles(BaseTestSubtitles):
+    url = 'QRS8MkLhQmM'
+    IE = YoutubeIE
+
+    def getSubtitles(self):
+        info_dict = self.getInfoDict()
+        return info_dict[0]['subtitles']
+
+    def test_youtube_no_writesubtitles(self):
+        self.DL.params['writesubtitles'] = False
+        subtitles = self.getSubtitles()
+        self.assertEqual(subtitles, None)
+
+    def test_youtube_subtitles(self):
+        self.DL.params['writesubtitles'] = True
+        subtitles = self.getSubtitles()
+        self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260')
+
+    def test_youtube_subtitles_lang(self):
+        self.DL.params['writesubtitles'] = True
+        self.DL.params['subtitleslangs'] = ['it']
+        subtitles = self.getSubtitles()
+        self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d')
+
+    def test_youtube_allsubtitles(self):
+        self.DL.params['writesubtitles'] = True
+        self.DL.params['allsubtitles'] = True
+        subtitles = self.getSubtitles()
+        self.assertEqual(len(subtitles.keys()), 13)
+
+    def test_youtube_subtitles_sbv_format(self):
+        self.DL.params['writesubtitles'] = True
+        self.DL.params['subtitlesformat'] = 'sbv'
+        subtitles = self.getSubtitles()
+        self.assertEqual(md5(subtitles['en']), '13aeaa0c245a8bed9a451cb643e3ad8b')
+
+    def test_youtube_subtitles_vtt_format(self):
+        self.DL.params['writesubtitles'] = True
+        self.DL.params['subtitlesformat'] = 'vtt'
+        subtitles = self.getSubtitles()
+        self.assertEqual(md5(subtitles['en']), '356cdc577fde0c6783b9b822e7206ff7')
+
+    def test_youtube_list_subtitles(self):
+        self.DL.expect_warning(u'Video doesn\'t have automatic captions')
+        self.DL.params['listsubtitles'] = True
+        info_dict = self.getInfoDict()
+        self.assertEqual(info_dict, None)
+
+    def test_youtube_automatic_captions(self):
+        self.url = '8YoUxe5ncPo'
+        self.DL.params['writeautomaticsub'] = True
+        self.DL.params['subtitleslangs'] = ['it']
+        subtitles = self.getSubtitles()
+        self.assertTrue(subtitles['it'] is not None)
+
+    def test_youtube_nosubtitles(self):
+        self.DL.expect_warning(u'video doesn\'t have subtitles')
+        self.url = 'sAjKT8FhjI8'
+        self.DL.params['writesubtitles'] = True
+        self.DL.params['allsubtitles'] = True
+        subtitles = self.getSubtitles()
+        self.assertEqual(len(subtitles), 0)
+
+    def test_youtube_multiple_langs(self):
+        self.url = 'QRS8MkLhQmM'
+        self.DL.params['writesubtitles'] = True
+        langs = ['it', 'fr', 'de']
+        self.DL.params['subtitleslangs'] = langs
+        subtitles = self.getSubtitles()
+        for lang in langs:
+            self.assertTrue(subtitles.get(lang) is not None, u'Subtitles for \'%s\' not extracted' % lang)
+
+
+class TestDailymotionSubtitles(BaseTestSubtitles):
+    url = 'http://www.dailymotion.com/video/xczg00'
+    IE = DailymotionIE
+
+    def test_no_writesubtitles(self):
+        subtitles = self.getSubtitles()
+        self.assertEqual(subtitles, None)
+
+    def test_subtitles(self):
+        self.DL.params['writesubtitles'] = True
+        subtitles = self.getSubtitles()
+        self.assertEqual(md5(subtitles['en']), '976553874490cba125086bbfea3ff76f')
+
+    def test_subtitles_lang(self):
+        self.DL.params['writesubtitles'] = True
+        self.DL.params['subtitleslangs'] = ['fr']
+        subtitles = self.getSubtitles()
+        self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792')
+
+    def test_allsubtitles(self):
+        self.DL.params['writesubtitles'] = True
+        self.DL.params['allsubtitles'] = True
+        subtitles = self.getSubtitles()
+        self.assertEqual(len(subtitles.keys()), 5)
+
+    def test_list_subtitles(self):
+        self.DL.expect_warning(u'Automatic Captions not supported by this server')
+        self.DL.params['listsubtitles'] = True
+        info_dict = self.getInfoDict()
+        self.assertEqual(info_dict, None)
+
+    def test_automatic_captions(self):
+        self.DL.expect_warning(u'Automatic Captions not supported by this server')
+        self.DL.params['writeautomaticsub'] = True
+        self.DL.params['subtitleslang'] = ['en']
+        subtitles = self.getSubtitles()
+        self.assertTrue(len(subtitles.keys()) == 0)
+
+    def test_nosubtitles(self):
+        self.DL.expect_warning(u'video doesn\'t have subtitles')
+        self.url = 'http://www.dailymotion.com/video/x12u166_le-zapping-tele-star-du-08-aout-2013_tv'
+        self.DL.params['writesubtitles'] = True
+        self.DL.params['allsubtitles'] = True
+        subtitles = self.getSubtitles()
+        self.assertEqual(len(subtitles), 0)
+
+    def test_multiple_langs(self):
+        self.DL.params['writesubtitles'] = True
+        langs = ['es', 'fr', 'de']
+        self.DL.params['subtitleslangs'] = langs
+        subtitles = self.getSubtitles()
+        for lang in langs:
+            self.assertTrue(subtitles.get(lang) is not None, u'Subtitles for \'%s\' not extracted' % lang)
+
+
+class TestTedSubtitles(BaseTestSubtitles):
+    url = 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html'
+    IE = TEDIE
+
+    def test_no_writesubtitles(self):
+        subtitles = self.getSubtitles()
+        self.assertEqual(subtitles, None)
+
+    def test_subtitles(self):
+        self.DL.params['writesubtitles'] = True
+        subtitles = self.getSubtitles()
+        self.assertEqual(md5(subtitles['en']), '2154f31ff9b9f89a0aa671537559c21d')
+
+    def test_subtitles_lang(self):
+        self.DL.params['writesubtitles'] = True
+        self.DL.params['subtitleslangs'] = ['fr']
+        subtitles = self.getSubtitles()
+        self.assertEqual(md5(subtitles['fr']), '7616cbc6df20ec2c1204083c83871cf6')
+
+    def test_allsubtitles(self):
+        self.DL.params['writesubtitles'] = True
+        self.DL.params['allsubtitles'] = True
+        subtitles = self.getSubtitles()
+        self.assertEqual(len(subtitles.keys()), 28)
+
+    def test_list_subtitles(self):
+        self.DL.expect_warning(u'Automatic Captions not supported by this server')
+        self.DL.params['listsubtitles'] = True
+        info_dict = self.getInfoDict()
+        self.assertEqual(info_dict, None)
+
+    def test_automatic_captions(self):
+        self.DL.expect_warning(u'Automatic Captions not supported by this server')
+        self.DL.params['writeautomaticsub'] = True
+        self.DL.params['subtitleslang'] = ['en']
+        subtitles = self.getSubtitles()
+        self.assertTrue(len(subtitles.keys()) == 0)
+
+    def test_multiple_langs(self):
+        self.DL.params['writesubtitles'] = True
+        langs = ['es', 'fr', 'de']
+        self.DL.params['subtitleslangs'] = langs
+        subtitles = self.getSubtitles()
+        for lang in langs:
+            self.assertTrue(subtitles.get(lang) is not None, u'Subtitles for \'%s\' not extracted' % lang)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/test_utils.py b/test/test_utils.py

index f3fbff042ccc8193d8d08527fdc04421c9832305..e9e590e749f131a0950c79bcf4fee1e9fb9004c2 100644 (file)
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -24,6 +24,8 @@ from youtube_dl.utils import (
      xpath_with_ns,
      smuggle_url,
      unsmuggle_url,
+    shell_quote,
+    encodeFilename,
  )
  
  if sys.version_info < (3, 0):
@@ -170,6 +172,10 @@ class TestUtil(unittest.TestCase):
          self.assertEqual(res_url, url)
          self.assertEqual(res_data, None)
  
+    def test_shell_quote(self):
+        args = ['ffmpeg', '-i', encodeFilename(u'ñ€ß\'.mp4')]
+        self.assertEqual(shell_quote(args), u"""ffmpeg -i 'ñ€ß'"'"'.mp4'""")
+
  
  if __name__ == '__main__':
      unittest.main()
diff --git a/test/test_write_annotations.py b/test/test_write_annotations.py

index 35defb8953402a74ff71b7a9a14cec105a5f1703..eac53b285ab6740b368f278784aced9625abb9a6 100644 (file)
--- a/test/test_write_annotations.py
+++ b/test/test_write_annotations.py
@@ -7,8 +7,7 @@ import sys
  import unittest
  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  
-from test.helper import get_params, global_setup, try_rm
-global_setup()
+from test.helper import get_params, try_rm
  
  
  import io
diff --git a/test/test_write_info_json.py b/test/test_write_info_json.py

index a5b6f6972df48f6b7cdcfebc3ea32d11c6a27afa..d7177611b5e1a90aa3bdf612ae873336ff44d686 100644 (file)
--- a/test/test_write_info_json.py
+++ b/test/test_write_info_json.py
@@ -7,8 +7,7 @@ import sys
  import unittest
  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  
-from test.helper import get_params, global_setup
-global_setup()
+from test.helper import get_params
  
  
  import io
@@ -31,7 +30,7 @@ params = get_params({
  
  
  TEST_ID = 'BaW_jenozKc'
-INFO_JSON_FILE = TEST_ID + '.mp4.info.json'
+INFO_JSON_FILE = TEST_ID + '.info.json'
  DESCRIPTION_FILE = TEST_ID + '.mp4.description'
  EXPECTED_DESCRIPTION = u'''test chars:  "'/\ä↭𝕐
  
diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py

index 4b7a7847bd3a33a9a2bff3e99f9f4cff0de7eebf..8fd073f3144b0c3f39cd1d3d9dbd518a540773c3 100644 (file)
--- a/test/test_youtube_lists.py
+++ b/test/test_youtube_lists.py
@@ -6,8 +6,7 @@ import sys
  import unittest
  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  
-from test.helper import FakeYDL, global_setup
-global_setup()
+from test.helper import FakeYDL
  
  
  from youtube_dl.extractor import (
@@ -27,7 +26,7 @@ class TestYoutubeLists(unittest.TestCase):
      def test_youtube_playlist(self):
          dl = FakeYDL()
          ie = YoutubePlaylistIE(dl)
-        result = ie.extract('https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re')[0]
+        result = ie.extract('https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re')
          self.assertIsPlaylist(result)
          self.assertEqual(result['title'], 'ytdl test PL')
          ytie_results = [YoutubeIE()._extract_id(url['url']) for url in result['entries']]
@@ -44,13 +43,13 @@ class TestYoutubeLists(unittest.TestCase):
      def test_issue_673(self):
          dl = FakeYDL()
          ie = YoutubePlaylistIE(dl)
-        result = ie.extract('PLBB231211A4F62143')[0]
+        result = ie.extract('PLBB231211A4F62143')
          self.assertTrue(len(result['entries']) > 25)
  
      def test_youtube_playlist_long(self):
          dl = FakeYDL()
          ie = YoutubePlaylistIE(dl)
-        result = ie.extract('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')[0]
+        result = ie.extract('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
          self.assertIsPlaylist(result)
          self.assertTrue(len(result['entries']) >= 799)
  
@@ -58,7 +57,7 @@ class TestYoutubeLists(unittest.TestCase):
          #651
          dl = FakeYDL()
          ie = YoutubePlaylistIE(dl)
-        result = ie.extract('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')[0]
+        result = ie.extract('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
          ytie_results = [YoutubeIE()._extract_id(url['url']) for url in result['entries']]
          self.assertFalse('pElCt5oNDuI' in ytie_results)
          self.assertFalse('KdPEApIVdWM' in ytie_results)
@@ -66,7 +65,7 @@ class TestYoutubeLists(unittest.TestCase):
      def test_youtube_playlist_empty(self):
          dl = FakeYDL()
          ie = YoutubePlaylistIE(dl)
-        result = ie.extract('https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx')[0]
+        result = ie.extract('https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx')
          self.assertIsPlaylist(result)
          self.assertEqual(len(result['entries']), 0)
  
@@ -74,7 +73,7 @@ class TestYoutubeLists(unittest.TestCase):
          dl = FakeYDL()
          ie = YoutubePlaylistIE(dl)
          # TODO find a > 100 (paginating?) videos course
-        result = ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')[0]
+        result = ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
          entries = result['entries']
          self.assertEqual(YoutubeIE()._extract_id(entries[0]['url']), 'j9WZyLZCBzs')
          self.assertEqual(len(entries), 25)
@@ -84,22 +83,22 @@ class TestYoutubeLists(unittest.TestCase):
          dl = FakeYDL()
          ie = YoutubeChannelIE(dl)
          #test paginated channel
-        result = ie.extract('https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w')[0]
+        result = ie.extract('https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w')
          self.assertTrue(len(result['entries']) > 90)
          #test autogenerated channel
-        result = ie.extract('https://www.youtube.com/channel/HCtnHdj3df7iM/videos')[0]
+        result = ie.extract('https://www.youtube.com/channel/HCtnHdj3df7iM/videos')
          self.assertTrue(len(result['entries']) >= 18)
  
      def test_youtube_user(self):
          dl = FakeYDL()
          ie = YoutubeUserIE(dl)
-        result = ie.extract('https://www.youtube.com/user/TheLinuxFoundation')[0]
+        result = ie.extract('https://www.youtube.com/user/TheLinuxFoundation')
          self.assertTrue(len(result['entries']) >= 320)
  
      def test_youtube_safe_search(self):
          dl = FakeYDL()
          ie = YoutubePlaylistIE(dl)
-        result = ie.extract('PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl')[0]
+        result = ie.extract('PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl')
          self.assertEqual(len(result['entries']), 2)
  
      def test_youtube_show(self):
diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py

index 5e1ff5eb0ede5bcb020cd027ca00d5b4159f9812..056700614b43fa0a3dbceeb82ef991e34fdb53f9 100644 (file)
--- a/test/test_youtube_signature.py
+++ b/test/test_youtube_signature.py
@@ -6,9 +6,6 @@ import sys
  import unittest
  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  
-from test.helper import global_setup
-global_setup()
-
  
  import io
  import re
diff --git a/test/test_youtube_subtitles.py b/test/test_youtube_subtitles.py

deleted file mode 100644 (file)

index 00430a3..0000000
--- a/test/test_youtube_subtitles.py
+++ /dev/null
@@ -1,95 +0,0 @@
-#!/usr/bin/env python
-
-# Allow direct execution
-import os
-import sys
-import unittest
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-from test.helper import FakeYDL, global_setup, md5
-global_setup()
-
-
-from youtube_dl.extractor import YoutubeIE
-
-
-class TestYoutubeSubtitles(unittest.TestCase):
-    def setUp(self):
-        self.DL = FakeYDL()
-        self.url = 'QRS8MkLhQmM'
-
-    def getInfoDict(self):
-        IE = YoutubeIE(self.DL)
-        info_dict = IE.extract(self.url)
-        return info_dict
-
-    def getSubtitles(self):
-        info_dict = self.getInfoDict()
-        return info_dict[0]['subtitles']
-
-    def test_youtube_no_writesubtitles(self):
-        self.DL.params['writesubtitles'] = False
-        subtitles = self.getSubtitles()
-        self.assertEqual(subtitles, None)
-
-    def test_youtube_subtitles(self):
-        self.DL.params['writesubtitles'] = True
-        subtitles = self.getSubtitles()
-        self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260')
-
-    def test_youtube_subtitles_lang(self):
-        self.DL.params['writesubtitles'] = True
-        self.DL.params['subtitleslangs'] = ['it']
-        subtitles = self.getSubtitles()
-        self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d')
-
-    def test_youtube_allsubtitles(self):
-        self.DL.params['writesubtitles'] = True
-        self.DL.params['allsubtitles'] = True
-        subtitles = self.getSubtitles()
-        self.assertEqual(len(subtitles.keys()), 13)
-
-    def test_youtube_subtitles_sbv_format(self):
-        self.DL.params['writesubtitles'] = True
-        self.DL.params['subtitlesformat'] = 'sbv'
-        subtitles = self.getSubtitles()
-        self.assertEqual(md5(subtitles['en']), '13aeaa0c245a8bed9a451cb643e3ad8b')
-
-    def test_youtube_subtitles_vtt_format(self):
-        self.DL.params['writesubtitles'] = True
-        self.DL.params['subtitlesformat'] = 'vtt'
-        subtitles = self.getSubtitles()
-        self.assertEqual(md5(subtitles['en']), '356cdc577fde0c6783b9b822e7206ff7')
-
-    def test_youtube_list_subtitles(self):
-        self.DL.expect_warning(u'Video doesn\'t have automatic captions')
-        self.DL.params['listsubtitles'] = True
-        info_dict = self.getInfoDict()
-        self.assertEqual(info_dict, None)
-
-    def test_youtube_automatic_captions(self):
-        self.url = '8YoUxe5ncPo'
-        self.DL.params['writeautomaticsub'] = True
-        self.DL.params['subtitleslangs'] = ['it']
-        subtitles = self.getSubtitles()
-        self.assertTrue(subtitles['it'] is not None)
-
-    def test_youtube_nosubtitles(self):
-        self.DL.expect_warning(u'video doesn\'t have subtitles')
-        self.url = 'sAjKT8FhjI8'
-        self.DL.params['writesubtitles'] = True
-        self.DL.params['allsubtitles'] = True
-        subtitles = self.getSubtitles()
-        self.assertEqual(len(subtitles), 0)
-
-    def test_youtube_multiple_langs(self):
-        self.url = 'QRS8MkLhQmM'
-        self.DL.params['writesubtitles'] = True
-        langs = ['it', 'fr', 'de']
-        self.DL.params['subtitleslangs'] = langs
-        subtitles = self.getSubtitles()
-        for lang in langs:
-            self.assertTrue(subtitles.get(lang) is not None, u'Subtitles for \'%s\' not extracted' % lang)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py

index 8ecabab1a517467c118dad0857c47291bcb2f929..27684d0f6a6302ce1f8b23c036190ff6ada0b760 100644 (file)
--- a/youtube_dl/FileDownloader.py
+++ b/youtube_dl/FileDownloader.py
@@ -1,15 +1,19 @@
-import math
  import os
  import re
  import subprocess
  import sys
  import time
-import traceback
  
-if os.name == 'nt':
-    import ctypes
-
-from .utils import *
+from .utils import (
+    compat_urllib_error,
+    compat_urllib_request,
+    ContentTooShortError,
+    determine_ext,
+    encodeFilename,
+    format_bytes,
+    sanitize_open,
+    timeconvert,
+)
  
  
  class FileDownloader(object):
@@ -49,20 +53,6 @@ class FileDownloader(object):
          self._progress_hooks = []
          self.params = params
  
-    @staticmethod
-    def format_bytes(bytes):
-        if bytes is None:
-            return 'N/A'
-        if type(bytes) is str:
-            bytes = float(bytes)
-        if bytes == 0.0:
-            exponent = 0
-        else:
-            exponent = int(math.log(bytes, 1024.0))
-        suffix = ['B','KiB','MiB','GiB','TiB','PiB','EiB','ZiB','YiB'][exponent]
-        converted = float(bytes) / float(1024 ** exponent)
-        return '%.2f%s' % (converted, suffix)
-
      @staticmethod
      def format_seconds(seconds):
          (mins, secs) = divmod(seconds, 60)
@@ -113,7 +103,7 @@ class FileDownloader(object):
      def format_speed(speed):
          if speed is None:
              return '%10s' % '---b/s'
-        return '%10s' % ('%s/s' % FileDownloader.format_bytes(speed))
+        return '%10s' % ('%s/s' % format_bytes(speed))
  
      @staticmethod
      def best_block_size(elapsed_time, bytes):
@@ -144,16 +134,8 @@ class FileDownloader(object):
      def to_stderr(self, message):
          self.ydl.to_screen(message)
  
-    def to_cons_title(self, message):
-        """Set console/terminal window title to message."""
-        if not self.params.get('consoletitle', False):
-            return
-        if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
-            # c_wchar_p() might not be necessary if `message` is
-            # already of type unicode()
-            ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
-        elif 'TERM' in os.environ:
-            self.to_screen('\033]0;%s\007' % message, skip_eol=True)
+    def to_console_title(self, message):
+        self.ydl.to_console_title(message)
  
      def trouble(self, *args, **kargs):
          self.ydl.trouble(*args, **kargs)
@@ -194,7 +176,7 @@ class FileDownloader(object):
              if old_filename == new_filename:
                  return
              os.rename(encodeFilename(old_filename), encodeFilename(new_filename))
-        except (IOError, OSError) as err:
+        except (IOError, OSError):
              self.report_error(u'unable to rename file')
  
      def try_utime(self, filename, last_modified_hdr):
@@ -227,8 +209,14 @@ class FileDownloader(object):
          if self.params.get('noprogress', False):
              return
          clear_line = (u'\x1b[K' if sys.stderr.isatty() and os.name != 'nt' else u'')
-        eta_str = self.format_eta(eta)
-        percent_str = self.format_percent(percent)
+        if eta is not None:
+            eta_str = self.format_eta(eta)
+        else:
+            eta_str = 'Unknown ETA'
+        if percent is not None:
+            percent_str = self.format_percent(percent)
+        else:
+            percent_str = 'Unknown %'
          speed_str = self.format_speed(speed)
          if self.params.get('progress_with_newline', False):
              self.to_screen(u'[download] %s of %s at %s ETA %s' %
@@ -236,7 +224,7 @@ class FileDownloader(object):
          else:
              self.to_screen(u'\r%s[download] %s of %s at %s ETA %s' %
                  (clear_line, percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
-        self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
+        self.to_console_title(u'youtube-dl - %s of %s at %s ETA %s' %
                  (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
  
      def report_resuming_byte(self, resume_len):
@@ -251,7 +239,7 @@ class FileDownloader(object):
          """Report file has already been fully downloaded."""
          try:
              self.to_screen(u'[download] %s has already been downloaded' % file_name)
-        except (UnicodeEncodeError) as err:
+        except UnicodeEncodeError:
              self.to_screen(u'[download] The file has already been downloaded')
  
      def report_unable_to_resume(self):
@@ -267,7 +255,62 @@ class FileDownloader(object):
              self.to_screen(u'\r%s[download] 100%% of %s in %s' %
                  (clear_line, data_len_str, self.format_seconds(tot_time)))
  
-    def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url):
+    def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url, live):
+        def run_rtmpdump(args):
+            start = time.time()
+            resume_percent = None
+            resume_downloaded_data_len = None
+            proc = subprocess.Popen(args, stderr=subprocess.PIPE)
+            cursor_in_new_line = True
+            proc_stderr_closed = False
+            while not proc_stderr_closed:
+                # read line from stderr
+                line = u''
+                while True:
+                    char = proc.stderr.read(1)
+                    if not char:
+                        proc_stderr_closed = True
+                        break
+                    if char in [b'\r', b'\n']:
+                        break
+                    line += char.decode('ascii', 'replace')
+                if not line:
+                    # proc_stderr_closed is True
+                    continue
+                mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line)
+                if mobj:
+                    downloaded_data_len = int(float(mobj.group(1))*1024)
+                    percent = float(mobj.group(2))
+                    if not resume_percent:
+                        resume_percent = percent
+                        resume_downloaded_data_len = downloaded_data_len
+                    eta = self.calc_eta(start, time.time(), 100-resume_percent, percent-resume_percent)
+                    speed = self.calc_speed(start, time.time(), downloaded_data_len-resume_downloaded_data_len)
+                    data_len = None
+                    if percent > 0:
+                        data_len = int(downloaded_data_len * 100 / percent)
+                    data_len_str = u'~' + format_bytes(data_len)
+                    self.report_progress(percent, data_len_str, speed, eta)
+                    cursor_in_new_line = False
+                    self._hook_progress({
+                        'downloaded_bytes': downloaded_data_len,
+                        'total_bytes': data_len,
+                        'tmpfilename': tmpfilename,
+                        'filename': filename,
+                        'status': 'downloading',
+                        'eta': eta,
+                        'speed': speed,
+                    })
+                elif self.params.get('verbose', False):
+                    if not cursor_in_new_line:
+                        self.to_screen(u'')
+                    cursor_in_new_line = True
+                    self.to_screen(u'[rtmpdump] '+line)
+            proc.wait()
+            if not cursor_in_new_line:
+                self.to_screen(u'')
+            return proc.returncode
+
          self.report_destination(filename)
          tmpfilename = self.temp_name(filename)
          test = self.params.get('test', False)
@@ -278,12 +321,11 @@ class FileDownloader(object):
          except (OSError, IOError):
              self.report_error(u'RTMP download detected but "rtmpdump" could not be run')
              return False
-        verbosity_option = '--verbose' if self.params.get('verbose', False) else '--quiet'
  
          # Download using rtmpdump. rtmpdump returns exit code 2 when
          # the connection was interrumpted and resuming appears to be
          # possible. This is part of rtmpdump's normal usage, AFAIK.
-        basic_args = ['rtmpdump', verbosity_option, '-r', url, '-o', tmpfilename]
+        basic_args = ['rtmpdump', '--verbose', '-r', url, '-o', tmpfilename]
          if player_url is not None:
              basic_args += ['--swfVfy', player_url]
          if page_url is not None:
@@ -294,6 +336,8 @@ class FileDownloader(object):
              basic_args += ['--tcUrl', url]
          if test:
              basic_args += ['--stop', '1']
+        if live:
+            basic_args += ['--live']
          args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)]
          if self.params.get('verbose', False):
              try:
@@ -302,23 +346,25 @@ class FileDownloader(object):
              except ImportError:
                  shell_quote = repr
              self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
-        retval = subprocess.call(args)
+
+        retval = run_rtmpdump(args)
+
          while (retval == 2 or retval == 1) and not test:
              prevsize = os.path.getsize(encodeFilename(tmpfilename))
-            self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
+            self.to_screen(u'[rtmpdump] %s bytes' % prevsize)
              time.sleep(5.0) # This seems to be needed
-            retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
+            retval = run_rtmpdump(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
              cursize = os.path.getsize(encodeFilename(tmpfilename))
              if prevsize == cursize and retval == 1:
                  break
               # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
              if prevsize == cursize and retval == 2 and cursize > 1024:
-                self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
+                self.to_screen(u'[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
                  retval = 0
                  break
          if retval == 0 or (test and retval == 2):
              fsize = os.path.getsize(encodeFilename(tmpfilename))
-            self.to_screen(u'\r[rtmpdump] %s bytes' % fsize)
+            self.to_screen(u'[rtmpdump] %s bytes' % fsize)
              self.try_rename(tmpfilename, filename)
              self._hook_progress({
                  'downloaded_bytes': fsize,
@@ -366,15 +412,20 @@ class FileDownloader(object):
          self.report_destination(filename)
          tmpfilename = self.temp_name(filename)
  
-        args = ['ffmpeg', '-y', '-i', url, '-f', 'mp4', tmpfilename]
-        # Check for ffmpeg first
-        try:
-            subprocess.call(['ffmpeg', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
-        except (OSError, IOError):
-            self.report_error(u'm3u8 download detected but "%s" could not be run' % args[0] )
-            return False
+        args = ['-y', '-i', url, '-f', 'mp4', '-c', 'copy',
+            '-bsf:a', 'aac_adtstoasc', tmpfilename]
  
-        retval = subprocess.call(args)
+        for program in ['avconv', 'ffmpeg']:
+            try:
+                subprocess.call([program, '-version'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
+                break
+            except (OSError, IOError):
+                pass
+        else:
+            self.report_error(u'm3u8 download detected but ffmpeg or avconv could not be found')
+        cmd = [program] + args
+
+        retval = subprocess.call(cmd)
          if retval == 0:
              fsize = os.path.getsize(encodeFilename(tmpfilename))
              self.to_screen(u'\r[%s] %s bytes' % (args[0], fsize))
@@ -411,7 +462,8 @@ class FileDownloader(object):
                                                  info_dict.get('player_url', None),
                                                  info_dict.get('page_url', None),
                                                  info_dict.get('play_path', None),
-                                                info_dict.get('tc_url', None))
+                                                info_dict.get('tc_url', None),
+                                                info_dict.get('rtmp_live', False))
  
          # Attempt to download using mplayer
          if url.startswith('mms') or url.startswith('rtsp'):
@@ -515,7 +567,7 @@ class FileDownloader(object):
                  self.to_screen(u'\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len))
                  return False
  
-        data_len_str = self.format_bytes(data_len)
+        data_len_str = format_bytes(data_len)
          byte_counter = 0 + resume_len
          block_size = self.params.get('buffersize', 1024)
          start = time.time()
@@ -550,12 +602,11 @@ class FileDownloader(object):
              # Progress message
              speed = self.calc_speed(start, time.time(), byte_counter - resume_len)
              if data_len is None:
-                self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
-                eta = None
+                eta = percent = None
              else:
                  percent = self.calc_percent(byte_counter, data_len)
                  eta = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
-                self.report_progress(percent, data_len_str, speed, eta)
+            self.report_progress(percent, data_len_str, speed, eta)
  
              self._hook_progress({
                  'downloaded_bytes': byte_counter,
diff --git a/youtube_dl/PostProcessor.py b/youtube_dl/PostProcessor.py

index 13b56ede5fdb3d66064a8072cdda87787eee1bae..69aedf87a44c72060e2af135cd95f6f820e9ab0c 100644 (file)
--- a/youtube_dl/PostProcessor.py
+++ b/youtube_dl/PostProcessor.py
@@ -501,7 +501,7 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
  
          options = ['-c', 'copy']
          for (name, value) in metadata.items():
-            options.extend(['-metadata', '%s="%s"' % (name, value)])
+            options.extend(['-metadata', '%s=%s' % (name, value)])
          options.extend(['-f', ext])
  
          self._downloader.to_screen(u'[ffmpeg] Adding metadata to \'%s\'' % filename)
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py

index 296c0f9924940d145c95226ba0dd27a61f700253..30ba94666a642c45bfc03af75eb09019c04ae9ad 100644 (file)
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -5,17 +5,53 @@ from __future__ import absolute_import
  
  import errno
  import io
+import json
  import os
+import platform
  import re
  import shutil
+import subprocess
  import socket
  import sys
  import time
  import traceback
  
-from .utils import *
+if os.name == 'nt':
+    import ctypes
+
+from .utils import (
+    compat_cookiejar,
+    compat_http_client,
+    compat_print,
+    compat_str,
+    compat_urllib_error,
+    compat_urllib_request,
+    ContentTooShortError,
+    date_from_str,
+    DateRange,
+    determine_ext,
+    DownloadError,
+    encodeFilename,
+    ExtractorError,
+    format_bytes,
+    locked_file,
+    make_HTTPS_handler,
+    MaxDownloadsReached,
+    PostProcessingError,
+    platform_name,
+    preferredencoding,
+    SameFileError,
+    sanitize_filename,
+    subtitles_filename,
+    takewhile_inclusive,
+    UnavailableVideoError,
+    write_json_file,
+    write_string,
+    YoutubeDLHandler,
+)
  from .extractor import get_info_extractor, gen_extractors
  from .FileDownloader import FileDownloader
+from .version import __version__
  
  
  class YoutubeDL(object):
@@ -57,6 +93,7 @@ class YoutubeDL(object):
      forcethumbnail:    Force printing thumbnail URL.
      forcedescription:  Force printing description.
      forcefilename:     Force printing final filename.
+    forcejson:         Force printing info_dict as JSON.
      simulate:          Do not download the video files.
      format:            Video format code.
      format_limit:      Highest quality format to try.
@@ -68,6 +105,7 @@ class YoutubeDL(object):
      playlistend:       Playlist item to end at.
      matchtitle:        Download only matching titles.
      rejecttitle:       Reject downloads for matching titles.
+    logger:            Log messages to a logging.Logger instance.
      logtostderr:       Log messages to stderr instead of stdout.
      writedescription:  Write the video description to a .description file
      writeinfojson:     Write the video description to a .info.json file
@@ -91,7 +129,10 @@ class YoutubeDL(object):
      downloadarchive:   File name of a file where all downloads are recorded.
                         Videos already present in the file are not downloaded
                         again.
-    
+    cookiefile:        File name where cookies should be read from and dumped to.
+    nocheckcertificate:Do not verify SSL certificates
+    proxy:             URL of the proxy server to use
+
      The following parameters are not used by YoutubeDL itself, they are used by
      the FileDownloader:
      nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
@@ -131,6 +172,8 @@ class YoutubeDL(object):
          if '%(stitle)s' in self.params['outtmpl']:
              self.report_warning(u'%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
  
+        self._setup_opener()
+
      def add_info_extractor(self, ie):
          """Add an InfoExtractor object to the end of the list."""
          self._ies.append(ie)
@@ -163,7 +206,9 @@ class YoutubeDL(object):
  
      def to_screen(self, message, skip_eol=False):
          """Print message to stdout if not in quiet mode."""
-        if not self.params.get('quiet', False):
+        if self.params.get('logger'):
+            self.params['logger'].debug(message)
+        elif not self.params.get('quiet', False):
              terminator = [u'\n', u''][skip_eol]
              output = message + terminator
              write_string(output, self._screen_file)
@@ -171,10 +216,47 @@ class YoutubeDL(object):
      def to_stderr(self, message):
          """Print message to stderr."""
          assert type(message) == type(u'')
-        output = message + u'\n'
-        if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
-            output = output.encode(preferredencoding())
-        sys.stderr.write(output)
+        if self.params.get('logger'):
+            self.params['logger'].error(message)
+        else:
+            output = message + u'\n'
+            if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
+                output = output.encode(preferredencoding())
+            sys.stderr.write(output)
+
+    def to_console_title(self, message):
+        if not self.params.get('consoletitle', False):
+            return
+        if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
+            # c_wchar_p() might not be necessary if `message` is
+            # already of type unicode()
+            ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
+        elif 'TERM' in os.environ:
+            write_string(u'\033]0;%s\007' % message, self._screen_file)
+
+    def save_console_title(self):
+        if not self.params.get('consoletitle', False):
+            return
+        if 'TERM' in os.environ:
+            # Save the title on stack
+            write_string(u'\033[22;0t', self._screen_file)
+
+    def restore_console_title(self):
+        if not self.params.get('consoletitle', False):
+            return
+        if 'TERM' in os.environ:
+            # Restore the title from stack
+            write_string(u'\033[23;0t', self._screen_file)
+
+    def __enter__(self):
+        self.save_console_title()
+        return self
+
+    def __exit__(self, *args):
+        self.restore_console_title()
+    
+        if self.params.get('cookiefile') is not None:
+            self.cookiejar.save()
  
      def fixed_template(self):
          """Checks if the output template is fixed."""
@@ -216,10 +298,10 @@ class YoutubeDL(object):
          If stderr is a tty file the 'WARNING:' will be colored
          '''
          if sys.stderr.isatty() and os.name != 'nt':
-            _msg_header=u'\033[0;33mWARNING:\033[0m'
+            _msg_header = u'\033[0;33mWARNING:\033[0m'
          else:
-            _msg_header=u'WARNING:'
-        warning_message=u'%s %s' % (_msg_header,message)
+            _msg_header = u'WARNING:'
+        warning_message = u'%s %s' % (_msg_header, message)
          self.to_stderr(warning_message)
  
      def report_error(self, message, tb=None):
@@ -234,19 +316,6 @@ class YoutubeDL(object):
          error_message = u'%s %s' % (_msg_header, message)
          self.trouble(error_message, tb)
  
-    def slow_down(self, start_time, byte_counter):
-        """Sleep if the download speed is over the rate limit."""
-        rate_limit = self.params.get('ratelimit', None)
-        if rate_limit is None or byte_counter == 0:
-            return
-        now = time.time()
-        elapsed = now - start_time
-        if elapsed <= 0.0:
-            return
-        speed = float(byte_counter) / elapsed
-        if speed > rate_limit:
-            time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
-
      def report_writedescription(self, descfn):
          """ Report that the description file is being written """
          self.to_screen(u'[info] Writing video description to: ' + descfn)
@@ -267,7 +336,7 @@ class YoutubeDL(object):
          """Report file has already been fully downloaded."""
          try:
              self.to_screen(u'[download] %s has already been downloaded' % file_name)
-        except (UnicodeEncodeError) as err:
+        except UnicodeEncodeError:
              self.to_screen(u'[download] The file has already been downloaded')
  
      def increment_downloads(self):
@@ -285,16 +354,18 @@ class YoutubeDL(object):
                  autonumber_size = 5
              autonumber_templ = u'%0' + str(autonumber_size) + u'd'
              template_dict['autonumber'] = autonumber_templ % self._num_downloads
-            if template_dict['playlist_index'] is not None:
+            if template_dict.get('playlist_index') is not None:
                  template_dict['playlist_index'] = u'%05d' % template_dict['playlist_index']
  
-            sanitize = lambda k,v: sanitize_filename(
+            sanitize = lambda k, v: sanitize_filename(
                  u'NA' if v is None else compat_str(v),
                  restricted=self.params.get('restrictfilenames'),
-                is_id=(k==u'id'))
-            template_dict = dict((k, sanitize(k, v)) for k,v in template_dict.items())
+                is_id=(k == u'id'))
+            template_dict = dict((k, sanitize(k, v))
+                                 for k, v in template_dict.items())
  
-            filename = self.params['outtmpl'] % template_dict
+            tmpl = os.path.expanduser(self.params['outtmpl'])
+            filename = tmpl % template_dict
              return filename
          except KeyError as err:
              self.report_error(u'Erroneous output template')
@@ -306,15 +377,17 @@ class YoutubeDL(object):
      def _match_entry(self, info_dict):
          """ Returns None iff the file should be downloaded """
  
-        title = info_dict['title']
-        matchtitle = self.params.get('matchtitle', False)
-        if matchtitle:
-            if not re.search(matchtitle, title, re.IGNORECASE):
-                return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
-        rejecttitle = self.params.get('rejecttitle', False)
-        if rejecttitle:
-            if re.search(rejecttitle, title, re.IGNORECASE):
-                return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
+        if 'title' in info_dict:
+            # This can happen when we're just evaluating the playlist
+            title = info_dict['title']
+            matchtitle = self.params.get('matchtitle', False)
+            if matchtitle:
+                if not re.search(matchtitle, title, re.IGNORECASE):
+                    return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
+            rejecttitle = self.params.get('rejecttitle', False)
+            if rejecttitle:
+                if re.search(rejecttitle, title, re.IGNORECASE):
+                    return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
          date = info_dict.get('upload_date', None)
          if date is not None:
              dateRange = self.params.get('daterange', DateRange())
@@ -325,17 +398,23 @@ class YoutubeDL(object):
              if age_limit < info_dict.get('age_limit', 0):
                  return u'Skipping "' + title + '" because it is age restricted'
          if self.in_download_archive(info_dict):
-            return (u'%(title)s has already been recorded in archive'
-                    % info_dict)
+            return (u'%s has already been recorded in archive'
+                    % info_dict.get('title', info_dict.get('id', u'video')))
          return None
-        
+
+    @staticmethod
+    def add_extra_info(info_dict, extra_info):
+        '''Set the keys from extra_info in info dict if they are missing'''
+        for key, value in extra_info.items():
+            info_dict.setdefault(key, value)
+
      def extract_info(self, url, download=True, ie_key=None, extra_info={}):
          '''
          Returns a list with a dictionary for each video we find.
          If 'download', also downloads the videos.
          extra_info is a dict containing the extra values to add to each result
           '''
-        
+
          if ie_key:
              ies = [self.get_info_extractor(ie_key)]
          else:
@@ -355,17 +434,17 @@ class YoutubeDL(object):
                      break
                  if isinstance(ie_result, list):
                      # Backwards compatibility: old IE result format
-                    for result in ie_result:
-                        result.update(extra_info)
                      ie_result = {
                          '_type': 'compat_list',
                          'entries': ie_result,
                      }
-                else:
-                    ie_result.update(extra_info)
-                if 'extractor' not in ie_result:
-                    ie_result['extractor'] = ie.IE_NAME
-                return self.process_ie_result(ie_result, download=download)
+                self.add_extra_info(ie_result,
+                    {
+                        'extractor': ie.IE_NAME,
+                        'webpage_url': url,
+                        'extractor_key': ie.ie_key(),
+                    })
+                return self.process_ie_result(ie_result, download, extra_info)
              except ExtractorError as de: # An error we somewhat expected
                  self.report_error(compat_str(de), de.format_traceback())
                  break
@@ -377,7 +456,7 @@ class YoutubeDL(object):
                      raise
          else:
              self.report_error(u'no suitable InfoExtractor: %s' % url)
-        
+
      def process_ie_result(self, ie_result, download=True, extra_info={}):
          """
          Take the result of the ie(may be modified) and resolve all unresolved
@@ -389,8 +468,8 @@ class YoutubeDL(object):
  
          result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
          if result_type == 'video':
-            ie_result.update(extra_info)
-            return self.process_video_result(ie_result)
+            self.add_extra_info(ie_result, extra_info)
+            return self.process_video_result(ie_result, download=download)
          elif result_type == 'url':
              # We have to add extra_info to the results because it may be
              # contained in a playlist
@@ -399,9 +478,10 @@ class YoutubeDL(object):
                                       ie_key=ie_result.get('ie_key'),
                                       extra_info=extra_info)
          elif result_type == 'playlist':
+
              # We process each entry in the playlist
              playlist = ie_result.get('title', None) or ie_result.get('id', None)
-            self.to_screen(u'[download] Downloading playlist: %s'  % playlist)
+            self.to_screen(u'[download] Downloading playlist: %s' % playlist)
  
              playlist_results = []
  
@@ -419,17 +499,21 @@ class YoutubeDL(object):
              self.to_screen(u"[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
                  (ie_result['extractor'], playlist, n_all_entries, n_entries))
  
-            for i,entry in enumerate(entries,1):
-                self.to_screen(u'[download] Downloading video #%s of %s' %(i, n_entries))
+            for i, entry in enumerate(entries, 1):
+                self.to_screen(u'[download] Downloading video #%s of %s' % (i, n_entries))
                  extra = {
-                         'playlist': playlist, 
-                         'playlist_index': i + playliststart,
-                         }
-                if not 'extractor' in entry:
-                    # We set the extractor, if it's an url it will be set then to
-                    # the new extractor, but if it's already a video we must make
-                    # sure it's present: see issue #877
-                    entry['extractor'] = ie_result['extractor']
+                    'playlist': playlist,
+                    'playlist_index': i + playliststart,
+                    'extractor': ie_result['extractor'],
+                    'webpage_url': ie_result['webpage_url'],
+                    'extractor_key': ie_result['extractor_key'],
+                }
+
+                reason = self._match_entry(entry)
+                if reason is not None:
+                    self.to_screen(u'[download] ' + reason)
+                    continue
+
                  entry_result = self.process_ie_result(entry,
                                                        download=download,
                                                        extra_info=extra)
@@ -438,16 +522,37 @@ class YoutubeDL(object):
              return ie_result
          elif result_type == 'compat_list':
              def _fixup(r):
-                r.setdefault('extractor', ie_result['extractor'])
+                self.add_extra_info(r,
+                    {
+                        'extractor': ie_result['extractor'],
+                        'webpage_url': ie_result['webpage_url'],
+                        'extractor_key': ie_result['extractor_key'],
+                    })
                  return r
              ie_result['entries'] = [
-                self.process_ie_result(_fixup(r), download=download)
+                self.process_ie_result(_fixup(r), download, extra_info)
                  for r in ie_result['entries']
              ]
              return ie_result
          else:
              raise Exception('Invalid result type: %s' % result_type)
  
+    def select_format(self, format_spec, available_formats):
+        if format_spec == 'best' or format_spec is None:
+            return available_formats[-1]
+        elif format_spec == 'worst':
+            return available_formats[0]
+        else:
+            extensions = [u'mp4', u'flv', u'webm', u'3gp']
+            if format_spec in extensions:
+                filter_f = lambda f: f['ext'] == format_spec
+            else:
+                filter_f = lambda f: f['format_id'] == format_spec
+            matches = list(filter(filter_f, available_formats))
+            if matches:
+                return matches[-1]
+        return None
+
      def process_video_result(self, info_dict, download=True):
          assert info_dict.get('_type', 'video') == 'video'
  
@@ -457,8 +562,9 @@ class YoutubeDL(object):
              info_dict['playlist_index'] = None
  
          # This extractors handle format selection themselves
-        if info_dict['extractor'] in [u'youtube', u'Youku', u'YouPorn', u'mixcloud']:
-            self.process_info(info_dict)
+        if info_dict['extractor'] in [u'youtube', u'Youku']:
+            if download:
+                self.process_info(info_dict)
              return info_dict
  
          # We now pick which formats have to be downloaded
@@ -470,17 +576,17 @@ class YoutubeDL(object):
  
          # We check that all the formats have the format and format_id fields
          for (i, format) in enumerate(formats):
-            if format.get('format') is None:
-                if format.get('height') is not None:
-                    if format.get('width') is not None:
-                        format_desc = u'%sx%s' % (format['width'], format['height'])
-                    else:
-                        format_desc = u'%sp' % format['height']
-                else:
-                    format_desc = '???'
-                format['format'] = format_desc
              if format.get('format_id') is None:
                  format['format_id'] = compat_str(i)
+            if format.get('format') is None:
+                format['format'] = u'{id} - {res}{note}'.format(
+                    id=format['format_id'],
+                    res=self.format_resolution(format),
+                    note=u' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
+                )
+            # Automatically determine file extension if missing
+            if 'ext' not in format:
+                format['ext'] = determine_ext(format['url'])
  
          if self.params.get('listformats', None):
              self.list_formats(info_dict)
@@ -502,25 +608,24 @@ class YoutubeDL(object):
              formats = sorted(formats, key=_free_formats_key)
  
          req_format = self.params.get('format', 'best')
+        if req_format is None:
+            req_format = 'best'
          formats_to_download = []
-        if req_format == 'best' or req_format is None:
-            formats_to_download = [formats[-1]]
-        elif req_format == 'worst':
-            formats_to_download = [formats[0]]
          # The -1 is for supporting YoutubeIE
-        elif req_format in ('-1', 'all'):
+        if req_format in ('-1', 'all'):
              formats_to_download = formats
          else:
-            # We can accept formats requestd in the format: 34/10/5, we pick
+            # We can accept formats requestd in the format: 34/5/best, we pick
              # the first that is available, starting from left
              req_formats = req_format.split('/')
              for rf in req_formats:
-                matches = filter(lambda f:f['format_id'] == rf ,formats)
-                if matches:
-                    formats_to_download = [matches[0]]
+                selected_format = self.select_format(rf, formats)
+                if selected_format is not None:
+                    formats_to_download = [selected_format]
                      break
          if not formats_to_download:
-            raise ExtractorError(u'requested format not available')
+            raise ExtractorError(u'requested format not available',
+                                 expected=True)
  
          if download:
              if len(formats_to_download) > 1:
@@ -564,20 +669,22 @@ class YoutubeDL(object):
  
          # Forced printings
          if self.params.get('forcetitle', False):
-            compat_print(info_dict['title'])
+            compat_print(info_dict['fulltitle'])
          if self.params.get('forceid', False):
              compat_print(info_dict['id'])
          if self.params.get('forceurl', False):
              # For RTMP URLs, also include the playpath
              compat_print(info_dict['url'] + info_dict.get('play_path', u''))
-        if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
+        if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
              compat_print(info_dict['thumbnail'])
-        if self.params.get('forcedescription', False) and 'description' in info_dict:
+        if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
              compat_print(info_dict['description'])
          if self.params.get('forcefilename', False) and filename is not None:
              compat_print(filename)
          if self.params.get('forceformat', False):
              compat_print(info_dict['format'])
+        if self.params.get('forcejson', False):
+            compat_print(json.dumps(info_dict))
  
          # Do nothing else if in simulate mode
          if self.params.get('simulate', False):
@@ -608,24 +715,24 @@ class YoutubeDL(object):
  
          if self.params.get('writeannotations', False):
              try:
-               annofn = filename + u'.annotations.xml'
-               self.report_writeannotations(annofn)
-               with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
-                   annofile.write(info_dict['annotations'])
+                annofn = filename + u'.annotations.xml'
+                self.report_writeannotations(annofn)
+                with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
+                    annofile.write(info_dict['annotations'])
              except (KeyError, TypeError):
                  self.report_warning(u'There are no annotations to write.')
              except (OSError, IOError):
-                 self.report_error(u'Cannot write annotations file: ' + annofn)
-                 return
+                self.report_error(u'Cannot write annotations file: ' + annofn)
+                return
  
          subtitles_are_requested = any([self.params.get('writesubtitles', False),
                                         self.params.get('writeautomaticsub')])
  
-        if  subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
+        if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
              # subtitles download errors are already managed as troubles in relevant IE
              # that way it will silently go on when used with unsupporting IE
              subtitles = info_dict['subtitles']
-            sub_format = self.params.get('subtitlesformat')
+            sub_format = self.params.get('subtitlesformat', 'srt')
              for sub_lang in subtitles.keys():
                  sub = subtitles[sub_lang]
                  if sub is None:
@@ -640,10 +747,10 @@ class YoutubeDL(object):
                      return
  
          if self.params.get('writeinfojson', False):
-            infofn = filename + u'.info.json'
+            infofn = os.path.splitext(filename)[0] + u'.info.json'
              self.report_writeinfojson(infofn)
              try:
-                json_info_dict = dict((k, v) for k,v in info_dict.items() if not k in ['urlhandle'])
+                json_info_dict = dict((k, v) for k, v in info_dict.items() if not k in ['urlhandle'])
                  write_json_file(json_info_dict, encodeFilename(infofn))
              except (OSError, IOError):
                  self.report_error(u'Cannot write metadata to JSON file ' + infofn)
@@ -697,7 +804,7 @@ class YoutubeDL(object):
          for url in url_list:
              try:
                  #It also downloads the videos
-                videos = self.extract_info(url)
+                self.extract_info(url)
              except UnavailableVideoError:
                  self.report_error(u'unable to download video')
              except MaxDownloadsReached:
@@ -713,7 +820,7 @@ class YoutubeDL(object):
          keep_video = None
          for pp in self._pps:
              try:
-                keep_video_wish,new_info = pp.run(info)
+                keep_video_wish, new_info = pp.run(info)
                  if keep_video_wish is not None:
                      if keep_video_wish:
                          keep_video = keep_video_wish
@@ -733,7 +840,16 @@ class YoutubeDL(object):
          fn = self.params.get('download_archive')
          if fn is None:
              return False
-        vid_id = info_dict['extractor'] + u' ' + info_dict['id']
+        extractor = info_dict.get('extractor_id')
+        if extractor is None:
+            if 'id' in info_dict:
+                extractor = info_dict.get('ie_key')  # key in a playlist
+        if extractor is None:
+            return False  # Incomplete video information
+        # Future-proof against any change in case
+        # and backwards compatibility with prior versions
+        extractor = extractor.lower()
+        vid_id = extractor + u' ' + info_dict['id']
          try:
              with locked_file(fn, 'r', encoding='utf-8') as archive_file:
                  for line in archive_file:
@@ -752,16 +868,134 @@ class YoutubeDL(object):
          with locked_file(fn, 'a', encoding='utf-8') as archive_file:
              archive_file.write(vid_id + u'\n')
  
+    @staticmethod
+    def format_resolution(format, default='unknown'):
+        if format.get('_resolution') is not None:
+            return format['_resolution']
+        if format.get('height') is not None:
+            if format.get('width') is not None:
+                res = u'%sx%s' % (format['width'], format['height'])
+            else:
+                res = u'%sp' % format['height']
+        else:
+            res = default
+        return res
+
      def list_formats(self, info_dict):
-        formats_s = []
-        for format in info_dict.get('formats', [info_dict]):
-            formats_s.append("%s\t:\t%s\t[%s]" % (format['format_id'],
-                                                format['ext'],
-                                                format.get('format', '???'),
-                                                )
-                            )
-        if len(formats_s) != 1:
-            formats_s[0]  += ' (worst)'
-            formats_s[-1] += ' (best)'
-        formats_s = "\n".join(formats_s)
-        self.to_screen(u"[info] Available formats for %s:\nformat code\textension\n%s" % (info_dict['id'], formats_s)) 
+        def format_note(fdict):
+            res = u''
+            if fdict.get('format_note') is not None:
+                res += fdict['format_note'] + u' '
+            if fdict.get('vcodec') is not None:
+                res += u'%-5s' % fdict['vcodec']
+            elif fdict.get('vbr') is not None:
+                res += u'video'
+            if fdict.get('vbr') is not None:
+                res += u'@%4dk' % fdict['vbr']
+            if fdict.get('acodec') is not None:
+                if res:
+                    res += u', '
+                res += u'%-5s' % fdict['acodec']
+            elif fdict.get('abr') is not None:
+                if res:
+                    res += u', '
+                res += 'audio'
+            if fdict.get('abr') is not None:
+                res += u'@%3dk' % fdict['abr']
+            if fdict.get('filesize') is not None:
+                if res:
+                    res += u', '
+                res += format_bytes(fdict['filesize'])
+            return res
+
+        def line(format, idlen=20):
+            return ((u'%-' + compat_str(idlen + 1) + u's%-10s%-12s%s') % (
+                format['format_id'],
+                format['ext'],
+                self.format_resolution(format),
+                format_note(format),
+            ))
+
+        formats = info_dict.get('formats', [info_dict])
+        idlen = max(len(u'format code'),
+                    max(len(f['format_id']) for f in formats))
+        formats_s = [line(f, idlen) for f in formats]
+        if len(formats) > 1:
+            formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)'
+            formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)'
+
+        header_line = line({
+            'format_id': u'format code', 'ext': u'extension',
+            '_resolution': u'resolution', 'format_note': u'note'}, idlen=idlen)
+        self.to_screen(u'[info] Available formats for %s:\n%s\n%s' %
+                       (info_dict['id'], header_line, u"\n".join(formats_s)))
+
+    def urlopen(self, req):
+        """ Start an HTTP download """
+        return self._opener.open(req)
+
+    def print_debug_header(self):
+        if not self.params.get('verbose'):
+            return
+        write_string(u'[debug] youtube-dl version ' + __version__ + u'\n')
+        try:
+            sp = subprocess.Popen(
+                ['git', 'rev-parse', '--short', 'HEAD'],
+                stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                cwd=os.path.dirname(os.path.abspath(__file__)))
+            out, err = sp.communicate()
+            out = out.decode().strip()
+            if re.match('[0-9a-f]+', out):
+                write_string(u'[debug] Git HEAD: ' + out + u'\n')
+        except:
+            try:
+                sys.exc_clear()
+            except:
+                pass
+        write_string(u'[debug] Python version %s - %s' %
+                     (platform.python_version(), platform_name()) + u'\n')
+
+        proxy_map = {}
+        for handler in self._opener.handlers:
+            if hasattr(handler, 'proxies'):
+                proxy_map.update(handler.proxies)
+        write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n')
+
+    def _setup_opener(self, timeout=300):
+        opts_cookiefile = self.params.get('cookiefile')
+        opts_proxy = self.params.get('proxy')
+
+        if opts_cookiefile is None:
+            self.cookiejar = compat_cookiejar.CookieJar()
+        else:
+            self.cookiejar = compat_cookiejar.MozillaCookieJar(
+                opts_cookiefile)
+            if os.access(opts_cookiefile, os.R_OK):
+                self.cookiejar.load()
+
+        cookie_processor = compat_urllib_request.HTTPCookieProcessor(
+            self.cookiejar)
+        if opts_proxy is not None:
+            if opts_proxy == '':
+                proxies = {}
+            else:
+                proxies = {'http': opts_proxy, 'https': opts_proxy}
+        else:
+            proxies = compat_urllib_request.getproxies()
+            # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
+            if 'http' in proxies and 'https' not in proxies:
+                proxies['https'] = proxies['http']
+        proxy_handler = compat_urllib_request.ProxyHandler(proxies)
+        https_handler = make_HTTPS_handler(
+            self.params.get('nocheckcertificate', False))
+        opener = compat_urllib_request.build_opener(
+            https_handler, proxy_handler, cookie_processor, YoutubeDLHandler())
+        # Delete the default user-agent header, which would otherwise apply in
+        # cases where our custom HTTP handler doesn't come into play
+        # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
+        opener.addheaders = []
+        self._opener = opener
+
+        # TODO remove this global modification
+        compat_urllib_request.install_opener(opener)
+        socket.setdefaulttimeout(timeout)
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py

index fce1adf0cffbf527841dfada34b931d93f67fd5a..1f15c7eaa03acc63a5d3cbf1e244b292a053344e 100644 (file)
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -32,50 +32,44 @@ __authors__  = (
      'Ismael Mejía',
      'Steffan \'Ruirize\' James',
      'Andras Elso',
+    'Jelle van der Waa',
+    'Marcin Cieślak',
+    'Anton Larionov',
+    'Takuya Tsuchida',
  )
  
  __license__ = 'Public Domain'
  
  import codecs
-import collections
  import getpass
  import optparse
  import os
  import random
  import re
  import shlex
-import socket
  import subprocess
  import sys
-import traceback
-import platform
  
  
  from .utils import (
-    compat_cookiejar,
      compat_print,
-    compat_str,
-    compat_urllib_request,
      DateRange,
      decodeOption,
      determine_ext,
      DownloadError,
      get_cachedir,
-    make_HTTPS_handler,
      MaxDownloadsReached,
-    platform_name,
      preferredencoding,
      SameFileError,
      std_headers,
      write_string,
-    YoutubeDLHandler,
  )
  from .update import update_self
-from .version import __version__
  from .FileDownloader import (
      FileDownloader,
  )
  from .extractor import gen_extractors
+from .version import __version__
  from .YoutubeDL import YoutubeDL
  from .PostProcessor import (
      FFmpegMetadataPP,
@@ -133,7 +127,7 @@ def parseOpts(overrideArguments=None):
  
      def _hide_login_info(opts):
          opts = list(opts)
-        for private_opt in ['-p', '--password', '-u', '--username']:
+        for private_opt in ['-p', '--password', '-u', '--username', '--video-password']:
              try:
                  i = opts.index(private_opt)
                  opts[i+1] = '<PRIVATE>'
@@ -304,6 +298,9 @@ def parseOpts(overrideArguments=None):
      verbosity.add_option('--get-format',
              action='store_true', dest='getformat',
              help='simulate, quiet but print output format', default=False)
+    verbosity.add_option('-j', '--dump-json',
+            action='store_true', dest='dumpjson',
+            help='simulate, quiet but print JSON information', default=False)
      verbosity.add_option('--newline',
              action='store_true', dest='progress_with_newline', help='output progress bar as new lines', default=False)
      verbosity.add_option('--no-progress',
@@ -316,6 +313,9 @@ def parseOpts(overrideArguments=None):
      verbosity.add_option('--dump-intermediate-pages',
              action='store_true', dest='dump_intermediate_pages', default=False,
              help='print downloaded pages to debug problems(very verbose)')
+    verbosity.add_option('--write-pages',
+            action='store_true', dest='write_pages', default=False,
+            help='Write downloaded pages to files in the current directory')
      verbosity.add_option('--youtube-print-sig-code',
              action='store_true', dest='youtube_print_sig_code', default=False,
              help=optparse.SUPPRESS_HELP)
@@ -336,7 +336,8 @@ def parseOpts(overrideArguments=None):
                    '%(uploader)s for the uploader name, %(uploader_id)s for the uploader nickname if different, '
                    '%(autonumber)s to get an automatically incremented number, '
                    '%(ext)s for the filename extension, '
-                  '%(format)s for the format description (like "22 - 1280x720" or "HD")'
+                  '%(format)s for the format description (like "22 - 1280x720" or "HD"),'
+                  '%(format_id)s for the unique id of the format (like Youtube\'s itags: "137"),'
                    '%(upload_date)s for the upload date (YYYYMMDD), '
                    '%(extractor)s for the provider (youtube, metacafe, etc), '
                    '%(id)s for the video id , %(playlist)s for the playlist the video is in, '
@@ -345,7 +346,7 @@ def parseOpts(overrideArguments=None):
                    'for example with -o \'/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s\' .'))
      filesystem.add_option('--autonumber-size',
              dest='autonumber_size', metavar='NUMBER',
-            help='Specifies the number of digits in %(autonumber)s when it is present in output filename template or --autonumber option is given')
+            help='Specifies the number of digits in %(autonumber)s when it is present in output filename template or --auto-number option is given')
      filesystem.add_option('--restrict-filenames',
              action='store_true', dest='restrictfilenames',
              help='Restrict filenames to only ASCII characters, and avoid "&" and spaces in filenames', default=False)
@@ -354,7 +355,7 @@ def parseOpts(overrideArguments=None):
      filesystem.add_option('-w', '--no-overwrites',
              action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
      filesystem.add_option('-c', '--continue',
-            action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
+            action='store_true', dest='continue_dl', help='force resume of partially downloaded files. By default, youtube-dl will resume downloads if possible.', default=True)
      filesystem.add_option('--no-continue',
              action='store_false', dest='continue_dl',
              help='do not resume partially downloaded files (restart from beginning)')
@@ -441,19 +442,6 @@ def _real_main(argv=None):
  
      parser, opts, args = parseOpts(argv)
  
-    # Open appropriate CookieJar
-    if opts.cookiefile is None:
-        jar = compat_cookiejar.CookieJar()
-    else:
-        try:
-            jar = compat_cookiejar.MozillaCookieJar(opts.cookiefile)
-            if os.access(opts.cookiefile, os.R_OK):
-                jar.load()
-        except (IOError, OSError) as err:
-            if opts.verbose:
-                traceback.print_exc()
-            write_string(u'ERROR: unable to open cookie file\n')
-            sys.exit(101)
      # Set user agent
      if opts.user_agent is not None:
          std_headers['User-Agent'] = opts.user_agent
@@ -485,8 +473,6 @@ def _real_main(argv=None):
      all_urls = batchurls + args
      all_urls = [url.strip() for url in all_urls]
  
-    opener = _setup_opener(jar=jar, opts=opts)
-
      extractors = gen_extractors()
  
      if opts.list_extractors:
@@ -541,7 +527,7 @@ def _real_main(argv=None):
      if opts.retries is not None:
          try:
              opts.retries = int(opts.retries)
-        except (TypeError, ValueError) as err:
+        except (TypeError, ValueError):
              parser.error(u'invalid retry count specified')
      if opts.buffersize is not None:
          numeric_buffersize = FileDownloader.parse_bytes(opts.buffersize)
@@ -552,13 +538,13 @@ def _real_main(argv=None):
          opts.playliststart = int(opts.playliststart)
          if opts.playliststart <= 0:
              raise ValueError(u'Playlist start must be positive')
-    except (TypeError, ValueError) as err:
+    except (TypeError, ValueError):
          parser.error(u'invalid playlist start number specified')
      try:
          opts.playlistend = int(opts.playlistend)
          if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
              raise ValueError(u'Playlist end must be greater than playlist start')
-    except (TypeError, ValueError) as err:
+    except (TypeError, ValueError):
          parser.error(u'invalid playlist end number specified')
      if opts.extractaudio:
          if opts.audioformat not in ['best', 'aac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav']:
@@ -597,13 +583,12 @@ def _real_main(argv=None):
                       u' file! Use "%%(ext)s" instead of %r' %
                       determine_ext(outtmpl, u''))
  
-    # YoutubeDL
-    ydl = YoutubeDL({
+    ydl_opts = {
          'usenetrc': opts.usenetrc,
          'username': opts.username,
          'password': opts.password,
          'videopassword': opts.videopassword,
-        'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
+        'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.dumpjson),
          'forceurl': opts.geturl,
          'forcetitle': opts.gettitle,
          'forceid': opts.getid,
@@ -611,8 +596,9 @@ def _real_main(argv=None):
          'forcedescription': opts.getdescription,
          'forcefilename': opts.getfilename,
          'forceformat': opts.getformat,
+        'forcejson': opts.dumpjson,
          'simulate': opts.simulate,
-        'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
+        'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.dumpjson),
          'format': opts.format,
          'format_limit': opts.format_limit,
          'listformats': opts.listformats,
@@ -651,6 +637,7 @@ def _real_main(argv=None):
          'prefer_free_formats': opts.prefer_free_formats,
          'verbose': opts.verbose,
          'dump_intermediate_pages': opts.dump_intermediate_pages,
+        'write_pages': opts.write_pages,
          'test': opts.test,
          'keepvideo': opts.keepvideo,
          'min_filesize': opts.min_filesize,
@@ -660,102 +647,45 @@ def _real_main(argv=None):
          'youtube_print_sig_code': opts.youtube_print_sig_code,
          'age_limit': opts.age_limit,
          'download_archive': opts.download_archive,
-        })
-
-    if opts.verbose:
-        write_string(u'[debug] youtube-dl version ' + __version__ + u'\n')
-        try:
-            sp = subprocess.Popen(
-                ['git', 'rev-parse', '--short', 'HEAD'],
-                stdout=subprocess.PIPE, stderr=subprocess.PIPE,
-                cwd=os.path.dirname(os.path.abspath(__file__)))
-            out, err = sp.communicate()
-            out = out.decode().strip()
-            if re.match('[0-9a-f]+', out):
-                write_string(u'[debug] Git HEAD: ' + out + u'\n')
-        except:
-            try:
-                sys.exc_clear()
-            except:
-                pass
-        write_string(u'[debug] Python version %s - %s' %(platform.python_version(), platform_name()) + u'\n')
-
-        proxy_map = {}
-        for handler in opener.handlers:
-            if hasattr(handler, 'proxies'):
-                proxy_map.update(handler.proxies)
-        write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n')
-
-    ydl.add_default_info_extractors()
-
-    # PostProcessors
-    # Add the metadata pp first, the other pps will copy it
-    if opts.addmetadata:
-        ydl.add_post_processor(FFmpegMetadataPP())
-    if opts.extractaudio:
-        ydl.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, nopostoverwrites=opts.nopostoverwrites))
-    if opts.recodevideo:
-        ydl.add_post_processor(FFmpegVideoConvertor(preferedformat=opts.recodevideo))
-    if opts.embedsubtitles:
-        ydl.add_post_processor(FFmpegEmbedSubtitlePP(subtitlesformat=opts.subtitlesformat))
-
-    # Update version
-    if opts.update_self:
-        update_self(ydl.to_screen, opts.verbose)
-
-    # Maybe do nothing
-    if len(all_urls) < 1:
-        if not opts.update_self:
-            parser.error(u'you must provide at least one URL')
-        else:
-            sys.exit()
+        'cookiefile': opts.cookiefile,
+        'nocheckcertificate': opts.no_check_certificate,
+    }
  
-    try:
-        retcode = ydl.download(all_urls)
-    except MaxDownloadsReached:
-        ydl.to_screen(u'--max-download limit reached, aborting.')
-        retcode = 101
+    with YoutubeDL(ydl_opts) as ydl:
+        ydl.print_debug_header()
+        ydl.add_default_info_extractors()
+
+        # PostProcessors
+        # Add the metadata pp first, the other pps will copy it
+        if opts.addmetadata:
+            ydl.add_post_processor(FFmpegMetadataPP())
+        if opts.extractaudio:
+            ydl.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, nopostoverwrites=opts.nopostoverwrites))
+        if opts.recodevideo:
+            ydl.add_post_processor(FFmpegVideoConvertor(preferedformat=opts.recodevideo))
+        if opts.embedsubtitles:
+            ydl.add_post_processor(FFmpegEmbedSubtitlePP(subtitlesformat=opts.subtitlesformat))
+
+        # Update version
+        if opts.update_self:
+            update_self(ydl.to_screen, opts.verbose)
+
+        # Maybe do nothing
+        if len(all_urls) < 1:
+            if not opts.update_self:
+                parser.error(u'you must provide at least one URL')
+            else:
+                sys.exit()
  
-    # Dump cookie jar if requested
-    if opts.cookiefile is not None:
          try:
-            jar.save()
-        except (IOError, OSError):
-            sys.exit(u'ERROR: unable to save cookie jar')
+            retcode = ydl.download(all_urls)
+        except MaxDownloadsReached:
+            ydl.to_screen(u'--max-download limit reached, aborting.')
+            retcode = 101
  
      sys.exit(retcode)
  
  
-def _setup_opener(jar=None, opts=None, timeout=300):
-    if opts is None:
-        FakeOptions = collections.namedtuple(
-            'FakeOptions', ['proxy', 'no_check_certificate'])
-        opts = FakeOptions(proxy=None, no_check_certificate=False)
-
-    cookie_processor = compat_urllib_request.HTTPCookieProcessor(jar)
-    if opts.proxy is not None:
-        if opts.proxy == '':
-            proxies = {}
-        else:
-            proxies = {'http': opts.proxy, 'https': opts.proxy}
-    else:
-        proxies = compat_urllib_request.getproxies()
-        # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
-        if 'http' in proxies and 'https' not in proxies:
-            proxies['https'] = proxies['http']
-    proxy_handler = compat_urllib_request.ProxyHandler(proxies)
-    https_handler = make_HTTPS_handler(opts)
-    opener = compat_urllib_request.build_opener(
-        https_handler, proxy_handler, cookie_processor, YoutubeDLHandler())
-    # Delete the default user-agent header, which would otherwise apply in
-    # cases where our custom HTTP handler doesn't come into play
-    # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
-    opener.addheaders = []
-    compat_urllib_request.install_opener(opener)
-    socket.setdefaulttimeout(timeout)
-    return opener
-
-
  def main(argv=None):
      try:
          _real_main(argv)
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py

index db69af361929fd7ff726d1a1df980730cad3630c..0b4d086b77314d502db0b08c3d0af244803e8e2e 100644 (file)
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -1,5 +1,6 @@
  from .appletrailers import AppleTrailersIE
  from .addanime import AddAnimeIE
+from .anitube import AnitubeIE
  from .archiveorg import ArchiveOrgIE
  from .ard import ARDIE
  from .arte import (
@@ -9,7 +10,8 @@ from .arte import (
      ArteTVFutureIE,
  )
  from .auengine import AUEngineIE
-from .bandcamp import BandcampIE
+from .bambuser import BambuserIE, BambuserChannelIE
+from .bandcamp import BandcampIE, BandcampAlbumIE
  from .bliptv import BlipTVIE, BlipTVUserIE
  from .bloomberg import BloombergIE
  from .breakcom import BreakIE
@@ -18,12 +20,14 @@ from .c56 import C56IE
  from .canalplus import CanalplusIE
  from .canalc2 import Canalc2IE
  from .cinemassacre import CinemassacreIE
+from .clipfish import ClipfishIE
  from .cnn import CNNIE
  from .collegehumor import CollegeHumorIE
-from .comedycentral import ComedyCentralIE
+from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE
  from .condenast import CondeNastIE
  from .criterion import CriterionIE
  from .cspan import CSpanIE
+from .d8 import D8IE
  from .dailymotion import (
      DailymotionIE,
      DailymotionPlaylistIE,
@@ -37,8 +41,10 @@ from .defense import DefenseGouvFrIE
  from .ebaumsworld import EbaumsWorldIE
  from .ehow import EHowIE
  from .eighttracks import EightTracksIE
+from .eitb import EitbIE
  from .escapist import EscapistIE
  from .exfm import ExfmIE
+from .extremetube import ExtremeTubeIE
  from .facebook import FacebookIE
  from .faz import FazIE
  from .fktv import (
@@ -54,6 +60,7 @@ from .francetv import (
  )
  from .freesound import FreesoundIE
  from .funnyordie import FunnyOrDieIE
+from .gamekings import GamekingsIE
  from .gamespot import GameSpotIE
  from .gametrailers import GametrailersIE
  from .generic import GenericIE
@@ -72,16 +79,19 @@ from .jeuxvideo import JeuxVideoIE
  from .jukebox import JukeboxIE
  from .justintv import JustinTVIE
  from .kankan import KankanIE
+from .keezmovies import KeezMoviesIE
  from .kickstarter import KickStarterIE
  from .keek import KeekIE
  from .liveleak import LiveLeakIE
-from .livestream import LivestreamIE
+from .livestream import LivestreamIE, LivestreamOriginalIE
  from .metacafe import MetacafeIE
  from .metacritic import MetacriticIE
  from .mit import TechTVMITIE, MITIE
  from .mixcloud import MixcloudIE
+from .mofosex import MofosexIE
  from .mtv import MTVIE
  from .muzu import MuzuTVIE
+from .myspace import MySpaceIE
  from .myspass import MySpassIE
  from .myvideo import MyVideoIE
  from .naver import NaverIE
@@ -89,11 +99,13 @@ from .nba import NBAIE
  from .nbc import NBCNewsIE
  from .newgrounds import NewgroundsIE
  from .nhl import NHLIE, NHLVideocenterIE
+from .niconico import NiconicoIE
  from .nowvideo import NowVideoIE
  from .ooyala import OoyalaIE
  from .orf import ORFIE
  from .pbs import PBSIE
  from .photobucket import PhotobucketIE
+from .pornhub import PornHubIE
  from .pornotube import PornotubeIE
  from .rbmaradio import RBMARadioIE
  from .redtube import RedTubeIE
@@ -108,22 +120,31 @@ from .slashdot import SlashdotIE
  from .slideshare import SlideshareIE
  from .sohu import SohuIE
  from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE
-from .southparkstudios import SouthParkStudiosIE
+from .southparkstudios import (
+    SouthParkStudiosIE,
+    SouthparkDeIE,
+)
+from .space import SpaceIE
+from .spankwire import SpankwireIE
  from .spiegel import SpiegelIE
  from .stanfordoc import StanfordOpenClassroomIE
  from .statigram import StatigramIE
  from .steam import SteamIE
+from .streamcloud import StreamcloudIE
  from .sztvhu import SztvHuIE
  from .teamcoco import TeamcocoIE
  from .techtalks import TechTalksIE
  from .ted import TEDIE
  from .tf1 import TF1IE
  from .thisav import ThisAVIE
+from .toutv import TouTvIE
  from .traileraddict import TrailerAddictIE
  from .trilulilu import TriluliluIE
+from .tube8 import Tube8IE
  from .tudou import TudouIE
  from .tumblr import TumblrIE
  from .tutv import TutvIE
+from .tvp import TvpIE
  from .unistra import UnistraIE
  from .ustream import UstreamIE, UstreamChannelIE
  from .vbox7 import Vbox7IE
@@ -137,6 +158,8 @@ from .videofyme import VideofyMeIE
  from .videopremium import VideoPremiumIE
  from .vimeo import VimeoIE, VimeoChannelIE
  from .vine import VineIE
+from .viki import VikiIE
+from .vk import VKIE
  from .wat import WatIE
  from .websurg import WeBSurgIE
  from .weibo import WeiboIE
@@ -145,6 +168,7 @@ from .worldstarhiphop import WorldStarHipHopIE
  from .xhamster import XHamsterIE
  from .xnxx import XNXXIE
  from .xvideos import XVideosIE
+from .xtube import XTubeIE
  from .yahoo import YahooIE, YahooSearchIE
  from .youjizz import YouJizzIE
  from .youku import YoukuIE
@@ -153,6 +177,7 @@ from .youtube import (
      YoutubeIE,
      YoutubePlaylistIE,
      YoutubeSearchIE,
+    YoutubeSearchDateIE,
      YoutubeUserIE,
      YoutubeChannelIE,
      YoutubeShowIE,
@@ -161,6 +186,7 @@ from .youtube import (
      YoutubeTruncatedURLIE,
      YoutubeWatchLaterIE,
      YoutubeFavouritesIE,
+    YoutubeHistoryIE,
  )
  from .zdf import ZDFIE
  
diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py

index 82a785a19c34517c17da294ad64c1cbe7d22cba4..b99d4b96689c23a13379d4392484c3763ce0e36f 100644 (file)
--- a/youtube_dl/extractor/addanime.py
+++ b/youtube_dl/extractor/addanime.py
@@ -17,8 +17,8 @@ class AddAnimeIE(InfoExtractor):
      IE_NAME = u'AddAnime'
      _TEST = {
          u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9',
-        u'file': u'24MR3YO5SAS9.flv',
-        u'md5': u'1036a0e0cd307b95bd8a8c3a5c8cfaf1',
+        u'file': u'24MR3YO5SAS9.mp4',
+        u'md5': u'72954ea10bc979ab5e2eb288b21425a0',
          u'info_dict': {
              u"description": u"One Piece 606",
              u"title": u"One Piece 606"
@@ -31,7 +31,8 @@ class AddAnimeIE(InfoExtractor):
              video_id = mobj.group('video_id')
              webpage = self._download_webpage(url, video_id)
          except ExtractorError as ee:
-            if not isinstance(ee.cause, compat_HTTPError):
+            if not isinstance(ee.cause, compat_HTTPError) or \
+               ee.cause.code != 503:
                  raise
  
              redir_webpage = ee.cause.read().decode('utf-8')
@@ -60,16 +61,26 @@ class AddAnimeIE(InfoExtractor):
                  note=u'Confirming after redirect')
              webpage = self._download_webpage(url, video_id)
  
-        video_url = self._search_regex(r"var normal_video_file = '(.*?)';",
-                                       webpage, u'video file URL')
+        formats = []
+        for format_id in ('normal', 'hq'):
+            rex = r"var %s_video_file = '(.*?)';" % re.escape(format_id)
+            video_url = self._search_regex(rex, webpage, u'video file URLx',
+                                           fatal=False)
+            if not video_url:
+                continue
+            formats.append({
+                'format_id': format_id,
+                'url': video_url,
+            })
+        if not formats:
+            raise ExtractorError(u'Cannot find any video format!')
          video_title = self._og_search_title(webpage)
          video_description = self._og_search_description(webpage)
  
          return {
              '_type': 'video',
              'id':  video_id,
-            'url': video_url,
-            'ext': 'flv',
+            'formats': formats,
              'title': video_title,
              'description': video_description
          }
diff --git a/youtube_dl/extractor/anitube.py b/youtube_dl/extractor/anitube.py

new file mode 100644 (file)

index 0000000..691d5a8
--- /dev/null
+++ b/youtube_dl/extractor/anitube.py
@@ -0,0 +1,55 @@
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+
+
+class AnitubeIE(InfoExtractor):
+    IE_NAME = u'anitube.se'
+    _VALID_URL = r'https?://(?:www\.)?anitube\.se/video/(?P<id>\d+)'
+
+    _TEST = {
+        u'url': u'http://www.anitube.se/video/36621',
+        u'md5': u'59d0eeae28ea0bc8c05e7af429998d43',
+        u'file': u'36621.mp4',
+        u'info_dict': {
+            u'id': u'36621',
+            u'ext': u'mp4',
+            u'title': u'Recorder to Randoseru 01',
+        },
+        u'skip': u'Blocked in the US',
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        key = self._html_search_regex(r'http://www\.anitube\.se/embed/([A-Za-z0-9_-]*)',
+                                      webpage, u'key')
+
+        webpage_config = self._download_webpage('http://www.anitube.se/nuevo/econfig.php?key=%s' % key,
+                                                key)
+        config_xml = xml.etree.ElementTree.fromstring(webpage_config.encode('utf-8'))
+
+        video_title = config_xml.find('title').text
+
+        formats = []
+        video_url = config_xml.find('file')
+        if video_url is not None:
+            formats.append({
+                'format_id': 'sd',
+                'url': video_url.text,
+            })
+        video_url = config_xml.find('filehd')
+        if video_url is not None:
+            formats.append({
+                'format_id': 'hd',
+                'url': video_url.text,
+            })
+
+        return {
+            'id': video_id,
+            'title': video_title,
+            'formats': formats
+        }
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py

index 5ee8a67b14699a330914cd4f0e0f627ca9fca5a5..44d0b5d708aa31aef6d9997321142cb8638ab56b 100644 (file)
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -10,6 +10,7 @@ from ..utils import (
      unified_strdate,
      determine_ext,
      get_element_by_id,
+    compat_str,
  )
  
  # There are different sources of video in arte.tv, the extraction process 
@@ -68,7 +69,7 @@ class ArteTvIE(InfoExtractor):
              lang = mobj.group('lang')
              return self._extract_liveweb(url, name, lang)
  
-        if re.search(self._LIVE_URL, video_id) is not None:
+        if re.search(self._LIVE_URL, url) is not None:
              raise ExtractorError(u'Arte live streams are not yet supported, sorry')
              # self.extractLiveStream(url)
              # return
@@ -114,7 +115,7 @@ class ArteTvIE(InfoExtractor):
          event_doc = config_doc.find('event')
          url_node = event_doc.find('video').find('urlHd')
          if url_node is None:
-            url_node = video_doc.find('urlSd')
+            url_node = event_doc.find('urlSd')
  
          return {'id': video_id,
                  'title': event_doc.find('name%s' % lang.capitalize()).text,
@@ -158,7 +159,9 @@ class ArteTVPlus7IE(InfoExtractor):
              'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
          }
  
-        formats = player_info['VSR'].values()
+        all_formats = player_info['VSR'].values()
+        # Some formats use the m3u8 protocol
+        all_formats = list(filter(lambda f: f.get('videoFormat') != 'M3U8', all_formats))
          def _match_lang(f):
              if f.get('versionCode') is None:
                  return True
@@ -170,18 +173,48 @@ class ArteTVPlus7IE(InfoExtractor):
              regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l]
              return any(re.match(r, f['versionCode']) for r in regexes)
          # Some formats may not be in the same language as the url
-        formats = filter(_match_lang, formats)
-        # Some formats use the m3u8 protocol
-        formats = filter(lambda f: f.get('videoFormat') != 'M3U8', formats)
-        # We order the formats by quality
-        formats = sorted(formats, key=lambda f: int(f.get('height',-1)))
-        # Prefer videos without subtitles in the same language
-        formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f.get('versionCode', '')) is None)
-        # Pick the best quality
+        formats = filter(_match_lang, all_formats)
+        formats = list(formats) # in python3 filter returns an iterator
+        if not formats:
+            # Some videos are only available in the 'Originalversion'
+            # they aren't tagged as being in French or German
+            if all(f['versionCode'] == 'VO' for f in all_formats):
+                formats = all_formats
+            else:
+                raise ExtractorError(u'The formats list is empty')
+
+        if re.match(r'[A-Z]Q', formats[0]['quality']) is not None:
+            def sort_key(f):
+                return ['HQ', 'MQ', 'EQ', 'SQ'].index(f['quality'])
+        else:
+            def sort_key(f):
+                return (
+                    # Sort first by quality
+                    int(f.get('height',-1)),
+                    int(f.get('bitrate',-1)),
+                    # The original version with subtitles has lower relevance
+                    re.match(r'VO-ST(F|A)', f.get('versionCode', '')) is None,
+                    # The version with sourds/mal subtitles has also lower relevance
+                    re.match(r'VO?(F|A)-STM\1', f.get('versionCode', '')) is None,
+                )
+        formats = sorted(formats, key=sort_key)
          def _format(format_info):
+            quality = ''
+            height = format_info.get('height')
+            if height is not None:
+                quality = compat_str(height)
+            bitrate = format_info.get('bitrate')
+            if bitrate is not None:
+                quality += '-%d' % bitrate
+            if format_info.get('versionCode') is not None:
+                format_id = u'%s-%s' % (quality, format_info['versionCode'])
+            else:
+                format_id = quality
              info = {
+                'format_id': format_id,
+                'format_note': format_info.get('versionLibelle'),
                  'width': format_info.get('width'),
-                'height': format_info.get('height'),
+                'height': height,
              }
              if format_info['mediaType'] == u'rtmp':
                  info['url'] = format_info['streamer']
@@ -192,8 +225,6 @@ class ArteTVPlus7IE(InfoExtractor):
                  info['ext'] = determine_ext(info['url'])
              return info
          info_dict['formats'] = [_format(f) for f in formats]
-        # TODO: Remove when #980 has been merged 
-        info_dict.update(info_dict['formats'][-1])
  
          return info_dict
  
@@ -207,7 +238,7 @@ class ArteTVCreativeIE(ArteTVPlus7IE):
          u'url': u'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design',
          u'file': u'050489-002.mp4',
          u'info_dict': {
-            u'title': u'Agentur Amateur #2 - Corporate Design',
+            u'title': u'Agentur Amateur / Agence Amateur #2 : Corporate Design',
          },
      }
  
diff --git a/youtube_dl/extractor/auengine.py b/youtube_dl/extractor/auengine.py

index 0febbff4f6c42afd10f8dbc13ea9df883edae4c6..95c038003b431dc48ac3bb89dcc03f8aa39ea07f 100644 (file)
--- a/youtube_dl/extractor/auengine.py
+++ b/youtube_dl/extractor/auengine.py
@@ -1,10 +1,10 @@
-import os.path
  import re
  
  from .common import InfoExtractor
  from ..utils import (
      compat_urllib_parse,
-    compat_urllib_parse_urlparse,
+    determine_ext,
+    ExtractorError,
  )
  
  class AUEngineIE(InfoExtractor):
@@ -25,22 +25,25 @@ class AUEngineIE(InfoExtractor):
          title = self._html_search_regex(r'<title>(?P<title>.+?)</title>',
                  webpage, u'title')
          title = title.strip()
-        links = re.findall(r'[^A-Za-z0-9]?(?:file|url):\s*["\'](http[^\'"&]*)', webpage)
-        links = [compat_urllib_parse.unquote(l) for l in links]
+        links = re.findall(r'\s(?:file|url):\s*["\']([^\'"]+)["\']', webpage)
+        links = map(compat_urllib_parse.unquote, links)
+
+        thumbnail = None
+        video_url = None
          for link in links:
-            root, pathext = os.path.splitext(compat_urllib_parse_urlparse(link).path)
-            if pathext == '.png':
+            if link.endswith('.png'):
                  thumbnail = link
-            elif pathext == '.mp4':
-                url = link
-                ext = pathext
+            elif '/videos/' in link:
+                video_url = link
+        if not video_url:
+            raise ExtractorError(u'Could not find video URL')
+        ext = u'.' + determine_ext(video_url)
          if ext == title[-len(ext):]:
              title = title[:-len(ext)]
-        ext = ext[1:]
-        return [{
+
+        return {
              'id':        video_id,
-            'url':       url,
-            'ext':       ext,
+            'url':       video_url,
              'title':     title,
              'thumbnail': thumbnail,
-        }]
+        }
diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py

new file mode 100644 (file)

index 0000000..967568c
--- /dev/null
+++ b/youtube_dl/extractor/bambuser.py
@@ -0,0 +1,81 @@
+import re
+import json
+import itertools
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_request,
+)
+
+
+class BambuserIE(InfoExtractor):
+    IE_NAME = u'bambuser'
+    _VALID_URL = r'https?://bambuser\.com/v/(?P<id>\d+)'
+    _API_KEY = '005f64509e19a868399060af746a00aa'
+
+    _TEST = {
+        u'url': u'http://bambuser.com/v/4050584',
+        # MD5 seems to be flaky, see https://travis-ci.org/rg3/youtube-dl/jobs/14051016#L388
+        #u'md5': u'fba8f7693e48fd4e8641b3fd5539a641',
+        u'info_dict': {
+            u'id': u'4050584',
+            u'ext': u'flv',
+            u'title': u'Education engineering days - lightning talks',
+            u'duration': 3741,
+            u'uploader': u'pixelversity',
+            u'uploader_id': u'344706',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        info_url = ('http://player-c.api.bambuser.com/getVideo.json?'
+            '&api_key=%s&vid=%s' % (self._API_KEY, video_id))
+        info_json = self._download_webpage(info_url, video_id)
+        info = json.loads(info_json)['result']
+
+        return {
+            'id': video_id,
+            'title': info['title'],
+            'url': info['url'],
+            'thumbnail': info.get('preview'),
+            'duration': int(info['length']),
+            'view_count': int(info['views_total']),
+            'uploader': info['username'],
+            'uploader_id': info['uid'],
+        }
+
+
+class BambuserChannelIE(InfoExtractor):
+    IE_NAME = u'bambuser:channel'
+    _VALID_URL = r'http://bambuser.com/channel/(?P<user>.*?)(?:/|#|\?|$)'
+    # The maximum number we can get with each request
+    _STEP = 50
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        user = mobj.group('user')
+        urls = []
+        last_id = ''
+        for i in itertools.count(1):
+            req_url = ('http://bambuser.com/xhr-api/index.php?username={user}'
+                '&sort=created&access_mode=0%2C1%2C2&limit={count}'
+                '&method=broadcast&format=json&vid_older_than={last}'
+                ).format(user=user, count=self._STEP, last=last_id)
+            req = compat_urllib_request.Request(req_url)
+            # Without setting this header, we wouldn't get any result
+            req.add_header('Referer', 'http://bambuser.com/channel/%s' % user)
+            info_json = self._download_webpage(req, user,
+                u'Downloading page %d' % i)
+            results = json.loads(info_json)['result']
+            if len(results) == 0:
+                break
+            last_id = results[-1]['vid']
+            urls.extend(self.url_result(v['page'], 'Bambuser') for v in results)
+
+        return {
+            '_type': 'playlist',
+            'title': user,
+            'entries': urls,
+        }
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py

index 129a20f4497b4cc6fc9f031e8e48dd8eb8980f66..3a32c14c598dd2da14841fe68c1cb59582f30799 100644 (file)
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -3,13 +3,16 @@ import re
  
  from .common import InfoExtractor
  from ..utils import (
+    compat_str,
+    compat_urlparse,
      ExtractorError,
  )
  
  
  class BandcampIE(InfoExtractor):
+    IE_NAME = u'Bandcamp'
      _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
-    _TEST = {
+    _TESTS = [{
          u'url': u'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
          u'file': u'1812978515.mp3',
          u'md5': u'cdeb30cdae1921719a3cbcab696ef53c',
@@ -17,7 +20,7 @@ class BandcampIE(InfoExtractor):
              u"title": u"youtube-dl test song \"'/\\\u00e4\u21ad"
          },
          u'skip': u'There is a limit of 200 free downloads / month for the test song'
-    }
+    }]
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
@@ -26,6 +29,23 @@ class BandcampIE(InfoExtractor):
          # We get the link to the free download page
          m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
          if m_download is None:
+            m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage)
+        if m_trackinfo:
+            json_code = m_trackinfo.group(1)
+            data = json.loads(json_code)
+
+            for d in data:
+                formats = [{
+                    'format_id': 'format_id',
+                    'url': format_url,
+                    'ext': format_id.partition('-')[0]
+                } for format_id, format_url in sorted(d['file'].items())]
+                return {
+                    'id': compat_str(d['id']),
+                    'title': d['title'],
+                    'formats': formats,
+                }
+        else:
              raise ExtractorError(u'No free songs found')
  
          download_link = m_download.group(1)
@@ -61,3 +81,49 @@ class BandcampIE(InfoExtractor):
                        }
  
          return [track_info]
+
+
+class BandcampAlbumIE(InfoExtractor):
+    IE_NAME = u'Bandcamp:album'
+    _VALID_URL = r'http://.*?\.bandcamp\.com/album/(?P<title>.*)'
+
+    _TEST = {
+        u'url': u'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
+        u'playlist': [
+            {
+                u'file': u'1353101989.mp3',
+                u'md5': u'39bc1eded3476e927c724321ddf116cf',
+                u'info_dict': {
+                    u'title': u'Intro',
+                }
+            },
+            {
+                u'file': u'38097443.mp3',
+                u'md5': u'1a2c32e2691474643e912cc6cd4bffaa',
+                u'info_dict': {
+                    u'title': u'Kero One - Keep It Alive (Blazo remix)',
+                }
+            },
+        ],
+        u'params': {
+            u'playlistend': 2
+        },
+        u'skip': u'Bancamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test'
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        title = mobj.group('title')
+        webpage = self._download_webpage(url, title)
+        tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage)
+        if not tracks_paths:
+            raise ExtractorError(u'The page doesn\'t contain any track')
+        entries = [
+            self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key())
+            for t_path in tracks_paths]
+        title = self._search_regex(r'album_title : "(.*?)"', webpage, u'title')
+        return {
+            '_type': 'playlist',
+            'title': title,
+            'entries': entries,
+        }
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py

index 1392f382a24c273604f0c67db7afafefbcec85b8..66fe0ac9ade6fad80d77f0429c136c2d022af16d 100644 (file)
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -9,10 +9,13 @@ from ..utils import (
      compat_urllib_parse,
      find_xpath_attr,
      compat_urlparse,
+    compat_str,
+    compat_urllib_request,
  
      ExtractorError,
  )
  
+
  class BrightcoveIE(InfoExtractor):
      _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P<query>.*)'
      _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s'
@@ -23,7 +26,7 @@ class BrightcoveIE(InfoExtractor):
              # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/
              u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001',
              u'file': u'2371591881001.mp4',
-            u'md5': u'9e80619e0a94663f0bdc849b4566af19',
+            u'md5': u'8eccab865181d29ec2958f32a6a754f5',
              u'note': u'Test Brightcove downloads and detection in GenericIE',
              u'info_dict': {
                  u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”',
@@ -41,6 +44,17 @@ class BrightcoveIE(InfoExtractor):
                  u'uploader': u'Oracle',
              },
          },
+        {
+            # From http://mashable.com/2013/10/26/thermoelectric-bracelet-lets-you-control-your-body-temperature/
+            u'url': u'http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ%7E%7E%2CAAABBzUwv1E%7E%2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001',
+            u'info_dict': {
+                u'id': u'2750934548001',
+                u'ext': u'mp4',
+                u'title': u'This Bracelet Acts as a Personal Thermostat',
+                u'description': u'md5:547b78c64f4112766ccf4e151c20b6a0',
+                u'uploader': u'Mashable',
+            },
+        },
      ]
  
      @classmethod
@@ -61,31 +75,61 @@ class BrightcoveIE(InfoExtractor):
          params = {'flashID': object_doc.attrib['id'],
                    'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'],
                    }
-        playerKey = find_xpath_attr(object_doc, './param', 'name', 'playerKey')
+        def find_param(name):
+            node = find_xpath_attr(object_doc, './param', 'name', name)
+            if node is not None:
+                return node.attrib['value']
+            return None
+        playerKey = find_param('playerKey')
          # Not all pages define this value
          if playerKey is not None:
-            params['playerKey'] = playerKey.attrib['value']
-        videoPlayer = find_xpath_attr(object_doc, './param', 'name', '@videoPlayer')
+            params['playerKey'] = playerKey
+        # The three fields hold the id of the video
+        videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID')
          if videoPlayer is not None:
-            params['@videoPlayer'] = videoPlayer.attrib['value']
+            params['@videoPlayer'] = videoPlayer
+        linkBase = find_param('linkBaseURL')
+        if linkBase is not None:
+            params['linkBaseURL'] = linkBase
          data = compat_urllib_parse.urlencode(params)
          return cls._FEDERATED_URL_TEMPLATE % data
  
+    @classmethod
+    def _extract_brightcove_url(cls, webpage):
+        """Try to extract the brightcove url from the wepbage, returns None
+        if it can't be found
+        """
+        m_brightcove = re.search(
+            r'<object[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1.+?</object>',
+            webpage, re.DOTALL)
+        if m_brightcove is not None:
+            return cls._build_brighcove_url(m_brightcove.group())
+        else:
+            return None
+
      def _real_extract(self, url):
+        # Change the 'videoId' and others field to '@videoPlayer'
+        url = re.sub(r'(?<=[?&])(videoI(d|D)|bctid)', '%40videoPlayer', url)
+        # Change bckey (used by bcove.me urls) to playerKey
+        url = re.sub(r'(?<=[?&])bckey', 'playerKey', url)
          mobj = re.match(self._VALID_URL, url)
          query_str = mobj.group('query')
          query = compat_urlparse.parse_qs(query_str)
  
          videoPlayer = query.get('@videoPlayer')
          if videoPlayer:
-            return self._get_video_info(videoPlayer[0], query_str)
+            return self._get_video_info(videoPlayer[0], query_str, query)
          else:
              player_key = query['playerKey']
              return self._get_playlist_info(player_key[0])
  
-    def _get_video_info(self, video_id, query):
-        request_url = self._FEDERATED_URL_TEMPLATE % query
-        webpage = self._download_webpage(request_url, video_id)
+    def _get_video_info(self, video_id, query_str, query):
+        request_url = self._FEDERATED_URL_TEMPLATE % query_str
+        req = compat_urllib_request.Request(request_url)
+        linkBase = query.get('linkBaseURL')
+        if linkBase is not None:
+            req.add_header('Referer', linkBase[0])
+        webpage = self._download_webpage(req, video_id)
  
          self.report_extraction(video_id)
          info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json')
@@ -109,7 +153,7 @@ class BrightcoveIE(InfoExtractor):
  
      def _extract_video_info(self, video_info):
          info = {
-            'id': video_info['id'],
+            'id': compat_str(video_info['id']),
              'title': video_info['displayName'],
              'description': video_info.get('shortDescription'),
              'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'),
@@ -119,15 +163,14 @@ class BrightcoveIE(InfoExtractor):
          renditions = video_info.get('renditions')
          if renditions:
              renditions = sorted(renditions, key=lambda r: r['size'])
-            best_format = renditions[-1]
-            info.update({
-                'url': best_format['defaultURL'],
-                'ext': 'mp4',
-            })
+            info['formats'] = [{
+                'url': rend['defaultURL'],
+                'height': rend.get('frameHeight'),
+                'width': rend.get('frameWidth'),
+            } for rend in renditions]
          elif video_info.get('FLVFullLengthURL') is not None:
              info.update({
                  'url': video_info['FLVFullLengthURL'],
-                'ext': 'flv',
              })
          else:
              raise ExtractorError(u'Unable to extract video url for %s' % info['id'])
diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py

index e7f4fa9fdc569b9eb559fd3358169e81c3cdfa3a..3d8d7f9d2dee4713b467e47ab79bbb55edccf147 100644 (file)
--- a/youtube_dl/extractor/canalc2.py
+++ b/youtube_dl/extractor/canalc2.py
@@ -6,7 +6,7 @@ from .common import InfoExtractor
  
  class Canalc2IE(InfoExtractor):
      IE_NAME = 'canalc2.tv'
-    _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?idVideo=(\d+)&voir=oui'
+    _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?.*?idVideo=(?P<id>\d+)'
  
      _TEST = {
          u'url': u'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui',
@@ -18,7 +18,9 @@ class Canalc2IE(InfoExtractor):
      }
  
      def _real_extract(self, url):
-        video_id = re.match(self._VALID_URL, url).group(1)
+        video_id = re.match(self._VALID_URL, url).group('id')
+        # We need to set the voir field for getting the file name
+        url = 'http://www.canalc2.tv/video.asp?idVideo=%s&voir=oui' % video_id
          webpage = self._download_webpage(url, video_id)
          file_name = self._search_regex(
              r"so\.addVariable\('file','(.*?)'\);",
diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py

index 1db9b24cf204cc26d68b1a1bdaff93577c3ae903..bfa2a8b4063163729b3d8c11d63d7567a81ab59e 100644 (file)
--- a/youtube_dl/extractor/canalplus.py
+++ b/youtube_dl/extractor/canalplus.py
@@ -5,6 +5,7 @@ import xml.etree.ElementTree
  from .common import InfoExtractor
  from ..utils import unified_strdate
  
+
  class CanalplusIE(InfoExtractor):
      _VALID_URL = r'https?://(www\.canalplus\.fr/.*?/(?P<path>.*)|player\.canalplus\.fr/#/(?P<id>\d+))'
      _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s'
@@ -25,7 +26,7 @@ class CanalplusIE(InfoExtractor):
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = mobj.groupdict().get('id')
          if video_id is None:
              webpage = self._download_webpage(url, mobj.group('path'))
              video_id = self._search_regex(r'videoId = "(\d+)";', webpage, u'video id')
diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py

index 6925b96c2ee1fd1e09624638805597259b068dcd..f0d08cebfce87b006b339508f655eba95a4bc1ef 100644 (file)
--- a/youtube_dl/extractor/cinemassacre.py
+++ b/youtube_dl/extractor/cinemassacre.py
@@ -41,7 +41,7 @@ class CinemassacreIE(InfoExtractor):
          webpage_url = u'http://' + mobj.group('url')
          webpage = self._download_webpage(webpage_url, None) # Don't know video id yet
          video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d')
-        mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/(?:embed|player)\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage)
+        mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage)
          if not mobj:
              raise ExtractorError(u'Can\'t extract embed url and video id')
          playerdata_url = mobj.group(u'embed_url')
@@ -55,30 +55,32 @@ class CinemassacreIE(InfoExtractor):
              video_description = None
  
          playerdata = self._download_webpage(playerdata_url, video_id)
-        base_url = self._html_search_regex(r'\'streamer\': \'(?P<base_url>rtmp://.*?)/(?:vod|Cinemassacre)\'',
-            playerdata, u'base_url')
-        base_url += '/Cinemassacre/'
-        # Important: The file names in playerdata are not used by the player and even wrong for some videos
-        sd_file = 'Cinemassacre-%s_high.mp4' % video_id
-        hd_file = 'Cinemassacre-%s.mp4' % video_id
-        video_thumbnail = 'http://image.screenwavemedia.com/Cinemassacre/Cinemassacre-%s_thumb_640x360.jpg' % video_id
+        url = self._html_search_regex(r'\'streamer\': \'(?P<url>[^\']+)\'', playerdata, u'url')
+
+        sd_file = self._html_search_regex(r'\'file\': \'(?P<sd_file>[^\']+)\'', playerdata, u'sd_file')
+        hd_file = self._html_search_regex(r'\'?file\'?: "(?P<hd_file>[^"]+)"', playerdata, u'hd_file')
+        video_thumbnail = self._html_search_regex(r'\'image\': \'(?P<thumbnail>[^\']+)\'', playerdata, u'thumbnail', fatal=False)
  
          formats = [
              {
-                'url': base_url + sd_file,
+                'url': url,
+                'play_path': 'mp4:' + sd_file,
+                'rtmp_live': True, # workaround
                  'ext': 'flv',
                  'format': 'sd',
                  'format_id': 'sd',
              },
              {
-                'url': base_url + hd_file,
+                'url': url,
+                'play_path': 'mp4:' + hd_file,
+                'rtmp_live': True, # workaround
                  'ext': 'flv',
                  'format': 'hd',
                  'format_id': 'hd',
              },
          ]
  
-        info = {
+        return {
              'id': video_id,
              'title': video_title,
              'formats': formats,
@@ -86,6 +88,3 @@ class CinemassacreIE(InfoExtractor):
              'upload_date': video_date,
              'thumbnail': video_thumbnail,
          }
-        # TODO: Remove when #980 has been merged
-        info.update(formats[-1])
-        return info
diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py

new file mode 100644 (file)

index 0000000..95449da
--- /dev/null
+++ b/youtube_dl/extractor/clipfish.py
@@ -0,0 +1,53 @@
+import re
+import time
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+
+
+class ClipfishIE(InfoExtractor):
+    IE_NAME = u'clipfish'
+
+    _VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P<id>[0-9]+)/'
+    _TEST = {
+        u'url': u'http://www.clipfish.de/special/supertalent/video/4028320/supertalent-2013-ivana-opacak-singt-nobodys-perfect/',
+        u'file': u'4028320.f4v',
+        u'md5': u'5e38bda8c329fbfb42be0386a3f5a382',
+        u'info_dict': {
+            u'title': u'Supertalent 2013: Ivana Opacak singt Nobody\'s Perfect',
+            u'duration': 399,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group(1)
+
+        info_url = ('http://www.clipfish.de/devxml/videoinfo/%s?ts=%d' %
+                    (video_id, int(time.time())))
+        info_xml = self._download_webpage(
+            info_url, video_id, note=u'Downloading info page')
+        doc = xml.etree.ElementTree.fromstring(info_xml)
+        title = doc.find('title').text
+        video_url = doc.find('filename').text
+        thumbnail = doc.find('imageurl').text
+        duration_str = doc.find('duration').text
+        m = re.match(
+            r'^(?P<hours>[0-9]+):(?P<minutes>[0-9]{2}):(?P<seconds>[0-9]{2}):(?P<ms>[0-9]*)$',
+            duration_str)
+        if m:
+            duration = (
+                (int(m.group('hours')) * 60 * 60) +
+                (int(m.group('minutes')) * 60) +
+                (int(m.group('seconds')))
+            )
+        else:
+            duration = None
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': video_url,
+            'thumbnail': thumbnail,
+            'duration': duration,
+        }
diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py

index a79f881cd9dbf54b8fdb3ff229de1d0e6b9c6aac..34adf6dda519a5ed2657fee3687d9e2e0f52ef73 100644 (file)
--- a/youtube_dl/extractor/cnn.py
+++ b/youtube_dl/extractor/cnn.py
@@ -6,7 +6,7 @@ from ..utils import determine_ext
  
  
  class CNNIE(InfoExtractor):
-    _VALID_URL = r'''(?x)https?://(edition\.)?cnn\.com/video/(data/.+?|\?)/
+    _VALID_URL = r'''(?x)https?://((edition|www)\.)?cnn\.com/video/(data/.+?|\?)/
          (?P<path>.+?/(?P<title>[^/]+?)(?:\.cnn|(?=&)))'''
  
      _TESTS = [{
diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py

index 8d4c93d6da91f4470c9809bf32dd0fbbe886c92b..b27c1dfc52401f3c148d48d2b2897d2b06db3834 100644 (file)
--- a/youtube_dl/extractor/collegehumor.py
+++ b/youtube_dl/extractor/collegehumor.py
@@ -1,5 +1,4 @@
  import re
-import xml.etree.ElementTree
  
  from .common import InfoExtractor
  from ..utils import (
@@ -46,11 +45,10 @@ class CollegeHumorIE(InfoExtractor):
  
          self.report_extraction(video_id)
          xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
-        metaXml = self._download_webpage(xmlUrl, video_id,
+        mdoc = self._download_xml(xmlUrl, video_id,
                                           u'Downloading info XML',
                                           u'Unable to download video info XML')
  
-        mdoc = xml.etree.ElementTree.fromstring(metaXml)
          try:
              videoNode = mdoc.findall('./video')[0]
              youtubeIdNode = videoNode.find('./youtubeID')
@@ -65,16 +63,13 @@ class CollegeHumorIE(InfoExtractor):
  
          if next_url.endswith(u'manifest.f4m'):
              manifest_url = next_url + '?hdcore=2.10.3'
-            manifestXml = self._download_webpage(manifest_url, video_id,
+            adoc = self._download_xml(manifest_url, video_id,
                                           u'Downloading XML manifest',
                                           u'Unable to download video info XML')
  
-            adoc = xml.etree.ElementTree.fromstring(manifestXml)
              try:
-                media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
-                node_id = media_node.attrib['url']
                  video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
-            except IndexError as err:
+            except IndexError:
                  raise ExtractorError(u'Invalid manifest file')
              url_pr = compat_urllib_parse_urlparse(info['thumbnail'])
              info['url'] = url_pr.scheme + '://' + url_pr.netloc + video_id[:-2].replace('.csmil','').replace(',','')
diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py

index 69b2beecebac319ef92e8043ab75ad71fad46a25..725849d2e98c89c2548778e6c37f73ce395d1c4a 100644 (file)
--- a/youtube_dl/extractor/comedycentral.py
+++ b/youtube_dl/extractor/comedycentral.py
@@ -2,6 +2,7 @@ import re
  import xml.etree.ElementTree
  
  from .common import InfoExtractor
+from .mtv import MTVIE, _media_xml_tag
  from ..utils import (
      compat_str,
      compat_urllib_parse,
@@ -11,7 +12,37 @@ from ..utils import (
  )
  
  
-class ComedyCentralIE(InfoExtractor):
+class ComedyCentralIE(MTVIE):
+    _VALID_URL = r'http://www.comedycentral.com/(video-clips|episodes|cc-studios)/(?P<title>.*)'
+    _FEED_URL = u'http://comedycentral.com/feeds/mrss/'
+
+    _TEST = {
+        u'url': u'http://www.comedycentral.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother',
+        u'md5': u'4167875aae411f903b751a21f357f1ee',
+        u'info_dict': {
+            u'id': u'cef0cbb3-e776-4bc9-b62e-8016deccb354',
+            u'ext': u'mp4',
+            u'title': u'Uncensored - Greg Fitzsimmons - Too Good of a Mother',
+            u'description': u'After a certain point, breastfeeding becomes c**kblocking.',
+        },
+    }
+    # Overwrite MTVIE properties we don't want
+    _TESTS = []
+
+    def _get_thumbnail_url(self, uri, itemdoc):
+        search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
+        return itemdoc.find(search_path).attrib['url']
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        title = mobj.group('title')
+        webpage = self._download_webpage(url, title)
+        mgid = self._search_regex(r'data-mgid="(?P<mgid>mgid:.*?)"',
+                                  webpage, u'mgid')
+        return self._get_videos_info(mgid)
+
+
+class ComedyCentralShowsIE(InfoExtractor):
      IE_DESC = u'The Daily Show / Colbert Report'
      # urls can be abbreviations like :thedailyshow or :colbert
      # urls for episodes like:
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index d4af3b5ebf119ef6612093bd6a5c03d1cc6cc380..5656445a3360b8e908280e967bb97fe5b22beffc 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -4,19 +4,22 @@ import re
  import socket
  import sys
  import netrc
+import xml.etree.ElementTree
  
  from ..utils import (
      compat_http_client,
      compat_urllib_error,
-    compat_urllib_request,
      compat_str,
  
      clean_html,
      compiled_regex_type,
      ExtractorError,
+    RegexNotFoundError,
+    sanitize_filename,
      unescapeHTML,
  )
  
+
  class InfoExtractor(object):
      """Information Extractor class.
  
@@ -61,11 +64,22 @@ class InfoExtractor(object):
                      * ext       Will be calculated from url if missing
                      * format    A human-readable description of the format
                                  ("mp4 container with h264/opus").
-                                Calculated from width and height if missing.
+                                Calculated from the format_id, width, height.
+                                and format_note fields if missing.
                      * format_id A short description of the format
                                  ("mp4_h264_opus" or "19")
+                    * format_note Additional info about the format
+                                ("3D" or "DASH video")
                      * width     Width of the video, if known
                      * height    Height of the video, if known
+                    * abr       Average audio bitrate in KBit/s
+                    * acodec    Name of the audio codec in use
+                    * vbr       Average video bitrate in KBit/s
+                    * vcodec    Name of the video codec in use
+                    * filesize  The number of bytes, if known in advance
+    webpage_url:    The url to the video webpage, if given to youtube-dl it
+                    should allow to get the same result again. (It will be set
+                    by YoutubeDL if it's missing)
  
      Unless mentioned otherwise, the fields should be Unicode strings.
  
@@ -144,7 +158,7 @@ class InfoExtractor(object):
          elif note is not False:
              self.to_screen(u'%s: %s' % (video_id, note))
          try:
-            return compat_urllib_request.urlopen(url_or_request)
+            return self._downloader.urlopen(url_or_request)
          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
              if errnote is None:
                  errnote = u'Unable to download webpage'
@@ -178,6 +192,17 @@ class InfoExtractor(object):
              self.to_screen(u'Dumping request to ' + url)
              dump = base64.b64encode(webpage_bytes).decode('ascii')
              self._downloader.to_screen(dump)
+        if self._downloader.params.get('write_pages', False):
+            try:
+                url = url_or_request.get_full_url()
+            except AttributeError:
+                url = url_or_request
+            raw_filename = ('%s_%s.dump' % (video_id, url))
+            filename = sanitize_filename(raw_filename, restricted=True)
+            self.to_screen(u'Saving request to ' + filename)
+            with open(filename, 'wb') as outf:
+                outf.write(webpage_bytes)
+
          content = webpage_bytes.decode(encoding, 'replace')
          return (content, urlh)
  
@@ -185,6 +210,11 @@ class InfoExtractor(object):
          """ Returns the data of the page as a string """
          return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
  
+    def _download_xml(self, url_or_request, video_id, note=u'Downloading XML', errnote=u'Unable to downloand XML'):
+        """Return the xml as an xml.etree.ElementTree.Element"""
+        xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
+        return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
+
      def to_screen(self, msg):
          """Print msg to screen, prefixing it with '[ie_name]'"""
          self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
@@ -206,12 +236,14 @@ class InfoExtractor(object):
          self.to_screen(u'Logging in')
  
      #Methods for following #608
-    def url_result(self, url, ie=None):
+    def url_result(self, url, ie=None, video_id=None):
          """Returns a url that points to a page that should be processed"""
          #TODO: ie should be the class used for getting the info
          video_info = {'_type': 'url',
                        'url': url,
                        'ie_key': ie}
+        if video_id is not None:
+            video_info['id'] = video_id
          return video_info
      def playlist_result(self, entries, playlist_id=None, playlist_title=None):
          """Returns a playlist"""
@@ -228,7 +260,7 @@ class InfoExtractor(object):
          Perform a regex search on the given string, using a single or a list of
          patterns returning the first matching group.
          In case of failure return a default value or raise a WARNING or a
-        ExtractorError, depending on fatal, specifying the field name.
+        RegexNotFoundError, depending on fatal, specifying the field name.
          """
          if isinstance(pattern, (str, compat_str, compiled_regex_type)):
              mobj = re.search(pattern, string, flags)
@@ -248,7 +280,7 @@ class InfoExtractor(object):
          elif default is not None:
              return default
          elif fatal:
-            raise ExtractorError(u'Unable to extract %s' % _name)
+            raise RegexNotFoundError(u'Unable to extract %s' % _name)
          else:
              self._downloader.report_warning(u'unable to extract %s; '
                  u'please report this issue on http://yt-dl.org/bug' % _name)
@@ -296,13 +328,21 @@ class InfoExtractor(object):
  
      # Helper functions for extracting OpenGraph info
      @staticmethod
-    def _og_regex(prop):
-        return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
+    def _og_regexes(prop):
+        content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
+        property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
+        template = r'<meta[^>]+?%s[^>]+?%s'
+        return [
+            template % (property_re, content_re),
+            template % (content_re, property_re),
+        ]
  
      def _og_search_property(self, prop, html, name=None, **kargs):
          if name is None:
              name = 'OpenGraph %s' % prop
-        escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
+        escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
+        if escaped is None:
+            return None
          return unescapeHTML(escaped)
  
      def _og_search_thumbnail(self, html, **kargs):
@@ -314,10 +354,21 @@ class InfoExtractor(object):
      def _og_search_title(self, html, **kargs):
          return self._og_search_property('title', html, **kargs)
  
-    def _og_search_video_url(self, html, name='video url', **kargs):
-        return self._html_search_regex([self._og_regex('video:secure_url'),
-                                        self._og_regex('video')],
-                                       html, name, **kargs)
+    def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
+        regexes = self._og_regexes('video')
+        if secure: regexes = self._og_regexes('video:secure_url') + regexes
+        return self._html_search_regex(regexes, html, name, **kargs)
+
+    def _html_search_meta(self, name, html, display_name=None):
+        if display_name is None:
+            display_name = name
+        return self._html_search_regex(
+            r'''(?ix)<meta(?=[^>]+(?:name|property)=["\']%s["\'])
+                    [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
+            html, display_name, fatal=False)
+
+    def _dc_search_uploader(self, html):
+        return self._html_search_meta('dc.creator', html, 'uploader')
  
      def _rta_search(self, html):
          # See http://www.rtalabel.org/index.php?content=howtofaq#single
@@ -327,6 +378,23 @@ class InfoExtractor(object):
              return 18
          return 0
  
+    def _media_rating_search(self, html):
+        # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
+        rating = self._html_search_meta('rating', html)
+
+        if not rating:
+            return None
+
+        RATING_TABLE = {
+            'safe for kids': 0,
+            'general': 8,
+            '14 years': 14,
+            'mature': 17,
+            'restricted': 19,
+        }
+        return RATING_TABLE.get(rating.lower(), None)
+
+
  
  class SearchInfoExtractor(InfoExtractor):
      """
diff --git a/youtube_dl/extractor/d8.py b/youtube_dl/extractor/d8.py

new file mode 100644 (file)

index 0000000..a56842b
--- /dev/null
+++ b/youtube_dl/extractor/d8.py
@@ -0,0 +1,22 @@
+# encoding: utf-8
+from .canalplus import CanalplusIE
+
+
+class D8IE(CanalplusIE):
+    _VALID_URL = r'https?://www\.d8\.tv/.*?/(?P<path>.*)'
+    _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/d8/%s'
+    IE_NAME = u'd8.tv'
+
+    _TEST = {
+        u'url': u'http://www.d8.tv/d8-docs-mags/pid6589-d8-campagne-intime.html',
+        u'file': u'966289.flv',
+        u'info_dict': {
+            u'title': u'Campagne intime - Documentaire exceptionnel',
+            u'description': u'md5:d2643b799fb190846ae09c61e59a859f',
+            u'upload_date': u'20131108',
+        },
+        u'params': {
+            # rtmp
+            u'skip_download': True,
+        },
+    }
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py

index 7d83539469d3d7ff120f916cc837a60bacfe8390..71f5e03eea393b7733bf3bfeb4f2eeea5b21eb85 100644 (file)
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -21,6 +21,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor):
          """Build a request with the family filter disabled"""
          request = compat_urllib_request.Request(url)
          request.add_header('Cookie', 'family_filter=off')
+        request.add_header('Cookie', 'ff=off')
          return request
  
  class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
@@ -28,6 +29,15 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
  
      _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/(?:embed/)?video/([^/]+)'
      IE_NAME = u'dailymotion'
+
+    _FORMATS = [
+        (u'stream_h264_ld_url', u'ld'),
+        (u'stream_h264_url', u'standard'),
+        (u'stream_h264_hq_url', u'hq'),
+        (u'stream_h264_hd_url', u'hd'),
+        (u'stream_h264_hd1080_url', u'hd180'),
+    ]
+
      _TESTS = [
          {
              u'url': u'http://www.dailymotion.com/video/x33vw9_tutoriel-de-youtubeur-dl-des-video_tech',
@@ -52,6 +62,18 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
              },
              u'skip': u'VEVO is only available in some countries',
          },
+        # age-restricted video
+        {
+            u'url': u'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband',
+            u'file': u'xyh2zz.mp4',
+            u'md5': u'0d667a7b9cebecc3c89ee93099c4159d',
+            u'info_dict': {
+                u'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]',
+                u'uploader': 'HotWaves1012',
+                u'age_limit': 18,
+            }
+
+        }
      ]
  
      def _real_extract(self, url):
@@ -60,7 +82,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
  
          video_id = mobj.group(1).split('_')[0].split('?')[0]
  
-        video_extension = 'mp4'
          url = 'http://www.dailymotion.com/video/%s' % video_id
  
          # Retrieve video webpage to extract further information
@@ -82,7 +103,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
          video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
                                               # Looking for official user
                                               r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
-                                            webpage, 'video uploader')
+                                            webpage, 'video uploader', fatal=False)
+        age_limit = self._rta_search(webpage)
  
          video_upload_date = None
          mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
@@ -99,37 +121,43 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
              msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title']
              raise ExtractorError(msg, expected=True)
  
-        # TODO: support choosing qualities
-
-        for key in ['stream_h264_hd1080_url','stream_h264_hd_url',
-                    'stream_h264_hq_url','stream_h264_url',
-                    'stream_h264_ld_url']:
-            if info.get(key):#key in info and info[key]:
-                max_quality = key
-                self.to_screen(u'Using %s' % key)
-                break
-        else:
+        formats = []
+        for (key, format_id) in self._FORMATS:
+            video_url = info.get(key)
+            if video_url is not None:
+                m_size = re.search(r'H264-(\d+)x(\d+)', video_url)
+                if m_size is not None:
+                    width, height = m_size.group(1), m_size.group(2)
+                else:
+                    width, height = None, None
+                formats.append({
+                    'url': video_url,
+                    'ext': 'mp4',
+                    'format_id': format_id,
+                    'width': width,
+                    'height': height,
+                })
+        if not formats:
              raise ExtractorError(u'Unable to extract video URL')
-        video_url = info[max_quality]
  
          # subtitles
-        video_subtitles = self.extract_subtitles(video_id)
+        video_subtitles = self.extract_subtitles(video_id, webpage)
          if self._downloader.params.get('listsubtitles', False):
-            self._list_available_subtitles(video_id)
+            self._list_available_subtitles(video_id, webpage)
              return
  
-        return [{
+        return {
              'id':       video_id,
-            'url':      video_url,
+            'formats': formats,
              'uploader': video_uploader,
              'upload_date':  video_upload_date,
              'title':    self._og_search_title(webpage),
-            'ext':      video_extension,
              'subtitles':    video_subtitles,
-            'thumbnail': info['thumbnail_url']
-        }]
+            'thumbnail': info['thumbnail_url'],
+            'age_limit': age_limit,
+        }
  
-    def _get_available_subtitles(self, video_id):
+    def _get_available_subtitles(self, video_id, webpage):
          try:
              sub_list = self._download_webpage(
                  'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id,
@@ -158,7 +186,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
              webpage = self._download_webpage(request,
                                               id, u'Downloading page %s' % pagenum)
  
-            playlist_el = get_element_by_attribute(u'class', u'video_list', webpage)
+            playlist_el = get_element_by_attribute(u'class', u'row video_list', webpage)
              video_ids.extend(re.findall(r'data-id="(.+?)"', playlist_el))
  
              if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
diff --git a/youtube_dl/extractor/depositfiles.py b/youtube_dl/extractor/depositfiles.py

index d43348955f122edf17573660077dca407634e329..2c9fb5f2e08dd69e977ab5dc599d97a5ff26a917 100644 (file)
--- a/youtube_dl/extractor/depositfiles.py
+++ b/youtube_dl/extractor/depositfiles.py
@@ -25,7 +25,7 @@ class DepositFilesIE(InfoExtractor):
          url = 'http://depositfiles.com/en/files/' + file_id
  
          # Retrieve file webpage with 'Free download' button pressed
-        free_download_indication = { 'gateway_result' : '1' }
+        free_download_indication = {'gateway_result' : '1'}
          request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
          try:
              self.report_download_webpage(file_id)
diff --git a/youtube_dl/extractor/eighttracks.py b/youtube_dl/extractor/eighttracks.py

index cced0681171a3dbc818e62ee2551da1958eacae2..f21ef88530d2f8913b4b35d9c03fc4fc14de7ddc 100644 (file)
--- a/youtube_dl/extractor/eighttracks.py
+++ b/youtube_dl/extractor/eighttracks.py
@@ -1,4 +1,3 @@
-import itertools
  import json
  import random
  import re
@@ -101,7 +100,7 @@ class EightTracksIE(InfoExtractor):
          first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
          next_url = first_url
          res = []
-        for i in itertools.count():
+        for i in range(track_count):
              api_json = self._download_webpage(next_url, playlist_id,
                  note=u'Downloading song information %s/%s' % (str(i+1), track_count),
                  errnote=u'Failed to download song information')
@@ -116,7 +115,5 @@ class EightTracksIE(InfoExtractor):
                  'ext': 'm4a',
              }
              res.append(info)
-            if api_data['set']['at_last_track']:
-                break
              next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
          return res
diff --git a/youtube_dl/extractor/eitb.py b/youtube_dl/extractor/eitb.py

new file mode 100644 (file)

index 0000000..4ba3231
--- /dev/null
+++ b/youtube_dl/extractor/eitb.py
@@ -0,0 +1,37 @@
+# encoding: utf-8
+import re
+
+from .common import InfoExtractor
+from .brightcove import BrightcoveIE
+from ..utils import ExtractorError
+
+
+class EitbIE(InfoExtractor):
+    IE_NAME = u'eitb.tv'
+    _VALID_URL = r'https?://www\.eitb\.tv/(eu/bideoa|es/video)/[^/]+/(?P<playlist_id>\d+)/(?P<chapter_id>\d+)'
+
+    _TEST = {
+        u'add_ie': ['Brightcove'],
+        u'url': u'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/2677100210001/2743577154001/lasa-y-zabala-30-anos/',
+        u'md5': u'edf4436247185adee3ea18ce64c47998',
+        u'info_dict': {
+            u'id': u'2743577154001',
+            u'ext': u'mp4',
+            u'title': u'60 minutos (Lasa y Zabala, 30 años)',
+            # All videos from eitb has this description in the brightcove info
+            u'description': u'.',
+            u'uploader': u'Euskal Telebista',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        chapter_id = mobj.group('chapter_id')
+        webpage = self._download_webpage(url, chapter_id)
+        bc_url = BrightcoveIE._extract_brightcove_url(webpage)
+        if bc_url is None:
+            raise ExtractorError(u'Could not extract the Brightcove url')
+        # The BrightcoveExperience object doesn't contain the video id, we set
+        # it manually
+        bc_url += '&%40videoPlayer={0}'.format(chapter_id)
+        return self.url_result(bc_url, BrightcoveIE.ie_key())
diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py

index 3aa2da52c0117bc9926df9c250eeb70da6cc2299..b1242f6bc457a41a9c8413eb851671acd05cc8c0 100644 (file)
--- a/youtube_dl/extractor/escapist.py
+++ b/youtube_dl/extractor/escapist.py
@@ -11,11 +11,11 @@ from ..utils import (
  
  
  class EscapistIE(InfoExtractor):
-    _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
+    _VALID_URL = r'^https?://?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
      _TEST = {
          u'url': u'http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate',
          u'file': u'6618-Breaking-Down-Baldurs-Gate.mp4',
-        u'md5': u'c6793dbda81388f4264c1ba18684a74d',
+        u'md5': u'ab3a706c681efca53f0a35f1415cf0d1',
          u'info_dict': {
              u"description": u"Baldur's Gate: Original, Modded or Enhanced Edition? I'll break down what you can expect from the new Baldur's Gate: Enhanced Edition.", 
              u"uploader": u"the-escapist-presents", 
@@ -25,50 +25,60 @@ class EscapistIE(InfoExtractor):
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
          showName = mobj.group('showname')
          videoId = mobj.group('episode')
  
          self.report_extraction(videoId)
          webpage = self._download_webpage(url, videoId)
  
-        videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
+        videoDesc = self._html_search_regex(
+            r'<meta name="description" content="([^"]*)"',
              webpage, u'description', fatal=False)
  
-        playerUrl = self._og_search_video_url(webpage, name='player url')
+        playerUrl = self._og_search_video_url(webpage, name=u'player URL')
  
-        title = self._html_search_regex('<meta name="title" content="([^"]*)"',
-            webpage, u'player url').split(' : ')[-1]
+        title = self._html_search_regex(
+            r'<meta name="title" content="([^"]*)"',
+            webpage, u'title').split(' : ')[-1]
  
-        configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
+        configUrl = self._search_regex('config=(.*)$', playerUrl, u'config URL')
          configUrl = compat_urllib_parse.unquote(configUrl)
  
-        configJSON = self._download_webpage(configUrl, videoId,
-                                            u'Downloading configuration',
-                                            u'unable to download configuration')
-
-        # Technically, it's JavaScript, not JSON
-        configJSON = configJSON.replace("'", '"')
-
+        formats = []
+
+        def _add_format(name, cfgurl):
+            configJSON = self._download_webpage(
+                cfgurl, videoId,
+                u'Downloading ' + name + ' configuration',
+                u'Unable to download ' + name + ' configuration')
+
+            # Technically, it's JavaScript, not JSON
+            configJSON = configJSON.replace("'", '"')
+
+            try:
+                config = json.loads(configJSON)
+            except (ValueError,) as err:
+                raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
+            playlist = config['playlist']
+            formats.append({
+                'url': playlist[1]['url'],
+                'format_id': name,
+            })
+
+        _add_format(u'normal', configUrl)
+        hq_url = (configUrl +
+                  ('&hq=1' if '?' in configUrl else configUrl + '?hq=1'))
          try:
-            config = json.loads(configJSON)
-        except (ValueError,) as err:
-            raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
+            _add_format(u'hq', hq_url)
+        except ExtractorError:
+            pass  # That's fine, we'll just use normal quality
  
-        playlist = config['playlist']
-        videoUrl = playlist[1]['url']
-
-        info = {
+        return {
              'id': videoId,
-            'url': videoUrl,
+            'formats': formats,
              'uploader': showName,
-            'upload_date': None,
              'title': title,
-            'ext': 'mp4',
              'thumbnail': self._og_search_thumbnail(webpage),
              'description': videoDesc,
              'player_url': playerUrl,
          }
-
-        return [info]
diff --git a/youtube_dl/extractor/exfm.py b/youtube_dl/extractor/exfm.py

index 3443f19c5f9bb8e2853c95b4ca5e153b395a701f..a51d79b08c656144c3f67d853fcae8fe52bc6e1f 100644 (file)
--- a/youtube_dl/extractor/exfm.py
+++ b/youtube_dl/extractor/exfm.py
@@ -11,16 +11,17 @@ class ExfmIE(InfoExtractor):
      _SOUNDCLOUD_URL = r'(?:http://)?(?:www\.)?api\.soundcloud.com/tracks/([^/]+)/stream'
      _TESTS = [
          {
-            u'url': u'http://ex.fm/song/1bgtzg',
-            u'file': u'95223130.mp3',
-            u'md5': u'8a7967a3fef10e59a1d6f86240fd41cf',
+            u'url': u'http://ex.fm/song/eh359',
+            u'file': u'44216187.mp3',
+            u'md5': u'e45513df5631e6d760970b14cc0c11e7',
              u'info_dict': {
-                u"title": u"We Can't Stop - Miley Cyrus",
-                u"uploader": u"Miley Cyrus",
-                u'upload_date': u'20130603',
-                u'description': u'Download "We Can\'t Stop" \r\niTunes: http://smarturl.it/WeCantStop?IQid=SC\r\nAmazon: http://smarturl.it/WeCantStopAMZ?IQid=SC',
+                u"title": u"Test House \"Love Is Not Enough\" (Extended Mix) DeadJournalist Exclusive",
+                u"uploader": u"deadjournalist",
+                u'upload_date': u'20120424',
+                u'description': u'Test House \"Love Is Not Enough\" (Extended Mix) DeadJournalist Exclusive',
              },
              u'note': u'Soundcloud song',
+            u'skip': u'The site is down too often',
          },
          {
              u'url': u'http://ex.fm/song/wddt8',
@@ -30,6 +31,7 @@ class ExfmIE(InfoExtractor):
                  u'title': u'Safe and Sound',
                  u'uploader': u'Capital Cities',
              },
+            u'skip': u'The site is down too often',
          },
      ]
  
diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py

new file mode 100644 (file)

index 0000000..1c20e43
--- /dev/null
+++ b/youtube_dl/extractor/extremetube.py
@@ -0,0 +1,50 @@
+import os
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse_urlparse,
+    compat_urllib_request,
+    compat_urllib_parse,
+)
+
+class ExtremeTubeIE(InfoExtractor):
+    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>extremetube\.com/video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)'
+    _TEST = {
+        u'url': u'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431',
+        u'file': u'652431.mp4',
+        u'md5': u'1fb9228f5e3332ec8c057d6ac36f33e0',
+        u'info_dict': {
+            u"title": u"Music Video 14 british euro brit european cumshots swallow",
+            u"uploader": u"unknown",
+            u"age_limit": 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('videoid')
+        url = 'http://www.' + mobj.group('url')
+
+        req = compat_urllib_request.Request(url)
+        req.add_header('Cookie', 'age_verified=1')
+        webpage = self._download_webpage(req, video_id)
+
+        video_title = self._html_search_regex(r'<h1 [^>]*?title="([^"]+)"[^>]*>\1<', webpage, u'title')
+        uploader = self._html_search_regex(r'>Posted by:(?=<)(?:\s|<[^>]*>)*(.+?)\|', webpage, u'uploader', fatal=False)
+        video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&amp;', webpage, u'video_url'))
+        path = compat_urllib_parse_urlparse(video_url).path
+        extension = os.path.splitext(path)[1][1:]
+        format = path.split('/')[5].split('_')[:2]
+        format = "-".join(format)
+
+        return {
+            'id': video_id,
+            'title': video_title,
+            'uploader': uploader,
+            'url': video_url,
+            'ext': extension,
+            'format': format,
+            'format_id': format,
+            'age_limit': 18,
+        }
diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py

index 9d1bc07510c3148b8ed8659d697c46017c6a36ff..3b210710e3695ec3aa940b335d9868a281d7740a 100644 (file)
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@@ -1,5 +1,4 @@
  import json
-import netrc
  import re
  import socket
  
@@ -19,7 +18,8 @@ class FacebookIE(InfoExtractor):
      """Information Extractor for Facebook"""
  
      _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
-    _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
+    _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1'
+    _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'
      _NETRC_MACHINE = 'facebook'
      IE_NAME = u'facebook'
      _TEST = {
@@ -36,50 +36,56 @@ class FacebookIE(InfoExtractor):
          """Report attempt to log in."""
          self.to_screen(u'Logging in')
  
-    def _real_initialize(self):
-        if self._downloader is None:
-            return
-
-        useremail = None
-        password = None
-        downloader_params = self._downloader.params
-
-        # Attempt to use provided username and password or .netrc data
-        if downloader_params.get('username', None) is not None:
-            useremail = downloader_params['username']
-            password = downloader_params['password']
-        elif downloader_params.get('usenetrc', False):
-            try:
-                info = netrc.netrc().authenticators(self._NETRC_MACHINE)
-                if info is not None:
-                    useremail = info[0]
-                    password = info[2]
-                else:
-                    raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
-            except (IOError, netrc.NetrcParseError) as err:
-                self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
-                return
-
+    def _login(self):
+        (useremail, password) = self._get_login_info()
          if useremail is None:
              return
  
-        # Log in
+        login_page_req = compat_urllib_request.Request(self._LOGIN_URL)
+        login_page_req.add_header('Cookie', 'locale=en_US')
+        self.report_login()
+        login_page = self._download_webpage(login_page_req, None, note=False,
+            errnote=u'Unable to download login page')
+        lsd = self._search_regex(r'"lsd":"(\w*?)"', login_page, u'lsd')
+        lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, u'lgnrnd')
+
          login_form = {
              'email': useremail,
              'pass': password,
-            'login': 'Log+In'
+            'lsd': lsd,
+            'lgnrnd': lgnrnd,
+            'next': 'http://facebook.com/home.php',
+            'default_persistent': '0',
+            'legacy_return': '1',
+            'timezone': '-60',
+            'trynum': '1',
              }
          request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
+        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
          try:
-            self.report_login()
              login_results = compat_urllib_request.urlopen(request).read()
              if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
                  self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
                  return
+
+            check_form = {
+                'fb_dtsg': self._search_regex(r'"fb_dtsg":"(.*?)"', login_results, u'fb_dtsg'),
+                'nh': self._search_regex(r'name="nh" value="(\w*?)"', login_results, u'nh'),
+                'name_action_selected': 'dont_save',
+                'submit[Continue]': self._search_regex(r'<input value="(.*?)" name="submit\[Continue\]"', login_results, u'continue'),
+            }
+            check_req = compat_urllib_request.Request(self._CHECKPOINT_URL, compat_urllib_parse.urlencode(check_form))
+            check_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
+            check_response = compat_urllib_request.urlopen(check_req).read()
+            if re.search(r'id="checkpointSubmitButton"', check_response) is not None:
+                self._downloader.report_warning(u'Unable to confirm login, you have to login in your brower and authorize the login.')
          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
              self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
              return
  
+    def _real_initialize(self):
+        self._login()
+
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
          if mobj is None:
@@ -93,7 +99,13 @@ class FacebookIE(InfoExtractor):
          AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
          m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
          if not m:
-            raise ExtractorError(u'Cannot parse data')
+            m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage)
+            if m_msg is not None:
+                raise ExtractorError(
+                    u'The video is not available, Facebook said: "%s"' % m_msg.group(1),
+                    expected=True)
+            else:
+                raise ExtractorError(u'Cannot parse data')
          data = dict(json.loads(m.group(1)))
          params_raw = compat_urllib_parse.unquote(data['params'])
          params = json.loads(params_raw)
diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py

index deaa4ed2d9bc14406b6a7d3d6e8b015c6fcf915d..89ed08db4cbb99f9381013813fa03a19474c8e24 100644 (file)
--- a/youtube_dl/extractor/faz.py
+++ b/youtube_dl/extractor/faz.py
@@ -5,8 +5,6 @@ import xml.etree.ElementTree
  from .common import InfoExtractor
  from ..utils import (
      determine_ext,
-    clean_html,
-    get_element_by_attribute,
  )
  
  
@@ -47,12 +45,12 @@ class FazIE(InfoExtractor):
                  'format_id': code.lower(),
              })
  
-        descr_html = get_element_by_attribute('class', 'Content Copy', webpage)
+        descr = self._html_search_regex(r'<p class="Content Copy">(.*?)</p>', webpage, u'description')
          info = {
              'id': video_id,
              'title': self._og_search_title(webpage),
              'formats': formats,
-            'description': clean_html(descr_html),
+            'description': descr,
              'thumbnail': config.find('STILL/STILL_BIG').text,
          }
          # TODO: Remove when #980 has been merged
diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py

index 9c89362efafefbb22c4dd5e4ef73950446fe9246..dba1a8dc262979b5afce987211bab2f14e502dba 100644 (file)
--- a/youtube_dl/extractor/fktv.py
+++ b/youtube_dl/extractor/fktv.py
@@ -39,7 +39,6 @@ class FKTVIE(InfoExtractor):
          for i, _ in enumerate(files, 1):
              video_id = '%04d%d' % (episode, i)
              video_url = 'http://dl%d.fernsehkritik.tv/fernsehkritik%d%s.flv' % (server, episode, '' if i == 1 else '-%d' % i)
-            video_title = 'Fernsehkritik %d.%d' % (episode, i)
              videos.append({
                  'id': video_id,
                  'url': video_url,
diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py

new file mode 100644 (file)

index 0000000..c91669b
--- /dev/null
+++ b/youtube_dl/extractor/gamekings.py
@@ -0,0 +1,38 @@
+import re
+
+from .common import InfoExtractor
+
+
+class GamekingsIE(InfoExtractor):
+    _VALID_URL = r'http?://www\.gamekings\.tv/videos/(?P<name>[0-9a-z\-]+)'
+    _TEST = {
+        u"url": u"http://www.gamekings.tv/videos/phoenix-wright-ace-attorney-dual-destinies-review/",
+        u'file': u'20130811.mp4',
+        # MD5 is flaky, seems to change regularly
+        #u'md5': u'2f32b1f7b80fdc5cb616efb4f387f8a3',
+        u'info_dict': {
+            u"title": u"Phoenix Wright: Ace Attorney \u2013 Dual Destinies Review",
+            u"description": u"Melle en Steven hebben voor de review een week in de rechtbank doorbracht met Phoenix Wright: Ace Attorney - Dual Destinies.",
+        }
+    }
+
+    def _real_extract(self, url):
+
+        mobj = re.match(self._VALID_URL, url)
+        name = mobj.group('name')
+        webpage = self._download_webpage(url, name)
+        video_url = self._og_search_video_url(webpage)
+
+        video = re.search(r'[0-9]+', video_url)
+        video_id = video.group(0)
+
+        # Todo: add medium format
+        video_url = video_url.replace(video_id, 'large/' + video_id)
+
+        return {
+            'id': video_id,
+            'ext': 'mp4',
+            'url': video_url,
+            'title': self._og_search_title(webpage),
+            'description': self._og_search_description(webpage),
+        }
diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py

index 098768361ede01d8acc01dc773a31b5b8fc67241..9645b00c3307a42ba48b66af599345ba80349a3d 100644 (file)
--- a/youtube_dl/extractor/gamespot.py
+++ b/youtube_dl/extractor/gamespot.py
@@ -24,7 +24,7 @@ class GameSpotIE(InfoExtractor):
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
-        page_id = video_id = mobj.group('page_id')
+        page_id = mobj.group('page_id')
          webpage = self._download_webpage(url, page_id)
          data_video_json = self._search_regex(r'data-video=\'(.*?)\'', webpage, u'data video')
          data_video = json.loads(unescapeHTML(data_video_json))
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index 69e0a7bd271dd0965f5c1f6f9c3a7cdce7a3da0b..37671430a99b66dea8339dfd986503f3cb57f59e 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -25,7 +25,7 @@ class GenericIE(InfoExtractor):
          {
              u'url': u'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
              u'file': u'13601338388002.mp4',
-            u'md5': u'85b90ccc9d73b4acd9138d3af4c27f89',
+            u'md5': u'6e15c93721d7ec9e9ca3fdbf07982cfd',
              u'info_dict': {
                  u"uploader": u"www.hodiho.fr",
                  u"title": u"R\u00e9gis plante sa Jeep"
@@ -33,6 +33,7 @@ class GenericIE(InfoExtractor):
          },
          # embedded vimeo video
          {
+            u'add_ie': ['Vimeo'],
              u'url': u'http://skillsmatter.com/podcast/home/move-semanticsperfect-forwarding-and-rvalue-references',
              u'file': u'22444065.mp4',
              u'md5': u'2903896e23df39722c33f015af0666e2',
@@ -41,7 +42,35 @@ class GenericIE(InfoExtractor):
                  u"uploader_id": u"skillsmatter",
                  u"uploader": u"Skills Matter",
              }
-        }
+        },
+        # bandcamp page with custom domain
+        {
+            u'add_ie': ['Bandcamp'],
+            u'url': u'http://bronyrock.com/track/the-pony-mash',
+            u'file': u'3235767654.mp3',
+            u'info_dict': {
+                u'title': u'The Pony Mash',
+                u'uploader': u'M_Pallante',
+            },
+            u'skip': u'There is a limit of 200 free downloads / month for the test song',
+        },
+        # embedded brightcove video
+        # it also tests brightcove videos that need to set the 'Referer' in the
+        # http requests
+        {
+            u'add_ie': ['Brightcove'],
+            u'url': u'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
+            u'info_dict': {
+                u'id': u'2765128793001',
+                u'ext': u'mp4',
+                u'title': u'Le cours de bourse : l’analyse technique',
+                u'description': u'md5:7e9ad046e968cb2d1114004aba466fd9',
+                u'uploader': u'BFM BUSINESS',
+            },
+            u'params': {
+                u'skip_download': True,
+            },
+        },
      ]
  
      def report_download_webpage(self, video_id):
@@ -133,11 +162,20 @@ class GenericIE(InfoExtractor):
              raise ExtractorError(u'Failed to download URL: %s' % url)
  
          self.report_extraction(video_id)
+
+        # it's tempting to parse this further, but you would
+        # have to take into account all the variations like
+        #   Video Title - Site Name
+        #   Site Name | Video Title
+        #   Video Title - Tagline | Site Name
+        # and so on and so forth; it's just not practical
+        video_title = self._html_search_regex(r'<title>(.*)</title>',
+            webpage, u'video title', default=u'video', flags=re.DOTALL)
+
          # Look for BrightCove:
-        m_brightcove = re.search(r'<object[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1.+?</object>', webpage, re.DOTALL)
-        if m_brightcove is not None:
+        bc_url = BrightcoveIE._extract_brightcove_url(webpage)
+        if bc_url is not None:
              self.to_screen(u'Brightcove video detected.')
-            bc_url = BrightcoveIE._build_brighcove_url(m_brightcove.group())
              return self.url_result(bc_url, 'Brightcove')
  
          # Look for embedded Vimeo player
@@ -149,11 +187,20 @@ class GenericIE(InfoExtractor):
              return self.url_result(surl, 'Vimeo')
  
          # Look for embedded YouTube player
-        mobj = re.search(
-            r'<iframe[^>]+?src="(https?://(?:www\.)?youtube.com/embed/.+?)"', webpage)
-        if mobj:
-            surl = unescapeHTML(mobj.group(1))
-            return self.url_result(surl, 'Youtube')
+        matches = re.findall(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube.com/embed/.+?)\1', webpage)
+        if matches:
+            urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube')
+                     for tuppl in matches]
+            return self.playlist_result(
+                urlrs, playlist_id=video_id, playlist_title=video_title)
+
+        # Look for Bandcamp pages with custom domain
+        mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
+        if mobj is not None:
+            burl = unescapeHTML(mobj.group(1))
+            # Don't set the extractor because it can be a track url or an album
+            return self.url_result(burl)
  
          # Start with something easy: JW Player in SWFObject
          mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
@@ -162,7 +209,7 @@ class GenericIE(InfoExtractor):
              mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
          if mobj is None:
              # Broaden the search a little bit: JWPlayer JS loader
-            mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http[^\'"&]*)', webpage)
+            mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http[^\'"]*)', webpage)
          if mobj is None:
              # Try to find twitter cards info
              mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
@@ -189,27 +236,16 @@ class GenericIE(InfoExtractor):
          video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
  
          # here's a fun little line of code for you:
-        video_extension = os.path.splitext(video_id)[1][1:]
          video_id = os.path.splitext(video_id)[0]
  
-        # it's tempting to parse this further, but you would
-        # have to take into account all the variations like
-        #   Video Title - Site Name
-        #   Site Name | Video Title
-        #   Video Title - Tagline | Site Name
-        # and so on and so forth; it's just not practical
-        video_title = self._html_search_regex(r'<title>(.*)</title>',
-            webpage, u'video title', default=u'video', flags=re.DOTALL)
-
          # video uploader is domain name
          video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
              url, u'video uploader')
  
-        return [{
+        return {
              'id':       video_id,
              'url':      video_url,
              'uploader': video_uploader,
              'upload_date':  None,
              'title':    video_title,
-            'ext':      video_extension,
-        }]
+        }
diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py

index ab12d7e9381317b4dfddb679eced39db2f752ed4..2570746b2047a1d1ae0a60b48970b1414f168e40 100644 (file)
--- a/youtube_dl/extractor/googleplus.py
+++ b/youtube_dl/extractor/googleplus.py
@@ -41,9 +41,9 @@ class GooglePlusIE(InfoExtractor):
  
          # Extract update date
          upload_date = self._html_search_regex(
-            r'''(?x)<a.+?class="o-T-s\s[^"]+"\s+style="display:\s*none"\s*>
+            r'''(?x)<a.+?class="o-U-s\s[^"]+"\s+style="display:\s*none"\s*>
                      ([0-9]{4}-[0-9]{2}-[0-9]{2})</a>''',
-            webpage, u'upload date', fatal=False)
+            webpage, u'upload date', fatal=False, flags=re.VERBOSE)
          if upload_date:
              # Convert timestring to a format suitable for filename
              upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
diff --git a/youtube_dl/extractor/howcast.py b/youtube_dl/extractor/howcast.py

index 46954337f25e1cbd7bae89e7da76d4e93ecc8c9e..bafc5826f680353af40b820609a543192ac73d17 100644 (file)
--- a/youtube_dl/extractor/howcast.py
+++ b/youtube_dl/extractor/howcast.py
@@ -8,7 +8,7 @@ class HowcastIE(InfoExtractor):
      _TEST = {
          u'url': u'http://www.howcast.com/videos/390161-How-to-Tie-a-Square-Knot-Properly',
          u'file': u'390161.mp4',
-        u'md5': u'1d7ba54e2c9d7dc6935ef39e00529138',
+        u'md5': u'8b743df908c42f60cf6496586c7f12c3',
          u'info_dict': {
              u"description": u"The square knot, also known as the reef knot, is one of the oldest, most basic knots to tie, and can be used in many different ways. Here's the proper way to tie a square knot.", 
              u"title": u"How to Tie a Square Knot Properly"
diff --git a/youtube_dl/extractor/hypem.py b/youtube_dl/extractor/hypem.py

index ab2b591036a6371fc31e7437c368858dfb708efa..9bd06e7c7913e9c7492f63417760012f1219c875 100644 (file)
--- a/youtube_dl/extractor/hypem.py
+++ b/youtube_dl/extractor/hypem.py
@@ -30,7 +30,7 @@ class HypemIE(InfoExtractor):
              raise ExtractorError(u'Invalid URL: %s' % url)
          track_id = mobj.group(1)
  
-        data = { 'ax': 1, 'ts': time.time() }
+        data = {'ax': 1, 'ts': time.time()}
          data_encoded = compat_urllib_parse.urlencode(data)
          complete_url = url + "?" + data_encoded
          request = compat_urllib_request.Request(complete_url)
@@ -68,4 +68,4 @@ class HypemIE(InfoExtractor):
              'ext':      "mp3",
              'title':    title,
              'artist':   artist,
-        }]
-\ No newline at end of file
+        }]
diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py

index ddc42882a436a216cbd24b0b28d03da89ec27b0d..213aac428451bfcb860585b26de0e1c43abc732d 100644 (file)
--- a/youtube_dl/extractor/instagram.py
+++ b/youtube_dl/extractor/instagram.py
@@ -26,7 +26,7 @@ class InstagramIE(InfoExtractor):
  
          return [{
              'id':        video_id,
-            'url':       self._og_search_video_url(webpage),
+            'url':       self._og_search_video_url(webpage, secure=False),
              'ext':       'mp4',
              'title':     u'Video by %s' % uploader_id,
              'thumbnail': self._og_search_thumbnail(webpage),
diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py

index 5986459d6dfdf7358b7d7a2e4bc139a579a01265..be8e05f539d7f64c301f7a63a488aedbf9d129cd 100644 (file)
--- a/youtube_dl/extractor/internetvideoarchive.py
+++ b/youtube_dl/extractor/internetvideoarchive.py
@@ -19,7 +19,7 @@ class InternetVideoArchiveIE(InfoExtractor):
          u'info_dict': {
              u'title': u'SKYFALL',
              u'description': u'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.',
-            u'duration': 156,
+            u'duration': 153,
          },
      }
  
@@ -74,7 +74,7 @@ class InternetVideoArchiveIE(InfoExtractor):
              })
          formats = sorted(formats, key=lambda f: f['bitrate'])
  
-        info = {
+        return {
              'id': video_id,
              'title': item.find('title').text,
              'formats': formats,
@@ -82,6 +82,3 @@ class InternetVideoArchiveIE(InfoExtractor):
              'description': item.find('description').text,
              'duration': int(attr['duration']),
          }
-        # TODO: Remove when #980 has been merged
-        info.update(formats[-1])
-        return info
diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py

index 6bb54b932298395b8f07554b12ad6091cca140d3..0020c47cfd53a05eb4c719a2d8c203783e97d4e2 100644 (file)
--- a/youtube_dl/extractor/jeuxvideo.py
+++ b/youtube_dl/extractor/jeuxvideo.py
@@ -22,7 +22,7 @@ class JeuxVideoIE(InfoExtractor):
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
-        title = re.match(self._VALID_URL, url).group(1)
+        title = mobj.group(1)
          webpage = self._download_webpage(url, title)
          xml_link = self._html_search_regex(
              r'<param name="flashvars" value="config=(.*?)" />',
diff --git a/youtube_dl/extractor/kankan.py b/youtube_dl/extractor/kankan.py

index 445d465017f513b55839ed9323c95212e6de7fb7..50916f4a66c6227e1eb4dc531745c9d5a6ad85d5 100644 (file)
--- a/youtube_dl/extractor/kankan.py
+++ b/youtube_dl/extractor/kankan.py
@@ -1,8 +1,10 @@
  import re
+import hashlib
  
  from .common import InfoExtractor
  from ..utils import determine_ext
  
+_md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()
  
  class KankanIE(InfoExtractor):
      _VALID_URL = r'https?://(?:.*?\.)?kankan\.com/.+?/(?P<id>\d+)\.shtml'
@@ -30,7 +32,10 @@ class KankanIE(InfoExtractor):
                                                   video_id, u'Downloading video url info')
          ip = self._search_regex(r'ip:"(.+?)"', video_info_page, u'video url ip')
          path = self._search_regex(r'path:"(.+?)"', video_info_page, u'video url path')
-        video_url = 'http://%s%s' % (ip, path)
+        param1 = self._search_regex(r'param1:(\d+)', video_info_page, u'param1')
+        param2 = self._search_regex(r'param2:(\d+)', video_info_page, u'param2')
+        key = _md5('xl_mp43651' + param1 + param2)
+        video_url = 'http://%s%s?key=%s&key1=%s' % (ip, path, key, param2)
  
          return {'id': video_id,
                  'title': title,
diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py

new file mode 100644 (file)

index 0000000..29658a7
--- /dev/null
+++ b/youtube_dl/extractor/keezmovies.py
@@ -0,0 +1,61 @@
+import os
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse_urlparse,
+    compat_urllib_request,
+    compat_urllib_parse,
+)
+from ..aes import (
+    aes_decrypt_text
+)
+
+class KeezMoviesIE(InfoExtractor):
+    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>keezmovies\.com/video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)'
+    _TEST = {
+        u'url': u'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711',
+        u'file': u'1214711.mp4',
+        u'md5': u'6e297b7e789329923fcf83abb67c9289',
+        u'info_dict': {
+            u"title": u"Petite Asian Lady Mai Playing In Bathtub",
+            u"age_limit": 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('videoid')
+        url = 'http://www.' + mobj.group('url')
+
+        req = compat_urllib_request.Request(url)
+        req.add_header('Cookie', 'age_verified=1')
+        webpage = self._download_webpage(req, video_id)
+
+        # embedded video
+        mobj = re.search(r'href="([^"]+)"></iframe>', webpage)
+        if mobj:
+            embedded_url = mobj.group(1)
+            return self.url_result(embedded_url)
+
+        video_title = self._html_search_regex(r'<h1 [^>]*>([^<]+)', webpage, u'title')
+        video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&amp;', webpage, u'video_url'))
+        if webpage.find('encrypted=true')!=-1:
+            password = self._html_search_regex(r'video_title=(.+?)&amp;', webpage, u'password')
+            video_url = aes_decrypt_text(video_url, password, 32).decode('utf-8')
+        path = compat_urllib_parse_urlparse(video_url).path
+        extension = os.path.splitext(path)[1][1:]
+        format = path.split('/')[4].split('_')[:2]
+        format = "-".join(format)
+
+        age_limit = self._rta_search(webpage)
+
+        return {
+            'id': video_id,
+            'title': video_title,
+            'url': video_url,
+            'ext': extension,
+            'format': format,
+            'format_id': format,
+            'age_limit': age_limit,
+        }
diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py

index d04da98c89ed582e83e8bb905b15ff04c78d3018..5f548437cd74d879e06ac4be64adf3a6de93ddef 100644 (file)
--- a/youtube_dl/extractor/livestream.py
+++ b/youtube_dl/extractor/livestream.py
@@ -1,16 +1,17 @@
  import re
  import json
+import xml.etree.ElementTree
  
  from .common import InfoExtractor
  from ..utils import (
      compat_urllib_parse_urlparse,
      compat_urlparse,
-    get_meta_content,
-    ExtractorError,
+    xpath_with_ns,
  )
  
  
  class LivestreamIE(InfoExtractor):
+    IE_NAME = u'livestream'
      _VALID_URL = r'http://new.livestream.com/.*?/(?P<event_name>.*?)(/videos/(?P<id>\d+))?/?$'
      _TEST = {
          u'url': u'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370',
@@ -40,13 +41,9 @@ class LivestreamIE(InfoExtractor):
  
          if video_id is None:
              # This is an event page:
-            player = get_meta_content('twitter:player', webpage)
-            if player is None:
-                raise ExtractorError('Couldn\'t extract event api url')
-            api_url = player.replace('/player', '')
-            api_url = re.sub(r'^(https?://)(new\.)', r'\1api.\2', api_url)
-            info = json.loads(self._download_webpage(api_url, event_name,
-                                                     u'Downloading event info'))
+            config_json = self._search_regex(r'window.config = ({.*?});',
+                webpage, u'window config')
+            info = json.loads(config_json)['event']
              videos = [self._extract_video_info(video_data['data'])
                  for video_data in info['feed']['data'] if video_data['type'] == u'video']
              return self.playlist_result(videos, info['id'], info['full_name'])
@@ -58,3 +55,44 @@ class LivestreamIE(InfoExtractor):
              info = json.loads(self._download_webpage(api_url, video_id,
                                                       u'Downloading video info'))
              return self._extract_video_info(info)
+
+
+# The original version of Livestream uses a different system
+class LivestreamOriginalIE(InfoExtractor):
+    IE_NAME = u'livestream:original'
+    _VALID_URL = r'https?://www\.livestream\.com/(?P<user>[^/]+)/video\?.*?clipId=(?P<id>.*?)(&|$)'
+    _TEST = {
+        u'url': u'http://www.livestream.com/dealbook/video?clipId=pla_8aa4a3f1-ba15-46a4-893b-902210e138fb',
+        u'info_dict': {
+            u'id': u'pla_8aa4a3f1-ba15-46a4-893b-902210e138fb',
+            u'ext': u'flv',
+            u'title': u'Spark 1 (BitCoin) with Cameron Winklevoss & Tyler Winklevoss of Winklevoss Capital',
+        },
+        u'params': {
+            # rtmp
+            u'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        user = mobj.group('user')
+        api_url = 'http://x{0}x.api.channel.livestream.com/2.0/clipdetails?extendedInfo=true&id={1}'.format(user, video_id)
+
+        api_response = self._download_webpage(api_url, video_id)
+        info = xml.etree.ElementTree.fromstring(api_response.encode('utf-8'))
+        item = info.find('channel').find('item')
+        ns = {'media': 'http://search.yahoo.com/mrss'}
+        thumbnail_url = item.find(xpath_with_ns('media:thumbnail', ns)).attrib['url']
+        # Remove the extension and number from the path (like 1.jpg)
+        path = self._search_regex(r'(user-files/.+)_.*?\.jpg$', thumbnail_url, u'path')
+
+        return {
+            'id': video_id,
+            'title': item.find('title').text,
+            'url': 'rtmp://extondemand.livestream.com/ondemand',
+            'play_path': 'mp4:trans/dv15/mogulus-{0}.mp4'.format(path),
+            'ext': 'flv',
+            'thumbnail': thumbnail_url,
+        }
diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py

index e537648ffb83564e56f43f7e1e21a949cc609925..91480ba875d5fff781ce08a47c41a3824e94e910 100644 (file)
--- a/youtube_dl/extractor/metacafe.py
+++ b/youtube_dl/extractor/metacafe.py
@@ -20,10 +20,12 @@ class MetacafeIE(InfoExtractor):
      _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
      _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
      IE_NAME = u'metacafe'
-    _TESTS = [{
+    _TESTS = [
+    # Youtube video
+    {
          u"add_ie": ["Youtube"],
          u"url":  u"http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/",
-        u"file":  u"_aUehQsCQtM.flv",
+        u"file":  u"_aUehQsCQtM.mp4",
          u"info_dict": {
              u"upload_date": u"20090102",
              u"title": u"The Electric Company | \"Short I\" | PBS KIDS GO!",
@@ -32,15 +34,42 @@ class MetacafeIE(InfoExtractor):
              u"uploader_id": u"PBS"
          }
      },
+    # Normal metacafe video
+    {
+        u'url': u'http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/',
+        u'md5': u'6e0bca200eaad2552e6915ed6fd4d9ad',
+        u'info_dict': {
+            u'id': u'11121940',
+            u'ext': u'mp4',
+            u'title': u'News: Stuff You Won\'t Do with Your PlayStation 4',
+            u'uploader': u'ign',
+            u'description': u'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.',
+        },
+    },
+    # AnyClip video
      {
          u"url": u"http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/",
          u"file": u"an-dVVXnuY7Jh77J.mp4",
          u"info_dict": {
              u"title": u"The Andromeda Strain (1971): Stop the Bomb Part 3",
              u"uploader": u"anyclip",
-            u"description": u"md5:38c711dd98f5bb87acf973d573442e67"
-        }
-    }]
+            u"description": u"md5:38c711dd98f5bb87acf973d573442e67",
+        },
+    },
+    # age-restricted video
+    {
+        u'url': u'http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/',
+        u'md5': u'98dde7c1a35d02178e8ab7560fe8bd09',
+        u'info_dict': {
+            u'id': u'5186653',
+            u'ext': u'mp4',
+            u'title': u'BBC INTERNAL Christmas Tape \'79 - UNCENSORED Outtakes, Etc.',
+            u'uploader': u'Dwayne Pipe',
+            u'description': u'md5:950bf4c581e2c059911fa3ffbe377e4b',
+            u'age_limit': 18,
+        },
+    },
+    ]
  
  
      def report_disclaimer(self):
@@ -62,6 +91,7 @@ class MetacafeIE(InfoExtractor):
              'submit': "Continue - I'm over 18",
              }
          request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
+        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
          try:
              self.report_age_confirmation()
              compat_urllib_request.urlopen(request).read()
@@ -83,7 +113,12 @@ class MetacafeIE(InfoExtractor):
  
          # Retrieve video webpage to extract further information
          req = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
-        req.headers['Cookie'] = 'flashVersion=0;'
+
+        # AnyClip videos require the flashversion cookie so that we get the link
+        # to the mp4 file
+        mobj_an = re.match(r'^an-(.*?)$', video_id)
+        if mobj_an:
+            req.headers['Cookie'] = 'flashVersion=0;'
          webpage = self._download_webpage(req, video_id)
  
          # Extract URL, uploader and title from webpage
@@ -125,6 +160,11 @@ class MetacafeIE(InfoExtractor):
                  r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);',
                  webpage, u'uploader nickname', fatal=False)
  
+        if re.search(r'"contentRating":"restricted"', webpage) is not None:
+            age_limit = 18
+        else:
+            age_limit = 0
+
          return {
              '_type':    'video',
              'id':       video_id,
@@ -134,4 +174,5 @@ class MetacafeIE(InfoExtractor):
              'upload_date':  None,
              'title':    video_title,
              'ext':      video_ext,
+            'age_limit': age_limit,
          }
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py

index a200dcd74a5a7af220cedea02a60c01cfd643e79..e2baf44d7e15032022e6b304ace2bf8ef11a09b2 100644 (file)
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@@ -60,7 +60,7 @@ class MixcloudIE(InfoExtractor):
              'title': info['name'],
              'url': final_song_url,
              'ext': 'mp3',
-            'description': info['description'],
+            'description': info.get('description'),
              'thumbnail': info['pictures'].get('extra_large'),
              'uploader': info['user']['name'],
              'uploader_id': info['user']['username'],
diff --git a/youtube_dl/extractor/mofosex.py b/youtube_dl/extractor/mofosex.py

new file mode 100644 (file)

index 0000000..b9430b0
--- /dev/null
+++ b/youtube_dl/extractor/mofosex.py
@@ -0,0 +1,49 @@
+import os
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse_urlparse,
+    compat_urllib_request,
+    compat_urllib_parse,
+)
+
+class MofosexIE(InfoExtractor):
+    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>mofosex\.com/videos/(?P<videoid>[0-9]+)/.*?\.html)'
+    _TEST = {
+        u'url': u'http://www.mofosex.com/videos/5018/japanese-teen-music-video.html',
+        u'file': u'5018.mp4',
+        u'md5': u'1b2eb47ac33cc75d4a80e3026b613c5a',
+        u'info_dict': {
+            u"title": u"Japanese Teen Music Video",
+            u"age_limit": 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('videoid')
+        url = 'http://www.' + mobj.group('url')
+
+        req = compat_urllib_request.Request(url)
+        req.add_header('Cookie', 'age_verified=1')
+        webpage = self._download_webpage(req, video_id)
+
+        video_title = self._html_search_regex(r'<h1>(.+?)<', webpage, u'title')
+        video_url = compat_urllib_parse.unquote(self._html_search_regex(r'flashvars.video_url = \'([^\']+)', webpage, u'video_url'))
+        path = compat_urllib_parse_urlparse(video_url).path
+        extension = os.path.splitext(path)[1][1:]
+        format = path.split('/')[5].split('_')[:2]
+        format = "-".join(format)
+
+        age_limit = self._rta_search(webpage)
+
+        return {
+            'id': video_id,
+            'title': video_title,
+            'url': video_url,
+            'ext': extension,
+            'format': format,
+            'format_id': format,
+            'age_limit': age_limit,
+        }
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py

index e520e2bb491f2c55f3867ab214b2b949eca6e684..04afd6c4c86233512d9e42f0493f5d7a0a7b00b0 100644 (file)
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -26,6 +26,7 @@ class MTVIE(InfoExtractor):
              },
          },
          {
+            u'add_ie': ['Vevo'],
              u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml',
              u'file': u'USCJY1331283.mp4',
              u'md5': u'73b4e7fcadd88929292fe52c3ced8caf',
@@ -47,7 +48,7 @@ class MTVIE(InfoExtractor):
      def _transform_rtmp_url(rtmp_video_url):
          m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp\..+?/.*)$', rtmp_video_url)
          if not m:
-            raise ExtractorError(u'Cannot transform RTMP url')
+            return rtmp_video_url
          base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
          return base + m.group('finalid')
  
@@ -58,7 +59,6 @@ class MTVIE(InfoExtractor):
          if '/error_country_block.swf' in metadataXml:
              raise ExtractorError(u'This video is not available from your country.', expected=True)
          mdoc = xml.etree.ElementTree.fromstring(metadataXml.encode('utf-8'))
-        renditions = mdoc.findall('.//rendition')
  
          formats = []
          for rendition in mdoc.findall('.//rendition'):
@@ -80,6 +80,8 @@ class MTVIE(InfoExtractor):
          video_id = self._id_from_uri(uri)
          self.report_extraction(video_id)
          mediagen_url = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))).attrib['url']
+        # Remove the templates, like &device={device}
+        mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', u'', mediagen_url)
          if 'acceptMethods' not in mediagen_url:
              mediagen_url += '&acceptMethods=fms'
          mediagen_page = self._download_webpage(mediagen_url, video_id,
diff --git a/youtube_dl/extractor/myspace.py b/youtube_dl/extractor/myspace.py

new file mode 100644 (file)

index 0000000..050f54a
--- /dev/null
+++ b/youtube_dl/extractor/myspace.py
@@ -0,0 +1,48 @@
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_str,
+)
+
+
+class MySpaceIE(InfoExtractor):
+    _VALID_URL = r'https?://myspace\.com/([^/]+)/video/[^/]+/(?P<id>\d+)'
+
+    _TEST = {
+        u'url': u'https://myspace.com/coldplay/video/viva-la-vida/100008689',
+        u'info_dict': {
+            u'id': u'100008689',
+            u'ext': u'flv',
+            u'title': u'Viva La Vida',
+            u'description': u'The official Viva La Vida video, directed by Hype Williams',
+            u'uploader': u'Coldplay',
+            u'uploader_id': u'coldplay',
+        },
+        u'params': {
+            # rtmp download
+            u'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        webpage = self._download_webpage(url, video_id)
+        context = json.loads(self._search_regex(r'context = ({.*?});', webpage,
+            u'context'))
+        video = context['video']
+        rtmp_url, play_path = video['streamUrl'].split(';', 1)
+
+        return {
+            'id': compat_str(video['mediaId']),
+            'title': video['title'],
+            'url': rtmp_url,
+            'play_path': play_path,
+            'ext': 'flv',
+            'description': video['description'],
+            'thumbnail': video['imageUrl'],
+            'uploader': video['artistName'],
+            'uploader_id': video['artistUsername'],
+        }
diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py

index e8d43dd135ff29bc9471466f66b8b4e1b55eadea..458fe40639171943fec8b516a5facba5bcfb3df6 100644 (file)
--- a/youtube_dl/extractor/nhl.py
+++ b/youtube_dl/extractor/nhl.py
@@ -72,7 +72,7 @@ class NHLIE(NHLBaseInfoExtractor):
  
  class NHLVideocenterIE(NHLBaseInfoExtractor):
      IE_NAME = u'nhl.com:videocenter'
-    IE_DESC = u'Download the first 12 videos from a videocenter category'
+    IE_DESC = u'NHL videocenter category'
      _VALID_URL = r'https?://video\.(?P<team>[^.]*)\.nhl\.com/videocenter/(console\?.*?catid=(?P<catid>[^&]+))?'
  
      @classmethod
@@ -90,8 +90,8 @@ class NHLVideocenterIE(NHLBaseInfoExtractor):
               r'{statusIndex:0,index:0,.*?id:(.*?),'],
              webpage, u'category id')
          playlist_title = self._html_search_regex(
-            r'\?catid=%s">(.*?)</a>' % cat_id,
-            webpage, u'playlist title', flags=re.DOTALL)
+            r'tab0"[^>]*?>(.*?)</td>',
+            webpage, u'playlist title', flags=re.DOTALL).lower().capitalize()
  
          data = compat_urllib_parse.urlencode({
              'cid': cat_id,
diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py

new file mode 100644 (file)

index 0000000..729607e
--- /dev/null
+++ b/youtube_dl/extractor/niconico.py
@@ -0,0 +1,131 @@
+# encoding: utf-8
+
+import re
+import socket
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_http_client,
+    compat_urllib_error,
+    compat_urllib_parse,
+    compat_urllib_request,
+    compat_urlparse,
+    compat_str,
+
+    ExtractorError,
+    unified_strdate,
+)
+
+
+class NiconicoIE(InfoExtractor):
+    IE_NAME = u'niconico'
+    IE_DESC = u'ニコニコ動画'
+
+    _TEST = {
+        u'url': u'http://www.nicovideo.jp/watch/sm22312215',
+        u'file': u'sm22312215.mp4',
+        u'md5': u'd1a75c0823e2f629128c43e1212760f9',
+        u'info_dict': {
+            u'title': u'Big Buck Bunny',
+            u'uploader': u'takuya0301',
+            u'uploader_id': u'2698420',
+            u'upload_date': u'20131123',
+            u'description': u'(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org',
+        },
+        u'params': {
+            u'username': u'ydl.niconico@gmail.com',
+            u'password': u'youtube-dl',
+        },
+    }
+
+    _VALID_URL = r'^https?://(?:www\.|secure\.)?nicovideo\.jp/watch/([a-z][a-z][0-9]+)(?:.*)$'
+    _NETRC_MACHINE = 'niconico'
+    # If True it will raise an error if no login info is provided
+    _LOGIN_REQUIRED = True
+
+    def _real_initialize(self):
+        self._login()
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+        # No authentication to be performed
+        if username is None:
+            if self._LOGIN_REQUIRED:
+                raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
+            return False
+
+        # Log in
+        login_form_strs = {
+            u'mail': username,
+            u'password': password,
+        }
+        # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
+        # chokes on unicode
+        login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
+        login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8')
+        request = compat_urllib_request.Request(
+            u'https://secure.nicovideo.jp/secure/login', login_data)
+        login_results = self._download_webpage(
+            request, u'', note=u'Logging in', errnote=u'Unable to log in')
+        if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None:
+            self._downloader.report_warning(u'unable to log in: bad username or password')
+            return False
+        return True
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group(1)
+
+        # Get video webpage. We are not actually interested in it, but need
+        # the cookies in order to be able to download the info webpage
+        self._download_webpage('http://www.nicovideo.jp/watch/' + video_id, video_id)
+
+        video_info_webpage = self._download_webpage(
+            'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id,
+            note=u'Downloading video info page')
+
+        # Get flv info
+        flv_info_webpage = self._download_webpage(
+            u'http://flapi.nicovideo.jp/api/getflv?v=' + video_id,
+            video_id, u'Downloading flv info')
+        video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0]
+
+        # Start extracting information
+        video_info = xml.etree.ElementTree.fromstring(video_info_webpage)
+        video_title = video_info.find('.//title').text
+        video_extension = video_info.find('.//movie_type').text
+        video_format = video_extension.upper()
+        video_thumbnail = video_info.find('.//thumbnail_url').text
+        video_description = video_info.find('.//description').text
+        video_uploader_id = video_info.find('.//user_id').text
+        video_upload_date = unified_strdate(video_info.find('.//first_retrieve').text.split('+')[0])
+        video_view_count = video_info.find('.//view_counter').text
+        video_webpage_url = video_info.find('.//watch_url').text
+
+        # uploader
+        video_uploader = video_uploader_id
+        url = 'http://seiga.nicovideo.jp/api/user/info?id=' + video_uploader_id
+        try:
+            user_info_webpage = self._download_webpage(
+                url, video_id, note=u'Downloading user information')
+        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+            self._downloader.report_warning(u'Unable to download user info webpage: %s' % compat_str(err))
+        else:
+            user_info = xml.etree.ElementTree.fromstring(user_info_webpage)
+            video_uploader = user_info.find('.//nickname').text
+
+        return {
+            'id':          video_id,
+            'url':         video_real_url,
+            'title':       video_title,
+            'ext':         video_extension,
+            'format':      video_format,
+            'thumbnail':   video_thumbnail,
+            'description': video_description,
+            'uploader':    video_uploader,
+            'upload_date': video_upload_date,
+            'uploader_id': video_uploader_id,
+            'view_count':  video_view_count,
+            'webpage_url': video_webpage_url,
+        }
diff --git a/youtube_dl/extractor/nowvideo.py b/youtube_dl/extractor/nowvideo.py

index ab52ad4011851405e9a6b17f73720a8cd646860c..241cc160b9ca58bfc6b88bf9c12fe134df3b3d66 100644 (file)
--- a/youtube_dl/extractor/nowvideo.py
+++ b/youtube_dl/extractor/nowvideo.py
@@ -20,7 +20,10 @@ class NowVideoIE(InfoExtractor):
  
          video_id = mobj.group('id')
          webpage_url = 'http://www.nowvideo.ch/video/' + video_id
+        embed_url = 'http://embed.nowvideo.ch/embed.php?v=' + video_id
          webpage = self._download_webpage(webpage_url, video_id)
+        embed_page = self._download_webpage(embed_url, video_id,
+            u'Downloading embed page')
  
          self.report_extraction(video_id)
  
@@ -28,7 +31,7 @@ class NowVideoIE(InfoExtractor):
              webpage, u'video title')
  
          video_key = self._search_regex(r'var fkzd="(.*)";',
-            webpage, u'video key')
+            embed_page, u'video key')
  
          api_call = "http://www.nowvideo.ch/api/player.api.php?file={0}&numOfErrors=0&cid=1&key={1}".format(video_id, video_key)
          api_response = self._download_webpage(api_call, video_id,
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py

new file mode 100644 (file)

index 0000000..8b34719
--- /dev/null
+++ b/youtube_dl/extractor/pornhub.py
@@ -0,0 +1,68 @@
+import os
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse_urlparse,
+    compat_urllib_request,
+    compat_urllib_parse,
+)
+from ..aes import (
+    aes_decrypt_text
+)
+
+class PornHubIE(InfoExtractor):
+    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>pornhub\.com/view_video\.php\?viewkey=(?P<videoid>[0-9]+))'
+    _TEST = {
+        u'url': u'http://www.pornhub.com/view_video.php?viewkey=648719015',
+        u'file': u'648719015.mp4',
+        u'md5': u'882f488fa1f0026f023f33576004a2ed',
+        u'info_dict': {
+            u"uploader": u"BABES-COM", 
+            u"title": u"Seductive Indian beauty strips down and fingers her pink pussy",
+            u"age_limit": 18
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('videoid')
+        url = 'http://www.' + mobj.group('url')
+
+        req = compat_urllib_request.Request(url)
+        req.add_header('Cookie', 'age_verified=1')
+        webpage = self._download_webpage(req, video_id)
+
+        video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, u'title')
+        video_uploader = self._html_search_regex(r'<b>From: </b>(?:\s|<[^>]*>)*(.+?)<', webpage, u'uploader', fatal=False)
+        thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, u'thumbnail', fatal=False)
+        if thumbnail:
+            thumbnail = compat_urllib_parse.unquote(thumbnail)
+
+        video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
+        if webpage.find('"encrypted":true') != -1:
+            password = self._html_search_regex(r'"video_title":"([^"]+)', webpage, u'password').replace('+', ' ')
+            video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))
+
+        formats = []
+        for video_url in video_urls:
+            path = compat_urllib_parse_urlparse(video_url).path
+            extension = os.path.splitext(path)[1][1:]
+            format = path.split('/')[5].split('_')[:2]
+            format = "-".join(format)
+            formats.append({
+                'url': video_url,
+                'ext': extension,
+                'format': format,
+                'format_id': format,
+            })
+        formats.sort(key=lambda format: list(map(lambda s: s.zfill(6), format['format'].split('-'))))
+
+        return {
+            'id': video_id,
+            'uploader': video_uploader,
+            'title': video_title,
+            'thumbnail': thumbnail,
+            'formats': formats,
+            'age_limit': 18,
+        }
diff --git a/youtube_dl/extractor/pornotube.py b/youtube_dl/extractor/pornotube.py

index 5d770ec285c3d1e3dcad04cfe49ca7780a9dd2b4..35dc5a9ffafb32d36e30f51988291dded6a6d18c 100644 (file)
--- a/youtube_dl/extractor/pornotube.py
+++ b/youtube_dl/extractor/pornotube.py
@@ -16,7 +16,8 @@ class PornotubeIE(InfoExtractor):
          u'md5': u'374dd6dcedd24234453b295209aa69b6',
          u'info_dict': {
              u"upload_date": u"20090708", 
-            u"title": u"Marilyn-Monroe-Bathing"
+            u"title": u"Marilyn-Monroe-Bathing",
+            u"age_limit": 18
          }
      }
  
diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py

index 365aade564bd4f423317d427b3070c91e3e22ad3..3bbda128e1a3881ffd0b7a81e6c45da128994db5 100644 (file)
--- a/youtube_dl/extractor/redtube.py
+++ b/youtube_dl/extractor/redtube.py
@@ -8,9 +8,12 @@ class RedTubeIE(InfoExtractor):
      _TEST = {
          u'url': u'http://www.redtube.com/66418',
          u'file': u'66418.mp4',
-        u'md5': u'7b8c22b5e7098a3e1c09709df1126d2d',
+        # md5 varies from time to time, as in
+        # https://travis-ci.org/rg3/youtube-dl/jobs/14052463#L295
+        #u'md5': u'7b8c22b5e7098a3e1c09709df1126d2d',
          u'info_dict': {
-            u"title": u"Sucked on a toilet"
+            u"title": u"Sucked on a toilet",
+            u"age_limit": 18,
          }
      }
  
diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py

index d1b08c9bc050b3639ca252f2e84a373a8e4fa5f9..2f238de35832d61222331cf423e2691d8de52721 100644 (file)
--- a/youtube_dl/extractor/rtlnow.py
+++ b/youtube_dl/extractor/rtlnow.py
@@ -62,19 +62,6 @@ class RTLnowIE(InfoExtractor):
              u'skip_download': True,
          },
      },
-    {
-        u'url': u'http://www.rtlnitronow.de/recht-ordnung/lebensmittelkontrolle-erlangenordnungsamt-berlin.php?film_id=127367&player=1&season=1',
-        u'file': u'127367.flv',
-        u'info_dict': {
-            u'upload_date': u'20130926', 
-            u'title': u'Recht & Ordnung - Lebensmittelkontrolle Erlangen/Ordnungsamt...',
-            u'description': u'Lebensmittelkontrolle Erlangen/Ordnungsamt Berlin',
-            u'thumbnail': u'http://autoimg.static-fra.de/nitronow/344787/1500x1500/image2.jpg',
-        },
-        u'params': {
-            u'skip_download': True,
-        },
-    },
      {
          u'url': u'http://www.n-tvnow.de/top-gear/episode-1-2013-01-01-00-00-00.php?film_id=124903&player=1&season=10',
          u'file': u'124903.flv',
diff --git a/youtube_dl/extractor/slashdot.py b/youtube_dl/extractor/slashdot.py

index 2cba530766a2b898967aefe1c3f7761ebe986d45..f5003c7f91bc78d10a63d25604537e5e77f9fdb8 100644 (file)
--- a/youtube_dl/extractor/slashdot.py
+++ b/youtube_dl/extractor/slashdot.py
@@ -7,6 +7,7 @@ class SlashdotIE(InfoExtractor):
      _VALID_URL = r'https?://tv.slashdot.org/video/\?embed=(?P<id>.*?)(&|$)'
  
      _TEST = {
+        u'add_ie': ['Ooyala'],
          u'url': u'http://tv.slashdot.org/video/?embed=JscHMzZDplD0p-yNLOzTfzC3Q3xzJaUz',
          u'file': u'JscHMzZDplD0p-yNLOzTfzC3Q3xzJaUz.mp4',
          u'md5': u'd2222e7a4a4c1541b3e0cf732fb26735',
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py

index 29cd5617c7d1919fa95e0b48e7ff35585106b800..67b2dff9c9ec431bb7ddd501bb804f6cc572cb61 100644 (file)
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -29,19 +29,37 @@ class SoundcloudIE(InfoExtractor):
                      )
                      '''
      IE_NAME = u'soundcloud'
-    _TEST = {
-        u'url': u'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy',
-        u'file': u'62986583.mp3',
-        u'md5': u'ebef0a451b909710ed1d7787dddbf0d7',
-        u'info_dict': {
-            u"upload_date": u"20121011", 
-            u"description": u"No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd", 
-            u"uploader": u"E.T. ExTerrestrial Music", 
-            u"title": u"Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1"
-        }
-    }
+    _TESTS = [
+        {
+            u'url': u'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy',
+            u'file': u'62986583.mp3',
+            u'md5': u'ebef0a451b909710ed1d7787dddbf0d7',
+            u'info_dict': {
+                u"upload_date": u"20121011", 
+                u"description": u"No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd", 
+                u"uploader": u"E.T. ExTerrestrial Music", 
+                u"title": u"Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1"
+            }
+        },
+        # not streamable song
+        {
+            u'url': u'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep',
+            u'info_dict': {
+                u'id': u'47127627',
+                u'ext': u'mp3',
+                u'title': u'Goldrushed',
+                u'uploader': u'The Royal Concept',
+                u'upload_date': u'20120521',
+            },
+            u'params': {
+                # rtmp
+                u'skip_download': True,
+            },
+        },
+    ]
  
      _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28'
+    _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf'
  
      @classmethod
      def suitable(cls, url):
@@ -56,24 +74,48 @@ class SoundcloudIE(InfoExtractor):
          return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID
  
      def _extract_info_dict(self, info, full_title=None, quiet=False):
-        video_id = info['id']
-        name = full_title or video_id
+        track_id = compat_str(info['id'])
+        name = full_title or track_id
          if quiet == False:
              self.report_extraction(name)
  
          thumbnail = info['artwork_url']
          if thumbnail is not None:
              thumbnail = thumbnail.replace('-large', '-t500x500')
-        return {
-            'id':       info['id'],
-            'url':      info['stream_url'] + '?client_id=' + self._CLIENT_ID,
+        result = {
+            'id':       track_id,
              'uploader': info['user']['username'],
              'upload_date': unified_strdate(info['created_at']),
              'title':    info['title'],
-            'ext':      u'mp3',
+            'ext':      info.get('original_format', u'mp3'),
              'description': info['description'],
              'thumbnail': thumbnail,
          }
+        if info.get('downloadable', False):
+            # We can build a direct link to the song
+            result['url'] = 'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format(track_id, self._CLIENT_ID)
+        else:
+            # We have to retrieve the url
+            stream_json = self._download_webpage(
+                'http://api.soundcloud.com/i1/tracks/{0}/streams?client_id={1}'.format(track_id, self._IPHONE_CLIENT_ID),
+                track_id, u'Downloading track url')
+            # There should be only one entry in the dictionary
+            key, stream_url = list(json.loads(stream_json).items())[0]
+            if key.startswith(u'http'):
+                result['url'] = stream_url
+            elif key.startswith(u'rtmp'):
+                # The url doesn't have an rtmp app, we have to extract the playpath
+                url, path = stream_url.split('mp3:', 1)
+                result.update({
+                    'url': url,
+                    'play_path': 'mp3:' + path,
+                })
+            else:
+                # We fallback to the stream_url in the original info, this
+                # cannot be always used, sometimes it can give an HTTP 404 error
+                result['url'] = info['stream_url'] + '?client_id=' + self._CLIENT_ID,
+
+        return result
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
@@ -106,70 +148,8 @@ class SoundcloudIE(InfoExtractor):
  class SoundcloudSetIE(SoundcloudIE):
      _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)(?:[?].*)?$'
      IE_NAME = u'soundcloud:set'
-    _TEST = {
-        u"url":"https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep",
-        u"playlist": [
-            {
-                u"file":"30510138.mp3",
-                u"md5":"f9136bf103901728f29e419d2c70f55d",
-                u"info_dict": {
-                    u"upload_date": u"20111213",
-                    u"description": u"The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com",
-                    u"uploader": u"The Royal Concept",
-                    u"title": u"D-D-Dance"
-                }
-            },
-            {
-                u"file":"47127625.mp3",
-                u"md5":"09b6758a018470570f8fd423c9453dd8",
-                u"info_dict": {
-                    u"upload_date": u"20120521",
-                    u"description": u"The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com",
-                    u"uploader": u"The Royal Concept",
-                    u"title": u"The Royal Concept - Gimme Twice"
-                }
-            },
-            {
-                u"file":"47127627.mp3",
-                u"md5":"154abd4e418cea19c3b901f1e1306d9c",
-                u"info_dict": {
-                    u"upload_date": u"20120521",
-                    u"uploader": u"The Royal Concept",
-                    u"title": u"Goldrushed"
-                }
-            },
-            {
-                u"file":"47127629.mp3",
-                u"md5":"2f5471edc79ad3f33a683153e96a79c1",
-                u"info_dict": {
-                    u"upload_date": u"20120521",
-                    u"description": u"The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com",
-                    u"uploader": u"The Royal Concept",
-                    u"title": u"In the End"
-                }
-            },
-            {
-                u"file":"47127631.mp3",
-                u"md5":"f9ba87aa940af7213f98949254f1c6e2",
-                u"info_dict": {
-                    u"upload_date": u"20120521",
-                    u"description": u"The Royal Concept from Stockholm\r\nFilip / David / Povel / Magnus\r\nwww.theroyalconceptband.com",
-                    u"uploader": u"The Royal Concept",
-                    u"title": u"Knocked Up"
-                }
-            },
-            {
-                u"file":"75206121.mp3",
-                u"md5":"f9d1fe9406717e302980c30de4af9353",
-                u"info_dict": {
-                    u"upload_date": u"20130116",
-                    u"description": u"The unreleased track World on Fire premiered on the CW's hit show Arrow (8pm/7pm central).  \r\nAs a gift to our fans we would like to offer you a free download of the track!  ",
-                    u"uploader": u"The Royal Concept",
-                    u"title": u"World On Fire"
-                }
-            }
-        ]
-    }
+    # it's in tests/test_playlists.py
+    _TESTS = []
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
@@ -188,7 +168,6 @@ class SoundcloudSetIE(SoundcloudIE):
          resolv_url = self._resolv_url(url)
          info_json = self._download_webpage(resolv_url, full_title)
  
-        videos = []
          info = json.loads(info_json)
          if 'errors' in info:
              for err in info['errors']:
@@ -208,7 +187,7 @@ class SoundcloudUserIE(SoundcloudIE):
      IE_NAME = u'soundcloud:user'
  
      # it's in tests/test_playlists.py
-    _TEST = None
+    _TESTS = []
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/southparkstudios.py b/youtube_dl/extractor/southparkstudios.py

index b1e96b679b63a0c728eb039dee76cb209e5e9976..a711531e668bbc3ba32bfa3a93872c5f25ac73ab 100644 (file)
--- a/youtube_dl/extractor/southparkstudios.py
+++ b/youtube_dl/extractor/southparkstudios.py
@@ -5,21 +5,19 @@ from .mtv import MTVIE, _media_xml_tag
  
  class SouthParkStudiosIE(MTVIE):
      IE_NAME = u'southparkstudios.com'
-    _VALID_URL = r'https?://www\.southparkstudios\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$)'
+    _VALID_URL = r'(https?://)?(www\.)?(?P<url>southparkstudios\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$))'
  
      _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss'
  
-    _TEST = {
+    # Overwrite MTVIE properties we don't want
+    _TESTS = [{
          u'url': u'http://www.southparkstudios.com/clips/104437/bat-daded#tab=featured',
          u'file': u'a7bff6c2-ed00-11e0-aca6-0026b9414f30.mp4',
          u'info_dict': {
              u'title': u'Bat Daded',
              u'description': u'Randy disqualifies South Park by getting into a fight with Bat Dad.',
          },
-    }
-
-    # Overwrite MTVIE properties we don't want
-    _TESTS = []
+    }]
  
      def _get_thumbnail_url(self, uri, itemdoc):
          search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
@@ -31,8 +29,23 @@ class SouthParkStudiosIE(MTVIE):
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
+        url = u'http://www.' + mobj.group(u'url')
          video_id = mobj.group('id')
          webpage = self._download_webpage(url, video_id)
          mgid = self._search_regex(r'swfobject.embedSWF\(".*?(mgid:.*?)"',
                                    webpage, u'mgid')
          return self._get_videos_info(mgid)
+
+class SouthparkDeIE(SouthParkStudiosIE):
+    IE_NAME = u'southpark.de'
+    _VALID_URL = r'(https?://)?(www\.)?(?P<url>southpark\.de/(clips|alle-episoden)/(?P<id>.+?)(\?|#|$))'
+    _FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/'
+
+    _TESTS = [{
+        u'url': u'http://www.southpark.de/clips/uygssh/the-government-wont-respect-my-privacy#tab=featured',
+        u'file': u'85487c96-b3b9-4e39-9127-ad88583d9bf2.mp4',
+        u'info_dict': {
+            u'title': u'The Government Won\'t Respect My Privacy',
+            u'description': u'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.',
+        },
+    }]
diff --git a/youtube_dl/extractor/space.py b/youtube_dl/extractor/space.py

new file mode 100644 (file)

index 0000000..0d32a06
--- /dev/null
+++ b/youtube_dl/extractor/space.py
@@ -0,0 +1,35 @@
+import re
+
+from .common import InfoExtractor
+from .brightcove import BrightcoveIE
+from ..utils import RegexNotFoundError, ExtractorError
+
+
+class SpaceIE(InfoExtractor):
+    _VALID_URL = r'https?://www\.space\.com/\d+-(?P<title>[^/\.\?]*?)-video.html'
+    _TEST = {
+        u'add_ie': ['Brightcove'],
+        u'url': u'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html',
+        u'info_dict': {
+            u'id': u'2780937028001',
+            u'ext': u'mp4',
+            u'title': u'Huge Martian Landforms\' Detail Revealed By European Probe | Video',
+            u'description': u'md5:db81cf7f3122f95ed234b631a6ea1e61',
+            u'uploader': u'TechMedia Networks',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        title = mobj.group('title')
+        webpage = self._download_webpage(url, title)
+        try:
+            # Some videos require the playerKey field, which isn't define in
+            # the BrightcoveExperience object
+            brightcove_url = self._og_search_video_url(webpage)
+        except RegexNotFoundError:
+            # Other videos works fine with the info from the object
+            brightcove_url = BrightcoveIE._extract_brightcove_url(webpage)
+        if brightcove_url is None:
+            raise ExtractorError(u'The webpage does not contain a video', expected=True)
+        return self.url_result(brightcove_url, BrightcoveIE.ie_key())
diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py

new file mode 100644 (file)

index 0000000..9e2ad0d
--- /dev/null
+++ b/youtube_dl/extractor/spankwire.py
@@ -0,0 +1,74 @@
+import os
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse_urlparse,
+    compat_urllib_request,
+    compat_urllib_parse,
+)
+from ..aes import (
+    aes_decrypt_text
+)
+
+class SpankwireIE(InfoExtractor):
+    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>spankwire\.com/[^/]*/video(?P<videoid>[0-9]+)/?)'
+    _TEST = {
+        u'url': u'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/',
+        u'file': u'103545.mp4',
+        u'md5': u'1b3f55e345500552dbc252a3e9c1af43',
+        u'info_dict': {
+            u"uploader": u"oreusz", 
+            u"title": u"Buckcherry`s X Rated Music Video Crazy Bitch",
+            u"description": u"Crazy Bitch X rated music video.",
+            u"age_limit": 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('videoid')
+        url = 'http://www.' + mobj.group('url')
+
+        req = compat_urllib_request.Request(url)
+        req.add_header('Cookie', 'age_verified=1')
+        webpage = self._download_webpage(req, video_id)
+
+        video_title = self._html_search_regex(r'<h1>([^<]+)', webpage, u'title')
+        video_uploader = self._html_search_regex(
+            r'by:\s*<a [^>]*>(.+?)</a>', webpage, u'uploader', fatal=False)
+        thumbnail = self._html_search_regex(
+            r'flashvars\.image_url = "([^"]+)', webpage, u'thumbnail', fatal=False)
+        description = self._html_search_regex(
+            r'<div\s+id="descriptionContent">([^<]+)<', webpage, u'description', fatal=False)
+
+        video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'flashvars\.quality_[0-9]{3}p = "([^"]+)', webpage)))
+        if webpage.find('flashvars\.encrypted = "true"') != -1:
+            password = self._html_search_regex(r'flashvars\.video_title = "([^"]+)', webpage, u'password').replace('+', ' ')
+            video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))
+
+        formats = []
+        for video_url in video_urls:
+            path = compat_urllib_parse_urlparse(video_url).path
+            extension = os.path.splitext(path)[1][1:]
+            format = path.split('/')[4].split('_')[:2]
+            format = "-".join(format)
+            formats.append({
+                'url': video_url,
+                'ext': extension,
+                'format': format,
+                'format_id': format,
+            })
+        formats.sort(key=lambda format: list(map(lambda s: s.zfill(6), format['format'].split('-'))))
+
+        age_limit = self._rta_search(webpage)
+
+        return {
+            'id': video_id,
+            'uploader': video_uploader,
+            'title': video_title,
+            'thumbnail': thumbnail,
+            'description': description,
+            'formats': formats,
+            'age_limit': age_limit,
+        }
diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py

index 13c86401c0095bd5584e03ac35614d06edb76be1..19ce585cf1f6dc89569fa56cd6dc30c23dd17e61 100644 (file)
--- a/youtube_dl/extractor/spiegel.py
+++ b/youtube_dl/extractor/spiegel.py
@@ -6,14 +6,22 @@ from .common import InfoExtractor
  
  class SpiegelIE(InfoExtractor):
      _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
-    _TEST = {
+    _TESTS = [{
          u'url': u'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html',
          u'file': u'1259285.mp4',
          u'md5': u'2c2754212136f35fb4b19767d242f66e',
          u'info_dict': {
              u"title": u"Vulkanausbruch in Ecuador: Der \"Feuerschlund\" ist wieder aktiv"
          }
-    }
+    },
+    {
+        u'url': u'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html',
+        u'file': u'1309159.mp4',
+        u'md5': u'f2cdf638d7aa47654e251e1aee360af1',
+        u'info_dict': {
+            u'title': u'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers'
+        }
+    }]
  
      def _real_extract(self, url):
          m = re.match(self._VALID_URL, url)
@@ -21,25 +29,38 @@ class SpiegelIE(InfoExtractor):
  
          webpage = self._download_webpage(url, video_id)
  
-        video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
-            webpage, u'title')
+        video_title = self._html_search_regex(
+            r'<div class="module-title">(.*?)</div>', webpage, u'title')
  
          xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
-        xml_code = self._download_webpage(xml_url, video_id,
-                    note=u'Downloading XML', errnote=u'Failed to download XML')
+        xml_code = self._download_webpage(
+            xml_url, video_id,
+            note=u'Downloading XML', errnote=u'Failed to download XML')
  
          idoc = xml.etree.ElementTree.fromstring(xml_code)
-        last_type = idoc[-1]
-        filename = last_type.findall('./filename')[0].text
-        duration = float(last_type.findall('./duration')[0].text)
  
-        video_url = 'http://video2.spiegel.de/flash/' + filename
-        video_ext = filename.rpartition('.')[2]
+        formats = [
+            {
+                'format_id': n.tag.rpartition('type')[2],
+                'url': u'http://video2.spiegel.de/flash/' + n.find('./filename').text,
+                'width': int(n.find('./width').text),
+                'height': int(n.find('./height').text),
+                'abr': int(n.find('./audiobitrate').text),
+                'vbr': int(n.find('./videobitrate').text),
+                'vcodec': n.find('./codec').text,
+                'acodec': 'MP4A',
+            }
+            for n in list(idoc)
+            # Blacklist type 6, it's extremely LQ and not available on the same server
+            if n.tag.startswith('type') and n.tag != 'type6'
+        ]
+        formats.sort(key=lambda f: f['vbr'])
+        duration = float(idoc[0].findall('./duration')[0].text)
+
          info = {
              'id': video_id,
-            'url': video_url,
-            'ext': video_ext,
              'title': video_title,
              'duration': duration,
+            'formats': formats,
          }
-        return [info]
+        return info
diff --git a/youtube_dl/extractor/streamcloud.py b/youtube_dl/extractor/streamcloud.py

new file mode 100644 (file)

index 0000000..9faf3a5
--- /dev/null
+++ b/youtube_dl/extractor/streamcloud.py
@@ -0,0 +1,66 @@
+# coding: utf-8
+import re
+import time
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse,
+    compat_urllib_request,
+)
+
+
+class StreamcloudIE(InfoExtractor):
+    IE_NAME = u'streamcloud.eu'
+    _VALID_URL = r'https?://streamcloud\.eu/(?P<id>[a-zA-Z0-9_-]+)/(?P<fname>[^#?]*)\.html'
+
+    _TEST = {
+        u'url': u'http://streamcloud.eu/skp9j99s4bpz/youtube-dl_test_video_____________-BaW_jenozKc.mp4.html',
+        u'file': u'skp9j99s4bpz.mp4',
+        u'md5': u'6bea4c7fa5daaacc2a946b7146286686',
+        u'info_dict': {
+            u'title': u'youtube-dl test video  \'/\\ ä ↭',
+            u'duration': 9,
+        },
+        u'skip': u'Only available from the EU'
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        orig_webpage = self._download_webpage(url, video_id)
+
+        fields = re.findall(r'''(?x)<input\s+
+            type="(?:hidden|submit)"\s+
+            name="([^"]+)"\s+
+            (?:id="[^"]+"\s+)?
+            value="([^"]*)"
+            ''', orig_webpage)
+        post = compat_urllib_parse.urlencode(fields)
+
+        self.to_screen('%s: Waiting for timeout' % video_id)
+        time.sleep(12)
+        headers = {
+            b'Content-Type': b'application/x-www-form-urlencoded',
+        }
+        req = compat_urllib_request.Request(url, post, headers)
+
+        webpage = self._download_webpage(
+            req, video_id, note=u'Downloading video page ...')
+        title = self._html_search_regex(
+            r'<h1[^>]*>([^<]+)<', webpage, u'title')
+        video_url = self._search_regex(
+            r'file:\s*"([^"]+)"', webpage, u'video URL')
+        duration_str = self._search_regex(
+            r'duration:\s*"?([0-9]+)"?', webpage, u'duration', fatal=False)
+        duration = None if duration_str is None else int(duration_str)
+        thumbnail = self._search_regex(
+            r'image:\s*"([^"]+)"', webpage, u'thumbnail URL', fatal=False)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': video_url,
+            'duration': duration,
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py

index 90de7de3a709d4385b29e62d44ae1e82349d883e..4b4c5235d09ea8a6f75b7d182abf87bfc573557f 100644 (file)
--- a/youtube_dl/extractor/subtitles.py
+++ b/youtube_dl/extractor/subtitles.py
@@ -12,9 +12,9 @@ class SubtitlesInfoExtractor(InfoExtractor):
          return any([self._downloader.params.get('writesubtitles', False),
                      self._downloader.params.get('writeautomaticsub')])
  
-    def _list_available_subtitles(self, video_id, webpage=None):
+    def _list_available_subtitles(self, video_id, webpage):
          """ outputs the available subtitles for the video """
-        sub_lang_list = self._get_available_subtitles(video_id)
+        sub_lang_list = self._get_available_subtitles(video_id, webpage)
          auto_captions_list = self._get_available_automatic_caption(video_id, webpage)
          sub_lang = ",".join(list(sub_lang_list.keys()))
          self.to_screen(u'%s: Available subtitles for video: %s' %
@@ -23,7 +23,7 @@ class SubtitlesInfoExtractor(InfoExtractor):
          self.to_screen(u'%s: Available automatic captions for video: %s' %
                         (video_id, auto_lang))
  
-    def extract_subtitles(self, video_id, video_webpage=None):
+    def extract_subtitles(self, video_id, webpage):
          """
          returns {sub_lang: sub} ,{} if subtitles not found or None if the
          subtitles aren't requested.
@@ -32,9 +32,9 @@ class SubtitlesInfoExtractor(InfoExtractor):
              return None
          available_subs_list = {}
          if self._downloader.params.get('writeautomaticsub', False):
-            available_subs_list.update(self._get_available_automatic_caption(video_id, video_webpage))
+            available_subs_list.update(self._get_available_automatic_caption(video_id, webpage))
          if self._downloader.params.get('writesubtitles', False):
-            available_subs_list.update(self._get_available_subtitles(video_id))
+            available_subs_list.update(self._get_available_subtitles(video_id, webpage))
  
          if not available_subs_list:  # error, it didn't get the available subtitles
              return {}
@@ -74,7 +74,7 @@ class SubtitlesInfoExtractor(InfoExtractor):
              return
          return sub
  
-    def _get_available_subtitles(self, video_id):
+    def _get_available_subtitles(self, video_id, webpage):
          """
          returns {sub_lang: url} or {} if not available
          Must be redefined by the subclasses
diff --git a/youtube_dl/extractor/sztvhu.py b/youtube_dl/extractor/sztvhu.py

index 81fa35c4bd297f6b6a4b5fa44ed1b98998393ab6..c9359fafb5c5989923c6320e3e684673b80057d6 100644 (file)
--- a/youtube_dl/extractor/sztvhu.py
+++ b/youtube_dl/extractor/sztvhu.py
@@ -15,7 +15,8 @@ class SztvHuIE(InfoExtractor):
          u'info_dict': {
              u"title": u"Cserkészek népszerűsítették a környezettudatos életmódot a Savaria téren",
              u"description": u'A zöld nap játékos ismeretterjesztő programjait a Magyar Cserkész Szövetség szervezte, akik az ország nyolc városában adják át tudásukat az érdeklődőknek. A PET...',
-        }
+        },
+        u'skip': u'Service temporarily disabled as of 2013-11-20'
      }
  
      def _real_extract(self, url):
diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py

index c910110ca9775d9ad03011238aacdc3c9ef4dae1..165d9f88bc984ef80f1fd7aa4ff0d0b10e5fca45 100644 (file)
--- a/youtube_dl/extractor/teamcoco.py
+++ b/youtube_dl/extractor/teamcoco.py
@@ -1,4 +1,5 @@
  import re
+import xml.etree.ElementTree
  
  from .common import InfoExtractor
  from ..utils import (
@@ -11,7 +12,7 @@ class TeamcocoIE(InfoExtractor):
      _TEST = {
          u'url': u'http://teamcoco.com/video/louis-ck-interview-george-w-bush',
          u'file': u'19705.mp4',
-        u'md5': u'27b6f7527da5acf534b15f21b032656e',
+        u'md5': u'cde9ba0fa3506f5f017ce11ead928f9a',
          u'info_dict': {
              u"description": u"Louis C.K. got starstruck by George W. Bush, so what? Part one.", 
              u"title": u"Louis C.K. Interview Pt. 1 11/3/11"
@@ -31,16 +32,40 @@ class TeamcocoIE(InfoExtractor):
          self.report_extraction(video_id)
  
          data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
-        data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
+        data_xml = self._download_webpage(data_url, video_id, 'Downloading data webpage')
+        data = xml.etree.ElementTree.fromstring(data_xml.encode('utf-8'))
  
-        video_url = self._html_search_regex(r'<file [^>]*type="high".*?>(.*?)</file>',
-            data, u'video URL')
  
-        return [{
+        qualities = ['500k', '480p', '1000k', '720p', '1080p']
+        formats = []
+        for file in data.findall('files/file'):
+            if file.attrib.get('playmode') == 'all':
+                # it just duplicates one of the entries
+                break
+            file_url = file.text
+            m_format = re.search(r'(\d+(k|p))\.mp4', file_url)
+            if m_format is not None:
+                format_id = m_format.group(1)
+            else:
+                format_id = file.attrib['bitrate']
+            formats.append({
+                'url': file_url,
+                'ext': 'mp4',
+                'format_id': format_id,
+            })
+        def sort_key(f):
+            try:
+                return qualities.index(f['format_id'])
+            except ValueError:
+                return -1
+        formats.sort(key=sort_key)
+        if not formats:
+            raise ExtractorError(u'Unable to extract video URL')
+
+        return {
              'id':          video_id,
-            'url':         video_url,
-            'ext':         'mp4',
+            'formats': formats,
              'title':       self._og_search_title(webpage),
              'thumbnail':   self._og_search_thumbnail(webpage),
              'description': self._og_search_description(webpage),
-        }]
+        }
diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py

index dfa1176a3e4e4eef333dcb829773c189bf9916ba..4bca62ba003e325ebedd0fcc74c953bd64120cd5 100644 (file)
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@@ -1,10 +1,13 @@
  import json
  import re
  
-from .common import InfoExtractor
+from .subtitles import SubtitlesInfoExtractor
  
+from ..utils import (
+    RegexNotFoundError,
+)
  
-class TEDIE(InfoExtractor):
+class TEDIE(SubtitlesInfoExtractor):
      _VALID_URL=r'''http://www\.ted\.com/
                     (
                          ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
@@ -32,33 +35,32 @@ class TEDIE(InfoExtractor):
      def _real_extract(self, url):
          m=re.match(self._VALID_URL, url, re.VERBOSE)
          if m.group('type_talk'):
-            return [self._talk_info(url)]
+            return self._talk_info(url)
          else :
              playlist_id=m.group('playlist_id')
              name=m.group('name')
              self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
              return [self._playlist_videos_info(url,name,playlist_id)]
  
-    def _playlist_videos_info(self,url,name,playlist_id=0):
+
+    def _playlist_videos_info(self, url, name, playlist_id):
          '''Returns the videos of the playlist'''
-        video_RE=r'''
-                     <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
-                     ([.\s]*?)data-playlist_item_id="(\d+)"
-                     ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
-                     '''
-        video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
-        webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
-        m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
-        m_names=re.finditer(video_name_RE,webpage)
+
+        webpage = self._download_webpage(
+            url, playlist_id, u'Downloading playlist webpage')
+        matches = re.finditer(
+            r'<p\s+class="talk-title[^"]*"><a\s+href="(?P<talk_url>/talks/[^"]+\.html)">[^<]*</a></p>',
+            webpage)
  
          playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
                                                   webpage, 'playlist title')
  
-        playlist_entries = []
-        for m_video, m_name in zip(m_videos,m_names):
-            talk_url='http://www.ted.com%s' % m_name.group('talk_url')
-            playlist_entries.append(self.url_result(talk_url, 'TED'))
-        return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
+        playlist_entries = [
+            self.url_result(u'http://www.ted.com' + m.group('talk_url'), 'TED')
+            for m in matches
+        ]
+        return self.playlist_result(
+            playlist_entries, playlist_id=playlist_id, playlist_title=playlist_title)
  
      def _talk_info(self, url, video_id=0):
          """Return the video for the talk in the url"""
@@ -81,16 +83,35 @@ class TEDIE(InfoExtractor):
              'ext': 'mp4',
              'url': stream['file'],
              'format': stream['id']
-            } for stream in info['htmlStreams']]
-        info = {
-            'id': info['id'],
+        } for stream in info['htmlStreams']]
+
+        video_id = info['id']
+
+        # subtitles
+        video_subtitles = self.extract_subtitles(video_id, webpage)
+        if self._downloader.params.get('listsubtitles', False):
+            self._list_available_subtitles(video_id, webpage)
+            return
+
+        return {
+            'id': video_id,
              'title': title,
              'thumbnail': thumbnail,
              'description': desc,
+            'subtitles': video_subtitles,
              'formats': formats,
          }
  
-        # TODO: Remove when #980 has been merged
-        info.update(info['formats'][-1])
-
-        return info
+    def _get_available_subtitles(self, video_id, webpage):
+        try:
+            options = self._search_regex(r'(?:<select name="subtitles_language_select" id="subtitles_language_select">)(.*?)(?:</select>)', webpage, 'subtitles_language_select', flags=re.DOTALL)
+            languages = re.findall(r'(?:<option value=")(\S+)"', options)
+            if languages:
+                sub_lang_list = {}
+                for l in languages:
+                    url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
+                    sub_lang_list[l] = url
+                return sub_lang_list
+        except RegexNotFoundError:
+            self._downloader.report_warning(u'video doesn\'t have subtitles')
+        return {}
diff --git a/youtube_dl/extractor/toutv.py b/youtube_dl/extractor/toutv.py

new file mode 100644 (file)

index 0000000..2f728d3
--- /dev/null
+++ b/youtube_dl/extractor/toutv.py
@@ -0,0 +1,74 @@
+# coding: utf-8
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    unified_strdate,
+)
+
+
+class TouTvIE(InfoExtractor):
+    IE_NAME = u'tou.tv'
+    _VALID_URL = r'https?://www\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/(?P<episode>S[0-9]+E[0-9]+)))'
+
+    _TEST = {
+        u'url': u'http://www.tou.tv/30-vies/S04E41',
+        u'file': u'30-vies_S04E41.mp4',
+        u'info_dict': {
+            u'title': u'30 vies Saison 4 / Épisode 41',
+            u'description': u'md5:da363002db82ccbe4dafeb9cab039b09',
+            u'age_limit': 8,
+            u'uploader': u'Groupe des Nouveaux Médias',
+            u'duration': 1296,
+            u'upload_date': u'20131118',
+            u'thumbnail': u'http://static.tou.tv/medias/images/2013-11-18_19_00_00_30VIES_0341_01_L.jpeg',
+        },
+        u'params': {
+            u'skip_download': True,  # Requires rtmpdump
+        },
+        u'skip': 'Only available in Canada'
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        webpage = self._download_webpage(url, video_id)
+
+        mediaId = self._search_regex(
+            r'"idMedia":\s*"([^"]+)"', webpage, u'media ID')
+
+        streams_url = u'http://release.theplatform.com/content.select?pid=' + mediaId
+        streams_webpage = self._download_webpage(
+            streams_url, video_id, note=u'Downloading stream list')
+
+        streams_doc = xml.etree.ElementTree.fromstring(
+            streams_webpage.encode('utf-8'))
+        video_url = next(n.text
+                         for n in streams_doc.findall('.//choice/url')
+                         if u'//ad.doubleclick' not in n.text)
+        if video_url.endswith('/Unavailable.flv'):
+            raise ExtractorError(
+                u'Access to this video is blocked from outside of Canada',
+                expected=True)
+
+        duration_str = self._html_search_meta(
+            'video:duration', webpage, u'duration')
+        duration = int(duration_str) if duration_str else None
+        upload_date_str = self._html_search_meta(
+            'video:release_date', webpage, u'upload date')
+        upload_date = unified_strdate(upload_date_str) if upload_date_str else None
+
+        return {
+            'id': video_id,
+            'title': self._og_search_title(webpage),
+            'url': video_url,
+            'description': self._og_search_description(webpage),
+            'uploader': self._dc_search_uploader(webpage),
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'age_limit': self._media_rating_search(webpage),
+            'duration': duration,
+            'upload_date': upload_date,
+            'ext': 'mp4',
+        }
diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py

new file mode 100644 (file)

index 0000000..4d9d41d
--- /dev/null
+++ b/youtube_dl/extractor/tube8.py
@@ -0,0 +1,63 @@
+import os
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse_urlparse,
+    compat_urllib_request,
+)
+from ..aes import (
+    aes_decrypt_text
+)
+
+class Tube8IE(InfoExtractor):
+    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>tube8\.com/[^/]+/[^/]+/(?P<videoid>[0-9]+)/?)'
+    _TEST = {
+        u'url': u'http://www.tube8.com/teen/kasia-music-video/229795/',
+        u'file': u'229795.mp4',
+        u'md5': u'e9e0b0c86734e5e3766e653509475db0',
+        u'info_dict': {
+            u"description": u"hot teen Kasia grinding", 
+            u"uploader": u"unknown", 
+            u"title": u"Kasia music video",
+            u"age_limit": 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('videoid')
+        url = 'http://www.' + mobj.group('url')
+
+        req = compat_urllib_request.Request(url)
+        req.add_header('Cookie', 'age_verified=1')
+        webpage = self._download_webpage(req, video_id)
+
+        video_title = self._html_search_regex(r'videotitle     ="([^"]+)', webpage, u'title')
+        video_description = self._html_search_regex(r'>Description:</strong>(.+?)<', webpage, u'description', fatal=False)
+        video_uploader = self._html_search_regex(r'>Submitted by:</strong>(?:\s|<[^>]*>)*(.+?)<', webpage, u'uploader', fatal=False)
+        thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, u'thumbnail', fatal=False)
+        if thumbnail:
+            thumbnail = thumbnail.replace('\\/', '/')
+
+        video_url = self._html_search_regex(r'"video_url":"([^"]+)', webpage, u'video_url')
+        if webpage.find('"encrypted":true')!=-1:
+            password = self._html_search_regex(r'"video_title":"([^"]+)', webpage, u'password')
+            video_url = aes_decrypt_text(video_url, password, 32).decode('utf-8')
+        path = compat_urllib_parse_urlparse(video_url).path
+        extension = os.path.splitext(path)[1][1:]
+        format = path.split('/')[4].split('_')[:2]
+        format = "-".join(format)
+
+        return {
+            'id': video_id,
+            'uploader': video_uploader,
+            'title': video_title,
+            'thumbnail': thumbnail,
+            'description': video_description,
+            'url': video_url,
+            'ext': extension,
+            'format': format,
+            'format_id': format,
+            'age_limit': 18,
+        }
diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py

new file mode 100644 (file)

index 0000000..bfed9dd
--- /dev/null
+++ b/youtube_dl/extractor/tvp.py
@@ -0,0 +1,42 @@
+import json
+import re
+
+from .common import InfoExtractor
+
+
+class TvpIE(InfoExtractor):
+    IE_NAME = u'tvp.pl'
+    _VALID_URL = r'https?://www\.tvp\.pl/.*?wideo/(?P<date>\d+)/(?P<id>\d+)'
+
+    _TEST = {
+        u'url': u'http://www.tvp.pl/warszawa/magazyny/campusnews/wideo/31102013/12878238',
+        u'md5': u'148408967a6a468953c0a75cbdaf0d7a',
+        u'file': u'12878238.wmv',
+        u'info_dict': {
+            u'title': u'31.10.2013 - Odcinek 2',
+            u'description': u'31.10.2013 - Odcinek 2',
+        },
+        u'skip': u'Download has to use same server IP as extraction. Therefore, a good (load-balancing) DNS resolver will make the download fail.'
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        webpage = self._download_webpage(url, video_id)
+        json_url = 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id
+        json_params = self._download_webpage(
+            json_url, video_id, u"Downloading video metadata")
+
+        params = json.loads(json_params)
+        self.report_extraction(video_id)
+        video_url = params['video_url']
+
+        title = self._og_search_title(webpage, fatal=True)
+        return {
+            'id': video_id,
+            'title': title,
+            'ext': 'wmv',
+            'url': video_url,
+            'description': self._og_search_description(webpage),
+            'thumbnail': self._og_search_thumbnail(webpage),
+        }
diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py

index 1c1cc418d29a8897e2a2825492ed7becab75af6b..4378b17800f1df78275d68a9525ca95585dc8b9d 100644 (file)
--- a/youtube_dl/extractor/vevo.py
+++ b/youtube_dl/extractor/vevo.py
@@ -5,7 +5,7 @@ import datetime
  
  from .common import InfoExtractor
  from ..utils import (
-    determine_ext,
+    compat_HTTPError,
      ExtractorError,
  )
  
@@ -16,26 +16,22 @@ class VevoIE(InfoExtractor):
      (currently used by MTVIE)
      """
      _VALID_URL = r'((http://www.vevo.com/watch/.*?/.*?/)|(vevo:))(?P<id>.*?)(\?|$)'
-    _TEST = {
+    _TESTS = [{
          u'url': u'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',
          u'file': u'GB1101300280.mp4',
+        u"md5": u"06bea460acb744eab74a9d7dcb4bfd61",
          u'info_dict': {
              u"upload_date": u"20130624",
              u"uploader": u"Hurts",
              u"title": u"Somebody to Die For",
-            u'duration': 230,
+            u"duration": 230,
+            u"width": 1920,
+            u"height": 1080,
          }
-    }
+    }]
+    _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com/'
  
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-
-        json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id
-        info_json = self._download_webpage(json_url, video_id, u'Downloading json info')
-
-        self.report_extraction(video_id)
-        video_info = json.loads(info_json)['video']
+    def _formats_from_json(self, video_info):
          last_version = {'version': -1}
          for version in video_info['videoVersions']:
              # These are the HTTP downloads, other types are for different manifests
@@ -50,17 +46,75 @@ class VevoIE(InfoExtractor):
          # Already sorted from worst to best quality
          for rend in renditions.findall('rendition'):
              attr = rend.attrib
-            f_url = attr['url']
+            format_note = '%(videoCodec)s@%(videoBitrate)4sk, %(audioCodec)s@%(audioBitrate)3sk' % attr
              formats.append({
-                'url': f_url,
-                'ext': determine_ext(f_url),
+                'url': attr['url'],
+                'format_id': attr['name'],
+                'format_note': format_note,
                  'height': int(attr['frameheight']),
                  'width': int(attr['frameWidth']),
              })
+        return formats
+
+    def _formats_from_smil(self, smil_xml):
+        formats = []
+        smil_doc = xml.etree.ElementTree.fromstring(smil_xml.encode('utf-8'))
+        els = smil_doc.findall('.//{http://www.w3.org/2001/SMIL20/Language}video')
+        for el in els:
+            src = el.attrib['src']
+            m = re.match(r'''(?xi)
+                (?P<ext>[a-z0-9]+):
+                (?P<path>
+                    [/a-z0-9]+     # The directory and main part of the URL
+                    _(?P<cbr>[0-9]+)k
+                    _(?P<width>[0-9]+)x(?P<height>[0-9]+)
+                    _(?P<vcodec>[a-z0-9]+)
+                    _(?P<vbr>[0-9]+)
+                    _(?P<acodec>[a-z0-9]+)
+                    _(?P<abr>[0-9]+)
+                    \.[a-z0-9]+  # File extension
+                )''', src)
+            if not m:
+                continue
  
-        date_epoch = int(self._search_regex(
-            r'/Date\((\d+)\)/', video_info['launchDate'], u'launch date'))/1000
-        upload_date = datetime.datetime.fromtimestamp(date_epoch)
+            format_url = self._SMIL_BASE_URL + m.group('path')
+            formats.append({
+                'url': format_url,
+                'format_id': u'SMIL_' + m.group('cbr'),
+                'vcodec': m.group('vcodec'),
+                'acodec': m.group('acodec'),
+                'vbr': int(m.group('vbr')),
+                'abr': int(m.group('abr')),
+                'ext': m.group('ext'),
+                'width': int(m.group('width')),
+                'height': int(m.group('height')),
+            })
+        return formats
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id
+        info_json = self._download_webpage(json_url, video_id, u'Downloading json info')
+        video_info = json.loads(info_json)['video']
+
+        formats = self._formats_from_json(video_info)
+        try:
+            smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % (
+                self._SMIL_BASE_URL, video_id, video_id.lower())
+            smil_xml = self._download_webpage(smil_url, video_id,
+                                              u'Downloading SMIL info')
+            formats.extend(self._formats_from_smil(smil_xml))
+        except ExtractorError as ee:
+            if not isinstance(ee.cause, compat_HTTPError):
+                raise
+            self._downloader.report_warning(
+                u'Cannot download SMIL information, falling back to JSON ..')
+
+        timestamp_ms = int(self._search_regex(
+            r'/Date\((\d+)\)/', video_info['launchDate'], u'launch date'))
+        upload_date = datetime.datetime.fromtimestamp(timestamp_ms // 1000)
          info = {
              'id': video_id,
              'title': video_info['title'],
@@ -71,7 +125,4 @@ class VevoIE(InfoExtractor):
              'duration': video_info['duration'],
          }
  
-        # TODO: Remove when #980 has been merged
-        info.update(formats[-1])
-
          return info
diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py

index 12c84a985cc8a2ee49b592d504e60cdbe8500eb8..826804af37af54e308f90349e909d3e0e3aa5126 100644 (file)
--- a/youtube_dl/extractor/viddler.py
+++ b/youtube_dl/extractor/viddler.py
@@ -8,7 +8,7 @@ from ..utils import (
  
  
  class ViddlerIE(InfoExtractor):
-    _VALID_URL = r'(?P<domain>https?://(?:www\.)?viddler.com)/(?:v|embed|player)/(?P<id>[0-9]+)'
+    _VALID_URL = r'(?P<domain>https?://(?:www\.)?viddler.com)/(?:v|embed|player)/(?P<id>[a-z0-9]+)'
      _TEST = {
          u"url": u"http://www.viddler.com/v/43903784",
          u'file': u'43903784.mp4',
diff --git a/youtube_dl/extractor/videodetective.py b/youtube_dl/extractor/videodetective.py

index d89f8409443675f4359b85b96ffc0c2d49ec32e5..265dd5b91fd9e5c4fc5a0cac8a9f36dd36731cfe 100644 (file)
--- a/youtube_dl/extractor/videodetective.py
+++ b/youtube_dl/extractor/videodetective.py
@@ -16,7 +16,7 @@ class VideoDetectiveIE(InfoExtractor):
          u'info_dict': {
              u'title': u'KICK-ASS 2',
              u'description': u'md5:65ba37ad619165afac7d432eaded6013',
-            u'duration': 138,
+            u'duration': 135,
          },
      }
  
diff --git a/youtube_dl/extractor/videopremium.py b/youtube_dl/extractor/videopremium.py

index 65f39b98259bc0050b512073b2f26e5bd0e49605..4800415bde2d103b8781ab3954d617fc50166074 100644 (file)
--- a/youtube_dl/extractor/videopremium.py
+++ b/youtube_dl/extractor/videopremium.py
@@ -24,12 +24,16 @@ class VideoPremiumIE(InfoExtractor):
          webpage_url = 'http://videopremium.tv/' + video_id
          webpage = self._download_webpage(webpage_url, video_id)
  
-        self.report_extraction(video_id)
+        if re.match(r"^<html><head><script[^>]*>window.location\s*=", webpage):
+            # Download again, we need a cookie
+            webpage = self._download_webpage(
+                webpage_url, video_id,
+                note=u'Downloading webpage again (with cookie)')
  
-        video_title = self._html_search_regex(r'<h2(?:.*?)>\s*(.+?)\s*<',
-            webpage, u'video title')
+        video_title = self._html_search_regex(
+            r'<h2(?:.*?)>\s*(.+?)\s*<', webpage, u'video title')
  
-        return [{
+        return {
              'id':          video_id,
              'url':         "rtmp://e%d.md.iplay.md/play" % random.randint(1, 16),
              'play_path':   "mp4:%s.f4v" % video_id,
@@ -37,4 +41,4 @@ class VideoPremiumIE(InfoExtractor):
              'player_url':  "http://videopremium.tv/uplayer/uppod.swf",
              'ext':         'f4v',
              'title':       video_title,
-        }]
+        }
+\ No newline at end of file
diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py

new file mode 100644 (file)

index 0000000..2206a06
--- /dev/null
+++ b/youtube_dl/extractor/viki.py
@@ -0,0 +1,101 @@
+import re
+
+from ..utils import (
+    ExtractorError,
+    unescapeHTML,
+    unified_strdate,
+)
+from .subtitles import SubtitlesInfoExtractor
+
+
+class VikiIE(SubtitlesInfoExtractor):
+    IE_NAME = u'viki'
+
+    _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)'
+    _TEST = {
+        u'url': u'http://www.viki.com/videos/1023585v-heirs-episode-14',
+        u'file': u'1023585v.mp4',
+        u'md5': u'a21454021c2646f5433514177e2caa5f',
+        u'info_dict': {
+            u'title': u'Heirs Episode 14',
+            u'uploader': u'SBS',
+            u'description': u'md5:c4b17b9626dd4b143dcc4d855ba3474e',
+            u'upload_date': u'20131121',
+            u'age_limit': 13,
+        },
+        u'skip': u'Blocked in the US',
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group(1)
+
+        webpage = self._download_webpage(url, video_id)
+        title = self._og_search_title(webpage)
+        description = self._og_search_description(webpage)
+        thumbnail = self._og_search_thumbnail(webpage)
+
+        uploader_m = re.search(
+            r'<strong>Broadcast Network: </strong>\s*([^<]*)<', webpage)
+        if uploader_m is None:
+            uploader = None
+        else:
+            uploader = uploader_m.group(1).strip()
+
+        rating_str = self._html_search_regex(
+            r'<strong>Rating: </strong>\s*([^<]*)<', webpage,
+            u'rating information', default='').strip()
+        RATINGS = {
+            'G': 0,
+            'PG': 10,
+            'PG-13': 13,
+            'R': 16,
+            'NC': 18,
+        }
+        age_limit = RATINGS.get(rating_str)
+
+        info_url = 'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id
+        info_webpage = self._download_webpage(
+            info_url, video_id, note=u'Downloading info page')
+        if re.match(r'\s*<div\s+class="video-error', info_webpage):
+            raise ExtractorError(
+                u'Video %s is blocked from your location.' % video_id,
+                expected=True)
+        video_url = self._html_search_regex(
+            r'<source[^>]+src="([^"]+)"', info_webpage, u'video URL')
+
+        upload_date_str = self._html_search_regex(
+            r'"created_at":"([^"]+)"', info_webpage, u'upload date')
+        upload_date = (
+            unified_strdate(upload_date_str)
+            if upload_date_str is not None
+            else None
+        )
+
+        # subtitles
+        video_subtitles = self.extract_subtitles(video_id, info_webpage)
+        if self._downloader.params.get('listsubtitles', False):
+            self._list_available_subtitles(video_id, info_webpage)
+            return
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': video_url,
+            'description': description,
+            'thumbnail': thumbnail,
+            'age_limit': age_limit,
+            'uploader': uploader,
+            'subtitles': video_subtitles,
+            'upload_date': upload_date,
+        }
+
+    def _get_available_subtitles(self, video_id, info_webpage):
+        res = {}
+        for sturl_html in re.findall(r'<track src="([^"]+)"/>', info_webpage):
+            sturl = unescapeHTML(sturl_html)
+            m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl)
+            if not m:
+                continue
+            res[m.group('lang')] = sturl
+        return res
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py

index 2de56ac814462e3c3536ccac34b980b3e9a8bfb5..7d82c2cfa84bd9b57b7ebc9eb35537b4033ba45d 100644 (file)
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -1,3 +1,4 @@
+# encoding: utf-8
  import json
  import re
  import itertools
@@ -10,6 +11,7 @@ from ..utils import (
      clean_html,
      get_element_by_attribute,
      ExtractorError,
+    RegexNotFoundError,
      std_headers,
      unsmuggle_url,
  )
@@ -18,12 +20,12 @@ class VimeoIE(InfoExtractor):
      """Information extractor for vimeo.com."""
  
      # _VALID_URL matches Vimeo URLs
-    _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)/?(?:[?].*)?$'
+    _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|(?P<player>player))\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)/?(?:[?].*)?(?:#.*)?$'
      _NETRC_MACHINE = 'vimeo'
      IE_NAME = u'vimeo'
      _TESTS = [
          {
-            u'url': u'http://vimeo.com/56015672',
+            u'url': u'http://vimeo.com/56015672#at=0',
              u'file': u'56015672.mp4',
              u'md5': u'8879b6cc097e987f02484baf890129e5',
              u'info_dict': {
@@ -54,7 +56,22 @@ class VimeoIE(InfoExtractor):
                  u'title': u'Kathy Sierra: Building the minimum Badass User, Business of Software',
                  u'uploader': u'The BLN & Business of Software',
              },
-        }
+        },
+        {
+            u'url': u'http://vimeo.com/68375962',
+            u'file': u'68375962.mp4',
+            u'md5': u'aaf896bdb7ddd6476df50007a0ac0ae7',
+            u'note': u'Video protected with password',
+            u'info_dict': {
+                u'title': u'youtube-dl password protected test video',
+                u'upload_date': u'20130614',
+                u'uploader_id': u'user18948128',
+                u'uploader': u'Jaime Marquínez Ferrándiz',
+            },
+            u'params': {
+                u'videopassword': u'youtube-dl',
+            },
+        },
      ]
  
      def _login(self):
@@ -111,11 +128,9 @@ class VimeoIE(InfoExtractor):
              raise ExtractorError(u'Invalid URL: %s' % url)
  
          video_id = mobj.group('id')
-        if not mobj.group('proto'):
-            url = 'https://' + url
-        elif mobj.group('pro'):
+        if mobj.group('pro') or mobj.group('player'):
              url = 'http://player.vimeo.com/video/' + video_id
-        elif mobj.group('direct_link'):
+        else:
              url = 'https://vimeo.com/' + video_id
  
          # Retrieve video webpage to extract further information
@@ -129,18 +144,26 @@ class VimeoIE(InfoExtractor):
  
          # Extract the config JSON
          try:
-            config = self._search_regex([r' = {config:({.+?}),assets:', r'c=({.+?);'],
-                webpage, u'info section', flags=re.DOTALL)
-            config = json.loads(config)
-        except:
+            try:
+                config_url = self._html_search_regex(
+                    r' data-config-url="(.+?)"', webpage, u'config URL')
+                config_json = self._download_webpage(config_url, video_id)
+                config = json.loads(config_json)
+            except RegexNotFoundError:
+                # For pro videos or player.vimeo.com urls
+                config = self._search_regex([r' = {config:({.+?}),assets:', r'(?:c|b)=({.+?});'],
+                    webpage, u'info section', flags=re.DOTALL)
+                config = json.loads(config)
+        except Exception as e:
              if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
                  raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
  
-            if re.search('If so please provide the correct password.', webpage):
+            if re.search('<form[^>]+?id="pw_form"', webpage) is not None:
                  self._verify_video_password(url, video_id, webpage)
                  return self._real_extract(url)
              else:
-                raise ExtractorError(u'Unable to extract info section')
+                raise ExtractorError(u'Unable to extract info section',
+                                     cause=e)
  
          # Extract title
          video_title = config["video"]["title"]
@@ -179,47 +202,47 @@ class VimeoIE(InfoExtractor):
  
          # Vimeo specific: extract video codec and quality information
          # First consider quality, then codecs, then take everything
-        # TODO bind to format param
-        codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
-        files = { 'hd': [], 'sd': [], 'other': []}
+        codecs = [('vp6', 'flv'), ('vp8', 'flv'), ('h264', 'mp4')]
+        files = {'hd': [], 'sd': [], 'other': []}
          config_files = config["video"].get("files") or config["request"].get("files")
          for codec_name, codec_extension in codecs:
-            if codec_name in config_files:
-                if 'hd' in config_files[codec_name]:
-                    files['hd'].append((codec_name, codec_extension, 'hd'))
-                elif 'sd' in config_files[codec_name]:
-                    files['sd'].append((codec_name, codec_extension, 'sd'))
+            for quality in config_files.get(codec_name, []):
+                format_id = '-'.join((codec_name, quality)).lower()
+                key = quality if quality in files else 'other'
+                video_url = None
+                if isinstance(config_files[codec_name], dict):
+                    file_info = config_files[codec_name][quality]
+                    video_url = file_info.get('url')
                  else:
-                    files['other'].append((codec_name, codec_extension, config_files[codec_name][0]))
-
-        for quality in ('hd', 'sd', 'other'):
-            if len(files[quality]) > 0:
-                video_quality = files[quality][0][2]
-                video_codec = files[quality][0][0]
-                video_extension = files[quality][0][1]
-                self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
-                break
-        else:
-            raise ExtractorError(u'No known codec found')
+                    file_info = {}
+                if video_url is None:
+                    video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
+                        %(video_id, sig, timestamp, quality, codec_name.upper())
  
-        video_url = None
-        if isinstance(config_files[video_codec], dict):
-            video_url = config_files[video_codec][video_quality].get("url")
-        if video_url is None:
-            video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
-                        %(video_id, sig, timestamp, video_quality, video_codec.upper())
+                files[key].append({
+                    'ext': codec_extension,
+                    'url': video_url,
+                    'format_id': format_id,
+                    'width': file_info.get('width'),
+                    'height': file_info.get('height'),
+                })
+        formats = []
+        for key in ('other', 'sd', 'hd'):
+            formats += files[key]
+        if len(formats) == 0:
+            raise ExtractorError(u'No known codec found')
  
-        return [{
+        return {
              'id':       video_id,
-            'url':      video_url,
              'uploader': video_uploader,
              'uploader_id': video_uploader_id,
              'upload_date':  video_upload_date,
              'title':    video_title,
-            'ext':      video_extension,
              'thumbnail':    video_thumbnail,
              'description':  video_description,
-        }]
+            'formats': formats,
+            'webpage_url': url,
+        }
  
  
  class VimeoChannelIE(InfoExtractor):
diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py

index c4ec1f06ffe3ccce17598aeb319047f0890f9a02..651ba317dcd8fffefb2ac938c9f09d5de8356865 100644 (file)
--- a/youtube_dl/extractor/vine.py
+++ b/youtube_dl/extractor/vine.py
@@ -27,7 +27,7 @@ class VineIE(InfoExtractor):
          video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
              webpage, u'video URL')
  
-        uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
+        uploader = self._html_search_regex(r'<p class="username">(.*?)</p>',
              webpage, u'uploader', fatal=False, flags=re.DOTALL)
  
          return [{
diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py

new file mode 100644 (file)

index 0000000..90d8a6d
--- /dev/null
+++ b/youtube_dl/extractor/vk.py
@@ -0,0 +1,45 @@
+# encoding: utf-8
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_str,
+    unescapeHTML,
+)
+
+
+class VKIE(InfoExtractor):
+    IE_NAME = u'vk.com'
+    _VALID_URL = r'https?://vk\.com/(?:videos.*?\?.*?z=)?video(?P<id>.*?)(?:\?|%2F|$)'
+
+    _TEST = {
+        u'url': u'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521',
+        u'md5': u'0deae91935c54e00003c2a00646315f0',
+        u'info_dict': {
+            u'id': u'162222515',
+            u'ext': u'flv',
+            u'title': u'ProtivoGunz - Хуёвая песня',
+            u'uploader': u'Noize MC',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        info_url = 'http://vk.com/al_video.php?act=show&al=1&video=%s' % video_id
+        info_page = self._download_webpage(info_url, video_id)
+        m_yt = re.search(r'src="(http://www.youtube.com/.*?)"', info_page)
+        if m_yt is not None:
+            self.to_screen(u'Youtube video detected')
+            return self.url_result(m_yt.group(1), 'Youtube')
+        vars_json = self._search_regex(r'var vars = ({.*?});', info_page, u'vars')
+        vars = json.loads(vars_json)
+
+        return {
+            'id': compat_str(vars['vid']),
+            'url': vars['url240'],
+            'title': unescapeHTML(vars['md_title']),
+            'thumbnail': vars['jpg'],
+            'uploader': vars['md_author'],
+        }
diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py

index 0757495bd8a11e22c98c9307d734f9e83529a631..fa784ab994d2b8acede7e4b4496b12779a787de6 100644 (file)
--- a/youtube_dl/extractor/weibo.py
+++ b/youtube_dl/extractor/weibo.py
@@ -13,6 +13,7 @@ class WeiboIE(InfoExtractor):
      _VALID_URL = r'https?://video\.weibo\.com/v/weishipin/t_(?P<id>.+?)\.htm'
  
      _TEST = {
+        u'add_ie': ['Sina'],
          u'url': u'http://video.weibo.com/v/weishipin/t_zjUw2kZ.htm',
          u'file': u'98322879.flv',
          u'info_dict': {
diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py

index 361619694980d3260ff81aeed2d0d07294739a0e..7444d3393a25f8a49778a5bd589aa839591bd9d8 100644 (file)
--- a/youtube_dl/extractor/xhamster.py
+++ b/youtube_dl/extractor/xhamster.py
@@ -19,7 +19,8 @@ class XHamsterIE(InfoExtractor):
          u'info_dict': {
              u"upload_date": u"20121014", 
              u"uploader_id": u"Ruseful2011", 
-            u"title": u"FemaleAgent Shy beauty takes the bait"
+            u"title": u"FemaleAgent Shy beauty takes the bait",
+            u"age_limit": 18,
          }
      },
      {
@@ -27,28 +28,33 @@ class XHamsterIE(InfoExtractor):
          u'file': u'2221348.flv',
          u'md5': u'e767b9475de189320f691f49c679c4c7',
          u'info_dict': {
-            u"upload_date": u"20130914", 
-            u"uploader_id": u"jojo747400", 
-            u"title": u"Britney Spears  Sexy Booty"
+            u"upload_date": u"20130914",
+            u"uploader_id": u"jojo747400",
+            u"title": u"Britney Spears  Sexy Booty",
+            u"age_limit": 18,
          }
      }]
  
      def _real_extract(self,url):
+        def extract_video_url(webpage):
+            mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
+            if mobj is None:
+                raise ExtractorError(u'Unable to extract media URL')
+            if len(mobj.group('server')) == 0:
+                return compat_urllib_parse.unquote(mobj.group('file'))
+            else:
+                return mobj.group('server')+'/key='+mobj.group('file')
+
+        def is_hd(webpage):
+            return webpage.find('<div class=\'icon iconHD\'>') != -1
+
          mobj = re.match(self._VALID_URL, url)
  
          video_id = mobj.group('id')
          seo = mobj.group('seo')
-        mrss_url = 'http://xhamster.com/movies/%s/%s.html?hd' % (video_id, seo)
+        mrss_url = 'http://xhamster.com/movies/%s/%s.html' % (video_id, seo)
          webpage = self._download_webpage(mrss_url, video_id)
  
-        mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract media URL')
-        if len(mobj.group('server')) == 0:
-            video_url = compat_urllib_parse.unquote(mobj.group('file'))
-        else:
-            video_url = mobj.group('server')+'/key='+mobj.group('file')
-
          video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
              webpage, u'title')
  
@@ -72,13 +78,34 @@ class XHamsterIE(InfoExtractor):
          video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
              webpage, u'thumbnail', fatal=False)
  
-        return [{
-            'id':       video_id,
-            'url':      video_url,
-            'ext':      determine_ext(video_url),
-            'title':    video_title,
+        age_limit = self._rta_search(webpage)
+
+        video_url = extract_video_url(webpage)
+        hd = is_hd(webpage)
+        formats = [{
+            'url': video_url,
+            'ext': determine_ext(video_url),
+            'format': 'hd' if hd else 'sd',
+            'format_id': 'hd' if hd else 'sd',
+        }]
+        if not hd:
+            webpage = self._download_webpage(mrss_url+'?hd', video_id)
+            if is_hd(webpage):
+                video_url = extract_video_url(webpage)
+                formats.append({
+                    'url': video_url,
+                    'ext': determine_ext(video_url),
+                    'format': 'hd',
+                    'format_id': 'hd',
+                })
+
+        return {
+            'id': video_id,
+            'title': video_title,
+            'formats': formats,
              'description': video_description,
              'upload_date': video_upload_date,
              'uploader_id': video_uploader_id,
-            'thumbnail': video_thumbnail
-        }]
+            'thumbnail': video_thumbnail,
+            'age_limit': age_limit,
+        }
diff --git a/youtube_dl/extractor/xnxx.py b/youtube_dl/extractor/xnxx.py

index 8a0eb1afdacc4cbe1cbb441b939cff3d7697cf4e..1177a4b14ec04748bebb5ab17db2f0a29c68ca5c 100644 (file)
--- a/youtube_dl/extractor/xnxx.py
+++ b/youtube_dl/extractor/xnxx.py
@@ -9,7 +9,7 @@ from ..utils import (
  
  
  class XNXXIE(InfoExtractor):
-    _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
+    _VALID_URL = r'^(?:https?://)?(?:video|www)\.xnxx\.com/video([0-9]+)/(.*)'
      VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
      VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
      VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py

new file mode 100644 (file)

index 0000000..e3458d2
--- /dev/null
+++ b/youtube_dl/extractor/xtube.py
@@ -0,0 +1,54 @@
+import os
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse_urlparse,
+    compat_urllib_request,
+)
+
+class XTubeIE(InfoExtractor):
+    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<videoid>[^/?&]+))'
+    _TEST = {
+        u'url': u'http://www.xtube.com/watch.php?v=kVTUy_G222_',
+        u'file': u'kVTUy_G222_.mp4',
+        u'md5': u'092fbdd3cbe292c920ef6fc6a8a9cdab',
+        u'info_dict': {
+            u"title": u"strange erotica",
+            u"description": u"surreal gay themed erotica...almost an ET kind of thing",
+            u"uploader": u"greenshowers",
+            u"age_limit": 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('videoid')
+        url = 'http://www.' + mobj.group('url')
+
+        req = compat_urllib_request.Request(url)
+        req.add_header('Cookie', 'age_verified=1')
+        webpage = self._download_webpage(req, video_id)
+
+        video_title = self._html_search_regex(r'<div class="p_5px[^>]*>([^<]+)', webpage, u'title')
+        video_uploader = self._html_search_regex(r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, u'uploader', fatal=False)
+        video_description = self._html_search_regex(r'<p class="video_description">([^<]+)', webpage, u'description', default=None)
+        video_url= self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, u'video_url').replace('\\/', '/')
+        path = compat_urllib_parse_urlparse(video_url).path
+        extension = os.path.splitext(path)[1][1:]
+        format = path.split('/')[5].split('_')[:2]
+        format[0] += 'p'
+        format[1] += 'k'
+        format = "-".join(format)
+
+        return {
+            'id': video_id,
+            'title': video_title,
+            'uploader': video_uploader,
+            'description': video_description,
+            'url': video_url,
+            'ext': extension,
+            'format': format,
+            'format_id': format,
+            'age_limit': 18,
+        }
diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py

index c3b9736d70a7af6fb90cd617312d25fd2d7cc740..90138d7e523a405c20bae8352c6233b5868860f5 100644 (file)
--- a/youtube_dl/extractor/xvideos.py
+++ b/youtube_dl/extractor/xvideos.py
@@ -13,7 +13,8 @@ class XVideosIE(InfoExtractor):
          u'file': u'939581.flv',
          u'md5': u'1d0c835822f0a71a7bf011855db929d0',
          u'info_dict': {
-            u"title": u"Funny Porns By >>>>S<<<<<< -1"
+            u"title": u"Funny Porns By >>>>S<<<<<< -1",
+            u"age_limit": 18,
          }
      }
  
@@ -46,6 +47,7 @@ class XVideosIE(InfoExtractor):
              'ext': 'flv',
              'thumbnail': video_thumbnail,
              'description': None,
+            'age_limit': 18,
          }
  
          return [info]
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py

index 464b498f584c3e42b613a79589b52a4d32fec413..34e6afb20fb6833ab21501785deb54cf5f0a0e24 100644 (file)
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -132,7 +132,7 @@ class YahooSearchIE(SearchInfoExtractor):
                  mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
                  e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
                  res['entries'].append(e)
-            if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
+            if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1)):
                  break
  
          return res
diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py

index 1265639e821bd873b74aeea08811f8c22e966ba1..1fcc518acde9dbb08fef1ccb42a9ee7ae550967a 100644 (file)
--- a/youtube_dl/extractor/youjizz.py
+++ b/youtube_dl/extractor/youjizz.py
@@ -13,7 +13,8 @@ class YouJizzIE(InfoExtractor):
          u'file': u'2189178.flv',
          u'md5': u'07e15fa469ba384c7693fd246905547c',
          u'info_dict': {
-            u"title": u"Zeichentrick 1"
+            u"title": u"Zeichentrick 1",
+            u"age_limit": 18,
          }
      }
  
@@ -25,6 +26,8 @@ class YouJizzIE(InfoExtractor):
          # Get webpage content
          webpage = self._download_webpage(url, video_id)
  
+        age_limit = self._rta_search(webpage)
+
          # Get the video title
          video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
              webpage, u'title').strip()
@@ -60,6 +63,7 @@ class YouJizzIE(InfoExtractor):
                  'title': video_title,
                  'ext': 'flv',
                  'format': 'flv',
-                'player_url': embed_page_url}
+                'player_url': embed_page_url,
+                'age_limit': age_limit}
  
          return [info]
diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py

index 9d88c17f52a25091ea045d2ea0dd6f819da93473..a8fd40c833fb7707eb1cd8760c288da5f2299025 100644 (file)
--- a/youtube_dl/extractor/youku.py
+++ b/youtube_dl/extractor/youku.py
@@ -18,7 +18,7 @@ class YoukuIE(InfoExtractor):
          u"url": u"http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html",
          u"file": u"XNDgyMDQ2NTQw_part00.flv",
          u"md5": u"ffe3f2e435663dc2d1eea34faeff5b5b",
-        u"params": { u"test": False },
+        u"params": {u"test": False},
          u"info_dict": {
              u"title": u"youtube-dl test video \"'/\\ä↭𝕐"
          }
@@ -37,8 +37,8 @@ class YoukuIE(InfoExtractor):
          source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
          seed = float(seed)
          for i in range(len(source)):
-            seed  =  (seed * 211 + 30031 ) % 65536
-            index  =  math.floor(seed / 65536 * len(source) )
+            seed  =  (seed * 211 + 30031) % 65536
+            index  =  math.floor(seed / 65536 * len(source))
              mixed.append(source[int(index)])
              source.remove(source[int(index)])
          #return ''.join(mixed)
diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py

index b1f93dd1bb90d964916394d88d83aaaf153ba15b..bd0f2cae0298dec0d78f812153976ec6a8434bb0 100644 (file)
--- a/youtube_dl/extractor/youporn.py
+++ b/youtube_dl/extractor/youporn.py
@@ -17,7 +17,7 @@ from ..aes import (
  )
  
  class YouPornIE(InfoExtractor):
-    _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
+    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+))'
      _TEST = {
          u'url': u'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',
          u'file': u'505835.mp4',
@@ -26,27 +26,15 @@ class YouPornIE(InfoExtractor):
              u"upload_date": u"20101221", 
              u"description": u"Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?", 
              u"uploader": u"Ask Dan And Jennifer", 
-            u"title": u"Sex Ed: Is It Safe To Masturbate Daily?"
+            u"title": u"Sex Ed: Is It Safe To Masturbate Daily?",
+            u"age_limit": 18,
          }
      }
  
-    def _print_formats(self, formats):
-        """Print all available formats"""
-        print(u'Available formats:')
-        print(u'ext\t\tformat')
-        print(u'---------------------------------')
-        for format in formats:
-            print(u'%s\t\t%s'  % (format['ext'], format['format']))
-
-    def _specific(self, req_format, formats):
-        for x in formats:
-            if x["format"] == req_format:
-                return x
-        return None
-
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
          video_id = mobj.group('videoid')
+        url = 'http://www.' + mobj.group('url')
  
          req = compat_urllib_request.Request(url)
          req.add_header('Cookie', 'age_verified=1')
@@ -70,27 +58,22 @@ class YouPornIE(InfoExtractor):
          except KeyError:
              raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
  
-        # Get all of the formats available
+        # Get all of the links from the page
          DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
          download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
              webpage, u'download list').strip()
-
-        # Get all of the links from the page
-        LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
+        LINK_RE = r'<a href="([^"]+)">'
          links = re.findall(LINK_RE, download_list_html)
-        
-        # Get link of hd video if available
-        mobj = re.search(r'var encryptedQuality720URL = \'(?P<encrypted_video_url>[a-zA-Z0-9+/]+={0,2})\';', webpage)
-        if mobj != None:
-            encrypted_video_url = mobj.group(u'encrypted_video_url')
-            video_url = aes_decrypt_text(encrypted_video_url, video_title, 32).decode('utf-8')
-            links = [video_url] + links
+
+        # Get all encrypted links
+        encrypted_links = re.findall(r'var encryptedQuality[0-9]{3}URL = \'([a-zA-Z0-9+/]+={0,2})\';', webpage)
+        for encrypted_link in encrypted_links:
+            link = aes_decrypt_text(encrypted_link, video_title, 32).decode('utf-8')
+            links.append(link)
          
          if not links:
              raise ExtractorError(u'ERROR: no known formats available for video')
  
-        self.to_screen(u'Links found: %d' % len(links))
-
          formats = []
          for link in links:
  
@@ -98,43 +81,36 @@ class YouPornIE(InfoExtractor):
              # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
              # A path looks like this:
              # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
-            video_url = unescapeHTML( link )
-            path = compat_urllib_parse_urlparse( video_url ).path
-            extension = os.path.splitext( path )[1][1:]
+            video_url = unescapeHTML(link)
+            path = compat_urllib_parse_urlparse(video_url).path
+            extension = os.path.splitext(path)[1][1:]
              format = path.split('/')[4].split('_')[:2]
+
              # size = format[0]
              # bitrate = format[1]
-            format = "-".join( format )
+            format = "-".join(format)
              # title = u'%s-%s-%s' % (video_title, size, bitrate)
  
              formats.append({
-                'id': video_id,
                  'url': video_url,
-                'uploader': video_uploader,
-                'upload_date': upload_date,
-                'title': video_title,
                  'ext': extension,
                  'format': format,
-                'thumbnail': thumbnail,
-                'description': video_description,
-                'age_limit': age_limit,
+                'format_id': format,
              })
  
-        if self._downloader.params.get('listformats', None):
-            self._print_formats(formats)
-            return
-
-        req_format = self._downloader.params.get('format', 'best')
-        self.to_screen(u'Format: %s' % req_format)
-
-        if req_format is None or req_format == 'best':
-            return [formats[0]]
-        elif req_format == 'worst':
-            return [formats[-1]]
-        elif req_format in ('-1', 'all'):
-            return formats
-        else:
-            format = self._specific( req_format, formats )
-            if format is None:
-                raise ExtractorError(u'Requested format not available')
-            return [format]
+        # Sort and remove doubles
+        formats.sort(key=lambda format: list(map(lambda s: s.zfill(6), format['format'].split('-'))))
+        for i in range(len(formats)-1,0,-1):
+            if formats[i]['format_id'] == formats[i-1]['format_id']:
+                del formats[i]
+        
+        return {
+            'id': video_id,
+            'uploader': video_uploader,
+            'upload_date': upload_date,
+            'title': video_title,
+            'thumbnail': thumbnail,
+            'description': video_description,
+            'age_limit': age_limit,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index 143fac98ac93584af73984e5f53baa630d674b13..1bf9cb7d4a73260325f5b8e175b6e57fe87c0c35 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -74,14 +74,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
              self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
              return False
  
-        galx = None
-        dsh = None
-        match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
-        if match:
-          galx = match.group(1)
-        match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
-        if match:
-          dsh = match.group(1)
+        galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
+                                  login_page, u'Login GALX parameter')
  
          # Log in
          login_form_strs = {
@@ -95,7 +89,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
                  u'checkConnection': u'',
                  u'checkedDomains': u'youtube',
                  u'dnConn': u'',
-                u'dsh': dsh,
                  u'pstMsg': u'0',
                  u'rmShown': u'1',
                  u'secTok': u'',
@@ -146,10 +139,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
  
  class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
      IE_DESC = u'YouTube.com'
-    _VALID_URL = r"""^
+    _VALID_URL = r"""(?x)^
                       (
-                         (?:https?://)?                                       # http(s):// (optional)
-                         (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
+                         (?:https?://|//)?                                    # http(s):// or protocol-independent URL (optional)
+                         (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
                              tube\.majestyc\.net/|
                              youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
                           (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
@@ -236,11 +229,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
          '136': 'mp4',
          '137': 'mp4',
          '138': 'mp4',
-        '139': 'mp4',
-        '140': 'mp4',
-        '141': 'mp4',
          '160': 'mp4',
  
+        # Dash mp4 audio
+        '139': 'm4a',
+        '140': 'm4a',
+        '141': 'm4a',
+
          # Dash webm
          '171': 'webm',
          '172': 'webm',
@@ -344,18 +339,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                  u"description": u"test chars:  \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
              }
          },
-        {
-            u"url":  u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
-            u"file":  u"1ltcDfZMA3U.flv",
-            u"note": u"Test VEVO video (#897)",
-            u"info_dict": {
-                u"upload_date": u"20070518",
-                u"title": u"Maps - It Will Find You",
-                u"description": u"Music video by Maps performing It Will Find You.",
-                u"uploader": u"MuteUSA",
-                u"uploader_id": u"MuteUSA"
-            }
-        },
          {
              u"url":  u"http://www.youtube.com/watch?v=UxxajLWwzqY",
              u"file":  u"UxxajLWwzqY.mp4",
@@ -380,6 +363,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                  u"uploader_id": u"justintimberlakeVEVO"
              }
          },
+        {
+            u"url":  u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
+            u"file":  u"yZIXLfi8CZQ.mp4",
+            u"note": u"Embed-only video (#1746)",
+            u"info_dict": {
+                u"upload_date": u"20120608",
+                u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
+                u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
+                u"uploader": u"SET India",
+                u"uploader_id": u"setindia"
+            }
+        },
      ]
  
  
@@ -387,7 +382,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
      def suitable(cls, url):
          """Receives a URL and returns True if suitable for this IE."""
          if YoutubePlaylistIE.suitable(url): return False
-        return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
+        return re.match(cls._VALID_URL, url) is not None
  
      def __init__(self, *args, **kwargs):
          super(YoutubeIE, self).__init__(*args, **kwargs)
@@ -1036,6 +1031,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
          """Turn the encrypted s field into a working signature"""
  
          if player_url is not None:
+            if player_url.startswith(u'//'):
+                player_url = u'https:' + player_url
              try:
                  player_id = (player_url, len(s))
                  if player_id not in self._player_cache:
@@ -1099,7 +1096,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
          else:
              raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
  
-    def _get_available_subtitles(self, video_id):
+    def _get_available_subtitles(self, video_id, webpage):
          try:
              sub_list = self._download_webpage(
                  'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
@@ -1115,8 +1112,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
              params = compat_urllib_parse.urlencode({
                  'lang': lang,
                  'v': video_id,
-                'fmt': self._downloader.params.get('subtitlesformat'),
-                'name': l[0],
+                'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
+                'name': l[0].encode('utf-8'),
              })
              url = u'http://www.youtube.com/api/timedtext?' + params
              sub_lang_list[lang] = url
@@ -1128,7 +1125,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
      def _get_available_automatic_caption(self, video_id, webpage):
          """We need the webpage for getting the captions url, pass it as an
             argument to speed up the process."""
-        sub_format = self._downloader.params.get('subtitlesformat')
+        sub_format = self._downloader.params.get('subtitlesformat', 'srt')
          self.to_screen(u'%s: Looking for automatic captions' % video_id)
          mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
          err_msg = u'Couldn\'t find automatic captions for %s' % video_id
@@ -1150,7 +1147,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
              list_page = self._download_webpage(list_url, video_id)
              caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
              original_lang_node = caption_list.find('track')
-            if not original_lang_node or original_lang_node.attrib.get('kind') != 'asr' :
+            if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
                  self._downloader.report_warning(u'Video doesn\'t have automatic captions')
                  return {}
              original_lang = original_lang_node.attrib['lang_code']
@@ -1287,7 +1284,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
              # We simulate the access to the video from www.youtube.com/v/{video_id}
              # this can be viewed without login into Youtube
              data = compat_urllib_parse.urlencode({'video_id': video_id,
-                                                  'el': 'embedded',
+                                                  'el': 'player_embedded',
                                                    'gl': 'US',
                                                    'hl': 'en',
                                                    'eurl': 'https://youtube.googleapis.com/v/' + video_id,
@@ -1316,6 +1313,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
              else:
                  raise ExtractorError(u'"token" parameter not in video info for unknown reason')
  
+        if 'view_count' in video_info:
+            view_count = int(video_info['view_count'][0])
+        else:
+            view_count = None
+
          # Check for "rental" videos
          if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
              raise ExtractorError(u'"rental" videos not supported')
@@ -1403,32 +1405,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
              # this signatures are encrypted
              if 'url_encoded_fmt_stream_map' not in args:
                  raise ValueError(u'No stream_map present')  # caught below
-            m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
+            re_signature = re.compile(r'[&,]s=')
+            m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
              if m_s is not None:
                  self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
                  video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
-            m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
+            m_s = re_signature.search(args.get('adaptive_fmts', u''))
              if m_s is not None:
-                if 'url_encoded_fmt_stream_map' in video_info:
-                    video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
-                else:
-                    video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
-            elif 'adaptive_fmts' in video_info:
-                if 'url_encoded_fmt_stream_map' in video_info:
-                    video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
+                if 'adaptive_fmts' in video_info:
+                    video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
                  else:
-                    video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
+                    video_info['adaptive_fmts'] = [args['adaptive_fmts']]
          except ValueError:
              pass
  
          if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
              self.report_rtmp_download()
              video_url_list = [(None, video_info['conn'][0])]
-        elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
-            if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
+        elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
+            encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
+            if 'rtmpe%3Dyes' in encoded_url_map:
                  raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
              url_map = {}
-            for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
+            for url_data_str in encoded_url_map.split(','):
                  url_data = compat_parse_qs(url_data_str)
                  if 'itag' in url_data and 'url' in url_data:
                      url = url_data['url'][0]
@@ -1481,13 +1480,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
              raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
  
          results = []
-        for format_param, video_real_url in video_url_list:
+        for itag, video_real_url in video_url_list:
              # Extension
-            video_extension = self._video_extensions.get(format_param, 'flv')
+            video_extension = self._video_extensions.get(itag, 'flv')
  
-            video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
-                                              self._video_dimensions.get(format_param, '???'),
-                                              ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
+            video_format = '{0} - {1}{2}'.format(itag if itag else video_extension,
+                                              self._video_dimensions.get(itag, '???'),
+                                              ' ('+self._special_itags[itag]+')' if itag in self._special_itags else '')
  
              results.append({
                  'id':       video_id,
@@ -1498,17 +1497,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                  'title':    video_title,
                  'ext':      video_extension,
                  'format':   video_format,
+                'format_id': itag,
                  'thumbnail':    video_thumbnail,
                  'description':  video_description,
                  'player_url':   player_url,
                  'subtitles':    video_subtitles,
                  'duration':     video_duration,
                  'age_limit':    18 if age_gate else 0,
-                'annotations':  video_annotations
+                'annotations':  video_annotations,
+                'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
+                'view_count': view_count,
              })
          return results
  
-class YoutubePlaylistIE(InfoExtractor):
+class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
      IE_DESC = u'YouTube.com playlists'
      _VALID_URL = r"""(?:
                          (?:https?://)?
@@ -1524,8 +1526,9 @@ class YoutubePlaylistIE(InfoExtractor):
                       |
                          ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
                       )"""
-    _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
-    _MAX_RESULTS = 50
+    _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
+    _MORE_PAGES_INDICATOR = r'data-link-type="next"'
+    _VIDEO_RE = r'href="/watch\?v=([0-9A-Za-z_-]{11})&amp;'
      IE_NAME = u'youtube:playlist'
  
      @classmethod
@@ -1533,6 +1536,9 @@ class YoutubePlaylistIE(InfoExtractor):
          """Receives a URL and returns True if suitable for this IE."""
          return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
  
+    def _real_initialize(self):
+        self._login()
+
      def _real_extract(self, url):
          # Extract playlist id
          mobj = re.match(self._VALID_URL, url, re.VERBOSE)
@@ -1546,51 +1552,33 @@ class YoutubePlaylistIE(InfoExtractor):
              video_id = query_dict['v'][0]
              if self._downloader.params.get('noplaylist'):
                  self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
-                return self.url_result('https://www.youtube.com/watch?v=' + video_id, 'Youtube')
+                return self.url_result(video_id, 'Youtube', video_id=video_id)
              else:
                  self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
  
-        # Download playlist videos from API
-        videos = []
+        # Extract the video ids from the playlist pages
+        ids = []
  
          for page_num in itertools.count(1):
-            start_index = self._MAX_RESULTS * (page_num - 1) + 1
-            if start_index >= 1000:
-                self._downloader.report_warning(u'Max number of results reached')
-                break
-            url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
+            url = self._TEMPLATE_URL % (playlist_id, page_num)
              page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
+            # The ids are duplicated
+            new_ids = orderedSet(re.findall(self._VIDEO_RE, page))
+            ids.extend(new_ids)
  
-            try:
-                response = json.loads(page)
-            except ValueError as err:
-                raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
-
-            if 'feed' not in response:
-                raise ExtractorError(u'Got a malformed response from YouTube API')
-            playlist_title = response['feed']['title']['$t']
-            if 'entry' not in response['feed']:
-                # Number of videos is a multiple of self._MAX_RESULTS
+            if re.search(self._MORE_PAGES_INDICATOR, page) is None:
                  break
  
-            for entry in response['feed']['entry']:
-                index = entry['yt$position']['$t']
-                if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
-                    videos.append((
-                        index,
-                        'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
-                    ))
-
-        videos = [v[1] for v in sorted(videos)]
+        playlist_title = self._og_search_title(page)
  
-        url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
-        return [self.playlist_result(url_results, playlist_id, playlist_title)]
+        url_results = [self.url_result(vid_id, 'Youtube', video_id=vid_id)
+                       for vid_id in ids]
+        return self.playlist_result(url_results, playlist_id, playlist_title)
  
  
  class YoutubeChannelIE(InfoExtractor):
      IE_DESC = u'YouTube.com channels'
      _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
-    _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
      _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
      _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
      IE_NAME = u'youtube:channel'
@@ -1611,36 +1599,37 @@ class YoutubeChannelIE(InfoExtractor):
          # Download channel page
          channel_id = mobj.group(1)
          video_ids = []
-        pagenum = 1
-
-        url = self._TEMPLATE_URL % (channel_id, pagenum)
-        page = self._download_webpage(url, channel_id,
-                                      u'Downloading page #%s' % pagenum)
-
-        # Extract video identifiers
-        ids_in_page = self.extract_videos_from_page(page)
-        video_ids.extend(ids_in_page)
+        url = 'https://www.youtube.com/channel/%s/videos' % channel_id
+        channel_page = self._download_webpage(url, channel_id)
+        if re.search(r'channel-header-autogenerated-label', channel_page) is not None:
+            autogenerated = True
+        else:
+            autogenerated = False
  
-        # Download any subsequent channel pages using the json-based channel_ajax query
-        if self._MORE_PAGES_INDICATOR in page:
+        if autogenerated:
+            # The videos are contained in a single page
+            # the ajax pages can't be used, they are empty
+            video_ids = self.extract_videos_from_page(channel_page)
+        else:
+            # Download all channel pages using the json-based channel_ajax query
              for pagenum in itertools.count(1):
                  url = self._MORE_PAGES_URL % (pagenum, channel_id)
                  page = self._download_webpage(url, channel_id,
                                                u'Downloading page #%s' % pagenum)
-
+    
                  page = json.loads(page)
-
+    
                  ids_in_page = self.extract_videos_from_page(page['content_html'])
                  video_ids.extend(ids_in_page)
-
-                if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
+    
+                if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
                      break
  
          self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
  
-        urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
-        url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
-        return [self.playlist_result(url_entries, channel_id)]
+        url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
+                       for video_id in video_ids]
+        return self.playlist_result(url_entries, channel_id)
  
  
  class YoutubeUserIE(InfoExtractor):
@@ -1704,9 +1693,11 @@ class YoutubeUserIE(InfoExtractor):
              if len(ids_in_page) < self._GDATA_PAGE_SIZE:
                  break
  
-        urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
-        url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
-        return [self.playlist_result(url_results, playlist_title = username)]
+        url_results = [
+            self.url_result(video_id, 'Youtube', video_id=video_id)
+            for video_id in video_ids]
+        return self.playlist_result(url_results, playlist_title=username)
+
  
  class YoutubeSearchIE(SearchInfoExtractor):
      IE_DESC = u'YouTube.com searches'
@@ -1747,9 +1738,14 @@ class YoutubeSearchIE(SearchInfoExtractor):
  
          if len(video_ids) > n:
              video_ids = video_ids[:n]
-        videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
+        videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
+                  for video_id in video_ids]
          return self.playlist_result(videos, query)
  
+class YoutubeSearchDateIE(YoutubeSearchIE):
+    _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
+    _SEARCH_KEY = 'ytsearchdate'
+    IE_DESC = u'YouTube.com searches, newest videos first'
  
  class YoutubeShowIE(InfoExtractor):
      IE_DESC = u'YouTube.com (multi-season) shows'
@@ -1803,7 +1799,9 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
              feed_html = info['feed_html']
              m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
              ids = orderedSet(m.group(1) for m in m_ids)
-            feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
+            feed_entries.extend(
+                self.url_result(video_id, 'Youtube', video_id=video_id)
+                for video_id in ids)
              if info['paging'] is None:
                  break
          return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
@@ -1828,6 +1826,20 @@ class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
      _PAGING_STEP = 100
      _PERSONAL_FEED = True
  
+class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
+    IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
+    _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
+    _FEED_NAME = 'history'
+    _PERSONAL_FEED = True
+    _PLAYLIST_TITLE = u'Youtube Watch History'
+
+    def _real_extract(self, url):
+        webpage = self._download_webpage('https://www.youtube.com/feed/history', u'History')
+        data_paging = self._search_regex(r'data-paging="(\d+)"', webpage, u'data-paging')
+        # The step is actually a ridiculously big number (like 1374343569725646)
+        self._PAGING_STEP = int(data_paging)
+        return super(YoutubeHistoryIE, self)._real_extract(url)
+
  class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
      IE_NAME = u'youtube:favorites'
      IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py

index faed7ff7f0511c666795a20f48eefd4dc96c7009..07f830e80793e5ac08432831e1179708461e22fc 100644 (file)
--- a/youtube_dl/extractor/zdf.py
+++ b/youtube_dl/extractor/zdf.py
@@ -1,75 +1,111 @@
+import operator
  import re
  
  from .common import InfoExtractor
  from ..utils import (
-    determine_ext,
-    ExtractorError,
+    parse_xml_doc,
+    unified_strdate,
  )
  
  
  class ZDFIE(InfoExtractor):
      _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek(?P<hash>#)?\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
-    _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
          video_id = mobj.group('video_id')
  
-        if mobj.group('hash'):
-            url = url.replace(u'#', u'', 1)
+        xml_url = u'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
+        info_xml = self._download_webpage(
+            xml_url, video_id, note=u'Downloading video info')
+        doc = parse_xml_doc(info_xml)
  
-        html = self._download_webpage(url, video_id)
-        streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
-        if streams is None:
-            raise ExtractorError(u'No media url found.')
+        title = doc.find('.//information/title').text
+        description = doc.find('.//information/detail').text
+        uploader_node = doc.find('.//details/originChannelTitle')
+        uploader = None if uploader_node is None else uploader_node.text
+        duration_str = doc.find('.//details/length').text
+        duration_m = re.match(r'''(?x)^
+            (?P<hours>[0-9]{2})
+            :(?P<minutes>[0-9]{2})
+            :(?P<seconds>[0-9]{2})
+            (?:\.(?P<ms>[0-9]+)?)
+            ''', duration_str)
+        duration = (
+            (
+                (int(duration_m.group('hours')) * 60 * 60) +
+                (int(duration_m.group('minutes')) * 60) +
+                int(duration_m.group('seconds'))
+            )
+            if duration_m
+            else None
+        )
+        upload_date = unified_strdate(doc.find('.//details/airtime').text)
  
-        # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
-        # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
-        # choose first/default media type and highest quality for now
-        def stream_pref(s):
-            TYPE_ORDER = ['ostreaming', 'hstreaming', 'wstreaming']
+        def xml_to_format(fnode):
+            video_url = fnode.find('url').text
+            is_available = u'http://www.metafilegenerator' not in video_url
+
+            format_id = fnode.attrib['basetype']
+            format_m = re.match(r'''(?x)
+                (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_
+                (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+)
+            ''', format_id)
+
+            ext = format_m.group('container')
+            is_supported = ext != 'f4f'
+
+            PROTO_ORDER = ['http', 'rtmp', 'rtsp']
              try:
-                type_pref = TYPE_ORDER.index(s['media_type'])
+                proto_pref = -PROTO_ORDER.index(format_m.group('proto'))
              except ValueError:
-                type_pref = 999
+                proto_pref = 999
  
-            QUALITY_ORDER = ['veryhigh', '300']
+            quality = fnode.find('./quality').text
+            QUALITY_ORDER = ['veryhigh', '300', 'high', 'med', 'low']
              try:
-                quality_pref = QUALITY_ORDER.index(s['quality'])
+                quality_pref = -QUALITY_ORDER.index(quality)
              except ValueError:
                  quality_pref = 999
  
-            return (type_pref, quality_pref)
-
-        sorted_streams = sorted(streams, key=stream_pref)
-        if not sorted_streams:
-            raise ExtractorError(u'No stream found.')
-        stream = sorted_streams[0]
-
-        media_link = self._download_webpage(
-            stream['video_url'],
-            video_id,
-            u'Get stream URL')
+            abr = int(fnode.find('./audioBitrate').text) // 1000
+            vbr = int(fnode.find('./videoBitrate').text) // 1000
+            pref = (is_available, is_supported,
+                    proto_pref, quality_pref, vbr, abr)
  
-        MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
-        RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
+            format_note = u''
+            if not is_supported:
+                format_note += u'(unsupported)'
+            if not format_note:
+                format_note = None
  
-        mobj = re.search(self._MEDIA_STREAM, media_link)
-        if mobj is None:
-            mobj = re.search(RTSP_STREAM, media_link)
-            if mobj is None:
-                raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
-        video_url = mobj.group('video_url')
+            return {
+                'format_id': format_id + u'-' + quality,
+                'url': video_url,
+                'ext': ext,
+                'acodec': format_m.group('acodec'),
+                'vcodec': format_m.group('vcodec'),
+                'abr': abr,
+                'vbr': vbr,
+                'width': int(fnode.find('./width').text),
+                'height': int(fnode.find('./height').text),
+                'filesize': int(fnode.find('./filesize').text),
+                'format_note': format_note,
+                '_pref': pref,
+                '_available': is_available,
+            }
  
-        title = self._html_search_regex(
-            r'<h1(?: class="beitragHeadline")?>(.*?)</h1>',
-            html, u'title')
+        format_nodes = doc.findall('.//formitaeten/formitaet')
+        formats = sorted(filter(lambda f: f['_available'],
+                                map(xml_to_format, format_nodes)),
+                         key=operator.itemgetter('_pref'))
  
          return {
              'id': video_id,
-            'url': video_url,
              'title': title,
-            'ext': determine_ext(video_url)
+            'formats': formats,
+            'description': description,
+            'uploader': uploader,
+            'duration': duration,
+            'upload_date': upload_date,
          }
diff --git a/youtube_dl/update.py b/youtube_dl/update.py

index 0689a4891200bf2a03024b96ec2ecda5d857efb1..cd9670166e582ae9f3074c2371026f0f06c252a1 100644 (file)
--- a/youtube_dl/update.py
+++ b/youtube_dl/update.py
@@ -2,11 +2,15 @@ import io
  import json
  import traceback
  import hashlib
+import os
  import subprocess
  import sys
  from zipimport import zipimporter
  
-from .utils import *
+from .utils import (
+    compat_str,
+    compat_urllib_request,
+)
  from .version import __version__
  
  def rsa_verify(message, signature, key):
@@ -37,6 +41,7 @@ def rsa_verify(message, signature, key):
      if signature != sha256(message).digest(): return False
      return True
  
+
  def update_self(to_screen, verbose):
      """Update the program file with the latest version from the repository"""
  
@@ -78,6 +83,13 @@ def update_self(to_screen, verbose):
          return
  
      version_id = versions_info['latest']
+
+    def version_tuple(version_str):
+        return tuple(map(int, version_str.split('.')))
+    if version_tuple(__version__) >= version_tuple(version_id):
+        to_screen(u'youtube-dl is up to date (%s)' % __version__)
+        return
+
      to_screen(u'Updating to version ' + version_id + '...')
      version = versions_info['versions'][version_id]
  
@@ -105,7 +117,7 @@ def update_self(to_screen, verbose):
              urlh = compat_urllib_request.urlopen(version['exe'][0])
              newcontent = urlh.read()
              urlh.close()
-        except (IOError, OSError) as err:
+        except (IOError, OSError):
              if verbose: to_screen(compat_str(traceback.format_exc()))
              to_screen(u'ERROR: unable to download latest version')
              return
@@ -118,7 +130,7 @@ def update_self(to_screen, verbose):
          try:
              with open(exe + '.new', 'wb') as outf:
                  outf.write(newcontent)
-        except (IOError, OSError) as err:
+        except (IOError, OSError):
              if verbose: to_screen(compat_str(traceback.format_exc()))
              to_screen(u'ERROR: unable to write the new version')
              return
@@ -137,7 +149,7 @@ start /b "" cmd /c del "%%~f0"&exit /b"
  
              subprocess.Popen([bat])  # Continues to run in the background
              return  # Do not show premature success messages
-        except (IOError, OSError) as err:
+        except (IOError, OSError):
              if verbose: to_screen(compat_str(traceback.format_exc()))
              to_screen(u'ERROR: unable to overwrite current version')
              return
@@ -148,7 +160,7 @@ start /b "" cmd /c del "%%~f0"&exit /b"
              urlh = compat_urllib_request.urlopen(version['bin'][0])
              newcontent = urlh.read()
              urlh.close()
-        except (IOError, OSError) as err:
+        except (IOError, OSError):
              if verbose: to_screen(compat_str(traceback.format_exc()))
              to_screen(u'ERROR: unable to download latest version')
              return
@@ -161,7 +173,7 @@ start /b "" cmd /c del "%%~f0"&exit /b"
          try:
              with open(filename, 'wb') as outf:
                  outf.write(newcontent)
-        except (IOError, OSError) as err:
+        except (IOError, OSError):
              if verbose: to_screen(compat_str(traceback.format_exc()))
              to_screen(u'ERROR: unable to overwrite current version')
              return
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index bfb8f6bcd971dad03d5236c8e607b59ff81c667a..946e90e93061bb7ff26b46595dfd7930974323b9 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -8,13 +8,16 @@ import gzip
  import io
  import json
  import locale
+import math
  import os
  import pipes
  import platform
  import re
+import ssl
  import socket
  import sys
  import traceback
+import xml.etree.ElementTree
  import zlib
  
  try:
@@ -535,17 +538,34 @@ def formatSeconds(secs):
      else:
          return '%d' % secs
  
-def make_HTTPS_handler(opts):
-    if sys.version_info < (3,2):
-        # Python's 2.x handler is very simplistic
-        return compat_urllib_request.HTTPSHandler()
+def make_HTTPS_handler(opts_no_check_certificate):
+    if sys.version_info < (3, 2):
+        import httplib
+
+        class HTTPSConnectionV3(httplib.HTTPSConnection):
+            def __init__(self, *args, **kwargs):
+                httplib.HTTPSConnection.__init__(self, *args, **kwargs)
+
+            def connect(self):
+                sock = socket.create_connection((self.host, self.port), self.timeout)
+                if self._tunnel_host:
+                    self.sock = sock
+                    self._tunnel()
+                try:
+                    self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
+                except ssl.SSLError:
+                    self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
+
+        class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
+            def https_open(self, req):
+                return self.do_open(HTTPSConnectionV3, req)
+        return HTTPSHandlerV3()
      else:
-        import ssl
-        context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
+        context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
          context.set_default_verify_paths()
          
          context.verify_mode = (ssl.CERT_NONE
-                               if opts.no_check_certificate
+                               if opts_no_check_certificate
                                 else ssl.CERT_REQUIRED)
          return compat_urllib_request.HTTPSHandler(context=context)
  
@@ -572,6 +592,11 @@ class ExtractorError(Exception):
          return u''.join(traceback.format_tb(self.traceback))
  
  
+class RegexNotFoundError(ExtractorError):
+    """Error when a regex didn't match"""
+    pass
+
+
  class DownloadError(Exception):
      """Download Error exception.
  
@@ -729,6 +754,8 @@ def unified_strdate(date_str):
          '%Y/%m/%d %H:%M:%S',
          '%d.%m.%Y %H:%M',
          '%Y-%m-%dT%H:%M:%SZ',
+        '%Y-%m-%dT%H:%M:%S.%fZ',
+        '%Y-%m-%dT%H:%M:%S.%f0Z',
          '%Y-%m-%dT%H:%M:%S',
      ]
      for expression in format_expressions:
@@ -944,7 +971,16 @@ class locked_file(object):
  
  
  def shell_quote(args):
-    return ' '.join(map(pipes.quote, args))
+    quoted_args = []
+    encoding = sys.getfilesystemencoding()
+    if encoding is None:
+        encoding = 'utf-8'
+    for a in args:
+        if isinstance(a, bytes):
+            # We may get a filename encoded with 'encodeFilename'
+            a = a.decode(encoding)
+        quoted_args.append(pipes.quote(a))
+    return u' '.join(quoted_args)
  
  
  def takewhile_inclusive(pred, seq):
@@ -971,3 +1007,22 @@ def unsmuggle_url(smug_url):
      jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
      data = json.loads(jsond)
      return url, data
+
+
+def parse_xml_doc(s):
+    assert isinstance(s, type(u''))
+    return xml.etree.ElementTree.fromstring(s.encode('utf-8'))
+
+
+def format_bytes(bytes):
+    if bytes is None:
+        return u'N/A'
+    if type(bytes) is str:
+        bytes = float(bytes)
+    if bytes == 0.0:
+        exponent = 0
+    else:
+        exponent = int(math.log(bytes, 1024.0))
+    suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
+    converted = float(bytes) / float(1024 ** exponent)
+    return u'%.2f%s' % (converted, suffix)
diff --git a/youtube_dl/version.py b/youtube_dl/version.py

index e2cc6423dc79d17d5095f90ad2ecb1ef1187db35..2af23040fabf1594e0e597b6c28ec0cff1a446d6 100644 (file)
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,2 +1,2 @@
  
-__version__ = '2013.10.18.2'
+__version__ = '2013.11.25.1'
author	Philipp Hagemeister <phihag@phihag.de>
	Mon, 25 Nov 2013 05:19:15 +0000 (06:19 +0100)
committer	Philipp Hagemeister <phihag@phihag.de>
	Mon, 25 Nov 2013 05:19:15 +0000 (06:19 +0100)
README.md		patch \| blob \| history
devscripts/check-porn.py	[new file with mode: 0644]	patch \| blob
setup.py		patch \| blob \| history
test/helper.py		patch \| blob \| history
test/test_YoutubeDL.py		patch \| blob \| history
test/test_age_restriction.py		patch \| blob \| history
test/test_all_urls.py		patch \| blob \| history
test/test_dailymotion_subtitles.py	[deleted file]	patch \| blob \| history
test/test_download.py		patch \| blob \| history
test/test_playlists.py		patch \| blob \| history
test/test_subtitles.py	[new file with mode: 0644]	patch \| blob
test/test_utils.py		patch \| blob \| history
test/test_write_annotations.py		patch \| blob \| history
test/test_write_info_json.py		patch \| blob \| history
test/test_youtube_lists.py		patch \| blob \| history
test/test_youtube_signature.py		patch \| blob \| history
test/test_youtube_subtitles.py	[deleted file]	patch \| blob \| history
youtube_dl/FileDownloader.py		patch \| blob \| history
youtube_dl/PostProcessor.py		patch \| blob \| history
youtube_dl/YoutubeDL.py		patch \| blob \| history
youtube_dl/__init__.py		patch \| blob \| history
youtube_dl/extractor/__init__.py		patch \| blob \| history
youtube_dl/extractor/addanime.py		patch \| blob \| history
youtube_dl/extractor/anitube.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/arte.py		patch \| blob \| history
youtube_dl/extractor/auengine.py		patch \| blob \| history
youtube_dl/extractor/bambuser.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/bandcamp.py		patch \| blob \| history
youtube_dl/extractor/brightcove.py		patch \| blob \| history
youtube_dl/extractor/canalc2.py		patch \| blob \| history
youtube_dl/extractor/canalplus.py		patch \| blob \| history
youtube_dl/extractor/cinemassacre.py		patch \| blob \| history
youtube_dl/extractor/clipfish.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/cnn.py		patch \| blob \| history
youtube_dl/extractor/collegehumor.py		patch \| blob \| history
youtube_dl/extractor/comedycentral.py		patch \| blob \| history
youtube_dl/extractor/common.py		patch \| blob \| history
youtube_dl/extractor/d8.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/dailymotion.py		patch \| blob \| history
youtube_dl/extractor/depositfiles.py		patch \| blob \| history
youtube_dl/extractor/eighttracks.py		patch \| blob \| history
youtube_dl/extractor/eitb.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/escapist.py		patch \| blob \| history
youtube_dl/extractor/exfm.py		patch \| blob \| history
youtube_dl/extractor/extremetube.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/facebook.py		patch \| blob \| history
youtube_dl/extractor/faz.py		patch \| blob \| history
youtube_dl/extractor/fktv.py		patch \| blob \| history
youtube_dl/extractor/gamekings.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/gamespot.py		patch \| blob \| history
youtube_dl/extractor/generic.py		patch \| blob \| history
youtube_dl/extractor/googleplus.py		patch \| blob \| history
youtube_dl/extractor/howcast.py		patch \| blob \| history
youtube_dl/extractor/hypem.py		patch \| blob \| history
youtube_dl/extractor/instagram.py		patch \| blob \| history
youtube_dl/extractor/internetvideoarchive.py		patch \| blob \| history
youtube_dl/extractor/jeuxvideo.py		patch \| blob \| history
youtube_dl/extractor/kankan.py		patch \| blob \| history
youtube_dl/extractor/keezmovies.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/livestream.py		patch \| blob \| history
youtube_dl/extractor/metacafe.py		patch \| blob \| history
youtube_dl/extractor/mixcloud.py		patch \| blob \| history
youtube_dl/extractor/mofosex.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/mtv.py		patch \| blob \| history
youtube_dl/extractor/myspace.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/nhl.py		patch \| blob \| history
youtube_dl/extractor/niconico.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/nowvideo.py		patch \| blob \| history
youtube_dl/extractor/pornhub.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/pornotube.py		patch \| blob \| history
youtube_dl/extractor/redtube.py		patch \| blob \| history
youtube_dl/extractor/rtlnow.py		patch \| blob \| history
youtube_dl/extractor/slashdot.py		patch \| blob \| history
youtube_dl/extractor/soundcloud.py		patch \| blob \| history
youtube_dl/extractor/southparkstudios.py		patch \| blob \| history
youtube_dl/extractor/space.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/spankwire.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/spiegel.py		patch \| blob \| history
youtube_dl/extractor/streamcloud.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/subtitles.py		patch \| blob \| history
youtube_dl/extractor/sztvhu.py		patch \| blob \| history
youtube_dl/extractor/teamcoco.py		patch \| blob \| history
youtube_dl/extractor/ted.py		patch \| blob \| history
youtube_dl/extractor/toutv.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/tube8.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/tvp.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/vevo.py		patch \| blob \| history
youtube_dl/extractor/viddler.py		patch \| blob \| history
youtube_dl/extractor/videodetective.py		patch \| blob \| history
youtube_dl/extractor/videopremium.py		patch \| blob \| history
youtube_dl/extractor/viki.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/vimeo.py		patch \| blob \| history
youtube_dl/extractor/vine.py		patch \| blob \| history
youtube_dl/extractor/vk.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/weibo.py		patch \| blob \| history
youtube_dl/extractor/xhamster.py		patch \| blob \| history
youtube_dl/extractor/xnxx.py		patch \| blob \| history
youtube_dl/extractor/xtube.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/xvideos.py		patch \| blob \| history
youtube_dl/extractor/yahoo.py		patch \| blob \| history
youtube_dl/extractor/youjizz.py		patch \| blob \| history
youtube_dl/extractor/youku.py		patch \| blob \| history
youtube_dl/extractor/youporn.py		patch \| blob \| history
youtube_dl/extractor/youtube.py		patch \| blob \| history
youtube_dl/extractor/zdf.py		patch \| blob \| history
youtube_dl/update.py		patch \| blob \| history
youtube_dl/utils.py		patch \| blob \| history
youtube_dl/version.py		patch \| blob \| history