Merge remote-tracking branch 'riking/twofactor'

author Philipp Hagemeister <phihag@phihag.de>

Sun, 24 Aug 2014 05:14:23 +0000 (07:14 +0200)

committer Philipp Hagemeister <phihag@phihag.de>

Sun, 24 Aug 2014 05:14:23 +0000 (07:14 +0200)
author Philipp Hagemeister <phihag@phihag.de>
Sun, 24 Aug 2014 05:14:23 +0000 (07:14 +0200)
committer Philipp Hagemeister <phihag@phihag.de>
Sun, 24 Aug 2014 05:14:23 +0000 (07:14 +0200)
diff --git a/Makefile b/Makefile

index c079761efa9b2e60887575f4cd7626d0abe469a2..088a9320bddfd367babd928bc96c71f3eaa4d9de 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -6,10 +6,10 @@ clean:
  cleanall: clean
         rm -f youtube-dl youtube-dl.exe
  
-PREFIX=/usr/local
-BINDIR=$(PREFIX)/bin
-MANDIR=$(PREFIX)/man
-PYTHON=/usr/bin/env python
+PREFIX ?= /usr/local
+BINDIR ?= $(PREFIX)/bin
+MANDIR ?= $(PREFIX)/man
+PYTHON ?= /usr/bin/env python
  
  # set SYSCONFDIR to /etc if PREFIX=/usr or PREFIX=/usr/local
  ifeq ($(PREFIX),/usr)
diff --git a/README.md b/README.md

index a42dfb8567ffb86e926756aec59cbee94cee29bf..72449cad71a09f855088a9df8b3ac1df9e5bcc18 100644 (file)
--- a/README.md
+++ b/README.md
@@ -17,6 +17,14 @@ If you do not have curl, you can alternatively use a recent wget:
  
  Windows users can [download a .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in their home directory or any other location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29).
  
+OS X users can install **youtube-dl** with [Homebrew](http://brew.sh/).
+
+    brew install youtube-dl
+
+You can also use pip:
+
+    sudo pip install youtube-dl
+
  Alternatively, refer to the developer instructions below for how to check out and work with the git repository. For further options, including PGP signatures, see https://rg3.github.io/youtube-dl/download.html .
  
  # DESCRIPTION
@@ -303,10 +311,12 @@ The current default template is `%(title)s-%(id)s.%(ext)s`.
  
  In some cases, you don't want special characters such as 中, spaces, or &, such as when transferring the downloaded filename to a Windows system or the filename through an 8bit-unsafe channel. In these cases, add the `--restrict-filenames` flag to get a shorter title:
  
-    $ youtube-dl --get-filename -o "%(title)s.%(ext)s" BaW_jenozKc
-    youtube-dl test video ''_ä↭𝕐.mp4    # All kinds of weird characters
-    $ youtube-dl --get-filename -o "%(title)s.%(ext)s" BaW_jenozKc --restrict-filenames
-    youtube-dl_test_video_.mp4          # A simple file name
+```bash
+$ youtube-dl --get-filename -o "%(title)s.%(ext)s" BaW_jenozKc
+youtube-dl test video ''_ä↭𝕐.mp4    # All kinds of weird characters
+$ youtube-dl --get-filename -o "%(title)s.%(ext)s" BaW_jenozKc --restrict-filenames
+youtube-dl_test_video_.mp4          # A simple file name
+```
  
  # VIDEO SELECTION
  
@@ -317,14 +327,16 @@ Videos can be filtered by their upload date using the options `--date`, `--dateb
   
  Examples:
  
-    # Download only the videos uploaded in the last 6 months
-    $ youtube-dl --dateafter now-6months
+```bash
+# Download only the videos uploaded in the last 6 months
+$ youtube-dl --dateafter now-6months
  
-    # Download only the videos uploaded on January 1, 1970
-    $ youtube-dl --date 19700101
+# Download only the videos uploaded on January 1, 1970
+$ youtube-dl --date 19700101
  
-    $ # will only download the videos uploaded in the 200x decade
-    $ youtube-dl --dateafter 20000101 --datebefore 20091231
+$ # will only download the videos uploaded in the 200x decade
+$ youtube-dl --dateafter 20000101 --datebefore 20091231
+```
  
  # FAQ
  
@@ -399,49 +411,49 @@ If you want to add support for a new site, you can follow this quick list (assum
  2. Check out the source code with `git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git`
  3. Start a new git branch with `cd youtube-dl; git checkout -b yourextractor`
  4. Start with this simple template and save it to `youtube_dl/extractor/yourextractor.py`:
-
-        # coding: utf-8
-        from __future__ import unicode_literals
-
-        import re
-
-        from .common import InfoExtractor
-        
-        
-        class YourExtractorIE(InfoExtractor):
-            _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P<id>[0-9]+)'
-            _TEST = {
-                'url': 'http://yourextractor.com/watch/42',
-                'md5': 'TODO: md5 sum of the first 10KiB of the video file',
-                'info_dict': {
-                    'id': '42',
-                    'ext': 'mp4',
-                    'title': 'Video title goes here',
-                    # TODO more properties, either as:
-                    # * A value
-                    # * MD5 checksum; start the string with md5:
-                    # * A regular expression; start the string with re:
-                    # * Any Python type (for example int or float)
-                }
+    ```python
+    # coding: utf-8
+    from __future__ import unicode_literals
+
+    import re
+
+    from .common import InfoExtractor
+
+
+    class YourExtractorIE(InfoExtractor):
+        _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P<id>[0-9]+)'
+        _TEST = {
+            'url': 'http://yourextractor.com/watch/42',
+            'md5': 'TODO: md5 sum of the first 10KiB of the video file',
+            'info_dict': {
+                'id': '42',
+                'ext': 'mp4',
+                'title': 'Video title goes here',
+                'thumbnail': 're:^https?://.*\.jpg$',
+                # TODO more properties, either as:
+                # * A value
+                # * MD5 checksum; start the string with md5:
+                # * A regular expression; start the string with re:
+                # * Any Python type (for example int or float)
              }
+        }
  
-            def _real_extract(self, url):
-                mobj = re.match(self._VALID_URL, url)
-                video_id = mobj.group('id')
-
-                # TODO more code goes here, for example ...
-                webpage = self._download_webpage(url, video_id)
-                title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
-
-                return {
-                    'id': video_id,
-                    'title': title,
-                    # TODO more properties (see youtube_dl/extractor/common.py)
-                }
+        def _real_extract(self, url):
+            mobj = re.match(self._VALID_URL, url)
+            video_id = mobj.group('id')
  
+            # TODO more code goes here, for example ...
+            webpage = self._download_webpage(url, video_id)
+            title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
  
+            return {
+                'id': video_id,
+                'title': title,
+                # TODO more properties (see youtube_dl/extractor/common.py)
+            }
+    ```
  5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py).
-6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done.
+6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will be then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc.
  7. Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Add tests and code for as many as you want.
  8. If you can, check the code with [pyflakes](https://pypi.python.org/pypi/pyflakes) (a good idea) and [pep8](https://pypi.python.org/pypi/pep8) (optional, ignore E501).
  9. When the tests pass, [add](https://www.kernel.org/pub/software/scm/git/docs/git-add.html) the new files and [commit](https://www.kernel.org/pub/software/scm/git/docs/git-commit.html) them and [push](https://www.kernel.org/pub/software/scm/git/docs/git-push.html) the result, like this:
diff --git a/test/helper.py b/test/helper.py

index b7299fb82c2e541fc520ba11c5c52d9edcc972e3..22d7638606841bc0250665402e09fb49655e5d89 100644 (file)
--- a/test/helper.py
+++ b/test/helper.py
@@ -117,8 +117,9 @@ def expect_info_dict(self, expected_dict, got_dict):
                  u'invalid value for field %s, expected %r, got %r' % (info_field, expected, got))
  
      # Check for the presence of mandatory fields
-    for key in ('id', 'url', 'title', 'ext'):
-        self.assertTrue(got_dict.get(key), 'Missing mandatory field %s' % key)
+    if got_dict.get('_type') != 'playlist':
+        for key in ('id', 'url', 'title', 'ext'):
+            self.assertTrue(got_dict.get(key), 'Missing mandatory field %s' % key)
      # Check for mandatory fields that are automatically set by YoutubeDL
      for key in ['webpage_url', 'extractor', 'extractor_key']:
          self.assertTrue(got_dict.get(key), u'Missing field: %s' % key)
diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py

index e794cc97f0e643c5f05539fd3d0313d30dc98f8d..ab61e19768e4454f061ab9f832cc70c968440d85 100644 (file)
--- a/test/test_YoutubeDL.py
+++ b/test/test_YoutubeDL.py
@@ -221,7 +221,7 @@ class TestFormatSelection(unittest.TestCase):
              '138', '137', '248', '136', '247', '135', '246',
              '245', '244', '134', '243', '133', '242', '160',
              # Dash audio
-            '141', '172', '140', '139', '171',
+            '141', '172', '140', '171', '139',
          ]
  
          for f1id, f2id in zip(order, order[1:]):
diff --git a/test/test_download.py b/test/test_download.py

index d6540588c130f6bafacd4ef7d077e6debf8d911d..c8d4ec2c87c97773d60c52ffb342809b8e2a0ffb 100644 (file)
--- a/test/test_download.py
+++ b/test/test_download.py
@@ -63,15 +63,21 @@ def generator(test_case):
      def test_template(self):
          ie = youtube_dl.extractor.get_info_extractor(test_case['name'])
          other_ies = [get_info_extractor(ie_key) for ie_key in test_case.get('add_ie', [])]
+        is_playlist = any(k.startswith('playlist') for k in test_case)
+        test_cases = test_case.get(
+            'playlist', [] if is_playlist else [test_case])
+
          def print_skipping(reason):
              print('Skipping %s: %s' % (test_case['name'], reason))
          if not ie.working():
              print_skipping('IE marked as not _WORKING')
              return
-        if 'playlist' not in test_case:
-            info_dict = test_case.get('info_dict', {})
-            if not test_case.get('file') and not (info_dict.get('id') and info_dict.get('ext')):
+
+        for tc in test_cases:
+            info_dict = tc.get('info_dict', {})
+            if not tc.get('file') and not (info_dict.get('id') and info_dict.get('ext')):
                  raise Exception('Test definition incorrect. The output file cannot be known. Are both \'id\' and \'ext\' keys present?')
+
          if 'skip' in test_case:
              print_skipping(test_case['skip'])
              return
@@ -81,6 +87,9 @@ def generator(test_case):
                  return
  
          params = get_params(test_case.get('params', {}))
+        if is_playlist and 'playlist' not in test_case:
+            params.setdefault('extract_flat', True)
+            params.setdefault('skip_download', True)
  
          ydl = YoutubeDL(params)
          ydl.add_default_info_extractors()
@@ -93,7 +102,6 @@ def generator(test_case):
          def get_tc_filename(tc):
              return tc.get('file') or ydl.prepare_filename(tc.get('info_dict', {}))
  
-        test_cases = test_case.get('playlist', [test_case])
          def try_rm_tcs_files():
              for tc in test_cases:
                  tc_filename = get_tc_filename(tc)
@@ -105,7 +113,10 @@ def generator(test_case):
              try_num = 1
              while True:
                  try:
-                    ydl.download([test_case['url']])
+                    # We're not using .download here sine that is just a shim
+                    # for outside error handling, and returns the exit code
+                    # instead of the result dict.
+                    res_dict = ydl.extract_info(test_case['url'])
                  except (DownloadError, ExtractorError) as err:
                      # Check if the exception is not a network related one
                      if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError, compat_http_client.BadStatusLine) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503):
@@ -121,6 +132,17 @@ def generator(test_case):
                  else:
                      break
  
+            if is_playlist:
+                self.assertEqual(res_dict['_type'], 'playlist')
+                expect_info_dict(self, test_case.get('info_dict', {}), res_dict)
+            if 'playlist_mincount' in test_case:
+                self.assertGreaterEqual(
+                    len(res_dict['entries']),
+                    test_case['playlist_mincount'],
+                    'Expected at least %d in playlist %s, but got only %d' % (
+                        test_case['playlist_mincount'], test_case['url'],
+                        len(res_dict['entries'])))
+
              for tc in test_cases:
                  tc_filename = get_tc_filename(tc)
                  if not test_case.get('params', {}).get('skip_download', False):
diff --git a/test/test_playlists.py b/test/test_playlists.py

index 4f188345bf2b9bd7fee5d886cbcfec3ba15dae6e..3f79a7d6a5f7d33ce64750c0f4c3d752547c59dc 100644 (file)
--- a/test/test_playlists.py
+++ b/test/test_playlists.py
@@ -1,6 +1,17 @@
  #!/usr/bin/env python
  # encoding: utf-8
  
+## DEPRECATED FILE!
+# Add new tests to the extractors themselves, like this:
+# _TEST = {
+#    'url': 'http://example.com/playlist/42',
+#    'playlist_mincount': 99,
+#    'info_dict': {
+#        'id': '42',
+#        'title': 'Playlist number forty-two',
+#    }
+# }
+
  from __future__ import unicode_literals
  
  # Allow direct execution
@@ -51,6 +62,7 @@ from youtube_dl.extractor import (
      InstagramUserIE,
      CSpanIE,
      AolIE,
+    GameOnePlaylistIE,
  )
  
  
@@ -396,5 +408,6 @@ class TestPlaylists(unittest.TestCase):
          self.assertEqual(result['id'], 'rbhagwati2')
          assertGreaterEqual(self, len(result['entries']), 179)
  
+
  if __name__ == '__main__':
      unittest.main()
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py

index 14a1d06ab1ed3350547822cac71501745a14842a..a671d6450469167381f1649f54108d1ea6ab5533 100755 (executable)
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -162,6 +162,7 @@ class YoutubeDL(object):
      default_search:    Prepend this string if an input url is not valid.
                         'auto' for elaborate guessing
      encoding:          Use this encoding instead of the system-specified.
+    extract_flat:      Do not resolve URLs, return the immediate result.
  
      The following parameters are not used by YoutubeDL itself, they are used by
      the FileDownloader:
@@ -479,7 +480,10 @@ class YoutubeDL(object):
                  return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
          age_limit = self.params.get('age_limit')
          if age_limit is not None:
-            if age_limit < info_dict.get('age_limit', 0):
+            actual_age_limit = info_dict.get('age_limit')
+            if actual_age_limit is None:
+                actual_age_limit = 0
+            if age_limit < actual_age_limit:
                  return 'Skipping "' + title + '" because it is age restricted'
          if self.in_download_archive(info_dict):
              return '%s has already been recorded in archive' % video_title
@@ -558,7 +562,12 @@ class YoutubeDL(object):
          Returns the resolved ie_result.
          """
  
-        result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
+        result_type = ie_result.get('_type', 'video')
+
+        if self.params.get('extract_flat', False):
+            if result_type in ('url', 'url_transparent'):
+                return ie_result
+
          if result_type == 'video':
              self.add_extra_info(ie_result, extra_info)
              return self.process_video_result(ie_result, download=download)
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py

index 80de211e75ea266d08bdc4c1bc8236bdd0741c24..a96bf9b5cd978cdc09e8cec4596cad473adc809a 100644 (file)
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -69,6 +69,10 @@ __authors__  = (
      'Dobrosław Żybort',
      'David Fabijan',
      'Sebastian Haas',
+    'Alexander Kirk',
+    'Erik Johnson',
+    'Keith Beckman',
+    'Ole Ernst',
  )
  
  __license__ = 'Public Domain'
diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py

index f79e6a99587cdc7d13ba210e528512a1e572e4ea..d01d1897e411fc5005c15868b46cfdf174c2ca4c 100644 (file)
--- a/youtube_dl/downloader/http.py
+++ b/youtube_dl/downloader/http.py
@@ -27,8 +27,16 @@ class HttpFD(FileDownloader):
              headers['Youtubedl-user-agent'] = info_dict['user_agent']
          if 'http_referer' in info_dict:
              headers['Referer'] = info_dict['http_referer']
-        basic_request = compat_urllib_request.Request(url, None, headers)
-        request = compat_urllib_request.Request(url, None, headers)
+        add_headers = info_dict.get('http_headers')
+        if add_headers:
+            headers.update(add_headers)
+        data = info_dict.get('http_post_data')
+        http_method = info_dict.get('http_method')
+        basic_request = compat_urllib_request.Request(url, data, headers)
+        request = compat_urllib_request.Request(url, data, headers)
+        if http_method is not None:
+            basic_request.get_method = lambda: http_method
+            request.get_method = lambda: http_method
  
          is_test = self.params.get('test', False)
  
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py

index 27602e0c03496cdf88784d547d53441a471fa31f..de6e8ee30f5ce1338fe63840347c80c14b465ff1 100644 (file)
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -69,6 +69,7 @@ from .dfb import DFBIE
  from .dotsub import DotsubIE
  from .dreisat import DreiSatIE
  from .drtv import DRTVIE
+from .dump import DumpIE
  from .defense import DefenseGouvFrIE
  from .discovery import DiscoveryIE
  from .divxstage import DivxStageIE
@@ -77,12 +78,17 @@ from .ebaumsworld import EbaumsWorldIE
  from .ehow import EHowIE
  from .eighttracks import EightTracksIE
  from .eitb import EitbIE
+from .ellentv import (
+    EllenTVIE,
+    EllenTVClipsIE,
+)
  from .elpais import ElPaisIE
  from .empflix import EmpflixIE
  from .engadget import EngadgetIE
  from .escapist import EscapistIE
  from .everyonesmixtape import EveryonesMixtapeIE
  from .exfm import ExfmIE
+from .expotv import ExpoTVIE
  from .extremetube import ExtremeTubeIE
  from .facebook import FacebookIE
  from .faz import FazIE
@@ -110,7 +116,10 @@ from .freesound import FreesoundIE
  from .freespeech import FreespeechIE
  from .funnyordie import FunnyOrDieIE
  from .gamekings import GamekingsIE
-from .gameone import GameOneIE
+from .gameone import (
+    GameOneIE,
+    GameOnePlaylistIE,
+)
  from .gamespot import GameSpotIE
  from .gamestar import GameStarIE
  from .gametrailers import GametrailersIE
@@ -121,6 +130,7 @@ from .googleplus import GooglePlusIE
  from .googlesearch import GoogleSearchIE
  from .gorillavid import GorillaVidIE
  from .goshgay import GoshgayIE
+from .grooveshark import GroovesharkIE
  from .hark import HarkIE
  from .helsinki import HelsinkiIE
  from .hentaistigma import HentaiStigmaIE
@@ -147,6 +157,7 @@ from .ivi import (
  from .izlesene import IzleseneIE
  from .jadorecettepub import JadoreCettePubIE
  from .jeuxvideo import JeuxVideoIE
+from .jove import JoveIE
  from .jukebox import JukeboxIE
  from .justintv import JustinTVIE
  from .jpopsukitv import JpopsukiIE
@@ -177,7 +188,9 @@ from .malemotion import MalemotionIE
  from .mdr import MDRIE
  from .metacafe import MetacafeIE
  from .metacritic import MetacriticIE
+from .ministrygrid import MinistryGridIE
  from .mit import TechTVMITIE, MITIE, OCWMITIE
+from .mitele import MiTeleIE
  from .mixcloud import MixcloudIE
  from .mlb import MLBIE
  from .mpora import MporaIE
@@ -187,6 +200,7 @@ from .mooshare import MooshareIE
  from .morningstar import MorningstarIE
  from .motherless import MotherlessIE
  from .motorsport import MotorsportIE
+from .movieclips import MovieClipsIE
  from .moviezine import MoviezineIE
  from .movshare import MovShareIE
  from .mtv import (
@@ -233,8 +247,10 @@ from .orf import (
      ORFFM4IE,
  )
  from .parliamentliveuk import ParliamentLiveUKIE
+from .patreon import PatreonIE
  from .pbs import PBSIE
  from .photobucket import PhotobucketIE
+from .playfm import PlayFMIE
  from .playvid import PlayvidIE
  from .podomatic import PodomaticIE
  from .pornhd import PornHdIE
@@ -252,9 +268,10 @@ from .ro220 import Ro220IE
  from .rottentomatoes import RottenTomatoesIE
  from .roxwel import RoxwelIE
  from .rtbf import RTBFIE
+from .rtlnl import RtlXlIE
  from .rtlnow import RTLnowIE
  from .rts import RTSIE
-from .rtve import RTVEALaCartaIE
+from .rtve import RTVEALaCartaIE, RTVELiveIE
  from .ruhd import RUHDIE
  from .rutube import (
      RutubeIE,
@@ -265,6 +282,7 @@ from .rutube import (
  from .rutv import RUTVIE
  from .sapo import SapoIE
  from .savefrom import SaveFromIE
+from .sbs import SBSIE
  from .scivee import SciVeeIE
  from .screencast import ScreencastIE
  from .servingsys import ServingSysIE
@@ -377,6 +395,7 @@ from .vuclip import VuClipIE
  from .vulture import VultureIE
  from .washingtonpost import WashingtonPostIE
  from .wat import WatIE
+from .wayofthemaster import WayOfTheMasterIE
  from .wdr import (
      WDRIE,
      WDRMobileIE,
diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py

index 7e93bc4df286e71554f272fb69ba43348f3c5da4..74860882628017c5ab7a44f22bd9b05286ad556e 100644 (file)
--- a/youtube_dl/extractor/aparat.py
+++ b/youtube_dl/extractor/aparat.py
@@ -1,5 +1,7 @@
  #coding: utf-8
  
+from __future__ import unicode_literals
+
  import re
  
  from .common import InfoExtractor
@@ -13,13 +15,14 @@ class AparatIE(InfoExtractor):
      _VALID_URL = r'^https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)'
  
      _TEST = {
-        u'url': u'http://www.aparat.com/v/wP8On',
-        u'file': u'wP8On.mp4',
-        u'md5': u'6714e0af7e0d875c5a39c4dc4ab46ad1',
-        u'info_dict': {
-            u"title": u"تیم گلکسی 11 - زومیت",
+        'url': 'http://www.aparat.com/v/wP8On',
+        'md5': '6714e0af7e0d875c5a39c4dc4ab46ad1',
+        'info_dict': {
+            'id': 'wP8On',
+            'ext': 'mp4',
+            'title': 'تیم گلکسی 11 - زومیت',
          },
-        #u'skip': u'Extremely unreliable',
+        # 'skip': 'Extremely unreliable',
      }
  
      def _real_extract(self, url):
@@ -29,8 +32,8 @@ class AparatIE(InfoExtractor):
          # Note: There is an easier-to-parse configuration at
          # http://www.aparat.com/video/video/config/videohash/%video_id
          # but the URL in there does not work
-        embed_url = (u'http://www.aparat.com/video/video/embed/videohash/' +
-                     video_id + u'/vt/frame')
+        embed_url = ('http://www.aparat.com/video/video/embed/videohash/' +
+                     video_id + '/vt/frame')
          webpage = self._download_webpage(embed_url, video_id)
  
          video_urls = re.findall(r'fileList\[[0-9]+\]\s*=\s*"([^"]+)"', webpage)
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py

index d86dbba8e8db9748ecf61b59a36bfffa532a33d1..1c72b2ff6f5b4d5a6c01a4a0dfde2b2de6517e04 100644 (file)
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -177,16 +177,26 @@ class ArteTVPlus7IE(InfoExtractor):
  # It also uses the arte_vp_url url from the webpage to extract the information
  class ArteTVCreativeIE(ArteTVPlus7IE):
      IE_NAME = 'arte.tv:creative'
-    _VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de)/magazine?/(?P<id>.+)'
+    _VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de)/(?:magazine?/)?(?P<id>[^?#]+)'
  
-    _TEST = {
+    _TESTS = [{
          'url': 'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design',
          'info_dict': {
-            'id': '050489-002',
+            'id': '72176',
              'ext': 'mp4',
-            'title': 'Agentur Amateur / Agence Amateur #2 : Corporate Design',
+            'title': 'Folge 2 - Corporate Design',
+            'upload_date': '20131004',
          },
-    }
+    }, {
+        'url': 'http://creative.arte.tv/fr/Monty-Python-Reunion',
+        'info_dict': {
+            'id': '160676',
+            'ext': 'mp4',
+            'title': 'Monty Python live (mostly)',
+            'description': 'Événement ! Quarante-cinq ans après leurs premiers succès, les légendaires Monty Python remontent sur scène.\n',
+            'upload_date': '20140805',
+        }
+    }]
  
  
  class ArteTVFutureIE(ArteTVPlus7IE):
diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py

index acfc4ad736d9deecf1ed9cdadd4063e2fc8e7243..261ead98f53f1747943d22935e04ab2d3a908d11 100644 (file)
--- a/youtube_dl/extractor/bliptv.py
+++ b/youtube_dl/extractor/bliptv.py
@@ -15,7 +15,7 @@ from ..utils import (
  
  
  class BlipTVIE(SubtitlesInfoExtractor):
-    _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-|rss/flash/)(?P<id>\d+)|((?:play/|api\.swf#)(?P<lookup_id>[\da-zA-Z+]+)))'
+    _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-|rss/flash/)(?P<id>\d+)|((?:play/|api\.swf#)(?P<lookup_id>[\da-zA-Z+_TESTS]+)))'
  
      _TESTS = [
          {
@@ -49,6 +49,21 @@ class BlipTVIE(SubtitlesInfoExtractor):
                  'uploader_id': '792887',
                  'duration': 279,
              }
+        },
+        {
+            # https://bugzilla.redhat.com/show_bug.cgi?id=967465
+            'url': 'http://a.blip.tv/api.swf#h6Uag5KbVwI',
+            'md5': '314e87b1ebe7a48fcbfdd51b791ce5a6',
+            'info_dict': {
+                'id': '6573122',
+                'ext': 'mov',
+                'upload_date': '20130520',
+                'description': 'Two hapless space marines argue over what to do when they realize they have an astronomically huge problem on their hands.',
+                'title': 'Red vs. Blue Season 11 Trailer',
+                'timestamp': 1369029609,
+                'uploader': 'redvsblue',
+                'uploader_id': '792887',
+            }
          }
      ]
  
@@ -150,7 +165,7 @@ class BlipTVIE(SubtitlesInfoExtractor):
  
  
  class BlipTVUserIE(InfoExtractor):
-    _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
+    _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)(?!api\.swf)([^/]+)/*$'
      _PAGE_SIZE = 12
      IE_NAME = 'blip.tv:user'
  
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py

index 419951b6279ae87fb8f0dab1c4f5249ce221a268..294670386256dc45a071544345e259cbf545e7c7 100644 (file)
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -154,12 +154,14 @@ class BrightcoveIE(InfoExtractor):
      def _extract_brightcove_urls(cls, webpage):
          """Return a list of all Brightcove URLs from the webpage """
  
-        url_m = re.search(r'<meta\s+property="og:video"\s+content="(http://c.brightcove.com/[^"]+)"', webpage)
+        url_m = re.search(
+            r'<meta\s+property="og:video"\s+content="(https?://(?:secure|c)\.brightcove.com/[^"]+)"',
+            webpage)
          if url_m:
              url = unescapeHTML(url_m.group(1))
              # Some sites don't add it, we can't download with this url, for example:
              # http://www.ktvu.com/videos/news/raw-video-caltrain-releases-video-of-man-almost/vCTZdY/
-            if 'playerKey' in url:
+            if 'playerKey' in url or 'videoId' in url:
                  return [url]
  
          matches = re.findall(
@@ -188,9 +190,13 @@ class BrightcoveIE(InfoExtractor):
              referer = smuggled_data.get('Referer', url)
              return self._get_video_info(
                  videoPlayer[0], query_str, query, referer=referer)
-        else:
+        elif 'playerKey' in query:
              player_key = query['playerKey']
              return self._get_playlist_info(player_key[0])
+        else:
+            raise ExtractorError(
+                'Cannot find playerKey= variable. Did you forget quotes in a shell invocation?',
+                expected=True)
  
      def _get_video_info(self, video_id, query_str, query, referer=None):
          request_url = self._FEDERATED_URL_TEMPLATE % query_str
@@ -202,6 +208,13 @@ class BrightcoveIE(InfoExtractor):
              req.add_header('Referer', referer)
          webpage = self._download_webpage(req, video_id)
  
+        error_msg = self._html_search_regex(
+            r"<h1>We're sorry.</h1>\s*<p>(.*?)</p>", webpage,
+            'error message', default=None)
+        if error_msg is not None:
+            raise ExtractorError(
+                'brightcove said: %s' % error_msg, expected=True)
+
          self.report_extraction(video_id)
          info = self._search_regex(r'var experienceJSON = ({.*});', webpage, 'json')
          info = json.loads(info)['data']
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index 45a17f8ada1fb0b0977ff9492876841965077c29..4d5b48167cb604b6679b6e524b5420efb1b3b9c5 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -84,6 +84,12 @@ class InfoExtractor(object):
                                   format, irrespective of the file format.
                                   -1 for default (order by other properties),
                                   -2 or smaller for less than default.
+                    * http_referer  HTTP Referer header value to set.
+                    * http_method  HTTP method to use for the download.
+                    * http_headers  A dictionary of additional HTTP headers
+                                 to add to the request.
+                    * http_post_data  Additional data to send with a POST
+                                 request.
      url:            Final video URL.
      ext:            Video filename extension.
      format:         The video format, defaults to ext (used for --get-format)
@@ -479,8 +485,9 @@ class InfoExtractor(object):
          return self._og_search_property('title', html, **kargs)
  
      def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
-        regexes = self._og_regexes('video')
-        if secure: regexes = self._og_regexes('video:secure_url') + regexes
+        regexes = self._og_regexes('video') + self._og_regexes('video:url')
+        if secure:
+            regexes = self._og_regexes('video:secure_url') + regexes
          return self._html_search_regex(regexes, html, name, **kargs)
  
      def _og_search_url(self, html, **kargs):
diff --git a/youtube_dl/extractor/dump.py b/youtube_dl/extractor/dump.py

new file mode 100644 (file)

index 0000000..6b65177
--- /dev/null
+++ b/youtube_dl/extractor/dump.py
@@ -0,0 +1,39 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class DumpIE(InfoExtractor):
+    _VALID_URL = r'^https?://(?:www\.)?dump\.com/(?P<id>[a-zA-Z0-9]+)/'
+
+    _TEST = {
+        'url': 'http://www.dump.com/oneus/',
+        'md5': 'ad71704d1e67dfd9e81e3e8b42d69d99',
+        'info_dict': {
+            'id': 'oneus',
+            'ext': 'flv',
+            'title': "He's one of us.",
+            'thumbnail': 're:^https?://.*\.jpg$',
+        },
+    }
+
+    def _real_extract(self, url):
+        m = re.match(self._VALID_URL, url)
+        video_id = m.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        video_url = self._search_regex(
+            r's1.addVariable\("file",\s*"([^"]+)"', webpage, 'video URL')
+
+        thumb = self._og_search_thumbnail(webpage)
+        title = self._search_regex(r'<b>([^"]+)</b>', webpage, 'title')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': video_url,
+            'thumbnail': thumb,
+        }
diff --git a/youtube_dl/extractor/ebaumsworld.py b/youtube_dl/extractor/ebaumsworld.py

index 877113d63a7261a284a628a06908c466d446613d..63c2549d37aa528cc79f83822c7a267d391b74cc 100644 (file)
--- a/youtube_dl/extractor/ebaumsworld.py
+++ b/youtube_dl/extractor/ebaumsworld.py
@@ -1,19 +1,21 @@
+from __future__ import unicode_literals
+
  import re
  
  from .common import InfoExtractor
-from ..utils import determine_ext
  
  
  class EbaumsWorldIE(InfoExtractor):
      _VALID_URL = r'https?://www\.ebaumsworld\.com/video/watch/(?P<id>\d+)'
  
      _TEST = {
-        u'url': u'http://www.ebaumsworld.com/video/watch/83367677/',
-        u'file': u'83367677.mp4',
-        u'info_dict': {
-            u'title': u'A Giant Python Opens The Door',
-            u'description': u'This is how nightmares start...',
-            u'uploader': u'jihadpizza',
+        'url': 'http://www.ebaumsworld.com/video/watch/83367677/',
+        'info_dict': {
+            'id': '83367677',
+            'ext': 'mp4',
+            'title': 'A Giant Python Opens The Door',
+            'description': 'This is how nightmares start...',
+            'uploader': 'jihadpizza',
          },
      }
  
@@ -28,7 +30,6 @@ class EbaumsWorldIE(InfoExtractor):
              'id': video_id,
              'title': config.find('title').text,
              'url': video_url,
-            'ext': determine_ext(video_url),
              'description': config.find('description').text,
              'thumbnail': config.find('image').text,
              'uploader': config.find('username').text,
diff --git a/youtube_dl/extractor/ellentv.py b/youtube_dl/extractor/ellentv.py

new file mode 100644 (file)

index 0000000..3e79236
--- /dev/null
+++ b/youtube_dl/extractor/ellentv.py
@@ -0,0 +1,79 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    parse_iso8601,
+)
+
+
+class EllenTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?ellentv\.com/videos/(?P<id>[a-z0-9_-]+)'
+    _TEST = {
+        'url': 'http://www.ellentv.com/videos/0-7jqrsr18/',
+        'md5': 'e4af06f3bf0d5f471921a18db5764642',
+        'info_dict': {
+            'id': '0-7jqrsr18',
+            'ext': 'mp4',
+            'title': 'What\'s Wrong with These Photos? A Whole Lot',
+            'timestamp': 1406876400,
+            'upload_date': '20140801',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        timestamp = parse_iso8601(self._search_regex(
+            r'<span class="publish-date"><time datetime="([^"]+)">',
+            webpage, 'timestamp'))
+
+        return {
+            'id': video_id,
+            'title': self._og_search_title(webpage),
+            'url': self._html_search_meta('VideoURL', webpage, 'url'),
+            'timestamp': timestamp,
+        }
+
+
+class EllenTVClipsIE(InfoExtractor):
+    IE_NAME = 'EllenTV:clips'
+    _VALID_URL = r'https?://(?:www\.)?ellentv\.com/episodes/(?P<id>[a-z0-9_-]+)'
+    _TEST = {
+        'url': 'http://www.ellentv.com/episodes/meryl-streep-vanessa-hudgens/',
+        'info_dict': {
+            'id': 'meryl-streep-vanessa-hudgens',
+            'title': 'Meryl Streep, Vanessa Hudgens',
+        },
+        'playlist_mincount': 9,
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        playlist_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, playlist_id)
+        playlist = self._extract_playlist(webpage)
+
+        return {
+            '_type': 'playlist',
+            'id': playlist_id,
+            'title': self._og_search_title(webpage),
+            'entries': self._extract_entries(playlist)
+        }
+
+    def _extract_playlist(self, webpage):
+        json_string = self._search_regex(r'playerView.addClips\(\[\{(.*?)\}\]\);', webpage, 'json')
+        try:
+            return json.loads("[{" + json_string + "}]")
+        except ValueError as ve:
+            raise ExtractorError('Failed to download JSON', cause=ve)
+
+    def _extract_entries(self, playlist):
+        return [self.url_result(item['url'], 'EllenTV') for item in playlist]
diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py

index 272dfe1f643208a31635dade0e561c8eb009aab7..476fc22b93424b13255d5eec3578eb985dbfbdfd 100644 (file)
--- a/youtube_dl/extractor/escapist.py
+++ b/youtube_dl/extractor/escapist.py
@@ -36,7 +36,7 @@ class EscapistIE(InfoExtractor):
              r'<meta name="description" content="([^"]*)"',
              webpage, 'description', fatal=False)
  
-        playerUrl = self._og_search_video_url(webpage, name=u'player URL')
+        playerUrl = self._og_search_video_url(webpage, name='player URL')
  
          title = self._html_search_regex(
              r'<meta name="title" content="([^"]*)"',
diff --git a/youtube_dl/extractor/expotv.py b/youtube_dl/extractor/expotv.py

new file mode 100644 (file)

index 0000000..a38b773
--- /dev/null
+++ b/youtube_dl/extractor/expotv.py
@@ -0,0 +1,73 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    unified_strdate,
+)
+
+
+class ExpoTVIE(InfoExtractor):
+    _VALID_URL = r'https?://www\.expotv\.com/videos/[^?#]*/(?P<id>[0-9]+)($|[?#])'
+    _TEST = {
+        'url': 'http://www.expotv.com/videos/reviews/1/24/LinneCardscom/17561',
+        'md5': '2985e6d7a392b2f7a05e0ca350fe41d0',
+        'info_dict': {
+            'id': '17561',
+            'ext': 'mp4',
+            'upload_date': '20060212',
+            'title': 'My Favorite Online Scrapbook Store',
+            'view_count': int,
+            'description': 'You\'ll find most everything you need at this virtual store front.',
+            'uploader': 'Anna T.',
+            'thumbnail': 're:^https?://.*\.jpg$',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        player_key = self._search_regex(
+            r'<param name="playerKey" value="([^"]+)"', webpage, 'player key')
+        config_url = 'http://client.expotv.com/video/config/%s/%s' % (
+            video_id, player_key)
+        config = self._download_json(
+            config_url, video_id,
+            note='Downloading video configuration')
+
+        formats = [{
+            'url': fcfg['file'],
+            'height': int_or_none(fcfg.get('height')),
+            'format_note': fcfg.get('label'),
+            'ext': self._search_regex(
+                r'filename=.*\.([a-z0-9_A-Z]+)&', fcfg['file'],
+                'file extension', default=None),
+        } for fcfg in config['sources']]
+        self._sort_formats(formats)
+
+        title = self._og_search_title(webpage)
+        description = self._og_search_description(webpage)
+        thumbnail = config.get('image')
+        view_count = int_or_none(self._search_regex(
+            r'<h5>Plays: ([0-9]+)</h5>', webpage, 'view counts'))
+        uploader = self._search_regex(
+            r'<div class="reviewer">\s*<img alt="([^"]+)"', webpage, 'uploader',
+            fatal=False)
+        upload_date = unified_strdate(self._search_regex(
+            r'<h5>Reviewed on ([0-9/.]+)</h5>', webpage, 'upload date',
+            fatal=False))
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': title,
+            'description': description,
+            'view_count': view_count,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'upload_date': upload_date,
+        }
diff --git a/youtube_dl/extractor/gameone.py b/youtube_dl/extractor/gameone.py

index b580f52fb42fd80f42dd78201b7a6db09425f77c..3022f539d2571f34ae87bc91dc4cf1f1e25ccdc0 100644 (file)
--- a/youtube_dl/extractor/gameone.py
+++ b/youtube_dl/extractor/gameone.py
@@ -88,3 +88,28 @@ class GameOneIE(InfoExtractor):
              'age_limit': age_limit,
              'timestamp': timestamp,
          }
+
+
+class GameOnePlaylistIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?gameone\.de(?:/tv)?/?$'
+    IE_NAME = 'gameone:playlist'
+    _TEST = {
+        'url': 'http://www.gameone.de/tv',
+        'info_dict': {
+            'title': 'GameOne',
+        },
+        'playlist_mincount': 294,
+    }
+
+    def _real_extract(self, url):
+        webpage = self._download_webpage('http://www.gameone.de/tv', 'TV')
+        max_id = max(map(int, re.findall(r'<a href="/tv/(\d+)"', webpage)))
+        entries = [
+            self.url_result('http://www.gameone.de/tv/%d' % video_id, 'GameOne')
+            for video_id in range(max_id, 0, -1)]
+
+        return {
+            '_type': 'playlist',
+            'title': 'GameOne',
+            'entries': entries,
+        }
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index 8e915735eaedcacaff85b4408fb93deb79bac16b..8b11f7f7a086cc28828a8e0afe7ec520d13ed956 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -15,11 +15,14 @@ from ..utils import (
      compat_xml_parse_error,
  
      ExtractorError,
+    float_or_none,
      HEADRequest,
+    orderedSet,
      parse_xml,
      smuggle_url,
      unescapeHTML,
      unified_strdate,
+    unsmuggle_url,
      url_basename,
  )
  from .brightcove import BrightcoveIE
@@ -289,6 +292,58 @@ class GenericIE(InfoExtractor):
                  'description': 'Mario\'s life in the fast lane has never looked so good.',
              },
          },
+        # YouTube embed via <data-embed-url="">
+        {
+            'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
+            'info_dict': {
+                'id': 'jpSGZsgga_I',
+                'ext': 'mp4',
+                'title': 'Asphalt 8: Airborne - Launch Trailer',
+                'uploader': 'Gameloft',
+                'uploader_id': 'gameloft',
+                'upload_date': '20130821',
+                'description': 'md5:87bd95f13d8be3e7da87a5f2c443106a',
+            },
+            'params': {
+                'skip_download': True,
+            }
+        },
+        # Camtasia studio
+        {
+            'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
+            'playlist': [{
+                'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
+                'info_dict': {
+                    'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
+                    'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
+                    'ext': 'flv',
+                    'duration': 2235.90,
+                }
+            }, {
+                'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
+                'info_dict': {
+                    'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
+                    'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
+                    'ext': 'flv',
+                    'duration': 2235.93,
+                }
+            }],
+            'info_dict': {
+                'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
+            }
+        },
+        # Flowplayer
+        {
+            'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
+            'md5': '9d65602bf31c6e20014319c7d07fba27',
+            'info_dict': {
+                'id': '5123ea6d5e5a7',
+                'ext': 'mp4',
+                'age_limit': 18,
+                'uploader': 'www.handjobhub.com',
+                'title': 'Busty Blonde Siri Tit Fuck While Wank at Handjob Hub',
+            }
+        }
      ]
  
      def report_download_webpage(self, video_id):
@@ -301,58 +356,6 @@ class GenericIE(InfoExtractor):
          """Report information extraction."""
          self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
  
-    def _send_head(self, url):
-        """Check if it is a redirect, like url shorteners, in case return the new url."""
-
-        class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
-            """
-            Subclass the HTTPRedirectHandler to make it use our
-            HEADRequest also on the redirected URL
-            """
-            def redirect_request(self, req, fp, code, msg, headers, newurl):
-                if code in (301, 302, 303, 307):
-                    newurl = newurl.replace(' ', '%20')
-                    newheaders = dict((k,v) for k,v in req.headers.items()
-                                      if k.lower() not in ("content-length", "content-type"))
-                    try:
-                        # This function was deprecated in python 3.3 and removed in 3.4
-                        origin_req_host = req.get_origin_req_host()
-                    except AttributeError:
-                        origin_req_host = req.origin_req_host
-                    return HEADRequest(newurl,
-                                       headers=newheaders,
-                                       origin_req_host=origin_req_host,
-                                       unverifiable=True)
-                else:
-                    raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
-
-        class HTTPMethodFallback(compat_urllib_request.BaseHandler):
-            """
-            Fallback to GET if HEAD is not allowed (405 HTTP error)
-            """
-            def http_error_405(self, req, fp, code, msg, headers):
-                fp.read()
-                fp.close()
-
-                newheaders = dict((k,v) for k,v in req.headers.items()
-                                  if k.lower() not in ("content-length", "content-type"))
-                return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
-                                                 headers=newheaders,
-                                                 origin_req_host=req.get_origin_req_host(),
-                                                 unverifiable=True))
-
-        # Build our opener
-        opener = compat_urllib_request.OpenerDirector()
-        for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
-                        HTTPMethodFallback, HEADRedirectHandler,
-                        compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
-            opener.add_handler(handler())
-
-        response = opener.open(HEADRequest(url))
-        if response is None:
-            raise ExtractorError('Invalid URL protocol')
-        return response
-
      def _extract_rss(self, url, video_id, doc):
          playlist_title = doc.find('./channel/title').text
          playlist_desc_el = doc.find('./channel/description')
@@ -372,6 +375,43 @@ class GenericIE(InfoExtractor):
              'entries': entries,
          }
  
+    def _extract_camtasia(self, url, video_id, webpage):
+        """ Returns None if no camtasia video can be found. """
+
+        camtasia_cfg = self._search_regex(
+            r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
+            webpage, 'camtasia configuration file', default=None)
+        if camtasia_cfg is None:
+            return None
+
+        title = self._html_search_meta('DC.title', webpage, fatal=True)
+
+        camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
+        camtasia_cfg = self._download_xml(
+            camtasia_url, video_id,
+            note='Downloading camtasia configuration',
+            errnote='Failed to download camtasia configuration')
+        fileset_node = camtasia_cfg.find('./playlist/array/fileset')
+
+        entries = []
+        for n in fileset_node.getchildren():
+            url_n = n.find('./uri')
+            if url_n is None:
+                continue
+
+            entries.append({
+                'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
+                'title': '%s - %s' % (title, n.tag),
+                'url': compat_urlparse.urljoin(url, url_n.text),
+                'duration': float_or_none(n.find('./duration').text),
+            })
+
+        return {
+            '_type': 'playlist',
+            'entries': entries,
+            'title': title,
+        }
+
      def _real_extract(self, url):
          if url.startswith('//'):
              return {
@@ -408,17 +448,31 @@ class GenericIE(InfoExtractor):
              else:
                  assert ':' in default_search
                  return self.url_result(default_search + url)
-        video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
+
+        url, smuggled_data = unsmuggle_url(url)
+        force_videoid = None
+        if smuggled_data and 'force_videoid' in smuggled_data:
+            force_videoid = smuggled_data['force_videoid']
+            video_id = force_videoid
+        else:
+            video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
  
          self.to_screen('%s: Requesting header' % video_id)
  
-        try:
-            response = self._send_head(url)
+        head_req = HEADRequest(url)
+        response = self._request_webpage(
+            head_req, video_id,
+            note=False, errnote='Could not send HEAD request to %s' % url,
+            fatal=False)
  
+        if response is not False:
              # Check for redirect
              new_url = response.geturl()
              if url != new_url:
                  self.report_following_redirect(new_url)
+                if force_videoid:
+                    new_url = smuggle_url(
+                        new_url, {'force_videoid': force_videoid})
                  return self.url_result(new_url)
  
              # Check for direct link to a video
@@ -439,10 +493,6 @@ class GenericIE(InfoExtractor):
                      'upload_date': upload_date,
                  }
  
-        except compat_urllib_error.HTTPError:
-            # This may be a stupid server that doesn't like HEAD, our UA, or so
-            pass
-
          try:
              webpage = self._download_webpage(url, video_id)
          except ValueError:
@@ -460,6 +510,11 @@ class GenericIE(InfoExtractor):
          except compat_xml_parse_error:
              pass
  
+        # Is it a Camtasia project?
+        camtasia_res = self._extract_camtasia(url, video_id, webpage)
+        if camtasia_res is not None:
+            return camtasia_res
+
          # Sometimes embedded video player is hidden behind percent encoding
          # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
          # Unescaping the whole page allows to handle those cases in a generic way
@@ -475,10 +530,26 @@ class GenericIE(InfoExtractor):
              r'(?s)<title>(.*?)</title>', webpage, 'video title',
              default='video')
  
+        # Try to detect age limit automatically
+        age_limit = self._rta_search(webpage)
+        # And then there are the jokers who advertise that they use RTA,
+        # but actually don't.
+        AGE_LIMIT_MARKERS = [
+            r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
+        ]
+        if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
+            age_limit = 18
+
          # video uploader is domain name
          video_uploader = self._search_regex(
              r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
  
+        # Helper method
+        def _playlist_from_matches(matches, getter, ie=None):
+            urlrs = orderedSet(self.url_result(getter(m), ie) for m in matches)
+            return self.playlist_result(
+                urlrs, playlist_id=video_id, playlist_title=video_title)
+
          # Look for BrightCove:
          bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
          if bc_urls:
@@ -514,6 +585,7 @@ class GenericIE(InfoExtractor):
          matches = re.findall(r'''(?x)
              (?:
                  <iframe[^>]+?src=|
+                data-video-url=|
                  <embed[^>]+?src=|
                  embedSWF\(?:\s*
              )
@@ -522,19 +594,15 @@ class GenericIE(InfoExtractor):
                  (?:embed|v)/.+?)
              \1''', webpage)
          if matches:
-            urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube')
-                     for tuppl in matches]
-            return self.playlist_result(
-                urlrs, playlist_id=video_id, playlist_title=video_title)
+            return _playlist_from_matches(
+                matches, lambda m: unescapeHTML(m[1]), ie='Youtube')
  
          # Look for embedded Dailymotion player
          matches = re.findall(
              r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
          if matches:
-            urlrs = [self.url_result(unescapeHTML(tuppl[1]))
-                     for tuppl in matches]
-            return self.playlist_result(
-                urlrs, playlist_id=video_id, playlist_title=video_title)
+            return _playlist_from_matches(
+                matches, lambda m: unescapeHTML(m[1]))
  
          # Look for embedded Wistia player
          match = re.search(
@@ -553,7 +621,7 @@ class GenericIE(InfoExtractor):
          mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
          if mobj:
              return self.url_result('http://blip.tv/a/a-'+mobj.group(1), 'BlipTV')
-        mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9]+)', webpage)
+        mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage)
          if mobj:
              return self.url_result(mobj.group(1), 'BlipTV')
  
@@ -648,10 +716,8 @@ class GenericIE(InfoExtractor):
          # Look for funnyordie embed
          matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
          if matches:
-            urlrs = [self.url_result(unescapeHTML(eurl), 'FunnyOrDie')
-                     for eurl in matches]
-            return self.playlist_result(
-                urlrs, playlist_id=video_id, playlist_title=video_title)
+            return _playlist_from_matches(
+                matches, getter=unescapeHTML, ie='FunnyOrDie')
  
          # Look for embedded RUTV player
          rutv_url = RUTVIE._extract_url(webpage)
@@ -713,6 +779,13 @@ class GenericIE(InfoExtractor):
          if mobj is not None:
              return self.url_result(mobj.group('url'), 'Yahoo')
  
+        # Look for embedded sbs.com.au player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:www\.)sbs\.com\.au/ondemand/video/single/.+?)\1',
+            webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'SBS')
+
          # Start with something easy: JW Player in SWFObject
          found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
          if not found:
@@ -730,6 +803,15 @@ class GenericIE(InfoExtractor):
          if not found:
              # Broaden the findall a little bit: JWPlayer JS loader
              found = re.findall(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)
+        if not found:
+            # Flow player
+            found = re.findall(r'''(?xs)
+                flowplayer\("[^"]+",\s*
+                    \{[^}]+?\}\s*,
+                    \s*{[^}]+? ["']?clip["']?\s*:\s*\{\s*
+                        ["']?url["']?\s*:\s*["']([^"']+)["']
+            ''', webpage)
+            assert found
          if not found:
              # Try to find twitter cards info
              found = re.findall(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
@@ -739,7 +821,12 @@ class GenericIE(InfoExtractor):
              m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
              # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
              if m_video_type is not None:
-                found = re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
+                def check_video(vurl):
+                    vpath = compat_urlparse.urlparse(vurl).path
+                    return '.' in vpath and not vpath.endswith('.swf')
+                found = list(filter(
+                    check_video,
+                    re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)))
          if not found:
              # HTML5 video
              found = re.findall(r'(?s)<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage)
@@ -776,6 +863,7 @@ class GenericIE(InfoExtractor):
                  'url': video_url,
                  'uploader': video_uploader,
                  'title': video_title,
+                'age_limit': age_limit,
              })
  
          if len(entries) == 1:
diff --git a/youtube_dl/extractor/grooveshark.py b/youtube_dl/extractor/grooveshark.py

new file mode 100644 (file)

index 0000000..726adff
--- /dev/null
+++ b/youtube_dl/extractor/grooveshark.py
@@ -0,0 +1,190 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import time
+import math
+import os.path
+import re
+
+
+from .common import InfoExtractor
+from ..utils import ExtractorError, compat_urllib_request, compat_html_parser
+
+from ..utils import (
+    compat_urllib_parse,
+    compat_urlparse,
+)
+
+
+class GroovesharkHtmlParser(compat_html_parser.HTMLParser):
+    def __init__(self):
+        self._current_object = None
+        self.objects = []
+        compat_html_parser.HTMLParser.__init__(self)
+
+    def handle_starttag(self, tag, attrs):
+        attrs = dict((k, v) for k, v in attrs)
+        if tag == 'object':
+            self._current_object = {'attrs': attrs, 'params': []}
+        elif tag == 'param':
+            self._current_object['params'].append(attrs)
+
+    def handle_endtag(self, tag):
+        if tag == 'object':
+            self.objects.append(self._current_object)
+            self._current_object = None
+
+    @classmethod
+    def extract_object_tags(cls, html):
+        p = cls()
+        p.feed(html)
+        p.close()
+        return p.objects
+
+
+class GroovesharkIE(InfoExtractor):
+    _VALID_URL = r'https?://(www\.)?grooveshark\.com/#!/s/([^/]+)/([^/]+)'
+    _TEST = {
+        'url': 'http://grooveshark.com/#!/s/Jolene+Tenth+Key+Remix+Ft+Will+Sessions/6SS1DW?src=5',
+        'md5': '7ecf8aefa59d6b2098517e1baa530023',
+        'info_dict': {
+            'id': '6SS1DW',
+            'title': 'Jolene (Tenth Key Remix ft. Will Sessions)',
+            'ext': 'mp3',
+            'duration': 227,
+        }
+    }
+
+    do_playerpage_request = True
+    do_bootstrap_request = True
+
+    def _parse_target(self, target):
+        uri = compat_urlparse.urlparse(target)
+        hash = uri.fragment[1:].split('?')[0]
+        token = os.path.basename(hash.rstrip('/'))
+        return (uri, hash, token)
+
+    def _build_bootstrap_url(self, target):
+        (uri, hash, token) = self._parse_target(target)
+        query = 'getCommunicationToken=1&hash=%s&%d' % (compat_urllib_parse.quote(hash, safe=''), self.ts)
+        return (compat_urlparse.urlunparse((uri.scheme, uri.netloc, '/preload.php', None, query, None)), token)
+
+    def _build_meta_url(self, target):
+        (uri, hash, token) = self._parse_target(target)
+        query = 'hash=%s&%d' % (compat_urllib_parse.quote(hash, safe=''), self.ts)
+        return (compat_urlparse.urlunparse((uri.scheme, uri.netloc, '/preload.php', None, query, None)), token)
+
+    def _build_stream_url(self, meta):
+        return compat_urlparse.urlunparse(('http', meta['streamKey']['ip'], '/stream.php', None, None, None))
+
+    def _build_swf_referer(self, target, obj):
+        (uri, _, _) = self._parse_target(target)
+        return compat_urlparse.urlunparse((uri.scheme, uri.netloc, obj['attrs']['data'], None, None, None))
+
+    def _transform_bootstrap(self, js):
+        return re.split('(?m)^\s*try\s*{', js)[0] \
+                 .split(' = ', 1)[1].strip().rstrip(';')
+
+    def _transform_meta(self, js):
+        return js.split('\n')[0].split('=')[1].rstrip(';')
+
+    def _get_meta(self, target):
+        (meta_url, token) = self._build_meta_url(target)
+        self.to_screen('Metadata URL: %s' % meta_url)
+
+        headers = {'Referer': compat_urlparse.urldefrag(target)[0]}
+        req = compat_urllib_request.Request(meta_url, headers=headers)
+        res = self._download_json(req, token,
+                                  transform_source=self._transform_meta)
+
+        if 'getStreamKeyWithSong' not in res:
+            raise ExtractorError(
+                'Metadata not found. URL may be malformed, or Grooveshark API may have changed.')
+
+        if res['getStreamKeyWithSong'] is None:
+            raise ExtractorError(
+                'Metadata download failed, probably due to Grooveshark anti-abuse throttling. Wait at least an hour before retrying from this IP.',
+                expected=True)
+
+        return res['getStreamKeyWithSong']
+
+    def _get_bootstrap(self, target):
+        (bootstrap_url, token) = self._build_bootstrap_url(target)
+
+        headers = {'Referer': compat_urlparse.urldefrag(target)[0]}
+        req = compat_urllib_request.Request(bootstrap_url, headers=headers)
+        res = self._download_json(req, token, fatal=False,
+                                  note='Downloading player bootstrap data',
+                                  errnote='Unable to download player bootstrap data',
+                                  transform_source=self._transform_bootstrap)
+        return res
+
+    def _get_playerpage(self, target):
+        (_, _, token) = self._parse_target(target)
+
+        webpage = self._download_webpage(
+            target, token,
+            note='Downloading player page',
+            errnote='Unable to download player page',
+            fatal=False)
+
+        if webpage is not None:
+            # Search (for example German) error message
+            error_msg = self._html_search_regex(
+                r'<div id="content">\s*<h2>(.*?)</h2>', webpage,
+                'error message', default=None)
+            if error_msg is not None:
+                error_msg = error_msg.replace('\n', ' ')
+                raise ExtractorError('Grooveshark said: %s' % error_msg)
+
+        if webpage is not None:
+            o = GroovesharkHtmlParser.extract_object_tags(webpage)
+            return (webpage, [x for x in o if x['attrs']['id'] == 'jsPlayerEmbed'])
+
+        return (webpage, None)
+
+    def _real_initialize(self):
+        self.ts = int(time.time() * 1000)  # timestamp in millis
+
+    def _real_extract(self, url):
+        (target_uri, _, token) = self._parse_target(url)
+
+        # 1. Fill cookiejar by making a request to the player page
+        swf_referer = None
+        if self.do_playerpage_request:
+            (_, player_objs) = self._get_playerpage(url)
+            if player_objs is not None:
+                swf_referer = self._build_swf_referer(url, player_objs[0])
+                self.to_screen('SWF Referer: %s' % swf_referer)
+
+        # 2. Ask preload.php for swf bootstrap data to better mimic webapp
+        if self.do_bootstrap_request:
+            bootstrap = self._get_bootstrap(url)
+            self.to_screen('CommunicationToken: %s' % bootstrap['getCommunicationToken'])
+
+        # 3. Ask preload.php for track metadata.
+        meta = self._get_meta(url)
+
+        # 4. Construct stream request for track.
+        stream_url = self._build_stream_url(meta)
+        duration = int(math.ceil(float(meta['streamKey']['uSecs']) / 1000000))
+        post_dict = {'streamKey': meta['streamKey']['streamKey']}
+        post_data = compat_urllib_parse.urlencode(post_dict).encode('utf-8')
+        headers = {
+            'Content-Length': len(post_data),
+            'Content-Type': 'application/x-www-form-urlencoded'
+        }
+        if swf_referer is not None:
+            headers['Referer'] = swf_referer
+
+        return {
+            'id': token,
+            'title': meta['song']['Name'],
+            'http_method': 'POST',
+            'url': stream_url,
+            'ext': 'mp3',
+            'format': 'mp3 audio',
+            'duration': duration,
+            'http_post_data': post_data,
+            'http_headers': headers,
+        }
diff --git a/youtube_dl/extractor/jove.py b/youtube_dl/extractor/jove.py

new file mode 100644 (file)

index 0000000..cf73cd7
--- /dev/null
+++ b/youtube_dl/extractor/jove.py
@@ -0,0 +1,80 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    unified_strdate
+)
+
+
+class JoveIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?jove\.com/video/(?P<id>[0-9]+)'
+    _CHAPTERS_URL = 'http://www.jove.com/video-chapters?videoid={video_id:}'
+    _TESTS = [
+        {
+            'url': 'http://www.jove.com/video/2744/electrode-positioning-montage-transcranial-direct-current',
+            'md5': '93723888d82dbd6ba8b3d7d0cd65dd2b',
+            'info_dict': {
+                'id': '2744',
+                'ext': 'mp4',
+                'title': 'Electrode Positioning and Montage in Transcranial Direct Current Stimulation',
+                'description': 'md5:015dd4509649c0908bc27f049e0262c6',
+                'thumbnail': 're:^https?://.*\.png$',
+                'upload_date': '20110523',
+            }
+        },
+        {
+            'url': 'http://www.jove.com/video/51796/culturing-caenorhabditis-elegans-axenic-liquid-media-creation',
+            'md5': '914aeb356f416811d911996434811beb',
+            'info_dict': {
+                'id': '51796',
+                'ext': 'mp4',
+                'title': 'Culturing Caenorhabditis elegans in Axenic Liquid Media and Creation of Transgenic Worms by Microparticle Bombardment',
+                'description': 'md5:35ff029261900583970c4023b70f1dc9',
+                'thumbnail': 're:^https?://.*\.png$',
+                'upload_date': '20140802',
+            }
+        },
+
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        chapters_id = self._html_search_regex(
+            r'/video-chapters\?videoid=([0-9]+)', webpage, 'chapters id')
+
+        chapters_xml = self._download_xml(
+            self._CHAPTERS_URL.format(video_id=chapters_id),
+            video_id, note='Downloading chapters XML',
+            errnote='Failed to download chapters XML')
+
+        video_url = chapters_xml.attrib.get('video')
+        if not video_url:
+            raise ExtractorError('Failed to get the video URL')
+
+        title = self._html_search_meta('citation_title', webpage, 'title')
+        thumbnail = self._og_search_thumbnail(webpage)
+        description = self._html_search_regex(
+            r'<div id="section_body_summary"><p class="jove_content">(.+?)</p>',
+            webpage, 'description', fatal=False)
+        publish_date = unified_strdate(self._html_search_meta(
+            'citation_publication_date', webpage, 'publish date', fatal=False))
+        comment_count = self._html_search_regex(
+            r'<meta name="num_comments" content="(\d+) Comments?"',
+            webpage, 'comment count', fatal=False)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': video_url,
+            'thumbnail': thumbnail,
+            'description': description,
+            'upload_date': publish_date,
+            'comment_count': comment_count,
+        }
diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py

index 6436c05a3cd8e3f25499b9ff911de837a6c98207..1a896b536dd813a561cea8f870258bf73519e00b 100644 (file)
--- a/youtube_dl/extractor/metacafe.py
+++ b/youtube_dl/extractor/metacafe.py
@@ -9,6 +9,7 @@ from ..utils import (
      compat_urllib_request,
      determine_ext,
      ExtractorError,
+    int_or_none,
  )
  
  
@@ -83,6 +84,21 @@ class MetacafeIE(InfoExtractor):
                  'skip_download': True,
              },
          },
+        # Movieclips.com video
+        {
+            'url': 'http://www.metacafe.com/watch/mv-Wy7ZU/my_week_with_marilyn_do_you_love_me/',
+            'info_dict': {
+                'id': 'mv-Wy7ZU',
+                'ext': 'mp4',
+                'title': 'My Week with Marilyn - Do You Love Me?',
+                'description': 'From the movie My Week with Marilyn - Colin (Eddie Redmayne) professes his love to Marilyn (Michelle Williams) and gets her to promise to return to set and finish the movie.',
+                'uploader': 'movie_trailers',
+                'duration': 176,
+            },
+            'params': {
+                'skip_download': 'requires rtmpdump',
+            }
+        }
      ]
  
      def report_disclaimer(self):
@@ -134,6 +150,7 @@ class MetacafeIE(InfoExtractor):
  
          # Extract URL, uploader and title from webpage
          self.report_extraction(video_id)
+        video_url = None
          mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
          if mobj is not None:
              mediaURL = compat_urllib_parse.unquote(mobj.group(1))
@@ -146,16 +163,17 @@ class MetacafeIE(InfoExtractor):
              else:
                  gdaKey = mobj.group(1)
                  video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
-        else:
+        if video_url is None:
              mobj = re.search(r'<video src="([^"]+)"', webpage)
              if mobj:
                  video_url = mobj.group(1)
                  video_ext = 'mp4'
-            else:
-                mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
-                if mobj is None:
-                    raise ExtractorError('Unable to extract media URL')
-                vardict = compat_parse_qs(mobj.group(1))
+        if video_url is None:
+            flashvars = self._search_regex(
+                r' name="flashvars" value="(.*?)"', webpage, 'flashvars',
+                default=None)
+            if flashvars:
+                vardict = compat_parse_qs(flashvars)
                  if 'mediaData' not in vardict:
                      raise ExtractorError('Unable to extract media URL')
                  mobj = re.search(
@@ -165,26 +183,68 @@ class MetacafeIE(InfoExtractor):
                  mediaURL = mobj.group('mediaURL').replace('\\/', '/')
                  video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
                  video_ext = determine_ext(video_url)
-
-        video_title = self._html_search_regex(r'(?im)<title>(.*) - Video</title>', webpage, 'title')
+        if video_url is None:
+            player_url = self._search_regex(
+                r"swfobject\.embedSWF\('([^']+)'",
+                webpage, 'config URL', default=None)
+            if player_url:
+                config_url = self._search_regex(
+                    r'config=(.+)$', player_url, 'config URL')
+                config_doc = self._download_xml(
+                    config_url, video_id,
+                    note='Downloading video config')
+                smil_url = config_doc.find('.//properties').attrib['smil_file']
+                smil_doc = self._download_xml(
+                    smil_url, video_id,
+                    note='Downloading SMIL document')
+                base_url = smil_doc.find('./head/meta').attrib['base']
+                video_url = []
+                for vn in smil_doc.findall('.//video'):
+                    br = int(vn.attrib['system-bitrate'])
+                    play_path = vn.attrib['src']
+                    video_url.append({
+                        'format_id': 'smil-%d' % br,
+                        'url': base_url,
+                        'play_path': play_path,
+                        'page_url': url,
+                        'player_url': player_url,
+                        'ext': play_path.partition(':')[0],
+                    })
+
+        if video_url is None:
+            raise ExtractorError('Unsupported video type')
+
+        video_title = self._html_search_regex(
+            r'(?im)<title>(.*) - Video</title>', webpage, 'title')
          description = self._og_search_description(webpage)
          thumbnail = self._og_search_thumbnail(webpage)
          video_uploader = self._html_search_regex(
                  r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);',
                  webpage, 'uploader nickname', fatal=False)
+        duration = int_or_none(
+            self._html_search_meta('video:duration', webpage))
+
+        age_limit = (
+            18
+            if re.search(r'"contentRating":"restricted"', webpage)
+            else 0)
  
-        if re.search(r'"contentRating":"restricted"', webpage) is not None:
-            age_limit = 18
+        if isinstance(video_url, list):
+            formats = video_url
          else:
-            age_limit = 0
+            formats = [{
+                'url': video_url,
+                'ext': video_ext,
+            }]
  
+        self._sort_formats(formats)
          return {
              'id': video_id,
-            'url': video_url,
              'description': description,
              'uploader': video_uploader,
              'title': video_title,
-            'thumbnail':thumbnail,
-            'ext': video_ext,
+            'thumbnail': thumbnail,
              'age_limit': age_limit,
+            'formats': formats,
+            'duration': duration,
          }
diff --git a/youtube_dl/extractor/ministrygrid.py b/youtube_dl/extractor/ministrygrid.py

new file mode 100644 (file)

index 0000000..949ad11
--- /dev/null
+++ b/youtube_dl/extractor/ministrygrid.py
@@ -0,0 +1,57 @@
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    smuggle_url,
+)
+
+
+class MinistryGridIE(InfoExtractor):
+    _VALID_URL = r'https?://www\.ministrygrid.com/([^/?#]*/)*(?P<id>[^/#?]+)/?(?:$|[?#])'
+
+    _TEST = {
+        'url': 'http://www.ministrygrid.com/training-viewer/-/training/t4g-2014-conference/the-gospel-by-numbers-4/the-gospel-by-numbers',
+        'md5': '844be0d2a1340422759c2a9101bab017',
+        'info_dict': {
+            'id': '3453494717001',
+            'ext': 'mp4',
+            'title': 'The Gospel by Numbers',
+            'description': 'Coming soon from T4G 2014!',
+            'uploader': 'LifeWay Christian Resources (MG)',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        portlets_json = self._search_regex(
+            r'Liferay\.Portlet\.list=(\[.+?\])', webpage, 'portlet list')
+        portlets = json.loads(portlets_json)
+        pl_id = self._search_regex(
+            r'<!--\s*p_l_id - ([0-9]+)<br>', webpage, 'p_l_id')
+
+        for i, portlet in enumerate(portlets):
+            portlet_url = 'http://www.ministrygrid.com/c/portal/render_portlet?p_l_id=%s&p_p_id=%s' % (pl_id, portlet)
+            portlet_code = self._download_webpage(
+                portlet_url, video_id,
+                note='Looking in portlet %s (%d/%d)' % (portlet, i + 1, len(portlets)),
+                fatal=False)
+            video_iframe_url = self._search_regex(
+                r'<iframe.*?src="([^"]+)"', portlet_code, 'video iframe',
+                default=None)
+            if video_iframe_url:
+                surl = smuggle_url(
+                    video_iframe_url, {'force_videoid': video_id})
+                return {
+                    '_type': 'url',
+                    'id': video_id,
+                    'url': surl,
+                }
+
+        raise ExtractorError('Could not find video iframe in any portlets')
diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py

new file mode 100644 (file)

index 0000000..979f3d6
--- /dev/null
+++ b/youtube_dl/extractor/mitele.py
@@ -0,0 +1,60 @@
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse,
+    get_element_by_attribute,
+    parse_duration,
+    strip_jsonp,
+)
+
+
+class MiTeleIE(InfoExtractor):
+    IE_NAME = 'mitele.es'
+    _VALID_URL = r'http://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<episode>[^/]+)/'
+
+    _TEST = {
+        'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/',
+        'md5': '6a75fe9d0d3275bead0cb683c616fddb',
+        'info_dict': {
+            'id': '0fce117d',
+            'ext': 'mp4',
+            'title': 'Programa 144 - Tor, la web invisible',
+            'description': 'md5:3b6fce7eaa41b2d97358726378d9369f',
+            'display_id': 'programa-144',
+            'duration': 2913,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        episode = mobj.group('episode')
+        webpage = self._download_webpage(url, episode)
+        embed_data_json = self._search_regex(
+            r'MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data',
+            flags=re.DOTALL
+        ).replace('\'', '"')
+        embed_data = json.loads(embed_data_json)
+
+        info_url = embed_data['flashvars']['host']
+        info_el = self._download_xml(info_url, episode).find('./video/info')
+
+        video_link = info_el.find('videoUrl/link').text
+        token_query = compat_urllib_parse.urlencode({'id': video_link})
+        token_info = self._download_json(
+            'http://token.mitele.es/?' + token_query, episode,
+            transform_source=strip_jsonp
+        )
+
+        return {
+            'id': embed_data['videoId'],
+            'display_id': episode,
+            'title': info_el.find('title').text,
+            'url': token_info['tokenizedUrl'],
+            'description': get_element_by_attribute('class', 'text', webpage),
+            'thumbnail': info_el.find('thumb').text,
+            'duration': parse_duration(info_el.find('duration').text),
+        }
diff --git a/youtube_dl/extractor/movieclips.py b/youtube_dl/extractor/movieclips.py

new file mode 100644 (file)

index 0000000..456807d
--- /dev/null
+++ b/youtube_dl/extractor/movieclips.py
@@ -0,0 +1,78 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    compat_str,
+    clean_html,
+)
+
+
+class MovieClipsIE(InfoExtractor):
+    _VALID_URL = r'https?://movieclips\.com/(?P<id>[\da-zA-Z]+)(?:-(?P<display_id>[\da-z-]+))?'
+    _TEST = {
+        'url': 'http://movieclips.com/Wy7ZU-my-week-with-marilyn-movie-do-you-love-me/',
+        'info_dict': {
+            'id': 'Wy7ZU',
+            'display_id': 'my-week-with-marilyn-movie-do-you-love-me',
+            'ext': 'mp4',
+            'title': 'My Week with Marilyn - Do You Love Me?',
+            'description': 'md5:e86795bd332fe3cff461e7c8dc542acb',
+            'thumbnail': 're:^https?://.*\.jpg$',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id')
+        show_id = display_id or video_id
+
+        config = self._download_xml(
+            'http://config.movieclips.com/player/config/%s' % video_id,
+            show_id, 'Downloading player config')
+
+        if config.find('./country-region').text == 'false':
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, config.find('./region_alert').text), expected=True)
+
+        properties = config.find('./video/properties')
+        smil_file = properties.attrib['smil_file']
+
+        smil = self._download_xml(smil_file, show_id, 'Downloading SMIL')
+        base_url = smil.find('./head/meta').attrib['base']
+
+        formats = []
+        for video in smil.findall('./body/switch/video'):
+            vbr = int(video.attrib['system-bitrate']) / 1000
+            src = video.attrib['src']
+            formats.append({
+                'url': base_url,
+                'play_path': src,
+                'ext': src.split(':')[0],
+                'vbr': vbr,
+                'format_id': '%dk' % vbr,
+            })
+
+        self._sort_formats(formats)
+
+        title = '%s - %s' % (properties.attrib['clip_movie_title'], properties.attrib['clip_title'])
+        description = clean_html(compat_str(properties.attrib['clip_description']))
+        thumbnail = properties.attrib['image']
+        categories = properties.attrib['clip_categories'].split(',')
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'categories': categories,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/nuvid.py b/youtube_dl/extractor/nuvid.py

index 280328b78306e5ab332cbb7111127f832c6c9aba..58ec81f91115b9df146f7570f5ef508f86a35fde 100644 (file)
--- a/youtube_dl/extractor/nuvid.py
+++ b/youtube_dl/extractor/nuvid.py
@@ -38,7 +38,7 @@ class NuvidIE(InfoExtractor):
              webpage = self._download_webpage(
                  request, video_id, 'Downloading %s page' % format_id)
              video_url = self._html_search_regex(
-                r'<a href="([^"]+)"\s*>Continue to watch video', webpage, '%s video URL' % format_id, fatal=False)
+                r'<a\s+href="([^"]+)"\s+class="b_link">', webpage, '%s video URL' % format_id, fatal=False)
              if not video_url:
                  continue
              formats.append({
@@ -49,19 +49,24 @@ class NuvidIE(InfoExtractor):
          webpage = self._download_webpage(
              'http://m.nuvid.com/video/%s' % video_id, video_id, 'Downloading video page')
          title = self._html_search_regex(
-            r'<div class="title">\s+<h2[^>]*>([^<]+)</h2>', webpage, 'title').strip()
-        thumbnail = self._html_search_regex(
-            r'href="(/thumbs/[^"]+)"[^>]*data-link_type="thumbs"',
-            webpage, 'thumbnail URL', fatal=False)
+            [r'<span title="([^"]+)">',
+             r'<div class="thumb-holder video">\s*<h5[^>]*>([^<]+)</h5>'], webpage, 'title').strip()
+        thumbnails = [
+            {
+                'url': thumb_url,
+            } for thumb_url in re.findall(r'<img src="([^"]+)" alt="" />', webpage)
+        ]
+        thumbnail = thumbnails[0]['url'] if thumbnails else None
          duration = parse_duration(self._html_search_regex(
-            r'Length:\s*<span>(\d{2}:\d{2})</span>',webpage, 'duration', fatal=False))
+            r'<i class="fa fa-clock-o"></i>\s*(\d{2}:\d{2})', webpage, 'duration', fatal=False))
          upload_date = unified_strdate(self._html_search_regex(
-            r'Added:\s*<span>(\d{4}-\d{2}-\d{2})</span>', webpage, 'upload date', fatal=False))
+            r'<i class="fa fa-user"></i>\s*(\d{4}-\d{2}-\d{2})', webpage, 'upload date', fatal=False))
  
          return {
              'id': video_id,
              'title': title,
-            'thumbnail': 'http://m.nuvid.com%s' % thumbnail,
+            'thumbnails': thumbnails,
+            'thumbnail': thumbnail,
              'duration': duration,
              'upload_date': upload_date,
              'age_limit': 18,
diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py

new file mode 100644 (file)

index 0000000..5429592
--- /dev/null
+++ b/youtube_dl/extractor/patreon.py
@@ -0,0 +1,100 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    js_to_json,
+)
+
+
+class PatreonIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?patreon\.com/creation\?hid=(.+)'
+    _TESTS = [
+        {
+            'url': 'http://www.patreon.com/creation?hid=743933',
+            'md5': 'e25505eec1053a6e6813b8ed369875cc',
+            'info_dict': {
+                'id': '743933',
+                'ext': 'mp3',
+                'title': 'Episode 166: David Smalley of Dogma Debate',
+                'uploader': 'Cognitive Dissonance Podcast',
+                'thumbnail': 're:^https?://.*$',
+            },
+        },
+        {
+            'url': 'http://www.patreon.com/creation?hid=754133',
+            'md5': '3eb09345bf44bf60451b8b0b81759d0a',
+            'info_dict': {
+                'id': '754133',
+                'ext': 'mp3',
+                'title': 'CD 167 Extra',
+                'uploader': 'Cognitive Dissonance Podcast',
+                'thumbnail': 're:^https?://.*$',
+            },
+        },
+    ]
+
+    # Currently Patreon exposes download URL via hidden CSS, so login is not
+    # needed. Keeping this commented for when this inevitably changes.
+    '''
+    def _login(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            return
+
+        login_form = {
+            'redirectUrl': 'http://www.patreon.com/',
+            'email': username,
+            'password': password,
+        }
+
+        request = compat_urllib_request.Request(
+            'https://www.patreon.com/processLogin',
+            compat_urllib_parse.urlencode(login_form).encode('utf-8')
+        )
+        login_page = self._download_webpage(request, None, note='Logging in as %s' % username)
+
+        if re.search(r'onLoginFailed', login_page):
+            raise ExtractorError('Unable to login, incorrect username and/or password', expected=True)
+
+    def _real_initialize(self):
+        self._login()
+    '''
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group(1)
+
+        webpage = self._download_webpage(url, video_id)
+        title = self._og_search_title(webpage).strip()
+
+        attach_fn = self._html_search_regex(
+            r'<div class="attach"><a target="_blank" href="([^"]+)">',
+            webpage, 'attachment URL', default=None)
+        if attach_fn is not None:
+            video_url = 'http://www.patreon.com' + attach_fn
+            thumbnail = self._og_search_thumbnail(webpage)
+            uploader = self._html_search_regex(
+                r'<strong>(.*?)</strong> is creating', webpage, 'uploader')
+        else:
+            playlist_js = self._search_regex(
+                r'(?s)new\s+jPlayerPlaylist\(\s*\{\s*[^}]*},\s*(\[.*?,?\s*\])',
+                webpage, 'playlist JSON')
+            playlist_json = js_to_json(playlist_js)
+            playlist = json.loads(playlist_json)
+            data = playlist[0]
+            video_url = self._proto_relative_url(data['mp3'])
+            thumbnail = self._proto_relative_url(data.get('cover'))
+            uploader = data.get('artist')
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'ext': 'mp3',
+            'title': title,
+            'uploader': uploader,
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py

index ec95d070411f97e1dad2fde881c9e5f847caafc8..2adfde9091b5ceae50abd8f0c79abd129c259751 100644 (file)
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@@ -20,17 +20,53 @@ class PBSIE(InfoExtractor):
          )
      '''
  
-    _TEST = {
-        'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/',
-        'md5': 'ce1888486f0908d555a8093cac9a7362',
-        'info_dict': {
-            'id': '2365006249',
-            'ext': 'mp4',
-            'title': 'A More Perfect Union',
-            'description': 'md5:ba0c207295339c8d6eced00b7c363c6a',
-            'duration': 3190,
+    _TESTS = [
+        {
+            'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/',
+            'md5': 'ce1888486f0908d555a8093cac9a7362',
+            'info_dict': {
+                'id': '2365006249',
+                'ext': 'mp4',
+                'title': 'A More Perfect Union',
+                'description': 'md5:ba0c207295339c8d6eced00b7c363c6a',
+                'duration': 3190,
+            },
+        },
+        {
+            'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/',
+            'md5': '143c98aa54a346738a3d78f54c925321',
+            'info_dict': {
+                'id': '2365297690',
+                'ext': 'mp4',
+                'title': 'Losing Iraq',
+                'description': 'md5:f5bfbefadf421e8bb8647602011caf8e',
+                'duration': 5050,
+            },
+        },
+        {
+            'url': 'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/',
+            'md5': 'b19856d7f5351b17a5ab1dc6a64be633',
+            'info_dict': {
+                'id': '2201174722',
+                'ext': 'mp4',
+                'title': 'Cyber Schools Gain Popularity, but Quality Questions Persist',
+                'description': 'md5:5871c15cba347c1b3d28ac47a73c7c28',
+                'duration': 801,
+            },
          },
-    }
+        {
+            'url': 'http://www.pbs.org/wnet/gperf/dudamel-conducts-verdi-requiem-hollywood-bowl-full-episode/3374/',
+            'md5': 'c62859342be2a0358d6c9eb306595978',
+            'info_dict': {
+                'id': '2365297708',
+                'ext': 'mp4',
+                'description': 'md5:68d87ef760660eb564455eb30ca464fe',
+                'title': 'Dudamel Conducts Verdi Requiem at the Hollywood Bowl - Full',
+                'duration': 6559,
+                'thumbnail': 're:^https?://.*\.jpg$',
+            }
+        }
+    ]
  
      def _extract_ids(self, url):
          mobj = re.match(self._VALID_URL, url)
@@ -40,15 +76,18 @@ class PBSIE(InfoExtractor):
          if presumptive_id:
              webpage = self._download_webpage(url, display_id)
  
-            # frontline video embed
+            MEDIA_ID_REGEXES = [
+                r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'",  # frontline video embed
+                r'class="coveplayerid">([^<]+)<',                       # coveplayer
+            ]
+
              media_id = self._search_regex(
-                r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'",
-                webpage, 'frontline video ID', fatal=False, default=None)
+                MEDIA_ID_REGEXES, webpage, 'media ID', fatal=False, default=None)
              if media_id:
                  return media_id, presumptive_id
  
              url = self._search_regex(
-                r'<iframe\s+id=["\']partnerPlayer["\'].*?\s+src=["\'](.*?)["\']>',
+                r'<iframe\s+(?:class|id)=["\']partnerPlayer["\'].*?\s+src=["\'](.*?)["\']>',
                  webpage, 'player URL')
              mobj = re.match(self._VALID_URL, url)
  
diff --git a/youtube_dl/extractor/playfm.py b/youtube_dl/extractor/playfm.py

new file mode 100644 (file)

index 0000000..72df4d8
--- /dev/null
+++ b/youtube_dl/extractor/playfm.py
@@ -0,0 +1,82 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse,
+    compat_urllib_request,
+    ExtractorError,
+    float_or_none,
+    int_or_none,
+)
+
+
+class PlayFMIE(InfoExtractor):
+    IE_NAME = 'play.fm'
+    _VALID_URL = r'https?://(?:www\.)?play\.fm/[^?#]*(?P<upload_date>[0-9]{8})(?P<id>[0-9]{6})(?:$|[?#])'
+
+    _TEST = {
+        'url': 'http://www.play.fm/recording/leipzigelectronicmusicbatofarparis_fr20140712137220',
+        'md5': 'c505f8307825a245d0c7ad1850001f22',
+        'info_dict': {
+            'id': '137220',
+            'ext': 'mp3',
+            'title': 'LEIPZIG ELECTRONIC MUSIC @ Batofar (Paris,FR) - 2014-07-12',
+            'uploader': 'Sven Tasnadi',
+            'uploader_id': 'sventasnadi',
+            'duration': 5627.428,
+            'upload_date': '20140712',
+            'view_count': int,
+            'thumbnail': 're:^https?://.*\.jpg$',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        upload_date = mobj.group('upload_date')
+
+        rec_data = compat_urllib_parse.urlencode({'rec_id': video_id})
+        req = compat_urllib_request.Request(
+            'http://www.play.fm/flexRead/recording', data=rec_data)
+        req.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        rec_doc = self._download_xml(req, video_id)
+
+        error_node = rec_doc.find('./error')
+        if error_node is not None:
+            raise ExtractorError('An error occured: %s (code %s)' % (
+                error_node.text, rec_doc.find('./status').text))
+
+        recording = rec_doc.find('./recording')
+        title = recording.find('./title').text
+        view_count = int_or_none(recording.find('./stats/playcount').text)
+        duration = float_or_none(recording.find('./duration').text, scale=1000)
+        thumbnail = recording.find('./image').text
+
+        artist = recording.find('./artists/artist')
+        uploader = artist.find('./name').text
+        uploader_id = artist.find('./slug').text
+
+        video_url = '%s//%s/%s/%s/offset/0/sh/%s/rec/%s/jingle/%s/loc/%s' % (
+            'http:', recording.find('./url').text,
+            recording.find('./_class').text, recording.find('./file_id').text,
+            rec_doc.find('./uuid').text, video_id,
+            rec_doc.find('./jingle/file_id').text,
+            'http%3A%2F%2Fwww.play.fm%2Fplayer',
+        )
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'ext': 'mp3',
+            'filesize': int_or_none(recording.find('./size').text),
+            'title': title,
+            'upload_date': upload_date,
+            'view_count': view_count,
+            'duration': duration,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+        }
diff --git a/youtube_dl/extractor/pornotube.py b/youtube_dl/extractor/pornotube.py

index 35dc5a9ffafb32d36e30f51988291dded6a6d18c..04bd3d9793c0424c6dded7d727e1f8cac629377c 100644 (file)
--- a/youtube_dl/extractor/pornotube.py
+++ b/youtube_dl/extractor/pornotube.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
  import re
  
  from .common import InfoExtractor
@@ -9,15 +11,16 @@ from ..utils import (
  
  
  class PornotubeIE(InfoExtractor):
-    _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
+    _VALID_URL = r'https?://(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
      _TEST = {
-        u'url': u'http://pornotube.com/c/173/m/1689755/Marilyn-Monroe-Bathing',
-        u'file': u'1689755.flv',
-        u'md5': u'374dd6dcedd24234453b295209aa69b6',
-        u'info_dict': {
-            u"upload_date": u"20090708", 
-            u"title": u"Marilyn-Monroe-Bathing",
-            u"age_limit": 18
+        'url': 'http://pornotube.com/c/173/m/1689755/Marilyn-Monroe-Bathing',
+        'md5': '374dd6dcedd24234453b295209aa69b6',
+        'info_dict': {
+            'id': '1689755',
+            'ext': 'flv',
+            'upload_date': '20090708',
+            'title': 'Marilyn-Monroe-Bathing',
+            'age_limit': 18
          }
      }
  
@@ -32,22 +35,22 @@ class PornotubeIE(InfoExtractor):
  
          # Get the video URL
          VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
-        video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
+        video_url = self._search_regex(VIDEO_URL_RE, webpage, 'video url')
          video_url = compat_urllib_parse.unquote(video_url)
  
          #Get the uploaded date
          VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
-        upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
-        if upload_date: upload_date = unified_strdate(upload_date)
+        upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, 'upload date', fatal=False)
+        if upload_date:
+            upload_date = unified_strdate(upload_date)
          age_limit = self._rta_search(webpage)
  
-        info = {'id': video_id,
-                'url': video_url,
-                'uploader': None,
-                'upload_date': upload_date,
-                'title': video_title,
-                'ext': 'flv',
-                'format': 'flv',
-                'age_limit': age_limit}
-
-        return [info]
+        return {
+            'id': video_id,
+            'url': video_url,
+            'upload_date': upload_date,
+            'title': video_title,
+            'ext': 'flv',
+            'format': 'flv',
+            'age_limit': age_limit,
+        }
diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py

new file mode 100644 (file)

index 0000000..190c8f2
--- /dev/null
+++ b/youtube_dl/extractor/rtlnl.py
@@ -0,0 +1,51 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class RtlXlIE(InfoExtractor):
+    IE_NAME = 'rtlxl.nl'
+    _VALID_URL = r'https?://www\.rtlxl\.nl/#!/[^/]+/(?P<uuid>[^/?]+)'
+
+    _TEST = {
+        'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/6e4203a6-0a5e-3596-8424-c599a59e0677',
+        'info_dict': {
+            'id': '6e4203a6-0a5e-3596-8424-c599a59e0677',
+            'ext': 'flv',
+            'title': 'RTL Nieuws - Laat',
+            'description': 'Dagelijks het laatste nieuws uit binnen- en '
+                'buitenland. Voor nog meer nieuws kunt u ook gebruikmaken van '
+                'onze mobiele apps.',
+            'timestamp': 1408051800,
+            'upload_date': '20140814',
+        },
+        'params': {
+            # We download the first bytes of the first fragment, it can't be
+            # processed by the f4m downloader beacuse it isn't complete
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        uuid = mobj.group('uuid')
+
+        info = self._download_json(
+            'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=flash/' % uuid,
+            uuid)
+        material = info['material'][0]
+        episode_info = info['episodes'][0]
+
+        f4m_url = 'http://manifest.us.rtl.nl' + material['videopath']
+        progname = info['abstracts'][0]['name']
+        subtitle = material['title'] or info['episodes'][0]['name']
+
+        return {
+            'id': uuid,
+            'title': '%s - %s' % (progname, subtitle), 
+            'formats': self._extract_f4m_formats(f4m_url, uuid),
+            'timestamp': material['original_date'],
+            'description': episode_info['synopsis'],
+        }
diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py

index c2228b2f0f6a1fc9bba02cddcb5a1740cc85038d..4dd35a47b35b5341139aa2d7f27886b52ddad5ce 100644 (file)
--- a/youtube_dl/extractor/rtve.py
+++ b/youtube_dl/extractor/rtve.py
@@ -1,21 +1,66 @@
  # encoding: utf-8
  from __future__ import unicode_literals
  
-import re
  import base64
+import re
+import time
  
  from .common import InfoExtractor
  from ..utils import (
      struct_unpack,
+    remove_end,
  )
  
  
+def _decrypt_url(png):
+    encrypted_data = base64.b64decode(png)
+    text_index = encrypted_data.find(b'tEXt')
+    text_chunk = encrypted_data[text_index - 4:]
+    length = struct_unpack('!I', text_chunk[:4])[0]
+    # Use bytearray to get integers when iterating in both python 2.x and 3.x
+    data = bytearray(text_chunk[8:8 + length])
+    data = [chr(b) for b in data if b != 0]
+    hash_index = data.index('#')
+    alphabet_data = data[:hash_index]
+    url_data = data[hash_index + 1:]
+
+    alphabet = []
+    e = 0
+    d = 0
+    for l in alphabet_data:
+        if d == 0:
+            alphabet.append(l)
+            d = e = (e + 1) % 4
+        else:
+            d -= 1
+    url = ''
+    f = 0
+    e = 3
+    b = 1
+    for letter in url_data:
+        if f == 0:
+            l = int(letter) * 10
+            f = 1
+        else:
+            if e == 0:
+                l += int(letter)
+                url += alphabet[l]
+                e = (b + 3) % 4
+                f = 0
+                b += 1
+            else:
+                e -= 1
+
+    return url
+
+
+
  class RTVEALaCartaIE(InfoExtractor):
      IE_NAME = 'rtve.es:alacarta'
      IE_DESC = 'RTVE a la carta'
      _VALID_URL = r'http://www\.rtve\.es/alacarta/videos/[^/]+/[^/]+/(?P<id>\d+)'
  
-    _TEST = {
+    _TESTS = [{
          'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/',
          'md5': '1d49b7e1ca7a7502c56a4bf1b60f1b43',
          'info_dict': {
@@ -23,48 +68,15 @@ class RTVEALaCartaIE(InfoExtractor):
              'ext': 'mp4',
              'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia',
          },
-    }
-
-    def _decrypt_url(self, png):
-        encrypted_data = base64.b64decode(png)
-        text_index = encrypted_data.find(b'tEXt')
-        text_chunk = encrypted_data[text_index-4:]
-        length = struct_unpack('!I', text_chunk[:4])[0]
-        # Use bytearray to get integers when iterating in both python 2.x and 3.x
-        data = bytearray(text_chunk[8:8+length])
-        data = [chr(b) for b in data if b != 0]
-        hash_index = data.index('#')
-        alphabet_data = data[:hash_index]
-        url_data = data[hash_index+1:]
-
-        alphabet = []
-        e = 0
-        d = 0
-        for l in alphabet_data:
-            if d == 0:
-                alphabet.append(l)
-                d = e = (e + 1) % 4
-            else:
-                d -= 1
-        url = ''
-        f = 0
-        e = 3
-        b = 1
-        for letter in url_data:
-            if f == 0:
-                l = int(letter)*10
-                f = 1
-            else:
-                if e == 0:
-                    l += int(letter)
-                    url += alphabet[l]
-                    e = (b + 3) % 4
-                    f = 0
-                    b += 1
-                else:
-                    e -= 1
-
-        return url
+    }, {
+        'note': 'Live stream',
+        'url': 'http://www.rtve.es/alacarta/videos/television/24h-live/1694255/',
+        'info_dict': {
+            'id': '1694255',
+            'ext': 'flv',
+            'title': 'TODO',
+        }
+    }]
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
@@ -74,11 +86,57 @@ class RTVEALaCartaIE(InfoExtractor):
              video_id)['page']['items'][0]
          png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % video_id
          png = self._download_webpage(png_url, video_id, 'Downloading url information')
-        video_url = self._decrypt_url(png)
+        video_url = _decrypt_url(png)
  
          return {
              'id': video_id,
              'title': info['title'],
              'url': video_url,
-            'thumbnail': info['image'],
+            'thumbnail': info.get('image'),
+            'page_url': url,
+        }
+
+
+class RTVELiveIE(InfoExtractor):
+    IE_NAME = 'rtve.es:live'
+    IE_DESC = 'RTVE.es live streams'
+    _VALID_URL = r'http://www\.rtve\.es/(?:deportes/directo|noticias|television)/(?P<id>[a-zA-Z0-9-]+)'
+
+    _TESTS = [{
+        'url': 'http://www.rtve.es/noticias/directo-la-1/',
+        'info_dict': {
+            'id': 'directo-la-1',
+            'ext': 'flv',
+            'title': 're:^La 1 de TVE [0-9]{4}-[0-9]{2}-[0-9]{2}Z[0-9]{6}$',
+        },
+        'params': {
+            'skip_download': 'live stream',
+        }
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        start_time = time.gmtime()
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        player_url = self._search_regex(
+            r'<param name="movie" value="([^"]+)"/>', webpage, 'player URL')
+        title = remove_end(self._og_search_title(webpage), ' en directo')
+        title += ' ' + time.strftime('%Y-%m-%dZ%H%M%S', start_time)
+
+        vidplayer_id = self._search_regex(
+            r' id="vidplayer([0-9]+)"', webpage, 'internal video ID')
+        png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % vidplayer_id
+        png = self._download_webpage(png_url, video_id, 'Downloading url information')
+        video_url = _decrypt_url(png)
+
+        return {
+            'id': video_id,
+            'ext': 'flv',
+            'title': title,
+            'url': video_url,
+            'app': 'rtve-live-live?ovpfv=2.1.2',
+            'player_url': player_url,
+            'rtmp_live': True,
          }
diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py

new file mode 100644 (file)

index 0000000..34058fd
--- /dev/null
+++ b/youtube_dl/extractor/sbs.py
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import json
+import re
+from .common import InfoExtractor
+from ..utils import (
+    js_to_json,
+    remove_end,
+)
+
+
+class SBSIE(InfoExtractor):
+    IE_DESC = 'sbs.com.au'
+    _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/ondemand/video/single/(?P<id>[0-9]+)/'
+
+    _TESTS = [{
+        # Original URL is handled by the generic IE which finds the iframe:
+        # http://www.sbs.com.au/thefeed/blog/2014/08/21/dingo-conservation
+        'url': 'http://www.sbs.com.au/ondemand/video/single/320403011771/?source=drupal&vertical=thefeed',
+        'md5': '3150cf278965eeabb5b4cea1c963fe0a',
+        'info_dict': {
+            'id': '320403011771',
+            'ext': 'flv',
+            'title': 'Dingo Conservation',
+            'description': 'Dingoes are on the brink of extinction; most of the animals we think are dingoes are in fact crossbred with wild dogs. This family run a dingo conservation park to prevent their extinction',
+            'thumbnail': 're:http://.*\.jpg',
+        },
+        'add_ies': ['generic'],
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        webpage = self._download_webpage(url, video_id)
+
+        release_urls_json = js_to_json(self._search_regex(
+            r'(?s)playerParams\.releaseUrls\s*=\s*(\{.*?\n\});\n',
+            webpage, ''))
+        release_urls = json.loads(release_urls_json)
+        theplatform_url = (
+            release_urls.get('progressive') or release_urls.get('standard'))
+
+        title = remove_end(self._og_search_title(webpage), ' (The Feed)')
+        description = self._html_search_meta('description', webpage)
+        thumbnail = self._og_search_thumbnail(webpage)
+
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'url': theplatform_url,
+
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py

index f8dd7e955ada5ce58fd04d668027587eda1b6c00..fa796ce72126610cda53db5378d926b44d72e526 100644 (file)
--- a/youtube_dl/extractor/teamcoco.py
+++ b/youtube_dl/extractor/teamcoco.py
@@ -37,7 +37,7 @@ class TeamcocoIE(InfoExtractor):
          video_id = mobj.group("video_id")
          if not video_id:
              video_id = self._html_search_regex(
-                r'<article class="video" data-id="(\d+?)"',
+                r'data-node-id="(\d+?)"',
                  webpage, 'video id')
  
          data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
diff --git a/youtube_dl/extractor/vodlocker.py b/youtube_dl/extractor/vodlocker.py

index 6d3b78749eaf1b25d23422343ab36c9d84c7cdc0..affef650726d716b7e80aaab5c66dab3bc3ddc28 100644 (file)
--- a/youtube_dl/extractor/vodlocker.py
+++ b/youtube_dl/extractor/vodlocker.py
@@ -44,7 +44,7 @@ class VodlockerIE(InfoExtractor):
                  req, video_id, 'Downloading video page')
  
          title = self._search_regex(
-            r'id="file_title".*?>\s*(.*?)\s*<span', webpage, 'title')
+            r'id="file_title".*?>\s*(.*?)\s*<(?:br|span)', webpage, 'title')
          thumbnail = self._search_regex(
              r'image:\s*"(http[^\"]+)",', webpage, 'thumbnail')
          url = self._search_regex(
diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py

index a584e08966ac57354c51a71d7fee520d7ce67df8..1f330378a7fbd72bd23c92747930e85e9145e452 100644 (file)
--- a/youtube_dl/extractor/wat.py
+++ b/youtube_dl/extractor/wat.py
@@ -2,27 +2,30 @@
  from __future__ import unicode_literals
  
  import re
+import time
+import hashlib
  
  from .common import InfoExtractor
  from ..utils import (
+    ExtractorError,
      unified_strdate,
  )
  
  
  class WatIE(InfoExtractor):
-    _VALID_URL = r'http://www\.wat\.tv/.*-(?P<shortID>.*?)_.*?\.html'
+    _VALID_URL = r'http://www\.wat\.tv/video/(?P<display_id>.*)-(?P<short_id>.*?)_.*?\.html'
      IE_NAME = 'wat.tv'
      _TEST = {
-        'url': 'http://www.wat.tv/video/world-war-philadelphia-vost-6bv55_2fjr7_.html',
+        'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html',
+        'md5': 'ce70e9223945ed26a8056d413ca55dc9',
          'info_dict': {
-            'id': '10631273',
+            'id': '11713067',
+            'display_id': 'soupe-figues-l-orange-aux-epices',
              'ext': 'mp4',
-            'title': 'World War Z - Philadelphia VOST',
-            'description': 'La menace est partout. Que se passe-t-il à Philadelphia ?\r\nWORLD WAR Z, avec Brad Pitt, au cinéma le 3 juillet.\r\nhttp://www.worldwarz.fr',
-        },
-        'params': {
-            # Sometimes wat serves the whole file with the --test option
-            'skip_download': True,
+            'title': 'Soupe de figues à l\'orange et aux épices',
+            'description': 'Retrouvez l\'émission "Petits plats en équilibre", diffusée le 18 août 2014.',
+            'upload_date': '20140819',
+            'duration': 120,
          },
      }
  
@@ -36,13 +39,20 @@ class WatIE(InfoExtractor):
          def real_id_for_chapter(chapter):
              return chapter['tc_start'].split('-')[0]
          mobj = re.match(self._VALID_URL, url)
-        short_id = mobj.group('shortID')
-        webpage = self._download_webpage(url, short_id)
+        short_id = mobj.group('short_id')
+        display_id = mobj.group('display_id')
+        webpage = self._download_webpage(url, display_id or short_id)
          real_id = self._search_regex(r'xtpage = ".*-(.*?)";', webpage, 'real id')
  
          video_info = self.download_video_info(real_id)
+
+        if video_info.get('geolock'):
+            raise ExtractorError('This content is not available in your area', expected=True)
+
          chapters = video_info['chapters']
          first_chapter = chapters[0]
+        files = video_info['files']
+        first_file = files[0]
  
          if real_id_for_chapter(first_chapter) != real_id:
              self.to_screen('Multipart video detected')
@@ -61,12 +71,45 @@ class WatIE(InfoExtractor):
              upload_date = unified_strdate(first_chapter['date_diffusion'])
          # Otherwise we can continue and extract just one part, we have to use
          # the short id for getting the video url
+
+        formats = [{
+            'url': 'http://wat.tv/get/android5/%s.mp4' % real_id,
+            'format_id': 'Mobile',
+        }]
+
+        fmts = [('SD', 'web')]
+        if first_file.get('hasHD'):
+            fmts.append(('HD', 'webhd'))
+
+        def compute_token(param):
+            timestamp = '%08x' % int(time.time())
+            magic = '9b673b13fa4682ed14c3cfa5af5310274b514c4133e9b3a81e6e3aba009l2564'
+            return '%s/%s' % (hashlib.md5((magic + param + timestamp).encode('ascii')).hexdigest(), timestamp)
+
+        for fmt in fmts:
+            webid = '/%s/%s' % (fmt[1], real_id)
+            video_url = self._download_webpage(
+                'http://www.wat.tv/get%s?token=%s&getURL=1' % (webid, compute_token(webid)),
+                real_id,
+                'Downloding %s video URL' % fmt[0],
+                'Failed to download %s video URL' % fmt[0],
+                False)
+            if not video_url:
+                continue
+            formats.append({
+                'url': video_url,
+                'ext': 'mp4',
+                'format_id': fmt[0],
+            })
+
          return {
              'id': real_id,
-            'url': 'http://wat.tv/get/android5/%s.mp4' % real_id,
+            'display_id': display_id,
              'title': first_chapter['title'],
              'thumbnail': first_chapter['preview'],
              'description': first_chapter['description'],
              'view_count': video_info['views'],
              'upload_date': upload_date,
+            'duration': first_file['duration'],
+            'formats': formats,
          }
diff --git a/youtube_dl/extractor/wayofthemaster.py b/youtube_dl/extractor/wayofthemaster.py

new file mode 100644 (file)

index 0000000..af7bb8b
--- /dev/null
+++ b/youtube_dl/extractor/wayofthemaster.py
@@ -0,0 +1,52 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class WayOfTheMasterIE(InfoExtractor):
+    _VALID_URL = r'https?://www\.wayofthemaster\.com/([^/?#]*/)*(?P<id>[^/?#]+)\.s?html(?:$|[?#])'
+
+    _TEST = {
+        'url': 'http://www.wayofthemaster.com/hbks.shtml',
+        'md5': '5316b57487ada8480606a93cb3d18d24',
+        'info_dict': {
+            'id': 'hbks',
+            'ext': 'mp4',
+            'title': 'Intelligent Design vs. Evolution',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._search_regex(
+            r'<img src="images/title_[^"]+".*?alt="([^"]+)"',
+            webpage, 'title', default=None)
+        if title is None:
+            title = self._html_search_regex(
+                r'<title>(.*?)</title>', webpage, 'page title')
+
+        url_base = self._search_regex(
+            r'<param\s+name="?movie"?\s+value=".*?/wotm_videoplayer_highlow[0-9]*\.swf\?vid=([^"]+)"',
+            webpage, 'URL base')
+        formats = [{
+            'format_id': 'low',
+            'quality': 1,
+            'url': url_base + '_low.mp4',
+        }, {
+            'format_id': 'high',
+            'quality': 2,
+            'url': url_base + '_high.mp4',
+        }]
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py

index 5374495f9b08f4d13fd7552fd612c19339b99e54..00b6d1eba33a6686319d47846c34476ce8b387c7 100644 (file)
--- a/youtube_dl/extractor/xhamster.py
+++ b/youtube_dl/extractor/xhamster.py
@@ -14,7 +14,7 @@ from ..utils import (
  
  class XHamsterIE(InfoExtractor):
      """Information Extractor for xHamster"""
-    _VALID_URL = r'http://(?:www\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'
+    _VALID_URL = r'http://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'
      _TESTS = [
          {
              'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index 5bfe5e7e586a84b89c3365866f80d1e91bbadf73..75044d71a3fd9f81fa5d89ab8283eb13e5d8191d 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -297,7 +297,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
          '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
  
          # Dash webm audio
-        '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50},
+        '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
          '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
  
          # RTMP (unnamed)
@@ -446,6 +446,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                  return lambda s: u''.join(s[i] for i in cache_spec)
              except IOError:
                  pass  # No cache available
+            except ValueError:
+                try:
+                    file_size = os.path.getsize(cache_fn)
+                except (OSError, IOError) as oe:
+                    file_size = str(oe)
+                self._downloader.report_warning(
+                    u'Cache %s failed (%s)' % (cache_fn, file_size))
  
          if player_type == 'js':
              code = self._download_webpage(
@@ -573,6 +580,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
          sub_lang_list = {}
          for l in lang_list:
              lang = l[1]
+            if lang in sub_lang_list:
+                continue
              params = compat_urllib_parse.urlencode({
                  'lang': lang,
                  'v': video_id,
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index 65b492fb336e4e26f7193377c48282a3b5464939..8095400d03dee6731b2bd24951597b74a90f1991 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -24,6 +24,7 @@ import socket
  import struct
  import subprocess
  import sys
+import tempfile
  import traceback
  import xml.etree.ElementTree
  import zlib
@@ -228,18 +229,42 @@ else:
          assert type(s) == type(u'')
          print(s)
  
-# In Python 2.x, json.dump expects a bytestream.
-# In Python 3.x, it writes to a character stream
-if sys.version_info < (3,0):
-    def write_json_file(obj, fn):
-        with open(fn, 'wb') as f:
-            json.dump(obj, f)
-else:
-    def write_json_file(obj, fn):
-        with open(fn, 'w', encoding='utf-8') as f:
-            json.dump(obj, f)
  
-if sys.version_info >= (2,7):
+def write_json_file(obj, fn):
+    """ Encode obj as JSON and write it to fn, atomically """
+
+    args = {
+        'suffix': '.tmp',
+        'prefix': os.path.basename(fn) + '.',
+        'dir': os.path.dirname(fn),
+        'delete': False,
+    }
+
+    # In Python 2.x, json.dump expects a bytestream.
+    # In Python 3.x, it writes to a character stream
+    if sys.version_info < (3, 0):
+        args['mode'] = 'wb'
+    else:
+        args.update({
+            'mode': 'w',
+            'encoding': 'utf-8',
+        })
+
+    tf = tempfile.NamedTemporaryFile(**args)
+
+    try:
+        with tf:
+            json.dump(obj, tf)
+        os.rename(tf.name, fn)
+    except:
+        try:
+            os.remove(tf.name)
+        except OSError:
+            pass
+        raise
+
+
+if sys.version_info >= (2, 7):
      def find_xpath_attr(node, xpath, key, val):
          """ Find the xpath xpath[@key=val] """
          assert re.match(r'^[a-zA-Z-]+$', key)
@@ -827,8 +852,10 @@ def unified_strdate(date_str):
          '%b %dnd %Y %I:%M%p',
          '%b %dth %Y %I:%M%p',
          '%Y-%m-%d',
+        '%Y/%m/%d',
          '%d.%m.%Y',
          '%d/%m/%Y',
+        '%d/%m/%y',
          '%Y/%m/%d %H:%M:%S',
          '%Y-%m-%d %H:%M:%S',
          '%d.%m.%Y %H:%M',
@@ -1259,6 +1286,12 @@ def remove_start(s, start):
      return s
  
  
+def remove_end(s, end):
+    if s.endswith(end):
+        return s[:-len(end)]
+    return s
+
+
  def url_basename(url):
      path = compat_urlparse.urlparse(url).path
      return path.strip(u'/').split(u'/')[-1]
@@ -1448,6 +1481,34 @@ def strip_jsonp(code):
      return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
  
  
+def js_to_json(code):
+    def fix_kv(m):
+        key = m.group(2)
+        if key.startswith("'"):
+            assert key.endswith("'")
+            assert '"' not in key
+            key = '"%s"' % key[1:-1]
+        elif not key.startswith('"'):
+            key = '"%s"' % key
+
+        value = m.group(4)
+        if value.startswith("'"):
+            assert value.endswith("'")
+            assert '"' not in value
+            value = '"%s"' % value[1:-1]
+
+        return m.group(1) + key + m.group(3) + value
+
+    res = re.sub(r'''(?x)
+            ([{,]\s*)
+            ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
+            (:\s*)
+            ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
+        ''', fix_kv, code)
+    res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
+    return res
+
+
  def qualities(quality_ids):
      """ Get a numeric quality value out of a list of possible values """
      def q(qid):
diff --git a/youtube_dl/version.py b/youtube_dl/version.py

index 2ef0d59e37117640da5c4d6558024a2de7625f49..a05ce2ebaafc5f22c27de39191ecaec9d48362f7 100644 (file)
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,2 +1,2 @@
  
-__version__ = '2014.08.10'
+__version__ = '2014.08.24.5'
author	Philipp Hagemeister <phihag@phihag.de>
	Sun, 24 Aug 2014 05:14:23 +0000 (07:14 +0200)
committer	Philipp Hagemeister <phihag@phihag.de>
	Sun, 24 Aug 2014 05:14:23 +0000 (07:14 +0200)
Makefile		patch \| blob \| history
README.md		patch \| blob \| history
test/helper.py		patch \| blob \| history
test/test_YoutubeDL.py		patch \| blob \| history
test/test_download.py		patch \| blob \| history
test/test_playlists.py		patch \| blob \| history
youtube_dl/YoutubeDL.py		patch \| blob \| history
youtube_dl/__init__.py		patch \| blob \| history
youtube_dl/downloader/http.py		patch \| blob \| history
youtube_dl/extractor/__init__.py		patch \| blob \| history
youtube_dl/extractor/aparat.py		patch \| blob \| history
youtube_dl/extractor/arte.py		patch \| blob \| history
youtube_dl/extractor/bliptv.py		patch \| blob \| history
youtube_dl/extractor/brightcove.py		patch \| blob \| history
youtube_dl/extractor/common.py		patch \| blob \| history
youtube_dl/extractor/dump.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/ebaumsworld.py		patch \| blob \| history
youtube_dl/extractor/ellentv.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/escapist.py		patch \| blob \| history
youtube_dl/extractor/expotv.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/gameone.py		patch \| blob \| history
youtube_dl/extractor/generic.py		patch \| blob \| history
youtube_dl/extractor/grooveshark.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/jove.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/metacafe.py		patch \| blob \| history
youtube_dl/extractor/ministrygrid.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/mitele.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/movieclips.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/nuvid.py		patch \| blob \| history
youtube_dl/extractor/patreon.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/pbs.py		patch \| blob \| history
youtube_dl/extractor/playfm.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/pornotube.py		patch \| blob \| history
youtube_dl/extractor/rtlnl.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/rtve.py		patch \| blob \| history
youtube_dl/extractor/sbs.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/teamcoco.py		patch \| blob \| history
youtube_dl/extractor/vodlocker.py		patch \| blob \| history
youtube_dl/extractor/wat.py		patch \| blob \| history
youtube_dl/extractor/wayofthemaster.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/xhamster.py		patch \| blob \| history
youtube_dl/extractor/youtube.py		patch \| blob \| history
youtube_dl/utils.py		patch \| blob \| history
youtube_dl/version.py		patch \| blob \| history