Merge remote-tracking branch 'jaimeMF/format_selection'
authorPhilipp Hagemeister <phihag@phihag.de>
Thu, 17 Oct 2013 22:17:03 +0000 (00:17 +0200)
committerPhilipp Hagemeister <phihag@phihag.de>
Thu, 17 Oct 2013 22:17:03 +0000 (00:17 +0200)
40 files changed:
Makefile
README.md
test/__init__.py [new file with mode: 0644]
test/helper.py
test/test_age_restriction.py
test/test_all_urls.py
test/test_dailymotion_subtitles.py
test/test_download.py
test/test_playlists.py
test/test_utils.py
test/test_write_annotations.py [new file with mode: 0644]
test/test_write_info_json.py
test/test_youtube_lists.py
test/test_youtube_signature.py
test/test_youtube_subtitles.py
tox.ini
youtube_dl/PostProcessor.py
youtube_dl/YoutubeDL.py
youtube_dl/__init__.py
youtube_dl/extractor/__init__.py
youtube_dl/extractor/arte.py
youtube_dl/extractor/brightcove.py
youtube_dl/extractor/cinemassacre.py [new file with mode: 0644]
youtube_dl/extractor/gamespot.py
youtube_dl/extractor/generic.py
youtube_dl/extractor/internetvideoarchive.py [new file with mode: 0644]
youtube_dl/extractor/nowvideo.py [new file with mode: 0644]
youtube_dl/extractor/rottentomatoes.py [new file with mode: 0644]
youtube_dl/extractor/rutube.py [new file with mode: 0644]
youtube_dl/extractor/sztvhu.py [new file with mode: 0644]
youtube_dl/extractor/techtalks.py [new file with mode: 0644]
youtube_dl/extractor/tudou.py
youtube_dl/extractor/videodetective.py [new file with mode: 0644]
youtube_dl/extractor/videopremium.py [new file with mode: 0644]
youtube_dl/extractor/vimeo.py
youtube_dl/extractor/websurg.py [new file with mode: 0644]
youtube_dl/extractor/youku.py
youtube_dl/extractor/youtube.py
youtube_dl/utils.py
youtube_dl/version.py

index 85dacfa4c31f2b83860891d6339b8b4a0e48c6b7..c6d09932bcd4f45b8910e828255703403c2df0d7 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -13,13 +13,13 @@ PYTHON=/usr/bin/env python
 
 # set SYSCONFDIR to /etc if PREFIX=/usr or PREFIX=/usr/local
 ifeq ($(PREFIX),/usr)
-    SYSCONFDIR=/etc
+       SYSCONFDIR=/etc
 else
-    ifeq ($(PREFIX),/usr/local)
-        SYSCONFDIR=/etc
-    else
-        SYSCONFDIR=$(PREFIX)/etc
-    endif
+       ifeq ($(PREFIX),/usr/local)
+               SYSCONFDIR=/etc
+       else
+               SYSCONFDIR=$(PREFIX)/etc
+       endif
 endif
 
 install: youtube-dl youtube-dl.1 youtube-dl.bash-completion
@@ -71,6 +71,7 @@ youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-
                --exclude '*~' \
                --exclude '__pycache' \
                --exclude '.git' \
+               --exclude 'testdata' \
                -- \
                bin devscripts test youtube_dl \
                CHANGELOG LICENSE README.md README.txt \
index 8824daee2cba4437c44db2576384527c55a3c3f1..6dae0a580f282ac053e0af0bcf7b41ff9cb1270a 100644 (file)
--- a/README.md
+++ b/README.md
@@ -57,9 +57,10 @@ which means you can modify it, redistribute it or use it however you like.
                                file. Record all downloaded videos in it.
 
 ## Download Options:
-    -r, --rate-limit LIMIT     maximum download rate (e.g. 50k or 44.6m)
+    -r, --rate-limit LIMIT     maximum download rate in bytes per second (e.g.
+                               50K or 4.2M)
     -R, --retries RETRIES      number of retries (default is 10)
-    --buffer-size SIZE         size of download buffer (e.g. 1024 or 16k)
+    --buffer-size SIZE         size of download buffer (e.g. 1024 or 16K)
                                (default is 1024)
     --no-resize-buffer         do not automatically adjust the buffer size. By
                                default, the buffer size is automatically resized
@@ -100,6 +101,7 @@ which means you can modify it, redistribute it or use it however you like.
                                file modification time
     --write-description        write video description to a .description file
     --write-info-json          write video metadata to a .info.json file
+    --write-annotations        write video annotations to a .annotation file
     --write-thumbnail          write thumbnail image to disk
 
 ## Verbosity / Simulation Options:
@@ -166,6 +168,7 @@ which means you can modify it, redistribute it or use it however you like.
                                processed files are overwritten by default
     --embed-subs               embed subtitles in the video (only for mp4
                                videos)
+    --add-metadata             add metadata to the files
 
 # CONFIGURATION
 
diff --git a/test/__init__.py b/test/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
index ad1b74dd30c140b01d8c9a4c51513cb2e3b8997f..79a0ede48989a4124d9992bc49e1b5ebbb7e6921 100644 (file)
@@ -1,22 +1,27 @@
 import errno
 import io
+import hashlib
 import json
 import os.path
 import re
 import types
 
 import youtube_dl.extractor
-from youtube_dl import YoutubeDL, YoutubeDLHandler
-from youtube_dl.utils import (
-    compat_cookiejar,
-    compat_urllib_request,
-)
+from youtube_dl import YoutubeDL
 
-youtube_dl._setup_opener(timeout=10)
 
-PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json")
-with io.open(PARAMETERS_FILE, encoding='utf-8') as pf:
-    parameters = json.load(pf)
+def global_setup():
+    youtube_dl._setup_opener(timeout=10)
+
+
+def get_params(override=None):
+    PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                                   "parameters.json")
+    with io.open(PARAMETERS_FILE, encoding='utf-8') as pf:
+        parameters = json.load(pf)
+    if override:
+        parameters.update(override)
+    return parameters
 
 
 def try_rm(filename):
@@ -32,7 +37,7 @@ class FakeYDL(YoutubeDL):
     def __init__(self):
         # Different instances of the downloader can't share the same dictionary
         # some test set the "sublang" parameter, which would break the md5 checks.
-        params = dict(parameters)
+        params = get_params()
         super(FakeYDL, self).__init__(params)
         self.result = []
         
@@ -62,3 +67,6 @@ def get_testcases():
         for t in getattr(ie, '_TESTS', []):
             t['name'] = type(ie).__name__[:-len('IE')]
             yield t
+
+
+md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()
index 943f9a315f804b6c6b7f8f31c9688ef1e5ee6d41..d500c6edceb6018510b9226d925d9f407b72fcbd 100644 (file)
@@ -1,14 +1,16 @@
 #!/usr/bin/env python
 
+# Allow direct execution
+import os
 import sys
 import unittest
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from test.helper import global_setup, try_rm
+global_setup()
 
-# Allow direct execution
-import os
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 from youtube_dl import YoutubeDL
-from helper import try_rm
 
 
 def _download_restricted(url, filename, age):
index ff1c86efebe31f2d6477cbfe6246baa50607d6a7..56e5f80e1f6ddb17fef3ee5c499c238996c12051 100644 (file)
@@ -1,14 +1,20 @@
 #!/usr/bin/env python
 
+# Allow direct execution
+import os
 import sys
 import unittest
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
-# Allow direct execution
-import os
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
-from youtube_dl.extractor import YoutubeIE, YoutubePlaylistIE, YoutubeChannelIE, JustinTVIE, gen_extractors
-from helper import get_testcases
+from test.helper import get_testcases
+
+from youtube_dl.extractor import (
+    gen_extractors,
+    JustinTVIE,
+    YoutubeIE,
+)
+
 
 class TestAllURLsMatching(unittest.TestCase):
     def setUp(self):
index ed2ad311df0035010be978e3515e88e18f7ba11a..c596415c4189624d254f72afd7b7bd7452d9fe50 100644 (file)
@@ -1,18 +1,16 @@
 #!/usr/bin/env python
 
+# Allow direct execution
+import os
 import sys
 import unittest
-import hashlib
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
-# Allow direct execution
-import os
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from test.helper import FakeYDL, global_setup, md5
+global_setup()
 
-from youtube_dl.extractor import DailymotionIE
-from youtube_dl.utils import *
-from helper import FakeYDL
 
-md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()
+from youtube_dl.extractor import DailymotionIE
 
 class TestDailymotionSubtitles(unittest.TestCase):
     def setUp(self):
index fdf59bb5c6af88bc0ee8dcbcbb4bc72b383b7968..b9a9be11d9686243ed2a1d5b748db4bc04712c54 100644 (file)
@@ -1,26 +1,31 @@
 #!/usr/bin/env python
 
+# Allow direct execution
+import os
+import sys
+import unittest
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from test.helper import get_params, get_testcases, global_setup, try_rm, md5
+global_setup()
+
+
 import hashlib
 import io
-import os
 import json
-import unittest
-import sys
 import socket
-import binascii
-
-# Allow direct execution
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 import youtube_dl.YoutubeDL
-from youtube_dl.utils import *
-
-PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json")
+from youtube_dl.utils import (
+    compat_str,
+    compat_urllib_error,
+    DownloadError,
+    ExtractorError,
+    UnavailableVideoError,
+)
 
 RETRIES = 3
 
-md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()
-
 class YoutubeDL(youtube_dl.YoutubeDL):
     def __init__(self, *args, **kwargs):
         self.to_stderr = self.to_screen
@@ -37,18 +42,12 @@ def _file_md5(fn):
     with open(fn, 'rb') as f:
         return hashlib.md5(f.read()).hexdigest()
 
-import helper  # Set up remaining global configuration
-from helper import get_testcases, try_rm
 defs = get_testcases()
 
-with io.open(PARAMETERS_FILE, encoding='utf-8') as pf:
-    parameters = json.load(pf)
-
 
 class TestDownload(unittest.TestCase):
     maxDiff = None
     def setUp(self):
-        self.parameters = parameters
         self.defs = defs
 
 ### Dynamically generate tests
@@ -68,8 +67,7 @@ def generator(test_case):
             print_skipping(test_case['skip'])
             return
 
-        params = self.parameters.copy()
-        params.update(test_case.get('params', {}))
+        params = get_params(test_case.get('params', {}))
 
         ydl = YoutubeDL(params)
         ydl.add_default_info_extractors()
index de8bd298a3e6c5c54518869b2bac2c166c5c3226..d6a8d56df99609e50ea5885d2f5a3eb48b72cf37 100644 (file)
@@ -1,13 +1,16 @@
 #!/usr/bin/env python
 # encoding: utf-8
 
-import sys
-import unittest
-import json
 
 # Allow direct execution
 import os
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import sys
+import unittest
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from test.helper import FakeYDL, global_setup
+global_setup()
+
 
 from youtube_dl.extractor import (
     DailymotionPlaylistIE,
@@ -18,9 +21,7 @@ from youtube_dl.extractor import (
     LivestreamIE,
     NHLVideocenterIE,
 )
-from youtube_dl.utils import *
 
-from helper import FakeYDL
 
 class TestPlaylists(unittest.TestCase):
     def assertIsPlaylist(self, info):
index ff2e9885bdba233699edd69457aed91de35c8708..f3fbff042ccc8193d8d08527fdc04421c9832305 100644 (file)
@@ -1,14 +1,15 @@
 #!/usr/bin/env python
+# coding: utf-8
 
-# Various small unit tests
-
+# Allow direct execution
+import os
 import sys
 import unittest
-import xml.etree.ElementTree
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
-# Allow direct execution
-import os
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+# Various small unit tests
+import xml.etree.ElementTree
 
 #from youtube_dl.utils import htmlentity_transform
 from youtube_dl.utils import (
@@ -20,6 +21,9 @@ from youtube_dl.utils import (
     unified_strdate,
     find_xpath_attr,
     get_meta_content,
+    xpath_with_ns,
+    smuggle_url,
+    unsmuggle_url,
 )
 
 if sys.version_info < (3, 0):
@@ -141,5 +145,31 @@ class TestUtil(unittest.TestCase):
         self.assertEqual(get_meta('description'), u'foo & bar')
         self.assertEqual(get_meta('author'), 'Plato')
 
+    def test_xpath_with_ns(self):
+        testxml = u'''<root xmlns:media="http://example.com/">
+            <media:song>
+                <media:author>The Author</media:author>
+                <url>http://server.com/download.mp3</url>
+            </media:song>
+        </root>'''
+        doc = xml.etree.ElementTree.fromstring(testxml)
+        find = lambda p: doc.find(xpath_with_ns(p, {'media': 'http://example.com/'}))
+        self.assertTrue(find('media:song') is not None)
+        self.assertEqual(find('media:song/media:author').text, u'The Author')
+        self.assertEqual(find('media:song/url').text, u'http://server.com/download.mp3')
+
+    def test_smuggle_url(self):
+        data = {u"ö": u"ö", u"abc": [3]}
+        url = 'https://foo.bar/baz?x=y#a'
+        smug_url = smuggle_url(url, data)
+        unsmug_url, unsmug_data = unsmuggle_url(smug_url)
+        self.assertEqual(url, unsmug_url)
+        self.assertEqual(data, unsmug_data)
+
+        res_url, res_data = unsmuggle_url(url)
+        self.assertEqual(res_url, url)
+        self.assertEqual(res_data, None)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/test_write_annotations.py b/test/test_write_annotations.py
new file mode 100644 (file)
index 0000000..35defb8
--- /dev/null
@@ -0,0 +1,80 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# Allow direct execution
+import os
+import sys
+import unittest
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from test.helper import get_params, global_setup, try_rm
+global_setup()
+
+
+import io
+
+import xml.etree.ElementTree
+
+import youtube_dl.YoutubeDL
+import youtube_dl.extractor
+
+
+class YoutubeDL(youtube_dl.YoutubeDL):
+    def __init__(self, *args, **kwargs):
+        super(YoutubeDL, self).__init__(*args, **kwargs)
+        self.to_stderr = self.to_screen
+
+params = get_params({
+    'writeannotations': True,
+    'skip_download': True,
+    'writeinfojson': False,
+    'format': 'flv',
+})
+
+
+
+TEST_ID = 'gr51aVj-mLg'
+ANNOTATIONS_FILE = TEST_ID + '.flv.annotations.xml'
+EXPECTED_ANNOTATIONS = ['Speech bubble', 'Note', 'Title', 'Spotlight', 'Label']
+
+class TestAnnotations(unittest.TestCase):
+    def setUp(self):
+        # Clear old files
+        self.tearDown()
+
+
+    def test_info_json(self):
+        expected = list(EXPECTED_ANNOTATIONS) #Two annotations could have the same text.
+        ie = youtube_dl.extractor.YoutubeIE()
+        ydl = YoutubeDL(params)
+        ydl.add_info_extractor(ie)
+        ydl.download([TEST_ID])
+        self.assertTrue(os.path.exists(ANNOTATIONS_FILE))
+        annoxml = None
+        with io.open(ANNOTATIONS_FILE, 'r', encoding='utf-8') as annof:
+                annoxml = xml.etree.ElementTree.parse(annof)
+        self.assertTrue(annoxml is not None, 'Failed to parse annotations XML')
+        root = annoxml.getroot()
+        self.assertEqual(root.tag, 'document')
+        annotationsTag = root.find('annotations')
+        self.assertEqual(annotationsTag.tag, 'annotations')
+        annotations = annotationsTag.findall('annotation')
+
+        #Not all the annotations have TEXT children and the annotations are returned unsorted.
+        for a in annotations:
+                self.assertEqual(a.tag, 'annotation')
+                if a.get('type') == 'text':
+                        textTag = a.find('TEXT')
+                        text = textTag.text
+                        self.assertTrue(text in expected) #assertIn only added in python 2.7
+                        #remove the first occurance, there could be more than one annotation with the same text
+                        expected.remove(text)
+        #We should have seen (and removed) all the expected annotation texts.
+        self.assertEqual(len(expected), 0, 'Not all expected annotations were found.')
+        
+
+    def tearDown(self):
+        try_rm(ANNOTATIONS_FILE)
+
+if __name__ == '__main__':
+    unittest.main()
index de6d5180fc0a88a66b747548352fdf72775d7c00..a5b6f6972df48f6b7cdcfebc3ea32d11c6a27afa 100644 (file)
@@ -1,37 +1,34 @@
 #!/usr/bin/env python
 # coding: utf-8
 
-import json
+# Allow direct execution
 import os
 import sys
 import unittest
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
-# Allow direct execution
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from test.helper import get_params, global_setup
+global_setup()
+
+
+import io
+import json
 
 import youtube_dl.YoutubeDL
 import youtube_dl.extractor
-from youtube_dl.utils import *
-
-PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json")
 
-# General configuration (from __init__, not very elegant...)
-jar = compat_cookiejar.CookieJar()
-cookie_processor = compat_urllib_request.HTTPCookieProcessor(jar)
-proxy_handler = compat_urllib_request.ProxyHandler()
-opener = compat_urllib_request.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
-compat_urllib_request.install_opener(opener)
 
 class YoutubeDL(youtube_dl.YoutubeDL):
     def __init__(self, *args, **kwargs):
         super(YoutubeDL, self).__init__(*args, **kwargs)
         self.to_stderr = self.to_screen
 
-with io.open(PARAMETERS_FILE, encoding='utf-8') as pf:
-    params = json.load(pf)
-params['writeinfojson'] = True
-params['skip_download'] = True
-params['writedescription'] = True
+params = get_params({
+    'writeinfojson': True,
+    'skip_download': True,
+    'writedescription': True,
+})
+
 
 TEST_ID = 'BaW_jenozKc'
 INFO_JSON_FILE = TEST_ID + '.mp4.info.json'
@@ -42,6 +39,7 @@ This is a test video for youtube-dl.
 
 For more information, contact phihag@phihag.de .'''
 
+
 class TestInfoJSON(unittest.TestCase):
     def setUp(self):
         # Clear old files
index 53e65816dfb9268c40831a57c781a169fbb050b7..4b7a7847bd3a33a9a2bff3e99f9f4cff0de7eebf 100644 (file)
@@ -1,20 +1,26 @@
 #!/usr/bin/env python
 
+# Allow direct execution
+import os
 import sys
 import unittest
-import json
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from test.helper import FakeYDL, global_setup
+global_setup()
 
-# Allow direct execution
-import os
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
-from youtube_dl.extractor import YoutubeUserIE, YoutubePlaylistIE, YoutubeIE, YoutubeChannelIE, YoutubeShowIE
-from youtube_dl.utils import *
+from youtube_dl.extractor import (
+    YoutubeUserIE,
+    YoutubePlaylistIE,
+    YoutubeIE,
+    YoutubeChannelIE,
+    YoutubeShowIE,
+)
 
-from helper import FakeYDL
 
 class TestYoutubeLists(unittest.TestCase):
-    def assertIsPlaylist(self,info):
+    def assertIsPlaylist(self, info):
         """Make sure the info has '_type' set to 'playlist'"""
         self.assertEqual(info['_type'], 'playlist')
 
@@ -100,7 +106,7 @@ class TestYoutubeLists(unittest.TestCase):
         dl = FakeYDL()
         ie = YoutubeShowIE(dl)
         result = ie.extract('http://www.youtube.com/show/airdisasters')
-        self.assertTrue(len(result) >= 4)
+        self.assertTrue(len(result) >= 3)
 
 if __name__ == '__main__':
     unittest.main()
index 5007d9a16305f055d38231cf2626dedcbd0c70ee..5e1ff5eb0ede5bcb020cd027ca00d5b4159f9812 100644 (file)
@@ -1,14 +1,18 @@
 #!/usr/bin/env python
 
-import io
-import re
-import string
+# Allow direct execution
+import os
 import sys
 import unittest
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
-# Allow direct execution
-import os
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from test.helper import global_setup
+global_setup()
+
+
+import io
+import re
+import string
 
 from youtube_dl.extractor import YoutubeIE
 from youtube_dl.utils import compat_str, compat_urlretrieve
index f9b0c1ad0b532191a6d4f1e97be5d86ad20d37fd..00430a338af7edfcdc7ea5f0380b888e86563ec4 100644 (file)
@@ -1,69 +1,79 @@
 #!/usr/bin/env python
 
+# Allow direct execution
+import os
 import sys
 import unittest
-import hashlib
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from test.helper import FakeYDL, global_setup, md5
+global_setup()
 
-# Allow direct execution
-import os
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 from youtube_dl.extractor import YoutubeIE
-from youtube_dl.utils import *
-from helper import FakeYDL
 
-md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()
 
 class TestYoutubeSubtitles(unittest.TestCase):
     def setUp(self):
         self.DL = FakeYDL()
         self.url = 'QRS8MkLhQmM'
+
     def getInfoDict(self):
         IE = YoutubeIE(self.DL)
         info_dict = IE.extract(self.url)
         return info_dict
+
     def getSubtitles(self):
         info_dict = self.getInfoDict()
-        return info_dict[0]['subtitles']        
+        return info_dict[0]['subtitles']
+
     def test_youtube_no_writesubtitles(self):
         self.DL.params['writesubtitles'] = False
         subtitles = self.getSubtitles()
         self.assertEqual(subtitles, None)
+
     def test_youtube_subtitles(self):
         self.DL.params['writesubtitles'] = True
         subtitles = self.getSubtitles()
         self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260')
+
     def test_youtube_subtitles_lang(self):
         self.DL.params['writesubtitles'] = True
         self.DL.params['subtitleslangs'] = ['it']
         subtitles = self.getSubtitles()
         self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d')
+
     def test_youtube_allsubtitles(self):
         self.DL.params['writesubtitles'] = True
         self.DL.params['allsubtitles'] = True
         subtitles = self.getSubtitles()
         self.assertEqual(len(subtitles.keys()), 13)
+
     def test_youtube_subtitles_sbv_format(self):
         self.DL.params['writesubtitles'] = True
         self.DL.params['subtitlesformat'] = 'sbv'
         subtitles = self.getSubtitles()
         self.assertEqual(md5(subtitles['en']), '13aeaa0c245a8bed9a451cb643e3ad8b')
+
     def test_youtube_subtitles_vtt_format(self):
         self.DL.params['writesubtitles'] = True
         self.DL.params['subtitlesformat'] = 'vtt'
         subtitles = self.getSubtitles()
         self.assertEqual(md5(subtitles['en']), '356cdc577fde0c6783b9b822e7206ff7')
+
     def test_youtube_list_subtitles(self):
         self.DL.expect_warning(u'Video doesn\'t have automatic captions')
         self.DL.params['listsubtitles'] = True
         info_dict = self.getInfoDict()
         self.assertEqual(info_dict, None)
+
     def test_youtube_automatic_captions(self):
         self.url = '8YoUxe5ncPo'
         self.DL.params['writeautomaticsub'] = True
         self.DL.params['subtitleslangs'] = ['it']
         subtitles = self.getSubtitles()
         self.assertTrue(subtitles['it'] is not None)
+
     def test_youtube_nosubtitles(self):
         self.DL.expect_warning(u'video doesn\'t have subtitles')
         self.url = 'sAjKT8FhjI8'
@@ -71,6 +81,7 @@ class TestYoutubeSubtitles(unittest.TestCase):
         self.DL.params['allsubtitles'] = True
         subtitles = self.getSubtitles()
         self.assertEqual(len(subtitles), 0)
+
     def test_youtube_multiple_langs(self):
         self.url = 'QRS8MkLhQmM'
         self.DL.params['writesubtitles'] = True
diff --git a/tox.ini b/tox.ini
index 53b461fdbb5de0b0e12d574e76d9e1215c4d83ee..ed01e3386d8efcaff7bb846ac1f83c3d62763fb0 100644 (file)
--- a/tox.ini
+++ b/tox.ini
@@ -1,5 +1,8 @@
 [tox]
 envlist = py26,py27,py33
 [testenv]
-deps = nose
-commands = nosetests --with-coverage --cover-package=youtube_dl --cover-html --verbose test
+deps =
+   nose
+   coverage
+commands = nosetests --verbose {posargs:test}  # --with-coverage --cover-package=youtube_dl --cover-html
+                                               # test.test_download:TestDownload.test_NowVideo
index fbf8a7f98ffc67792c48de2cb21a1536ffba08ac..13b56ede5fdb3d66064a8072cdda87787eee1bae 100644 (file)
@@ -2,9 +2,15 @@ import os
 import subprocess
 import sys
 import time
-import datetime
 
-from .utils import *
+
+from .utils import (
+    compat_subprocess_get_DEVNULL,
+    encodeFilename,
+    PostProcessingError,
+    shell_quote,
+    subtitles_filename,
+)
 
 
 class PostProcessor(object):
@@ -83,6 +89,8 @@ class FFmpegPostProcessor(PostProcessor):
                + opts +
                [encodeFilename(self._ffmpeg_filename_argument(out_path))])
 
+        if self._downloader.params.get('verbose', False):
+            self._downloader.to_screen(u'[debug] ffmpeg command line: %s' % shell_quote(cmd))
         p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
         stdout,stderr = p.communicate()
         if p.returncode != 0:
@@ -178,7 +186,8 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):
             extension = self._preferredcodec
             more_opts = []
             if self._preferredquality is not None:
-                if int(self._preferredquality) < 10:
+                # The opus codec doesn't support the -aq option
+                if int(self._preferredquality) < 10 and extension != 'opus':
                     more_opts += [self._exes['avconv'] and '-q:a' or '-aq', self._preferredquality]
                 else:
                     more_opts += [self._exes['avconv'] and '-b:a' or '-ab', self._preferredquality + 'k']
index a32e50772642adc046d6614577b8533584bf3745..f22a8bd0e044b9c10ecad56187a15a310a4c1d7d 100644 (file)
@@ -71,6 +71,7 @@ class YoutubeDL(object):
     logtostderr:       Log messages to stderr instead of stdout.
     writedescription:  Write the video description to a .description file
     writeinfojson:     Write the video description to a .info.json file
+    writeannotations:  Write the video annotations to a .annotations.xml file
     writethumbnail:    Write the thumbnail image to a file
     writesubtitles:    Write the video subtitles to a file
     writeautomaticsub: Write the automatic subtitles to a file
@@ -258,6 +259,10 @@ class YoutubeDL(object):
         """ Report that the metadata file has been written """
         self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
 
+    def report_writeannotations(self, annofn):
+        """ Report that the annotations file has been written. """
+        self.to_screen(u'[info] Writing video annotations to: ' + annofn)
+
     def report_file_already_downloaded(self, file_name):
         """Report file has already been fully downloaded."""
         try:
@@ -599,6 +604,18 @@ class YoutubeDL(object):
                 self.report_error(u'Cannot write description file ' + descfn)
                 return
 
+        if self.params.get('writeannotations', False):
+            try:
+               annofn = filename + u'.annotations.xml'
+               self.report_writeannotations(annofn)
+               with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
+                   annofile.write(info_dict['annotations'])
+            except (KeyError, TypeError):
+                self.report_warning(u'There are no annotations to write.')
+            except (OSError, IOError):
+                 self.report_error(u'Cannot write annotations file: ' + annofn)
+                 return
+
         subtitles_are_requested = any([self.params.get('writesubtitles', False),
                                        self.params.get('writeautomaticsub')])
 
index bc8e97250e18a9f52ed48a2cafe3ee48348cef05..cd642ce3b5a58aca93e4fb6cc7c65d3a540cdc35 100644 (file)
@@ -31,6 +31,7 @@ __authors__  = (
     'Huarong Huo',
     'Ismael Mejía',
     'Steffan \'Ruirize\' James',
+    'Andras Elso',
 )
 
 __license__ = 'Public Domain'
@@ -46,17 +47,43 @@ import shlex
 import socket
 import subprocess
 import sys
-import warnings
+import traceback
 import platform
 
 
-from .utils import *
+from .utils import (
+    compat_cookiejar,
+    compat_print,
+    compat_str,
+    compat_urllib_request,
+    DateRange,
+    decodeOption,
+    determine_ext,
+    DownloadError,
+    get_cachedir,
+    make_HTTPS_handler,
+    MaxDownloadsReached,
+    platform_name,
+    preferredencoding,
+    SameFileError,
+    std_headers,
+    write_string,
+    YoutubeDLHandler,
+)
 from .update import update_self
 from .version import __version__
-from .FileDownloader import *
+from .FileDownloader import (
+    FileDownloader,
+)
 from .extractor import gen_extractors
 from .YoutubeDL import YoutubeDL
-from .PostProcessor import *
+from .PostProcessor import (
+    FFmpegMetadataPP,
+    FFmpegVideoConvertor,
+    FFmpegExtractAudioPP,
+    FFmpegEmbedSubtitlePP,
+)
+
 
 def parseOpts(overrideArguments=None):
     def _readOptions(filename_bytes):
@@ -240,11 +267,11 @@ def parseOpts(overrideArguments=None):
             help='languages of the subtitles to download (optional) separated by commas, use IETF language tags like \'en,pt\'')
 
     downloader.add_option('-r', '--rate-limit',
-            dest='ratelimit', metavar='LIMIT', help='maximum download rate (e.g. 50k or 44.6m)')
+            dest='ratelimit', metavar='LIMIT', help='maximum download rate in bytes per second (e.g. 50K or 4.2M)')
     downloader.add_option('-R', '--retries',
             dest='retries', metavar='RETRIES', help='number of retries (default is %default)', default=10)
     downloader.add_option('--buffer-size',
-            dest='buffersize', metavar='SIZE', help='size of download buffer (e.g. 1024 or 16k) (default is %default)', default="1024")
+            dest='buffersize', metavar='SIZE', help='size of download buffer (e.g. 1024 or 16K) (default is %default)', default="1024")
     downloader.add_option('--no-resize-buffer',
             action='store_true', dest='noresizebuffer',
             help='do not automatically adjust the buffer size. By default, the buffer size is automatically resized from an initial value of SIZE.', default=False)
@@ -339,6 +366,9 @@ def parseOpts(overrideArguments=None):
     filesystem.add_option('--write-info-json',
             action='store_true', dest='writeinfojson',
             help='write video metadata to a .info.json file', default=False)
+    filesystem.add_option('--write-annotations',
+            action='store_true', dest='writeannotations',
+            help='write video annotations to a .annotation file', default=False)
     filesystem.add_option('--write-thumbnail',
             action='store_true', dest='writethumbnail',
             help='write thumbnail image to disk', default=False)
@@ -601,6 +631,7 @@ def _real_main(argv=None):
         'nopart': opts.nopart,
         'updatetime': opts.updatetime,
         'writedescription': opts.writedescription,
+        'writeannotations': opts.writeannotations,
         'writeinfojson': opts.writeinfojson,
         'writethumbnail': opts.writethumbnail,
         'writesubtitles': opts.writesubtitles,
@@ -684,7 +715,7 @@ def _real_main(argv=None):
     if opts.cookiefile is not None:
         try:
             jar.save()
-        except (IOError, OSError) as err:
+        except (IOError, OSError):
             sys.exit(u'ERROR: unable to save cookie jar')
 
     sys.exit(retcode)
index 688196869543628f8c08ecfab63725f147d30592..db69af361929fd7ff726d1a1df980730cad3630c 100644 (file)
@@ -2,7 +2,12 @@ from .appletrailers import AppleTrailersIE
 from .addanime import AddAnimeIE
 from .archiveorg import ArchiveOrgIE
 from .ard import ARDIE
-from .arte import ArteTvIE
+from .arte import (
+    ArteTvIE,
+    ArteTVPlus7IE,
+    ArteTVCreativeIE,
+    ArteTVFutureIE,
+)
 from .auengine import AUEngineIE
 from .bandcamp import BandcampIE
 from .bliptv import BlipTVIE, BlipTVUserIE
@@ -12,6 +17,7 @@ from .brightcove import BrightcoveIE
 from .c56 import C56IE
 from .canalplus import CanalplusIE
 from .canalc2 import Canalc2IE
+from .cinemassacre import CinemassacreIE
 from .cnn import CNNIE
 from .collegehumor import CollegeHumorIE
 from .comedycentral import ComedyCentralIE
@@ -61,6 +67,7 @@ from .ign import IGNIE, OneUPIE
 from .ina import InaIE
 from .infoq import InfoQIE
 from .instagram import InstagramIE
+from .internetvideoarchive import InternetVideoArchiveIE
 from .jeuxvideo import JeuxVideoIE
 from .jukebox import JukeboxIE
 from .justintv import JustinTVIE
@@ -82,6 +89,7 @@ from .nba import NBAIE
 from .nbc import NBCNewsIE
 from .newgrounds import NewgroundsIE
 from .nhl import NHLIE, NHLVideocenterIE
+from .nowvideo import NowVideoIE
 from .ooyala import OoyalaIE
 from .orf import ORFIE
 from .pbs import PBSIE
@@ -91,8 +99,10 @@ from .rbmaradio import RBMARadioIE
 from .redtube import RedTubeIE
 from .ringtv import RingTVIE
 from .ro220 import Ro220IE
+from .rottentomatoes import RottenTomatoesIE
 from .roxwel import RoxwelIE
 from .rtlnow import RTLnowIE
+from .rutube import RutubeIE
 from .sina import SinaIE
 from .slashdot import SlashdotIE
 from .slideshare import SlideshareIE
@@ -103,7 +113,9 @@ from .spiegel import SpiegelIE
 from .stanfordoc import StanfordOpenClassroomIE
 from .statigram import StatigramIE
 from .steam import SteamIE
+from .sztvhu import SztvHuIE
 from .teamcoco import TeamcocoIE
+from .techtalks import TechTalksIE
 from .ted import TEDIE
 from .tf1 import TF1IE
 from .thisav import ThisAVIE
@@ -120,10 +132,13 @@ from .veoh import VeohIE
 from .vevo import VevoIE
 from .vice import ViceIE
 from .viddler import ViddlerIE
+from .videodetective import VideoDetectiveIE
 from .videofyme import VideofyMeIE
+from .videopremium import VideoPremiumIE
 from .vimeo import VimeoIE, VimeoChannelIE
 from .vine import VineIE
 from .wat import WatIE
+from .websurg import WeBSurgIE
 from .weibo import WeiboIE
 from .wimp import WimpIE
 from .worldstarhiphop import WorldStarHipHopIE
index 4707d7ccab51502dadf787ab2a2fb1558a1c9d45..5ee8a67b14699a330914cd4f0e0f627ca9fca5a5 100644 (file)
@@ -1,3 +1,4 @@
+# encoding: utf-8
 import re
 import json
 import xml.etree.ElementTree
@@ -7,15 +8,15 @@ from ..utils import (
     ExtractorError,
     find_xpath_attr,
     unified_strdate,
+    determine_ext,
+    get_element_by_id,
 )
 
+# There are different sources of video in arte.tv, the extraction process 
+# is different for each one. The videos usually expire in 7 days, so we can't
+# add tests.
+
 class ArteTvIE(InfoExtractor):
-    """
-    There are two sources of video in arte.tv: videos.arte.tv and
-    www.arte.tv/guide, the extraction process is different for each one.
-    The videos expire in 7 days, so we can't add tests.
-    """
-    _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
     _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?P<lang>fr|de)/.*-(?P<id>.*?).html'
     _LIVEWEB_URL = r'(?:http://)?liveweb.arte.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)'
     _LIVE_URL = r'index-[0-9]+\.html$'
@@ -24,7 +25,7 @@ class ArteTvIE(InfoExtractor):
 
     @classmethod
     def suitable(cls, url):
-        return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL, cls._LIVEWEB_URL))
+        return any(re.match(regex, url) for regex in (cls._VIDEOS_URL, cls._LIVEWEB_URL))
 
     # TODO implement Live Stream
     # from ..utils import compat_urllib_parse
@@ -55,14 +56,6 @@ class ArteTvIE(InfoExtractor):
     #     video_url = u'%s/%s' % (info.get('url'), info.get('path'))
 
     def _real_extract(self, url):
-        mobj = re.match(self._EMISSION_URL, url)
-        if mobj is not None:
-            lang = mobj.group('lang')
-            # This is not a real id, it can be for example AJT for the news
-            # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
-            video_id = mobj.group('id')
-            return self._extract_emission(url, video_id, lang)
-
         mobj = re.match(self._VIDEOS_URL, url)
         if mobj is not None:
             id = mobj.group('id')
@@ -80,59 +73,6 @@ class ArteTvIE(InfoExtractor):
             # self.extractLiveStream(url)
             # return
 
-    def _extract_emission(self, url, video_id, lang):
-        """Extract from www.arte.tv/guide"""
-        webpage = self._download_webpage(url, video_id)
-        json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
-
-        json_info = self._download_webpage(json_url, video_id, 'Downloading info json')
-        self.report_extraction(video_id)
-        info = json.loads(json_info)
-        player_info = info['videoJsonPlayer']
-
-        info_dict = {'id': player_info['VID'],
-                     'title': player_info['VTI'],
-                     'description': player_info.get('VDE'),
-                     'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]),
-                     'thumbnail': player_info['programImage'],
-                     'ext': 'flv',
-                     }
-
-        formats = player_info['VSR'].values()
-        def _match_lang(f):
-            # Return true if that format is in the language of the url
-            if lang == 'fr':
-                l = 'F'
-            elif lang == 'de':
-                l = 'A'
-            regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l]
-            return any(re.match(r, f['versionCode']) for r in regexes)
-        # Some formats may not be in the same language as the url
-        formats = filter(_match_lang, formats)
-        # Some formats use the m3u8 protocol
-        formats = filter(lambda f: f['videoFormat'] != 'M3U8', formats)
-        # We order the formats by quality
-        formats = sorted(formats, key=lambda f: int(f['height']))
-        # Prefer videos without subtitles in the same language
-        formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f['versionCode']) is None)
-        # Pick the best quality
-        def _format(format_info):
-            info = {'ext': 'flv',
-                    'width': format_info.get('width'),
-                    'height': format_info.get('height'),
-                    }
-            if format_info['mediaType'] == u'rtmp':
-                info['url'] = format_info['streamer']
-                info['play_path'] = 'mp4:' + format_info['url']
-            else:
-                info_dict['url'] = format_info['url']
-            return info
-        info_dict['formats'] = [_format(f) for f in formats]
-        # TODO: Remove when #980 has been merged 
-        info_dict.update(info_dict['formats'][-1])
-
-        return info_dict
-
     def _extract_video(self, url, video_id, lang):
         """Extract from videos.arte.tv"""
         ref_xml_url = url.replace('/videos/', '/do_delegate/videos/')
@@ -182,3 +122,110 @@ class ArteTvIE(InfoExtractor):
                 'ext': 'flv',
                 'thumbnail': self._og_search_thumbnail(webpage),
                 }
+
+
+class ArteTVPlus7IE(InfoExtractor):
+    IE_NAME = u'arte.tv:+7'
+    _VALID_URL = r'https?://www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
+
+    @classmethod
+    def _extract_url_info(cls, url):
+        mobj = re.match(cls._VALID_URL, url)
+        lang = mobj.group('lang')
+        # This is not a real id, it can be for example AJT for the news
+        # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
+        video_id = mobj.group('id')
+        return video_id, lang
+
+    def _real_extract(self, url):
+        video_id, lang = self._extract_url_info(url)
+        webpage = self._download_webpage(url, video_id)
+        return self._extract_from_webpage(webpage, video_id, lang)
+
+    def _extract_from_webpage(self, webpage, video_id, lang):
+        json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
+
+        json_info = self._download_webpage(json_url, video_id, 'Downloading info json')
+        self.report_extraction(video_id)
+        info = json.loads(json_info)
+        player_info = info['videoJsonPlayer']
+
+        info_dict = {
+            'id': player_info['VID'],
+            'title': player_info['VTI'],
+            'description': player_info.get('VDE'),
+            'upload_date': unified_strdate(player_info.get('VDA', '').split(' ')[0]),
+            'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
+        }
+
+        formats = player_info['VSR'].values()
+        def _match_lang(f):
+            if f.get('versionCode') is None:
+                return True
+            # Return true if that format is in the language of the url
+            if lang == 'fr':
+                l = 'F'
+            elif lang == 'de':
+                l = 'A'
+            regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l]
+            return any(re.match(r, f['versionCode']) for r in regexes)
+        # Some formats may not be in the same language as the url
+        formats = filter(_match_lang, formats)
+        # Some formats use the m3u8 protocol
+        formats = filter(lambda f: f.get('videoFormat') != 'M3U8', formats)
+        # We order the formats by quality
+        formats = sorted(formats, key=lambda f: int(f.get('height',-1)))
+        # Prefer videos without subtitles in the same language
+        formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f.get('versionCode', '')) is None)
+        # Pick the best quality
+        def _format(format_info):
+            info = {
+                'width': format_info.get('width'),
+                'height': format_info.get('height'),
+            }
+            if format_info['mediaType'] == u'rtmp':
+                info['url'] = format_info['streamer']
+                info['play_path'] = 'mp4:' + format_info['url']
+                info['ext'] = 'flv'
+            else:
+                info['url'] = format_info['url']
+                info['ext'] = determine_ext(info['url'])
+            return info
+        info_dict['formats'] = [_format(f) for f in formats]
+        # TODO: Remove when #980 has been merged 
+        info_dict.update(info_dict['formats'][-1])
+
+        return info_dict
+
+
+# It also uses the arte_vp_url url from the webpage to extract the information
+class ArteTVCreativeIE(ArteTVPlus7IE):
+    IE_NAME = u'arte.tv:creative'
+    _VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de)/magazine?/(?P<id>.+)'
+
+    _TEST = {
+        u'url': u'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design',
+        u'file': u'050489-002.mp4',
+        u'info_dict': {
+            u'title': u'Agentur Amateur #2 - Corporate Design',
+        },
+    }
+
+
+class ArteTVFutureIE(ArteTVPlus7IE):
+    IE_NAME = u'arte.tv:future'
+    _VALID_URL = r'https?://future\.arte\.tv/(?P<lang>fr|de)/(thema|sujet)/.*?#article-anchor-(?P<id>\d+)'
+
+    _TEST = {
+        u'url': u'http://future.arte.tv/fr/sujet/info-sciences#article-anchor-7081',
+        u'file': u'050940-003.mp4',
+        u'info_dict': {
+            u'title': u'Les champignons au secours de la planète',
+        },
+    }
+
+    def _real_extract(self, url):
+        anchor_id, lang = self._extract_url_info(url)
+        webpage = self._download_webpage(url, anchor_id)
+        row = get_element_by_id(anchor_id, webpage)
+        return self._extract_from_webpage(row, anchor_id, lang)
index 745212f2fe731bf305e56e8087089c65efabbd68..1392f382a24c273604f0c67db7afafefbcec85b8 100644 (file)
@@ -53,6 +53,8 @@ class BrightcoveIE(InfoExtractor):
         # Fix up some stupid HTML, see https://github.com/rg3/youtube-dl/issues/1553
         object_str = re.sub(r'(<param name="[^"]+" value="[^"]+")>',
                             lambda m: m.group(1) + '/>', object_str)
+        # Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608
+        object_str = object_str.replace(u'<--', u'<!--')
 
         object_doc = xml.etree.ElementTree.fromstring(object_str)
         assert u'BrightcoveExperience' in object_doc.attrib['class']
@@ -96,7 +98,10 @@ class BrightcoveIE(InfoExtractor):
         playlist_info = self._download_webpage(self._PLAYLIST_URL_TEMPLATE % player_key,
                                                player_key, u'Downloading playlist information')
 
-        playlist_info = json.loads(playlist_info)['videoList']
+        json_data = json.loads(playlist_info)
+        if 'videoList' not in json_data:
+            raise ExtractorError(u'Empty playlist')
+        playlist_info = json_data['videoList']
         videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']]
 
         return self.playlist_result(videos, playlist_id=playlist_info['id'],
diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py
new file mode 100644 (file)
index 0000000..6925b96
--- /dev/null
@@ -0,0 +1,91 @@
+# encoding: utf-8
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+)
+
+
+class CinemassacreIE(InfoExtractor):
+    _VALID_URL = r'(?:http://)?(?:www\.)?(?P<url>cinemassacre\.com/(?P<date_Y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/.+?)(?:[/?].*)?'
+    _TESTS = [{
+        u'url': u'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/',
+        u'file': u'19911.flv',
+        u'info_dict': {
+            u'upload_date': u'20121110',
+            u'title': u'“Angry Video Game Nerd: The Movie” – Trailer',
+            u'description': u'md5:fb87405fcb42a331742a0dce2708560b',
+        },
+        u'params': {
+            # rtmp download
+            u'skip_download': True,
+        },
+    },
+    {
+        u'url': u'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
+        u'file': u'521be8ef82b16.flv',
+        u'info_dict': {
+            u'upload_date': u'20131002',
+            u'title': u'The Mummy’s Hand (1940)',
+        },
+        u'params': {
+            # rtmp download
+            u'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+
+        webpage_url = u'http://' + mobj.group('url')
+        webpage = self._download_webpage(webpage_url, None) # Don't know video id yet
+        video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d')
+        mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/(?:embed|player)\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage)
+        if not mobj:
+            raise ExtractorError(u'Can\'t extract embed url and video id')
+        playerdata_url = mobj.group(u'embed_url')
+        video_id = mobj.group(u'video_id')
+
+        video_title = self._html_search_regex(r'<title>(?P<title>.+?)\|',
+            webpage, u'title')
+        video_description = self._html_search_regex(r'<div class="entry-content">(?P<description>.+?)</div>',
+            webpage, u'description', flags=re.DOTALL, fatal=False)
+        if len(video_description) == 0:
+            video_description = None
+
+        playerdata = self._download_webpage(playerdata_url, video_id)
+        base_url = self._html_search_regex(r'\'streamer\': \'(?P<base_url>rtmp://.*?)/(?:vod|Cinemassacre)\'',
+            playerdata, u'base_url')
+        base_url += '/Cinemassacre/'
+        # Important: The file names in playerdata are not used by the player and even wrong for some videos
+        sd_file = 'Cinemassacre-%s_high.mp4' % video_id
+        hd_file = 'Cinemassacre-%s.mp4' % video_id
+        video_thumbnail = 'http://image.screenwavemedia.com/Cinemassacre/Cinemassacre-%s_thumb_640x360.jpg' % video_id
+
+        formats = [
+            {
+                'url': base_url + sd_file,
+                'ext': 'flv',
+                'format': 'sd',
+                'format_id': 'sd',
+            },
+            {
+                'url': base_url + hd_file,
+                'ext': 'flv',
+                'format': 'hd',
+                'format_id': 'hd',
+            },
+        ]
+
+        info = {
+            'id': video_id,
+            'title': video_title,
+            'formats': formats,
+            'description': video_description,
+            'upload_date': video_date,
+            'thumbnail': video_thumbnail,
+        }
+        # TODO: Remove when #980 has been merged
+        info.update(formats[-1])
+        return info
index 5edbf678ad805f4f5408bf7d478fd5f7402ae5ce..098768361ede01d8acc01dc773a31b5b8fc67241 100644 (file)
@@ -1,56 +1,59 @@
 import re
-import xml.etree.ElementTree
+import json
 
 from .common import InfoExtractor
 from ..utils import (
-    unified_strdate,
     compat_urllib_parse,
+    compat_urlparse,
+    unescapeHTML,
+    get_meta_content,
 )
 
+
 class GameSpotIE(InfoExtractor):
-    _WORKING = False
     _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<page_id>\d+)/?'
     _TEST = {
         u"url": u"http://www.gamespot.com/arma-iii/videos/arma-iii-community-guide-sitrep-i-6410818/",
-        u"file": u"6410818.mp4",
+        u"file": u"gs-2300-6410818.mp4",
         u"md5": u"b2a30deaa8654fcccd43713a6b6a4825",
         u"info_dict": {
             u"title": u"Arma 3 - Community Guide: SITREP I",
-            u"upload_date": u"20130627", 
+            u'description': u'Check out this video where some of the basics of Arma 3 is explained.',
         }
     }
 
-
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        page_id = mobj.group('page_id')
+        page_id = video_id = mobj.group('page_id')
         webpage = self._download_webpage(url, page_id)
-        video_id = self._html_search_regex([r'"og:video" content=".*?\?id=(\d+)"',
-                                            r'http://www\.gamespot\.com/videoembed/(\d+)'],
-                                           webpage, 'video id')
-        data = compat_urllib_parse.urlencode({'id': video_id, 'newplayer': '1'})
-        info_url = 'http://www.gamespot.com/pages/video_player/xml.php?' + data
-        info_xml = self._download_webpage(info_url, video_id)
-        doc = xml.etree.ElementTree.fromstring(info_xml)
-        clip_el = doc.find('./playList/clip')
+        data_video_json = self._search_regex(r'data-video=\'(.*?)\'', webpage, u'data video')
+        data_video = json.loads(unescapeHTML(data_video_json))
 
-        http_urls = [{'url': node.find('filePath').text,
-                      'rate': int(node.find('rate').text)}
-            for node in clip_el.find('./httpURI')]
-        best_quality = sorted(http_urls, key=lambda f: f['rate'])[-1]
-        video_url = best_quality['url']
-        title = clip_el.find('./title').text
-        ext = video_url.rpartition('.')[2]
-        thumbnail_url = clip_el.find('./screenGrabURI').text
-        view_count = int(clip_el.find('./views').text)
-        upload_date = unified_strdate(clip_el.find('./postDate').text)
+        # Transform the manifest url to a link to the mp4 files
+        # they are used in mobile devices.
+        f4m_url = data_video['videoStreams']['f4m_stream']
+        f4m_path = compat_urlparse.urlparse(f4m_url).path
+        QUALITIES_RE = r'((,\d+)+,?)'
+        qualities = self._search_regex(QUALITIES_RE, f4m_path, u'qualities').strip(',').split(',')
+        http_path = f4m_path[1:].split('/', 1)[1]
+        http_template = re.sub(QUALITIES_RE, r'%s', http_path)
+        http_template = http_template.replace('.csmil/manifest.f4m', '')
+        http_template = compat_urlparse.urljoin('http://video.gamespotcdn.com/', http_template)
+        formats = []
+        for q in qualities:
+            formats.append({
+                'url': http_template % q,
+                'ext': 'mp4',
+                'format_id': q,
+            })
 
-        return [{
-            'id'          : video_id,
-            'url'         : video_url,
-            'ext'         : ext,
-            'title'       : title,
-            'thumbnail'   : thumbnail_url,
-            'upload_date' : upload_date,
-            'view_count'  : view_count,
-        }]
+        info = {
+            'id': data_video['guid'],
+            'title': compat_urllib_parse.unquote(data_video['title']),
+            'formats': formats,
+            'description': get_meta_content('description', webpage),
+            'thumbnail': self._og_search_thumbnail(webpage),
+        }
+        # TODO: Remove when #980 has been merged
+        info.update(formats[-1])
+        return info
index 7060c6f9258c28c9dcb18681c62882f52715edf9..89805250cf690c32f1c9ec8d12a00052c26cd8a3 100644 (file)
@@ -11,6 +11,8 @@ from ..utils import (
     compat_urlparse,
 
     ExtractorError,
+    smuggle_url,
+    unescapeHTML,
 )
 from .brightcove import BrightcoveIE
 
@@ -29,6 +31,17 @@ class GenericIE(InfoExtractor):
                 u"title": u"R\u00e9gis plante sa Jeep"
             }
         },
+        # embedded vimeo video
+        {
+            u'url': u'http://skillsmatter.com/podcast/home/move-semanticsperfect-forwarding-and-rvalue-references',
+            u'file': u'22444065.mp4',
+            u'md5': u'2903896e23df39722c33f015af0666e2',
+            u'info_dict': {
+                u'title': u'ACCU 2011: Move Semantics,Perfect Forwarding, and Rvalue references- Scott Meyers- 13/04/2011',
+                u"uploader_id": u"skillsmatter",
+                u"uploader": u"Skills Matter",
+            }
+        }
     ]
 
     def report_download_webpage(self, video_id):
@@ -121,12 +134,20 @@ class GenericIE(InfoExtractor):
 
         self.report_extraction(video_id)
         # Look for BrightCove:
-        m_brightcove = re.search(r'<object.+?class=([\'"]).*?BrightcoveExperience.*?\1.+?</object>', webpage, re.DOTALL)
+        m_brightcove = re.search(r'<object[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1.+?</object>', webpage, re.DOTALL)
         if m_brightcove is not None:
             self.to_screen(u'Brightcove video detected.')
             bc_url = BrightcoveIE._build_brighcove_url(m_brightcove.group())
             return self.url_result(bc_url, 'Brightcove')
 
+        # Look for embedded Vimeo player
+        mobj = re.search(
+            r'<iframe\s+src="(https?://player.vimeo.com/video/.*?)"', webpage)
+        if mobj:
+            player_url = unescapeHTML(mobj.group(1))
+            surl = smuggle_url(player_url, {'Referer': url})
+            return self.url_result(surl, 'Vimeo')
+
         # Start with something easy: JW Player in SWFObject
         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
         if mobj is None:
diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py
new file mode 100644 (file)
index 0000000..5986459
--- /dev/null
@@ -0,0 +1,87 @@
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urlparse,
+    compat_urllib_parse,
+    xpath_with_ns,
+    determine_ext,
+)
+
+
+class InternetVideoArchiveIE(InfoExtractor):
+    _VALID_URL = r'https?://video\.internetvideoarchive\.net/flash/players/.*?\?.*?publishedid.*?'
+
+    _TEST = {
+        u'url': u'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247',
+        u'file': u'452693.mp4',
+        u'info_dict': {
+            u'title': u'SKYFALL',
+            u'description': u'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.',
+            u'duration': 156,
+        },
+    }
+
+    @staticmethod
+    def _build_url(query):
+        return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query
+
+    @staticmethod
+    def _clean_query(query):
+        NEEDED_ARGS = ['publishedid', 'customerid']
+        query_dic = compat_urlparse.parse_qs(query)
+        cleaned_dic = dict((k,v[0]) for (k,v) in query_dic.items() if k in NEEDED_ARGS)
+        # Other player ids return m3u8 urls
+        cleaned_dic['playerid'] = '247'
+        cleaned_dic['videokbrate'] = '100000'
+        return compat_urllib_parse.urlencode(cleaned_dic)
+
+    def _real_extract(self, url):
+        query = compat_urlparse.urlparse(url).query
+        query_dic = compat_urlparse.parse_qs(query)
+        video_id = query_dic['publishedid'][0]
+        url = self._build_url(query)
+
+        flashconfiguration_xml = self._download_webpage(url, video_id,
+            u'Downloading flash configuration')
+        flashconfiguration = xml.etree.ElementTree.fromstring(flashconfiguration_xml.encode('utf-8'))
+        file_url = flashconfiguration.find('file').text
+        file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx')
+        # Replace some of the parameters in the query to get the best quality
+        # and http links (no m3u8 manifests)
+        file_url = re.sub(r'(?<=\?)(.+)$',
+            lambda m: self._clean_query(m.group()),
+            file_url)
+        info_xml = self._download_webpage(file_url, video_id,
+            u'Downloading video info')
+        info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
+        item = info.find('channel/item')
+
+        def _bp(p):
+            return xpath_with_ns(p,
+                {'media': 'http://search.yahoo.com/mrss/',
+                'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats'})
+        formats = []
+        for content in item.findall(_bp('media:group/media:content')):
+            attr = content.attrib
+            f_url = attr['url']
+            formats.append({
+                'url': f_url,
+                'ext': determine_ext(f_url),
+                'width': int(attr['width']),
+                'bitrate': int(attr['bitrate']),
+            })
+        formats = sorted(formats, key=lambda f: f['bitrate'])
+
+        info = {
+            'id': video_id,
+            'title': item.find('title').text,
+            'formats': formats,
+            'thumbnail': item.find(_bp('media:thumbnail')).attrib['url'],
+            'description': item.find('description').text,
+            'duration': int(attr['duration']),
+        }
+        # TODO: Remove when #980 has been merged
+        info.update(formats[-1])
+        return info
diff --git a/youtube_dl/extractor/nowvideo.py b/youtube_dl/extractor/nowvideo.py
new file mode 100644 (file)
index 0000000..ab52ad4
--- /dev/null
@@ -0,0 +1,43 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import compat_urlparse
+
+
+class NowVideoIE(InfoExtractor):
+    _VALID_URL = r'(?:https?://)?(?:www\.)?nowvideo\.ch/video/(?P<id>\w+)'
+    _TEST = {
+        u'url': u'http://www.nowvideo.ch/video/0mw0yow7b6dxa',
+        u'file': u'0mw0yow7b6dxa.flv',
+        u'md5': u'f8fbbc8add72bd95b7850c6a02fc8817',
+        u'info_dict': {
+            u"title": u"youtubedl test video _BaW_jenozKc.mp4"
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+
+        video_id = mobj.group('id')
+        webpage_url = 'http://www.nowvideo.ch/video/' + video_id
+        webpage = self._download_webpage(webpage_url, video_id)
+
+        self.report_extraction(video_id)
+
+        video_title = self._html_search_regex(r'<h4>(.*)</h4>',
+            webpage, u'video title')
+
+        video_key = self._search_regex(r'var fkzd="(.*)";',
+            webpage, u'video key')
+
+        api_call = "http://www.nowvideo.ch/api/player.api.php?file={0}&numOfErrors=0&cid=1&key={1}".format(video_id, video_key)
+        api_response = self._download_webpage(api_call, video_id,
+            u'Downloading API page')
+        video_url = compat_urlparse.parse_qs(api_response)[u'url'][0]
+
+        return [{
+            'id':        video_id,
+            'url':       video_url,
+            'ext':       'flv',
+            'title':     video_title,
+        }]
diff --git a/youtube_dl/extractor/rottentomatoes.py b/youtube_dl/extractor/rottentomatoes.py
new file mode 100644 (file)
index 0000000..c79c394
--- /dev/null
@@ -0,0 +1,16 @@
+from .videodetective import VideoDetectiveIE
+
+
+# It just uses the same method as videodetective.com,
+# the internetvideoarchive.com is extracted from the og:video property
+class RottenTomatoesIE(VideoDetectiveIE):
+    _VALID_URL = r'https?://www\.rottentomatoes\.com/m/[^/]+/trailers/(?P<id>\d+)'
+
+    _TEST = {
+        u'url': u'http://www.rottentomatoes.com/m/toy_story_3/trailers/11028566/',
+        u'file': '613340.mp4',
+        u'info_dict': {
+            u'title': u'TOY STORY 3',
+            u'description': u'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.',
+        },
+    }
diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py
new file mode 100644 (file)
index 0000000..a18034f
--- /dev/null
@@ -0,0 +1,58 @@
+# encoding: utf-8
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urlparse,
+    compat_str,
+    ExtractorError,
+)
+
+
+class RutubeIE(InfoExtractor):
+    _VALID_URL = r'https?://rutube.ru/video/(?P<long_id>\w+)'
+
+    _TEST = {
+        u'url': u'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/',
+        u'file': u'3eac3b4561676c17df9132a9a1e62e3e.mp4',
+        u'info_dict': {
+            u'title': u'Раненный кенгуру забежал в аптеку',
+            u'uploader': u'NTDRussian',
+            u'uploader_id': u'29790',
+        },
+        u'params': {
+            # It requires ffmpeg (m3u8 download)
+            u'skip_download': True,
+        },
+    }
+
+    def _get_api_response(self, short_id, subpath):
+        api_url = 'http://rutube.ru/api/play/%s/%s/?format=json' % (subpath, short_id)
+        response_json = self._download_webpage(api_url, short_id,
+            u'Downloading %s json' % subpath)
+        return json.loads(response_json)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        long_id = mobj.group('long_id')
+        webpage = self._download_webpage(url, long_id)
+        og_video = self._og_search_video_url(webpage)
+        short_id = compat_urlparse.urlparse(og_video).path[1:]
+        options = self._get_api_response(short_id, 'options')
+        trackinfo = self._get_api_response(short_id, 'trackinfo')
+        # Some videos don't have the author field
+        author = trackinfo.get('author') or {}
+        m3u8_url = trackinfo['video_balancer'].get('m3u8')
+        if m3u8_url is None:
+            raise ExtractorError(u'Couldn\'t find m3u8 manifest url')
+
+        return {
+            'id': trackinfo['id'],
+            'title': trackinfo['title'],
+            'url': m3u8_url,
+            'ext': 'mp4',
+            'thumbnail': options['thumbnail_url'],
+            'uploader': author.get('name'),
+            'uploader_id': compat_str(author['id']) if author else None,
+        }
diff --git a/youtube_dl/extractor/sztvhu.py b/youtube_dl/extractor/sztvhu.py
new file mode 100644 (file)
index 0000000..81fa35c
--- /dev/null
@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+
+import re
+
+from .common import InfoExtractor
+from ..utils import determine_ext
+
+
+class SztvHuIE(InfoExtractor):
+    _VALID_URL = r'(?:http://)?(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/(?:[^/]+)/.+-(?P<id>[0-9]+)'
+    _TEST = {
+        u'url': u'http://sztv.hu/hirek/cserkeszek-nepszerusitettek-a-kornyezettudatos-eletmodot-a-savaria-teren-20130909',
+        u'file': u'20130909.mp4',
+        u'md5': u'a6df607b11fb07d0e9f2ad94613375cb',
+        u'info_dict': {
+            u"title": u"Cserkészek népszerűsítették a környezettudatos életmódot a Savaria téren",
+            u"description": u'A zöld nap játékos ismeretterjesztő programjait a Magyar Cserkész Szövetség szervezte, akik az ország nyolc városában adják át tudásukat az érdeklődőknek. A PET...',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        webpage = self._download_webpage(url, video_id)
+        video_file = self._search_regex(
+            r'file: "...:(.*?)",', webpage, 'video file')
+        title = self._html_search_regex(
+            r'<meta name="title" content="([^"]*?) - [^-]*? - [^-]*?"',
+            webpage, 'video title')
+        description = self._html_search_regex(
+            r'<meta name="description" content="([^"]*)"/>',
+            webpage, 'video description', fatal=False)
+        thumbnail = self._og_search_thumbnail(webpage)
+
+        video_url = 'http://media.sztv.hu/vod/' + video_file
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'ext': determine_ext(video_url),
+            'description': description,
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/techtalks.py b/youtube_dl/extractor/techtalks.py
new file mode 100644 (file)
index 0000000..a55f236
--- /dev/null
@@ -0,0 +1,65 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    get_element_by_attribute,
+    clean_html,
+)
+
+
+class TechTalksIE(InfoExtractor):
+    _VALID_URL = r'https?://techtalks\.tv/talks/[^/]*/(?P<id>\d+)/'
+
+    _TEST = {
+        u'url': u'http://techtalks.tv/talks/learning-topic-models-going-beyond-svd/57758/',
+        u'playlist': [
+            {
+                u'file': u'57758.flv',
+                u'info_dict': {
+                    u'title': u'Learning Topic Models --- Going beyond SVD',
+                },
+            },
+            {
+                u'file': u'57758-slides.flv',
+                u'info_dict': {
+                    u'title': u'Learning Topic Models --- Going beyond SVD',
+                },
+            },
+        ],
+        u'params': {
+            # rtmp download
+            u'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        talk_id = mobj.group('id')
+        webpage = self._download_webpage(url, talk_id)
+        rtmp_url = self._search_regex(r'netConnectionUrl: \'(.*?)\'', webpage,
+            u'rtmp url')
+        play_path = self._search_regex(r'href=\'(.*?)\' [^>]*id="flowplayer_presenter"',
+            webpage, u'presenter play path')
+        title = clean_html(get_element_by_attribute('class', 'title', webpage))
+        video_info = {
+                'id': talk_id,
+                'title': title,
+                'url': rtmp_url,
+                'play_path': play_path,
+                'ext': 'flv',
+            }
+        m_slides = re.search(r'<a class="slides" href=\'(.*?)\'', webpage)
+        if m_slides is None:
+            return video_info
+        else:
+            return [
+                video_info,
+                # The slides video
+                {
+                    'id': talk_id + '-slides',
+                    'title': title,
+                    'url': rtmp_url,
+                    'play_path': m_slides.group(1),
+                    'ext': 'flv',
+                },
+            ]
index 1405b73f76ad5166d45d9a9eb9687c49fa8a0bde..79679a14a5ee1703290121e33525411e917c0fb2 100644 (file)
@@ -7,15 +7,25 @@ from .common import InfoExtractor
 
 
 class TudouIE(InfoExtractor):
-    _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs)/(?:view|(.+?))/(?:([^/]+)|([^/]+))(?:\.html)?'
-    _TEST = {
+    _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs|albumplay)/(?:view|(.+?))/(?:([^/]+)|([^/]+))(?:\.html)?'
+    _TESTS = [{
         u'url': u'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
         u'file': u'159448201.f4v',
         u'md5': u'140a49ed444bd22f93330985d8475fcb',
         u'info_dict': {
             u"title": u"卡马乔国足开大脚长传冲吊集锦"
         }
-    }
+    },
+    {
+        u'url': u'http://www.tudou.com/albumplay/TenTw_JgiPM/PzsAs5usU9A.html',
+        u'file': u'todo.mp4',
+        u'md5': u'todo.mp4',
+        u'info_dict': {
+            u'title': u'todo.mp4',
+        },
+        u'add_ie': [u'Youku'],
+        u'skip': u'Only works from China'
+    }]
 
     def _url_for_id(self, id, quality = None):
         info_url = "http://v2.tudou.com/f?id="+str(id)
@@ -29,14 +39,18 @@ class TudouIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group(2)
         webpage = self._download_webpage(url, video_id)
-        title = re.search(",kw:\"(.+)\"",webpage)
-        if title is None:
-            title = re.search(",kw: \'(.+)\'",webpage)
-        title = title.group(1)
-        thumbnail_url = re.search(",pic: \'(.+?)\'",webpage)
-        if thumbnail_url is None:
-            thumbnail_url = re.search(",pic:\"(.+?)\"",webpage)
-        thumbnail_url = thumbnail_url.group(1)
+
+        m = re.search(r'vcode:\s*[\'"](.+?)[\'"]', webpage)
+        if m and m.group(1):
+            return {
+                '_type': 'url',
+                'url': u'youku:' + m.group(1),
+                'ie_key': 'Youku'
+            }
+
+        title = self._search_regex(r",kw:['\"](.+?)[\"']", webpage, u'title')
+        thumbnail_url = self._search_regex(
+            r",pic:\s*[\"'](.+?)[\"']", webpage, u'thumbnail URL', fatal=False)
 
         segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments')
         segments = json.loads(segs_json)
diff --git a/youtube_dl/extractor/videodetective.py b/youtube_dl/extractor/videodetective.py
new file mode 100644 (file)
index 0000000..d89f840
--- /dev/null
@@ -0,0 +1,30 @@
+import re
+
+from .common import InfoExtractor
+from .internetvideoarchive import InternetVideoArchiveIE
+from ..utils import (
+    compat_urlparse,
+)
+
+
+class VideoDetectiveIE(InfoExtractor):
+    _VALID_URL = r'https?://www\.videodetective\.com/[^/]+/[^/]+/(?P<id>\d+)'
+
+    _TEST = {
+        u'url': u'http://www.videodetective.com/movies/kick-ass-2/194487',
+        u'file': u'194487.mp4',
+        u'info_dict': {
+            u'title': u'KICK-ASS 2',
+            u'description': u'md5:65ba37ad619165afac7d432eaded6013',
+            u'duration': 138,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        webpage = self._download_webpage(url, video_id)
+        og_video = self._og_search_video_url(webpage)
+        query = compat_urlparse.urlparse(og_video).query
+        return self.url_result(InternetVideoArchiveIE._build_url(query),
+            ie=InternetVideoArchiveIE.ie_key())
diff --git a/youtube_dl/extractor/videopremium.py b/youtube_dl/extractor/videopremium.py
new file mode 100644 (file)
index 0000000..65f39b9
--- /dev/null
@@ -0,0 +1,40 @@
+import re
+import random
+
+from .common import InfoExtractor
+
+
+class VideoPremiumIE(InfoExtractor):
+    _VALID_URL = r'(?:https?://)?(?:www\.)?videopremium\.tv/(?P<id>\w+)(?:/.*)?'
+    _TEST = {
+        u'url': u'http://videopremium.tv/4w7oadjsf156',
+        u'file': u'4w7oadjsf156.f4v',
+        u'info_dict': {
+            u"title": u"youtube-dl_test_video____a_________-BaW_jenozKc.mp4.mp4"
+        },
+        u'params': {
+            u'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+
+        video_id = mobj.group('id')
+        webpage_url = 'http://videopremium.tv/' + video_id
+        webpage = self._download_webpage(webpage_url, video_id)
+
+        self.report_extraction(video_id)
+
+        video_title = self._html_search_regex(r'<h2(?:.*?)>\s*(.+?)\s*<',
+            webpage, u'video title')
+
+        return [{
+            'id':          video_id,
+            'url':         "rtmp://e%d.md.iplay.md/play" % random.randint(1, 16),
+            'play_path':   "mp4:%s.f4v" % video_id,
+            'page_url':    "http://videopremium.tv/" + video_id,
+            'player_url':  "http://videopremium.tv/uplayer/uppod.swf",
+            'ext':         'f4v',
+            'title':       video_title,
+        }]
index cea29f03525af91d1be56c475da0f62ce45eea83..2de56ac814462e3c3536ccac34b980b3e9a8bfb5 100644 (file)
@@ -11,6 +11,7 @@ from ..utils import (
     get_element_by_attribute,
     ExtractorError,
     std_headers,
+    unsmuggle_url,
 )
 
 class VimeoIE(InfoExtractor):
@@ -53,7 +54,7 @@ class VimeoIE(InfoExtractor):
                 u'title': u'Kathy Sierra: Building the minimum Badass User, Business of Software',
                 u'uploader': u'The BLN & Business of Software',
             },
-        },
+        }
     ]
 
     def _login(self):
@@ -98,6 +99,12 @@ class VimeoIE(InfoExtractor):
         self._login()
 
     def _real_extract(self, url, new_video=True):
+        url, data = unsmuggle_url(url)
+        headers = std_headers
+        if data is not None:
+            headers = headers.copy()
+            headers.update(data)
+
         # Extract ID from URL
         mobj = re.match(self._VALID_URL, url)
         if mobj is None:
@@ -112,7 +119,7 @@ class VimeoIE(InfoExtractor):
             url = 'https://vimeo.com/' + video_id
 
         # Retrieve video webpage to extract further information
-        request = compat_urllib_request.Request(url, None, std_headers)
+        request = compat_urllib_request.Request(url, None, headers)
         webpage = self._download_webpage(request, video_id)
 
         # Now we begin extracting as much information as we can from what we
diff --git a/youtube_dl/extractor/websurg.py b/youtube_dl/extractor/websurg.py
new file mode 100644 (file)
index 0000000..43953bf
--- /dev/null
@@ -0,0 +1,59 @@
+# coding: utf-8
+
+import re
+
+from ..utils import (
+    compat_urllib_request,
+    compat_urllib_parse
+)
+
+from .common import InfoExtractor
+
+class WeBSurgIE(InfoExtractor):
+    IE_NAME = u'websurg.com'
+    _VALID_URL = r'http://.*?\.websurg\.com/MEDIA/\?noheader=1&doi=(.*)'
+
+    _TEST = {
+        u'url': u'http://www.websurg.com/MEDIA/?noheader=1&doi=vd01en4012',
+        u'file': u'vd01en4012.mp4',
+        u'params': {
+            u'skip_download': True,
+        },
+        u'skip': u'Requires login information',
+    }
+    
+    _LOGIN_URL = 'http://www.websurg.com/inc/login/login_div.ajax.php?login=1'
+
+    def _real_initialize(self):
+
+        login_form = {
+            'username': self._downloader.params['username'],
+            'password': self._downloader.params['password'],
+            'Submit': 1
+        }
+        
+        request = compat_urllib_request.Request(
+            self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
+        request.add_header(
+            'Content-Type', 'application/x-www-form-urlencoded;charset=utf-8')
+        compat_urllib_request.urlopen(request).info()
+        webpage = self._download_webpage(self._LOGIN_URL, '', 'Logging in')
+        
+        if webpage != 'OK':
+            self._downloader.report_error(
+                u'Unable to log in: bad username/password')
+        
+    def _real_extract(self, url):
+        video_id = re.match(self._VALID_URL, url).group(1)
+        
+        webpage = self._download_webpage(url, video_id)
+        
+        url_info = re.search(r'streamer="(.*?)" src="(.*?)"', webpage)
+        
+        return {'id': video_id,
+                'title': self._og_search_title(webpage),
+                'description': self._og_search_description(webpage),
+                'ext' : 'mp4',
+                'url' : url_info.group(1) + '/' + url_info.group(2),
+                'thumbnail': self._og_search_thumbnail(webpage)
+                }
index 00fa2ccb5469329685a00ada37194219c31cb79f..9d88c17f52a25091ea045d2ea0dd6f819da93473 100644 (file)
@@ -13,7 +13,7 @@ from ..utils import (
 
 
 class YoukuIE(InfoExtractor):
-    _VALID_URL =  r'(?:http://)?(v|player)\.youku\.com/(v_show/id_|player\.php/sid/)(?P<ID>[A-Za-z0-9]+)(\.html|/v.swf)'
+    _VALID_URL =  r'(?:(?:http://)?(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)|youku:)(?P<ID>[A-Za-z0-9]+)(?:\.html|/v\.swf|)'
     _TEST =   {
         u"url": u"http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html",
         u"file": u"XNDgyMDQ2NTQw_part00.flv",
index 8222a880f55f7a27afe94e2aad5db570342650d9..fb7c42830781bf4cbf3f3ba78547e6defe5b13c5 100644 (file)
@@ -1150,7 +1150,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
             list_page = self._download_webpage(list_url, video_id)
             caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
             original_lang_node = caption_list.find('track')
-            if original_lang_node.attrib.get('kind') != 'asr' :
+            if not original_lang_node or original_lang_node.attrib.get('kind') != 'asr' :
                 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
                 return {}
             original_lang = original_lang_node.attrib['lang_code']
@@ -1250,6 +1250,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
             url_map[itag] = format_url
         return url_map
 
+    def _extract_annotations(self, video_id):
+        url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
+        return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
+
     def _real_extract(self, url):
         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
         mobj = re.search(self._NEXT_URL_RE, url)
@@ -1382,6 +1386,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
         else:
             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 
+        # annotations
+        video_annotations = None
+        if self._downloader.params.get('writeannotations', False):
+                video_annotations = self._extract_annotations(video_id)
+
         # Decide which formats to download
 
         try:
@@ -1495,6 +1504,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                 'subtitles':    video_subtitles,
                 'duration':     video_duration,
                 'age_limit':    18 if age_gate else 0,
+                'annotations':  video_annotations
             })
         return results
 
index 82a1daeb9075a056aa908e0a91bbd83b8897673a..833f981f24ca7b9dc2b2fd6cd79a40fe83afd019 100644 (file)
@@ -9,6 +9,7 @@ import io
 import json
 import locale
 import os
+import pipes
 import platform
 import re
 import socket
@@ -229,6 +230,19 @@ else:
                 return f
         return None
 
+# On python2.6 the xml.etree.ElementTree.Element methods don't support
+# the namespace parameter
+def xpath_with_ns(path, ns_map):
+    components = [c.split(':') for c in path.split('/')]
+    replaced = []
+    for c in components:
+        if len(c) == 1:
+            replaced.append(c[0])
+        else:
+            ns, tag = c
+            replaced.append('{%s}%s' % (ns_map[ns], tag))
+    return '/'.join(replaced)
+
 def htmlentity_transform(matchobj):
     """Transforms an HTML entity to a character.
 
@@ -927,3 +941,24 @@ class locked_file(object):
 
     def read(self, *args):
         return self.f.read(*args)
+
+
+def shell_quote(args):
+    return ' '.join(map(pipes.quote, args))
+
+
+def smuggle_url(url, data):
+    """ Pass additional data in a URL for internal use. """
+
+    sdata = compat_urllib_parse.urlencode(
+        {u'__youtubedl_smuggle': json.dumps(data)})
+    return url + u'#' + sdata
+
+
+def unsmuggle_url(smug_url):
+    if not '#__youtubedl_smuggle' in smug_url:
+        return smug_url, None
+    url, _, sdata = smug_url.rpartition(u'#')
+    jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
+    data = json.loads(jsond)
+    return url, data
index 1004af116bc88dba99ff62273d63cc02d6154ea4..22a51ffe65aa70abbb5fd8f40946639e8dfa8fb6 100644 (file)
@@ -1,2 +1,2 @@
 
-__version__ = '2013.10.09'
+__version__ = '2013.10.17'