Merge branch 'the-daily-show-podcast' of https://github.com/fstirlitz/youtube-dl...
authorSergey M․ <dstftw@gmail.com>
Sun, 22 Mar 2015 01:44:28 +0000 (07:44 +0600)
committerSergey M․ <dstftw@gmail.com>
Sun, 22 Mar 2015 01:44:28 +0000 (07:44 +0600)
28 files changed:
AUTHORS
devscripts/generate_aes_testdata.py [new file with mode: 0644]
docs/supportedsites.md
test/test_YoutubeDL.py
test/test_aes.py [new file with mode: 0644]
test/test_http.py
test/test_utils.py
youtube_dl/YoutubeDL.py
youtube_dl/extractor/__init__.py
youtube_dl/extractor/arte.py
youtube_dl/extractor/crunchyroll.py
youtube_dl/extractor/douyutv.py [new file with mode: 0644]
youtube_dl/extractor/generic.py
youtube_dl/extractor/grooveshark.py
youtube_dl/extractor/krasview.py
youtube_dl/extractor/letv.py
youtube_dl/extractor/mixcloud.py
youtube_dl/extractor/niconico.py
youtube_dl/extractor/nrk.py
youtube_dl/extractor/nytimes.py
youtube_dl/extractor/primesharetv.py [new file with mode: 0644]
youtube_dl/extractor/sohu.py
youtube_dl/extractor/twitch.py
youtube_dl/extractor/ultimedia.py [new file with mode: 0644]
youtube_dl/extractor/videomega.py
youtube_dl/extractor/vine.py
youtube_dl/utils.py
youtube_dl/version.py

diff --git a/AUTHORS b/AUTHORS
index 872da60711fcda8aa55cb4c8f9f1a0d5cf7e3884..512469f4c9c9e50ce945372ca1dd5e704efc177b 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -116,3 +116,4 @@ Duncan Keall
 Alexander Mamay
 Devin J. Pohly
 Eduardo Ferro Aldama
+Jeff Buchbinder
diff --git a/devscripts/generate_aes_testdata.py b/devscripts/generate_aes_testdata.py
new file mode 100644 (file)
index 0000000..ff66449
--- /dev/null
@@ -0,0 +1,36 @@
+from __future__ import unicode_literals
+
+import codecs
+import subprocess
+
+import os
+import sys
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from youtube_dl.utils import intlist_to_bytes
+from youtube_dl.aes import aes_encrypt, key_expansion
+
+secret_msg = b'Secret message goes here'
+
+
+def hex_str(int_list):
+    return codecs.encode(intlist_to_bytes(int_list), 'hex')
+
+
+def openssl_encode(algo, key, iv):
+    cmd = ['openssl', 'enc', '-e', '-' + algo, '-K', hex_str(key), '-iv', hex_str(iv)]
+    prog = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+    out, _ = prog.communicate(secret_msg)
+    return out
+
+iv = key = [0x20, 0x15] + 14 * [0]
+
+r = openssl_encode('aes-128-cbc', key, iv)
+print('aes_cbc_decrypt')
+print(repr(r))
+
+password = key
+new_key = aes_encrypt(password, key_expansion(password))
+r = openssl_encode('aes-128-ctr', new_key, iv)
+print('aes_decrypt_text')
+print(repr(r))
index d6a1e67c625379c966e98a87f2f0989ec720afdf..72b365305a1b4ec40eb7d0043750d39a16ad66d7 100644 (file)
  - **Discovery**
  - **divxstage**: DivxStage
  - **Dotsub**
+ - **DouyuTV**
  - **DRBonanza**
  - **Dropbox**
  - **DrTuber**
  - **PornHubPlaylist**
  - **Pornotube**
  - **PornoXO**
+ - **PrimeShareTV**
  - **PromptFile**
  - **prosiebensat1**: ProSiebenSat.1 Digital
  - **Puls4**
  - **RTP**
  - **RTS**: RTS.ch
  - **rtve.es:alacarta**: RTVE a la carta
+ - **rtve.es:infantil**: RTVE infantil
  - **rtve.es:live**: RTVE.es live streams
  - **RUHD**
  - **rutube**: Rutube videos
  - **Ubu**
  - **udemy**
  - **udemy:course**
+ - **Ultimedia**
  - **Unistra**
  - **Urort**: NRK P3 Urørt
  - **ustream**
index db8a47d2d0d9e18f46e78b43068196efadc0f943..652519831cec63a518ceb87d409603f13d9383a8 100644 (file)
@@ -14,6 +14,7 @@ from test.helper import FakeYDL, assertRegexpMatches
 from youtube_dl import YoutubeDL
 from youtube_dl.extractor import YoutubeIE
 from youtube_dl.postprocessor.common import PostProcessor
+from youtube_dl.utils import match_filter_func
 
 TEST_URL = 'http://localhost/sample.mp4'
 
@@ -339,6 +340,8 @@ class TestFormatSelection(unittest.TestCase):
         downloaded = ydl.downloaded_info_dicts[0]
         self.assertEqual(downloaded['format_id'], 'G')
 
+
+class TestYoutubeDL(unittest.TestCase):
     def test_subtitles(self):
         def s_formats(lang, autocaption=False):
             return [{
@@ -461,6 +464,73 @@ class TestFormatSelection(unittest.TestCase):
         self.assertTrue(os.path.exists(audiofile), '%s doesn\'t exist' % audiofile)
         os.unlink(audiofile)
 
+    def test_match_filter(self):
+        class FilterYDL(YDL):
+            def __init__(self, *args, **kwargs):
+                super(FilterYDL, self).__init__(*args, **kwargs)
+                self.params['simulate'] = True
+
+            def process_info(self, info_dict):
+                super(YDL, self).process_info(info_dict)
+
+            def _match_entry(self, info_dict, incomplete):
+                res = super(FilterYDL, self)._match_entry(info_dict, incomplete)
+                if res is None:
+                    self.downloaded_info_dicts.append(info_dict)
+                return res
+
+        first = {
+            'id': '1',
+            'url': TEST_URL,
+            'title': 'one',
+            'extractor': 'TEST',
+            'duration': 30,
+            'filesize': 10 * 1024,
+        }
+        second = {
+            'id': '2',
+            'url': TEST_URL,
+            'title': 'two',
+            'extractor': 'TEST',
+            'duration': 10,
+            'description': 'foo',
+            'filesize': 5 * 1024,
+        }
+        videos = [first, second]
+
+        def get_videos(filter_=None):
+            ydl = FilterYDL({'match_filter': filter_})
+            for v in videos:
+                ydl.process_ie_result(v, download=True)
+            return [v['id'] for v in ydl.downloaded_info_dicts]
+
+        res = get_videos()
+        self.assertEqual(res, ['1', '2'])
+
+        def f(v):
+            if v['id'] == '1':
+                return None
+            else:
+                return 'Video id is not 1'
+        res = get_videos(f)
+        self.assertEqual(res, ['1'])
+
+        f = match_filter_func('duration < 30')
+        res = get_videos(f)
+        self.assertEqual(res, ['2'])
+
+        f = match_filter_func('description = foo')
+        res = get_videos(f)
+        self.assertEqual(res, ['2'])
+
+        f = match_filter_func('description =? foo')
+        res = get_videos(f)
+        self.assertEqual(res, ['1', '2'])
+
+        f = match_filter_func('filesize > 5KiB')
+        res = get_videos(f)
+        self.assertEqual(res, ['1'])
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/test_aes.py b/test/test_aes.py
new file mode 100644 (file)
index 0000000..111b902
--- /dev/null
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+
+from __future__ import unicode_literals
+
+# Allow direct execution
+import os
+import sys
+import unittest
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from youtube_dl.aes import aes_decrypt, aes_encrypt, aes_cbc_decrypt, aes_decrypt_text
+from youtube_dl.utils import bytes_to_intlist, intlist_to_bytes
+import base64
+
+# the encrypted data can be generate with 'devscripts/generate_aes_testdata.py'
+
+
+class TestAES(unittest.TestCase):
+    def setUp(self):
+        self.key = self.iv = [0x20, 0x15] + 14 * [0]
+        self.secret_msg = b'Secret message goes here'
+
+    def test_encrypt(self):
+        msg = b'message'
+        key = list(range(16))
+        encrypted = aes_encrypt(bytes_to_intlist(msg), key)
+        decrypted = intlist_to_bytes(aes_decrypt(encrypted, key))
+        self.assertEqual(decrypted, msg)
+
+    def test_cbc_decrypt(self):
+        data = bytes_to_intlist(
+            b"\x97\x92+\xe5\x0b\xc3\x18\x91ky9m&\xb3\xb5@\xe6'\xc2\x96.\xc8u\x88\xab9-[\x9e|\xf1\xcd"
+        )
+        decrypted = intlist_to_bytes(aes_cbc_decrypt(data, self.key, self.iv))
+        self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg)
+
+    def test_decrypt_text(self):
+        password = intlist_to_bytes(self.key).decode('utf-8')
+        encrypted = base64.b64encode(
+            intlist_to_bytes(self.iv[:8]) +
+            b'\x17\x15\x93\xab\x8d\x80V\xcdV\xe0\t\xcdo\xc2\xa5\xd8ksM\r\xe27N\xae'
+        )
+        decrypted = (aes_decrypt_text(encrypted, password, 16))
+        self.assertEqual(decrypted, self.secret_msg)
+
+if __name__ == '__main__':
+    unittest.main()
index bd4d46fef95e60eb7d91752b4c8ba0462b4c469e..f2e305b6fed3ce2f0574a7c20e89ffb977934f28 100644 (file)
@@ -8,7 +8,7 @@ import unittest
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 from youtube_dl import YoutubeDL
-from youtube_dl.compat import compat_http_server
+from youtube_dl.compat import compat_http_server, compat_urllib_request
 import ssl
 import threading
 
@@ -68,5 +68,52 @@ class TestHTTP(unittest.TestCase):
         r = ydl.extract_info('https://localhost:%d/video.html' % self.port)
         self.assertEqual(r['url'], 'https://localhost:%d/vid.mp4' % self.port)
 
+
+def _build_proxy_handler(name):
+    class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler):
+        proxy_name = name
+
+        def log_message(self, format, *args):
+            pass
+
+        def do_GET(self):
+            self.send_response(200)
+            self.send_header('Content-Type', 'text/plain; charset=utf-8')
+            self.end_headers()
+            self.wfile.write('{self.proxy_name}: {self.path}'.format(self=self).encode('utf-8'))
+    return HTTPTestRequestHandler
+
+
+class TestProxy(unittest.TestCase):
+    def setUp(self):
+        self.proxy = compat_http_server.HTTPServer(
+            ('localhost', 0), _build_proxy_handler('normal'))
+        self.port = self.proxy.socket.getsockname()[1]
+        self.proxy_thread = threading.Thread(target=self.proxy.serve_forever)
+        self.proxy_thread.daemon = True
+        self.proxy_thread.start()
+
+        self.cn_proxy = compat_http_server.HTTPServer(
+            ('localhost', 0), _build_proxy_handler('cn'))
+        self.cn_port = self.cn_proxy.socket.getsockname()[1]
+        self.cn_proxy_thread = threading.Thread(target=self.cn_proxy.serve_forever)
+        self.cn_proxy_thread.daemon = True
+        self.cn_proxy_thread.start()
+
+    def test_proxy(self):
+        cn_proxy = 'localhost:{0}'.format(self.cn_port)
+        ydl = YoutubeDL({
+            'proxy': 'localhost:{0}'.format(self.port),
+            'cn_verification_proxy': cn_proxy,
+        })
+        url = 'http://foo.com/bar'
+        response = ydl.urlopen(url).read().decode('utf-8')
+        self.assertEqual(response, 'normal: {0}'.format(url))
+
+        req = compat_urllib_request.Request(url)
+        req.add_header('Ytdl-request-proxy', cn_proxy)
+        response = ydl.urlopen(req).read().decode('utf-8')
+        self.assertEqual(response, 'cn: {0}'.format(url))
+
 if __name__ == '__main__':
     unittest.main()
index 8f790bf0a7853e4fd3ed391a5169e50da2600a5b..a8ab876850d3645eb615d552162128befdac316d 100644 (file)
@@ -24,6 +24,7 @@ from youtube_dl.utils import (
     encodeFilename,
     escape_rfc3986,
     escape_url,
+    ExtractorError,
     find_xpath_attr,
     fix_xml_ampersands,
     InAdvancePagedList,
@@ -39,6 +40,7 @@ from youtube_dl.utils import (
     read_batch_urls,
     sanitize_filename,
     sanitize_path,
+    sanitize_url_path_consecutive_slashes,
     shell_quote,
     smuggle_url,
     str_to_int,
@@ -53,6 +55,7 @@ from youtube_dl.utils import (
     urlencode_postdata,
     version_tuple,
     xpath_with_ns,
+    xpath_text,
     render_table,
     match_str,
 )
@@ -168,6 +171,26 @@ class TestUtil(unittest.TestCase):
         self.assertEqual(sanitize_path('./abc'), 'abc')
         self.assertEqual(sanitize_path('./../abc'), '..\\abc')
 
+    def test_sanitize_url_path_consecutive_slashes(self):
+        self.assertEqual(
+            sanitize_url_path_consecutive_slashes('http://hostname/foo//bar/filename.html'),
+            'http://hostname/foo/bar/filename.html')
+        self.assertEqual(
+            sanitize_url_path_consecutive_slashes('http://hostname//foo/bar/filename.html'),
+            'http://hostname/foo/bar/filename.html')
+        self.assertEqual(
+            sanitize_url_path_consecutive_slashes('http://hostname//'),
+            'http://hostname/')
+        self.assertEqual(
+            sanitize_url_path_consecutive_slashes('http://hostname/foo/bar/filename.html'),
+            'http://hostname/foo/bar/filename.html')
+        self.assertEqual(
+            sanitize_url_path_consecutive_slashes('http://hostname/'),
+            'http://hostname/')
+        self.assertEqual(
+            sanitize_url_path_consecutive_slashes('http://hostname/abc//'),
+            'http://hostname/abc/')
+
     def test_ordered_set(self):
         self.assertEqual(orderedSet([1, 1, 2, 3, 4, 4, 5, 6, 7, 3, 5]), [1, 2, 3, 4, 5, 6, 7])
         self.assertEqual(orderedSet([]), [])
@@ -229,6 +252,17 @@ class TestUtil(unittest.TestCase):
         self.assertEqual(find('media:song/media:author').text, 'The Author')
         self.assertEqual(find('media:song/url').text, 'http://server.com/download.mp3')
 
+    def test_xpath_text(self):
+        testxml = '''<root>
+            <div>
+                <p>Foo</p>
+            </div>
+        </root>'''
+        doc = xml.etree.ElementTree.fromstring(testxml)
+        self.assertEqual(xpath_text(doc, 'div/p'), 'Foo')
+        self.assertTrue(xpath_text(doc, 'div/bar') is None)
+        self.assertRaises(ExtractorError, xpath_text, doc, 'div/bar', fatal=True)
+
     def test_smuggle_url(self):
         data = {"ö": "ö", "abc": [3]}
         url = 'https://foo.bar/baz?x=y#a'
index 5a83bc95662b92e786ed1088155321645eadea98..b5ef5e009f972cb9308bf5c09c561ce393c05a2a 100755 (executable)
@@ -328,9 +328,6 @@ class YoutubeDL(object):
                 'Parameter outtmpl is bytes, but should be a unicode string. '
                 'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')
 
-        if '%(stitle)s' in self.params.get('outtmpl', ''):
-            self.report_warning('%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
-
         self._setup_opener()
 
         if auto_init:
@@ -1218,9 +1215,6 @@ class YoutubeDL(object):
         if len(info_dict['title']) > 200:
             info_dict['title'] = info_dict['title'][:197] + '...'
 
-        # Keep for backwards compatibility
-        info_dict['stitle'] = info_dict['title']
-
         if 'format' not in info_dict:
             info_dict['format'] = info_dict['ext']
 
index e94779d40455f9069f679afb6e7dd7753147afa2..a20492fc372e73577536c93e04986152d5e2d0d1 100644 (file)
@@ -107,6 +107,7 @@ from .dctp import DctpTvIE
 from .deezer import DeezerPlaylistIE
 from .dfb import DFBIE
 from .dotsub import DotsubIE
+from .douyutv import DouyuTVIE
 from .dreisat import DreiSatIE
 from .drbonanza import DRBonanzaIE
 from .drtuber import DrTuberIE
@@ -346,6 +347,7 @@ from .npo import (
 )
 from .nrk import (
     NRKIE,
+    NRKPlaylistIE,
     NRKTVIE,
 )
 from .ntvde import NTVDeIE
@@ -381,6 +383,7 @@ from .pornhub import (
 )
 from .pornotube import PornotubeIE
 from .pornoxo import PornoXOIE
+from .primesharetv import PrimeShareTVIE
 from .promptfile import PromptFileIE
 from .prosiebensat1 import ProSiebenSat1IE
 from .puls4 import Puls4IE
@@ -537,6 +540,7 @@ from .udemy import (
     UdemyIE,
     UdemyCourseIE
 )
+from .ultimedia import UltimediaIE
 from .unistra import UnistraIE
 from .urort import UrortIE
 from .ustream import UstreamIE, UstreamChannelIE
index 929dd3cc5550beb1b2da8874763084b5146d2f33..8273bd6c9ae3cdff82052c8f63efc68be97561b3 100644 (file)
@@ -146,6 +146,7 @@ class ArteTVPlus7IE(InfoExtractor):
 
             formats.append(format)
 
+        self._check_formats(formats, video_id)
         self._sort_formats(formats)
 
         info_dict['formats'] = formats
index e64b88fbc4f796accae6816c5fcec2f2a1aa3ecd..6ded723c96ddad70ebf95b9fb6b73c811bcc6746 100644 (file)
@@ -23,7 +23,6 @@ from ..utils import (
 )
 from ..aes import (
     aes_cbc_decrypt,
-    inc,
 )
 
 
@@ -102,13 +101,6 @@ class CrunchyrollIE(InfoExtractor):
 
         key = obfuscate_key(id)
 
-        class Counter:
-            __value = iv
-
-            def next_value(self):
-                temp = self.__value
-                self.__value = inc(self.__value)
-                return temp
         decrypted_data = intlist_to_bytes(aes_cbc_decrypt(data, key, iv))
         return zlib.decompress(decrypted_data)
 
diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py
new file mode 100644 (file)
index 0000000..d7956e6
--- /dev/null
@@ -0,0 +1,77 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class DouyuTVIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www\.)?douyutv\.com/(?P<id>[A-Za-z0-9]+)'
+    _TEST = {
+        'url': 'http://www.douyutv.com/iseven',
+        'info_dict': {
+            'id': 'iseven',
+            'ext': 'flv',
+            'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'description': 'md5:9e525642c25a0a24302869937cf69d17',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'uploader': '7师傅',
+            'uploader_id': '431925',
+            'is_live': True,
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        config = self._download_json(
+            'http://www.douyutv.com/api/client/room/%s' % video_id, video_id)
+
+        data = config['data']
+
+        error_code = config.get('error', 0)
+        show_status = data.get('show_status')
+        if error_code is not 0:
+            raise ExtractorError(
+                'Server reported error %i' % error_code, expected=True)
+
+        # 1 = live, 2 = offline
+        if show_status == '2':
+            raise ExtractorError(
+                'Live stream is offline', expected=True)
+
+        base_url = data['rtmp_url']
+        live_path = data['rtmp_live']
+
+        title = self._live_title(data['room_name'])
+        description = data.get('show_details')
+        thumbnail = data.get('room_src')
+
+        uploader = data.get('nickname')
+        uploader_id = data.get('owner_uid')
+
+        multi_formats = data.get('rtmp_multi_bitrate')
+        if not isinstance(multi_formats, dict):
+            multi_formats = {}
+        multi_formats['live'] = live_path
+
+        formats = [{
+            'url': '%s/%s' % (base_url, format_path),
+            'format_id': format_id,
+            'preference': 1 if format_id == 'live' else 0,
+        } for format_id, format_path in multi_formats.items()]
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'formats': formats,
+            'is_live': True,
+        }
index 4e6927b08b032718e09fdbe0b35595456d587156..8716e4503ad4ae61a57745ec1eafe96fe5e25ddc 100644 (file)
@@ -1006,6 +1006,13 @@ class GenericIE(InfoExtractor):
         if mobj is not None:
             return self.url_result(mobj.group('url'))
 
+        # Look for NYTimes player
+        mobj = re.search(
+            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
+            webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'))
+
         # Look for Ooyala videos
         mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
                 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
@@ -1268,10 +1275,16 @@ class GenericIE(InfoExtractor):
             # HTML5 video
             found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
         if not found:
+            REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
             found = re.search(
                 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
-                r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'?([^\'"]+)',
+                r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
                 webpage)
+            if not found:
+                # Look also in Refresh HTTP header
+                refresh_header = head_response.headers.get('Refresh')
+                if refresh_header:
+                    found = re.search(REDIRECT_REGEX, refresh_header)
             if found:
                 new_url = found.group(1)
                 self.report_following_redirect(new_url)
index 848d17beb4d3559cb818f14795655b12912e0f11..36ad4915c5e7ea47feec8d32317ec15845a5b7d0 100644 (file)
@@ -140,9 +140,9 @@ class GroovesharkIE(InfoExtractor):
 
         if webpage is not None:
             o = GroovesharkHtmlParser.extract_object_tags(webpage)
-            return (webpage, [x for x in o if x['attrs']['id'] == 'jsPlayerEmbed'])
+            return webpage, [x for x in o if x['attrs']['id'] == 'jsPlayerEmbed']
 
-        return (webpage, None)
+        return webpage, None
 
     def _real_initialize(self):
         self.ts = int(time.time() * 1000)  # timestamp in millis
@@ -154,7 +154,7 @@ class GroovesharkIE(InfoExtractor):
         swf_referer = None
         if self.do_playerpage_request:
             (_, player_objs) = self._get_playerpage(url)
-            if player_objs is not None:
+            if player_objs:
                 swf_referer = self._build_swf_referer(url, player_objs[0])
                 self.to_screen('SWF Referer: %s' % swf_referer)
 
index e46954b47449b11be795c17478e168a6a57af0fd..96f95979a22429d2a19af3575ad1ca25c463b13e 100644 (file)
@@ -40,8 +40,10 @@ class KrasViewIE(InfoExtractor):
         description = self._og_search_description(webpage, default=None)
         thumbnail = flashvars.get('image') or self._og_search_thumbnail(webpage)
         duration = int_or_none(flashvars.get('duration'))
-        width = int_or_none(self._og_search_property('video:width', webpage, 'video width'))
-        height = int_or_none(self._og_search_property('video:height', webpage, 'video height'))
+        width = int_or_none(self._og_search_property(
+            'video:width', webpage, 'video width', default=None))
+        height = int_or_none(self._og_search_property(
+            'video:height', webpage, 'video height', default=None))
 
         return {
             'id': video_id,
index 85eee141b119519e9a6aac9a2fd8bfb1e05419a0..1484ac0d267697dceb34c9e406e3a26b26a37f54 100644 (file)
@@ -88,12 +88,13 @@ class LetvIE(InfoExtractor):
         play_json_req = compat_urllib_request.Request(
             'http://api.letv.com/mms/out/video/playJson?' + compat_urllib_parse.urlencode(params)
         )
-        play_json_req.add_header(
-            'Ytdl-request-proxy',
-            self._downloader.params.get('cn_verification_proxy'))
+        cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
+        if cn_verification_proxy:
+            play_json_req.add_header('Ytdl-request-proxy', cn_verification_proxy)
+
         play_json = self._download_json(
             play_json_req,
-            media_id, 'playJson data')
+            media_id, 'Downloading playJson data')
 
         # Check for errors
         playstatus = play_json['playstatus']
index 1831c6749401405c5a39ca60f4347df03bfe3631..21aea0c5513ab30cbe27384351229c42e864fab3 100644 (file)
@@ -1,6 +1,7 @@
 from __future__ import unicode_literals
 
 import re
+import itertools
 
 from .common import InfoExtractor
 from ..compat import (
@@ -10,7 +11,6 @@ from ..utils import (
     ExtractorError,
     HEADRequest,
     str_to_int,
-    parse_iso8601,
 )
 
 
@@ -27,8 +27,6 @@ class MixcloudIE(InfoExtractor):
             'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',
             'uploader': 'Daniel Holbach',
             'uploader_id': 'dholbach',
-            'upload_date': '20111115',
-            'timestamp': 1321359578,
             'thumbnail': 're:https?://.*\.jpg',
             'view_count': int,
             'like_count': int,
@@ -37,31 +35,30 @@ class MixcloudIE(InfoExtractor):
         'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/',
         'info_dict': {
             'id': 'gillespeterson-caribou-7-inch-vinyl-mix-chat',
-            'ext': 'm4a',
-            'title': 'Electric Relaxation vol. 3',
+            'ext': 'mp3',
+            'title': 'Caribou 7 inch Vinyl Mix & Chat',
             'description': 'md5:2b8aec6adce69f9d41724647c65875e8',
-            'uploader': 'Daniel Drumz',
+            'uploader': 'Gilles Peterson Worldwide',
             'uploader_id': 'gillespeterson',
-            'thumbnail': 're:https?://.*\.jpg',
+            'thumbnail': 're:https?://.*/images/',
             'view_count': int,
             'like_count': int,
         },
     }]
 
-    def _get_url(self, track_id, template_url):
-        server_count = 30
-        for i in range(server_count):
-            url = template_url % i
+    def _get_url(self, track_id, template_url, server_number):
+        boundaries = (1, 30)
+        for nr in server_numbers(server_number, boundaries):
+            url = template_url % nr
             try:
                 # We only want to know if the request succeed
                 # don't download the whole file
                 self._request_webpage(
                     HEADRequest(url), track_id,
-                    'Checking URL %d/%d ...' % (i + 1, server_count + 1))
+                    'Checking URL %d/%d ...' % (nr, boundaries[-1]))
                 return url
             except ExtractorError:
                 pass
-
         return None
 
     def _real_extract(self, url):
@@ -75,17 +72,18 @@ class MixcloudIE(InfoExtractor):
         preview_url = self._search_regex(
             r'\s(?:data-preview-url|m-preview)="([^"]+)"', webpage, 'preview url')
         song_url = preview_url.replace('/previews/', '/c/originals/')
+        server_number = int(self._search_regex(r'stream(\d+)', song_url, 'server number'))
         template_url = re.sub(r'(stream\d*)', 'stream%d', song_url)
-        final_song_url = self._get_url(track_id, template_url)
+        final_song_url = self._get_url(track_id, template_url, server_number)
         if final_song_url is None:
             self.to_screen('Trying with m4a extension')
             template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/')
-            final_song_url = self._get_url(track_id, template_url)
+            final_song_url = self._get_url(track_id, template_url, server_number)
         if final_song_url is None:
             raise ExtractorError('Unable to extract track url')
 
         PREFIX = (
-            r'<span class="play-button[^"]*?"'
+            r'm-play-on-spacebar[^>]+'
             r'(?:\s+[a-zA-Z0-9-]+(?:="[^"]+")?)*?\s+')
         title = self._html_search_regex(
             PREFIX + r'm-title="([^"]+)"', webpage, 'title')
@@ -99,16 +97,12 @@ class MixcloudIE(InfoExtractor):
             r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False)
         description = self._og_search_description(webpage)
         like_count = str_to_int(self._search_regex(
-            [r'<meta itemprop="interactionCount" content="UserLikes:([0-9]+)"',
-             r'/favorites/?">([0-9]+)<'],
+            r'\bbutton-favorite\b.+m-ajax-toggle-count="([^"]+)"',
             webpage, 'like count', fatal=False))
         view_count = str_to_int(self._search_regex(
             [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',
              r'/listeners/?">([0-9,.]+)</a>'],
             webpage, 'play count', fatal=False))
-        timestamp = parse_iso8601(self._search_regex(
-            r'<time itemprop="dateCreated" datetime="([^"]+)">',
-            webpage, 'upload date', default=None))
 
         return {
             'id': track_id,
@@ -118,7 +112,38 @@ class MixcloudIE(InfoExtractor):
             'thumbnail': thumbnail,
             'uploader': uploader,
             'uploader_id': uploader_id,
-            'timestamp': timestamp,
             'view_count': view_count,
             'like_count': like_count,
         }
+
+
+def server_numbers(first, boundaries):
+    """ Server numbers to try in descending order of probable availability.
+    Starting from first (i.e. the number of the server hosting the preview file)
+    and going further and further up to the higher boundary and down to the
+    lower one in an alternating fashion. Namely:
+
+        server_numbers(2, (1, 5))
+
+        # Where the preview server is 2, min number is 1 and max is 5.
+        # Yields: 2, 3, 1, 4, 5
+
+    Why not random numbers or increasing sequences? Since from what I've seen,
+    full length files seem to be hosted on servers whose number is closer to
+    that of the preview; to be confirmed.
+    """
+    zip_longest = getattr(itertools, 'zip_longest', None)
+    if zip_longest is None:
+        # python 2.x
+        zip_longest = itertools.izip_longest
+
+    if len(boundaries) != 2:
+        raise ValueError("boundaries should be a two-element tuple")
+    min, max = boundaries
+    highs = range(first + 1, max + 1)
+    lows = range(first - 1, min - 1, -1)
+    rest = filter(
+        None, itertools.chain.from_iterable(zip_longest(highs, lows)))
+    yield first
+    for n in rest:
+        yield n
index 7fb4e57dfb81029c0fc46dbab83ced5b2fbc766b..ddec7b3387f98e8bc86ab44feefd175676d40489 100644 (file)
@@ -22,7 +22,7 @@ class NiconicoIE(InfoExtractor):
     IE_NAME = 'niconico'
     IE_DESC = 'ニコニコ動画'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.nicovideo.jp/watch/sm22312215',
         'md5': 'd1a75c0823e2f629128c43e1212760f9',
         'info_dict': {
@@ -39,7 +39,24 @@ class NiconicoIE(InfoExtractor):
             'username': 'ydl.niconico@gmail.com',
             'password': 'youtube-dl',
         },
-    }
+    }, {
+        'url': 'http://www.nicovideo.jp/watch/nm14296458',
+        'md5': '8db08e0158457cf852a31519fceea5bc',
+        'info_dict': {
+            'id': 'nm14296458',
+            'ext': 'swf',
+            'title': '【鏡音リン】Dance on media【オリジナル】take2!',
+            'description': 'md5:',
+            'uploader': 'りょうた',
+            'uploader_id': '18822557',
+            'upload_date': '20110429',
+            'duration': 209,
+        },
+        'params': {
+            'username': 'ydl.niconico@gmail.com',
+            'password': 'youtube-dl',
+        },
+    }]
 
     _VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)'
     _NETRC_MACHINE = 'niconico'
@@ -89,7 +106,7 @@ class NiconicoIE(InfoExtractor):
         if self._AUTHENTICATED:
             # Get flv info
             flv_info_webpage = self._download_webpage(
-                'http://flapi.nicovideo.jp/api/getflv?v=' + video_id,
+                'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1',
                 video_id, 'Downloading flv info')
         else:
             # Get external player info
index bff36f9d3f24cad293144d8e216faf5eeefed92c..e91d3a248ec3367af8b7e9a8f60be0d503877481 100644 (file)
@@ -14,46 +14,48 @@ from ..utils import (
 
 
 class NRKIE(InfoExtractor):
-    _VALID_URL = r'http://(?:www\.)?nrk\.no/(?:video|lyd)/[^/]+/(?P<id>[\dA-F]{16})'
+    _VALID_URL = r'(?:nrk:|http://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)'
 
     _TESTS = [
         {
-            'url': 'http://www.nrk.no/video/dompap_og_andre_fugler_i_piip_show/D0FA54B5C8B6CE59/emne/piipshow/',
-            'md5': 'a6eac35052f3b242bb6bb7f43aed5886',
+            'url': 'http://www.nrk.no/video/PS*150533',
+            'md5': 'bccd850baebefe23b56d708a113229c2',
             'info_dict': {
                 'id': '150533',
                 'ext': 'flv',
                 'title': 'Dompap og andre fugler i Piip-Show',
-                'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f'
+                'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f',
+                'duration': 263,
             }
         },
         {
-            'url': 'http://www.nrk.no/lyd/lyd_av_oppleser_for_blinde/AEFDDD5473BA0198/',
-            'md5': '3471f2a51718195164e88f46bf427668',
+            'url': 'http://www.nrk.no/video/PS*154915',
+            'md5': '0b1493ba1aae7d9579a5ad5531bc395a',
             'info_dict': {
                 'id': '154915',
                 'ext': 'flv',
                 'title': 'Slik høres internett ut når du er blind',
                 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568',
+                'duration': 20,
             }
         },
     ]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-
-        page = self._download_webpage(url, video_id)
-
-        video_id = self._html_search_regex(r'<div class="nrk-video" data-nrk-id="(\d+)">', page, 'video id')
+        video_id = self._match_id(url)
 
         data = self._download_json(
-            'http://v7.psapi.nrk.no/mediaelement/%s' % video_id, video_id, 'Downloading media JSON')
+            'http://v8.psapi.nrk.no/mediaelement/%s' % video_id,
+            video_id, 'Downloading media JSON')
 
         if data['usageRights']['isGeoBlocked']:
-            raise ExtractorError('NRK har ikke rettig-heter til å vise dette programmet utenfor Norge', expected=True)
+            raise ExtractorError(
+                'NRK har ikke rettig-heter til å vise dette programmet utenfor Norge',
+                expected=True)
+
+        video_url = data['mediaUrl'] + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81'
 
-        video_url = data['mediaUrl'] + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124'
+        duration = parse_duration(data.get('duration'))
 
         images = data.get('images')
         if images:
@@ -69,10 +71,51 @@ class NRKIE(InfoExtractor):
             'ext': 'flv',
             'title': data['title'],
             'description': data['description'],
+            'duration': duration,
             'thumbnail': thumbnail,
         }
 
 
+class NRKPlaylistIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www\.)?nrk\.no/(?!video)(?:[^/]+/)+(?P<id>[^/]+)'
+
+    _TESTS = [{
+        'url': 'http://www.nrk.no/troms/gjenopplev-den-historiske-solformorkelsen-1.12270763',
+        'info_dict': {
+            'id': 'gjenopplev-den-historiske-solformorkelsen-1.12270763',
+            'title': 'Gjenopplev den historiske solformørkelsen',
+            'description': 'md5:c2df8ea3bac5654a26fc2834a542feed',
+        },
+        'playlist_count': 2,
+    }, {
+        'url': 'http://www.nrk.no/kultur/bok/rivertonprisen-til-karin-fossum-1.12266449',
+        'info_dict': {
+            'id': 'rivertonprisen-til-karin-fossum-1.12266449',
+            'title': 'Rivertonprisen til Karin Fossum',
+            'description': 'Første kvinne på 15 år til å vinne krimlitteraturprisen.',
+        },
+        'playlist_count': 5,
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        entries = [
+            self.url_result('nrk:%s' % video_id, 'NRK')
+            for video_id in re.findall(
+                r'class="[^"]*\brich\b[^"]*"[^>]+data-video-id="([^"]+)"',
+                webpage)
+        ]
+
+        playlist_title = self._og_search_title(webpage)
+        playlist_description = self._og_search_description(webpage)
+
+        return self.playlist_result(
+            entries, playlist_id, playlist_title, playlist_description)
+
+
 class NRKTVIE(InfoExtractor):
     _VALID_URL = r'(?P<baseurl>http://tv\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?'
 
index 56e1cad3b0021431721b59df2162feaf7e0c357b..03f0a4de6dccd8485091b802422bbfaf5f7402df 100644 (file)
@@ -1,15 +1,17 @@
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
-from ..utils import parse_iso8601
+from ..utils import (
+    float_or_none,
+    int_or_none,
+    parse_iso8601,
+)
 
 
 class NYTimesIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?nytimes\.com/video/(?:[^/]+/)+(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263',
         'md5': '18a525a510f942ada2720db5f31644c0',
         'info_dict': {
@@ -22,18 +24,21 @@ class NYTimesIE(InfoExtractor):
             'uploader': 'Brett Weiner',
             'duration': 419,
         }
-    }
+    }, {
+        'url': 'http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = self._match_id(url)
 
         video_data = self._download_json(
-            'http://www.nytimes.com/svc/video/api/v2/video/%s' % video_id, video_id, 'Downloading video JSON')
+            'http://www.nytimes.com/svc/video/api/v2/video/%s' % video_id,
+            video_id, 'Downloading video JSON')
 
         title = video_data['headline']
-        description = video_data['summary']
-        duration = video_data['duration'] / 1000.0
+        description = video_data.get('summary')
+        duration = float_or_none(video_data.get('duration'), 1000)
 
         uploader = video_data['byline']
         timestamp = parse_iso8601(video_data['publication_date'][:-8])
@@ -49,11 +54,11 @@ class NYTimesIE(InfoExtractor):
         formats = [
             {
                 'url': video['url'],
-                'format_id': video['type'],
-                'vcodec': video['video_codec'],
-                'width': video['width'],
-                'height': video['height'],
-                'filesize': get_file_size(video['fileSize']),
+                'format_id': video.get('type'),
+                'vcodec': video.get('video_codec'),
+                'width': int_or_none(video.get('width')),
+                'height': int_or_none(video.get('height')),
+                'filesize': get_file_size(video.get('fileSize')),
             } for video in video_data['renditions']
         ]
         self._sort_formats(formats)
@@ -61,7 +66,8 @@ class NYTimesIE(InfoExtractor):
         thumbnails = [
             {
                 'url': 'http://www.nytimes.com/%s' % image['url'],
-                'resolution': '%dx%d' % (image['width'], image['height']),
+                'width': int_or_none(image.get('width')),
+                'height': int_or_none(image.get('height')),
             } for image in video_data['images']
         ]
 
diff --git a/youtube_dl/extractor/primesharetv.py b/youtube_dl/extractor/primesharetv.py
new file mode 100644 (file)
index 0000000..01cc3d9
--- /dev/null
@@ -0,0 +1,69 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_parse,
+    compat_urllib_request,
+)
+from ..utils import ExtractorError
+
+
+class PrimeShareTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?primeshare\.tv/download/(?P<id>[\da-zA-Z]+)'
+
+    _TEST = {
+        'url': 'http://primeshare.tv/download/238790B611',
+        'md5': 'b92d9bf5461137c36228009f31533fbc',
+        'info_dict': {
+            'id': '238790B611',
+            'ext': 'mp4',
+            'title': 'Public Domain - 1960s Commercial - Crest Toothpaste-YKsuFona',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        if '>File not exist<' in webpage:
+            raise ExtractorError('Video %s does not exist' % video_id, expected=True)
+
+        fields = dict(re.findall(r'''(?x)<input\s+
+            type="hidden"\s+
+            name="([^"]+)"\s+
+            (?:id="[^"]+"\s+)?
+            value="([^"]*)"
+            ''', webpage))
+
+        headers = {
+            'Referer': url,
+            'Content-Type': 'application/x-www-form-urlencoded',
+        }
+
+        wait_time = int(self._search_regex(
+            r'var\s+cWaitTime\s*=\s*(\d+)',
+            webpage, 'wait time', default=7)) + 1
+        self._sleep(wait_time, video_id)
+
+        req = compat_urllib_request.Request(
+            url, compat_urllib_parse.urlencode(fields), headers)
+        video_page = self._download_webpage(
+            req, video_id, 'Downloading video page')
+
+        video_url = self._search_regex(
+            r"url\s*:\s*'([^']+\.primeshare\.tv(?::443)?/file/[^']+)'",
+            video_page, 'video url')
+
+        title = self._html_search_regex(
+            r'<h1>Watch\s*(?:&nbsp;)?\s*\((.+?)(?:\s*\[\.\.\.\])?\)\s*(?:&nbsp;)?\s*<strong>',
+            video_page, 'title')
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'ext': 'mp4',
+        }
index c04791997f3672cdb643870086c7ed7f52db54c1..11edf616ac8a781f0765cec94a7e49b2cb07c32c 100644 (file)
@@ -4,22 +4,87 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from .common import compat_str
+from ..compat import (
+    compat_str,
+    compat_urllib_request
+)
+from ..utils import sanitize_url_path_consecutive_slashes
 
 
 class SohuIE(InfoExtractor):
     _VALID_URL = r'https?://(?P<mytv>my\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P<id>\d+)\.shtml.*?'
 
-    _TEST = {
+    _TESTS = [{
+        'note': 'This video is available only in Mainland China',
         'url': 'http://tv.sohu.com/20130724/n382479172.shtml#super',
-        'md5': 'bde8d9a6ffd82c63a1eefaef4eeefec7',
+        'md5': '29175c8cadd8b5cc4055001e85d6b372',
         'info_dict': {
             'id': '382479172',
             'ext': 'mp4',
             'title': 'MV:Far East Movement《The Illest》',
         },
-        'skip': 'Only available from China',
-    }
+        'params': {
+            'cn_verification_proxy': 'proxy.uku.im:8888'
+        }
+    }, {
+        'url': 'http://tv.sohu.com/20150305/n409385080.shtml',
+        'md5': '699060e75cf58858dd47fb9c03c42cfb',
+        'info_dict': {
+            'id': '409385080',
+            'ext': 'mp4',
+            'title': '《2015湖南卫视羊年元宵晚会》唐嫣《花好月圆》',
+        }
+    }, {
+        'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml',
+        'md5': '9bf34be48f2f4dadcb226c74127e203c',
+        'info_dict': {
+            'id': '78693464',
+            'ext': 'mp4',
+            'title': '【爱范品】第31期:MWC见不到的奇葩手机',
+        }
+    }, {
+        'note': 'Multipart video',
+        'url': 'http://my.tv.sohu.com/pl/8384802/78910339.shtml',
+        'info_dict': {
+            'id': '78910339',
+        },
+        'playlist': [{
+            'md5': 'bdbfb8f39924725e6589c146bc1883ad',
+            'info_dict': {
+                'id': '78910339_part1',
+                'ext': 'mp4',
+                'duration': 294,
+                'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
+            }
+        }, {
+            'md5': '3e1f46aaeb95354fd10e7fca9fc1804e',
+            'info_dict': {
+                'id': '78910339_part2',
+                'ext': 'mp4',
+                'duration': 300,
+                'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
+            }
+        }, {
+            'md5': '8407e634175fdac706766481b9443450',
+            'info_dict': {
+                'id': '78910339_part3',
+                'ext': 'mp4',
+                'duration': 150,
+                'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
+            }
+        }]
+    }, {
+        'note': 'Video with title containing dash',
+        'url': 'http://my.tv.sohu.com/us/249884221/78932792.shtml',
+        'info_dict': {
+            'id': '78932792',
+            'ext': 'mp4',
+            'title': 'youtube-dl testing video',
+        },
+        'params': {
+            'skip_download': True
+        }
+    }]
 
     def _real_extract(self, url):
 
@@ -29,8 +94,14 @@ class SohuIE(InfoExtractor):
             else:
                 base_data_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid='
 
+            req = compat_urllib_request.Request(base_data_url + vid_id)
+
+            cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
+            if cn_verification_proxy:
+                req.add_header('Ytdl-request-proxy', cn_verification_proxy)
+
             return self._download_json(
-                base_data_url + vid_id, video_id,
+                req, video_id,
                 'Downloading JSON data for %s' % vid_id)
 
         mobj = re.match(self._VALID_URL, url)
@@ -38,10 +109,8 @@ class SohuIE(InfoExtractor):
         mytv = mobj.group('mytv') is not None
 
         webpage = self._download_webpage(url, video_id)
-        raw_title = self._html_search_regex(
-            r'(?s)<title>(.+?)</title>',
-            webpage, 'video title')
-        title = raw_title.partition('-')[0].strip()
+
+        title = self._og_search_title(webpage)
 
         vid = self._html_search_regex(
             r'var vid ?= ?["\'](\d+)["\']',
@@ -77,7 +146,9 @@ class SohuIE(InfoExtractor):
                     % (format_id, i + 1, part_count))
 
                 part_info = part_str.split('|')
-                video_url = '%s%s?key=%s' % (part_info[0], su[i], part_info[3])
+
+                video_url = sanitize_url_path_consecutive_slashes(
+                    '%s%s?key=%s' % (part_info[0], su[i], part_info[3]))
 
                 formats.append({
                     'url': video_url,
index cbdaf9c7ab0d8c878011b10f8c59d71c851007c5..aad2bf222269e5c3adf530b7332eed6eaabcee1a 100644 (file)
@@ -23,6 +23,7 @@ class TwitchBaseIE(InfoExtractor):
     _API_BASE = 'https://api.twitch.tv'
     _USHER_BASE = 'http://usher.twitch.tv'
     _LOGIN_URL = 'https://secure.twitch.tv/user/login'
+    _LOGIN_POST_URL = 'https://secure-login.twitch.tv/login'
     _NETRC_MACHINE = 'twitch'
 
     def _handle_error(self, response):
@@ -67,14 +68,14 @@ class TwitchBaseIE(InfoExtractor):
             'authenticity_token': authenticity_token,
             'redirect_on_login': '',
             'embed_form': 'false',
-            'mp_source_action': '',
+            'mp_source_action': 'login-button',
             'follow': '',
-            'user[login]': username,
-            'user[password]': password,
+            'login': username,
+            'password': password,
         }
 
         request = compat_urllib_request.Request(
-            self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+            self._LOGIN_POST_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
         request.add_header('Referer', self._LOGIN_URL)
         response = self._download_webpage(
             request, None, 'Logging in as %s' % username)
diff --git a/youtube_dl/extractor/ultimedia.py b/youtube_dl/extractor/ultimedia.py
new file mode 100644 (file)
index 0000000..06554a1
--- /dev/null
@@ -0,0 +1,104 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    qualities,
+    unified_strdate,
+    clean_html,
+)
+
+
+class UltimediaIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?ultimedia\.com/default/index/video[^/]+/id/(?P<id>[\d+a-z]+)'
+    _TESTS = [{
+        # news
+        'url': 'https://www.ultimedia.com/default/index/videogeneric/id/s8uk0r',
+        'md5': '276a0e49de58c7e85d32b057837952a2',
+        'info_dict': {
+            'id': 's8uk0r',
+            'ext': 'mp4',
+            'title': 'Loi sur la fin de vie: le texte prévoit un renforcement des directives anticipées',
+            'description': 'md5:3e5c8fd65791487333dda5db8aed32af',
+            'thumbnail': 're:^https?://.*\.jpg',
+            'upload_date': '20150317',
+        },
+    }, {
+        # music
+        'url': 'https://www.ultimedia.com/default/index/videomusic/id/xvpfp8',
+        'md5': '2ea3513813cf230605c7e2ffe7eca61c',
+        'info_dict': {
+            'id': 'xvpfp8',
+            'ext': 'mp4',
+            'title': "Two - C'est la vie (Clip)",
+            'description': 'Two',
+            'thumbnail': 're:^https?://.*\.jpg',
+            'upload_date': '20150224',
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        deliver_url = self._search_regex(
+            r'<iframe[^>]+src="(https?://(?:www\.)?ultimedia\.com/deliver/[^"]+)"',
+            webpage, 'deliver URL')
+
+        deliver_page = self._download_webpage(
+            deliver_url, video_id, 'Downloading iframe page')
+
+        if '>This video is currently not available' in deliver_page:
+            raise ExtractorError(
+                'Video %s is currently not available' % video_id, expected=True)
+
+        player = self._parse_json(
+            self._search_regex(
+                r"jwplayer\('player(?:_temp)?'\)\.setup\(({.+?})\)\.on", deliver_page, 'player'),
+            video_id)
+
+        quality = qualities(['flash', 'html5'])
+        formats = []
+        for mode in player['modes']:
+            video_url = mode.get('config', {}).get('file')
+            if not video_url:
+                continue
+            if re.match(r'https?://www\.youtube\.com/.+?', video_url):
+                return self.url_result(video_url, 'Youtube')
+            formats.append({
+                'url': video_url,
+                'format_id': mode.get('type'),
+                'quality': quality(mode.get('type')),
+            })
+        self._sort_formats(formats)
+
+        thumbnail = player.get('image')
+
+        title = clean_html((
+            self._html_search_regex(
+                r'(?s)<div\s+id="catArticle">.+?</div>(.+?)</h1>',
+                webpage, 'title', default=None)
+            or self._search_regex(
+                r"var\s+nameVideo\s*=\s*'([^']+)'",
+                deliver_page, 'title')))
+
+        description = clean_html(self._html_search_regex(
+            r'(?s)<span>Description</span>(.+?)</p>', webpage,
+            'description', fatal=False))
+
+        upload_date = unified_strdate(self._search_regex(
+            r'Ajouté le\s*<span>([^<]+)', webpage,
+            'upload date', fatal=False))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+            'formats': formats,
+        }
index 27303031620a8c126797bcdd6207d2f2355c74be..eb309a7cdf99b3ebc4bde755fe09d47505516f28 100644 (file)
@@ -4,28 +4,21 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..compat import (
-    compat_urllib_parse,
-    compat_urllib_request,
-)
-from ..utils import (
-    ExtractorError,
-    remove_start,
-)
+from ..compat import compat_urllib_request
 
 
 class VideoMegaIE(InfoExtractor):
     _VALID_URL = r'''(?x)https?://
         (?:www\.)?videomega\.tv/
-        (?:iframe\.php)?\?ref=(?P<id>[A-Za-z0-9]+)
+        (?:iframe\.php|cdn\.php)?\?ref=(?P<id>[A-Za-z0-9]+)
         '''
     _TEST = {
-        'url': 'http://videomega.tv/?ref=QR0HCUHI1661IHUCH0RQ',
+        'url': 'http://videomega.tv/?ref=4GNA688SU99US886ANG4',
         'md5': 'bf5c2f95c4c917536e80936af7bc51e1',
         'info_dict': {
-            'id': 'QR0HCUHI1661IHUCH0RQ',
+            'id': '4GNA688SU99US886ANG4',
             'ext': 'mp4',
-            'title': 'Big Buck Bunny',
+            'title': 'BigBuckBunny_320x180',
             'thumbnail': 're:^https?://.*\.jpg$',
         }
     }
@@ -33,34 +26,24 @@ class VideoMegaIE(InfoExtractor):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        iframe_url = 'http://videomega.tv/iframe.php?ref={0:}'.format(video_id)
+        iframe_url = 'http://videomega.tv/cdn.php?ref=%s' % video_id
         req = compat_urllib_request.Request(iframe_url)
         req.add_header('Referer', url)
         webpage = self._download_webpage(req, video_id)
 
-        try:
-            escaped_data = re.findall(r'unescape\("([^"]+)"\)', webpage)[-1]
-        except IndexError:
-            raise ExtractorError('Unable to extract escaped data')
-
-        playlist = compat_urllib_parse.unquote(escaped_data)
-
+        title = self._html_search_regex(
+            r'<title>(.*?)</title>', webpage, 'title')
+        title = re.sub(
+            r'(?:^[Vv]ideo[Mm]ega\.tv\s-\s?|\s?-\svideomega\.tv$)', '', title)
         thumbnail = self._search_regex(
-            r'image:\s*"([^"]+)"', playlist, 'thumbnail', fatal=False)
-        video_url = self._search_regex(r'file:\s*"([^"]+)"', playlist, 'URL')
-        title = remove_start(self._html_search_regex(
-            r'<title>(.*?)</title>', webpage, 'title'), 'VideoMega.tv - ')
-
-        formats = [{
-            'format_id': 'sd',
-            'url': video_url,
-        }]
-        self._sort_formats(formats)
+            r'<video[^>]+?poster="([^"]+)"', webpage, 'thumbnail', fatal=False)
+        video_url = self._search_regex(
+            r'<source[^>]+?src="([^"]+)"', webpage, 'video URL')
 
         return {
             'id': video_id,
             'title': title,
-            'formats': formats,
+            'url': video_url,
             'thumbnail': thumbnail,
             'http_headers': {
                 'Referer': iframe_url,
index 0b58fe0fe0b5188e9c9865e56ce064e94dbc45e5..c3187cfeb7c7f04dd040ca4cb3e89f0653ccbd03 100644 (file)
@@ -33,14 +33,13 @@ class VineIE(InfoExtractor):
             r'window\.POST_DATA = { %s: ({.+?}) }' % video_id, webpage, 'vine data'))
 
         formats = [{
-            'url': data['videoLowURL'],
-            'ext': 'mp4',
-            'format_id': 'low',
-        }, {
-            'url': data['videoUrl'],
-            'ext': 'mp4',
-            'format_id': 'standard',
-        }]
+            'format_id': '%(format)s-%(rate)s' % f,
+            'vcodec': f['format'],
+            'quality': f['rate'],
+            'url': f['videoUrl'],
+        } for f in data['videoUrls'] if f.get('rate')]
+
+        self._sort_formats(formats)
 
         return {
             'id': video_id,
index c3135effc18d0df51982787933c06ceaa851e666..472d4df41fda2cb1ffd0392cc4da0f4bdcc2a48a 100644 (file)
@@ -326,6 +326,13 @@ def sanitize_path(s):
     return os.path.join(*sanitized_path)
 
 
+def sanitize_url_path_consecutive_slashes(url):
+    """Collapses consecutive slashes in URLs' path"""
+    parsed_url = list(compat_urlparse.urlparse(url))
+    parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2])
+    return compat_urlparse.urlunparse(parsed_url)
+
+
 def orderedSet(iterable):
     """ Remove all duplicates from the input iterable """
     res = []
index 7ed07c375a9a898d45a145722a2ebd6ad93a9285..51b4260aad38f9dbae3b8d2e0d42602c540d3e4f 100644 (file)
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '2015.03.15'
+__version__ = '2015.03.18'