Merge pull request #3865 from diffycat/jpopsuki
authorSergey M. <dstftw@gmail.com>
Thu, 2 Oct 2014 12:38:29 +0000 (19:38 +0700)
committerSergey M. <dstftw@gmail.com>
Thu, 2 Oct 2014 12:38:29 +0000 (19:38 +0700)
[jpopsuki] Support category links

41 files changed:
README.md
test/helper.py
test/test_utils.py
test/test_youtube_signature.py
youtube_dl/extractor/__init__.py
youtube_dl/extractor/arte.py
youtube_dl/extractor/br.py
youtube_dl/extractor/cliphunter.py
youtube_dl/extractor/common.py
youtube_dl/extractor/dropbox.py
youtube_dl/extractor/eporner.py
youtube_dl/extractor/facebook.py
youtube_dl/extractor/funnyordie.py
youtube_dl/extractor/generic.py
youtube_dl/extractor/godtube.py
youtube_dl/extractor/golem.py
youtube_dl/extractor/ign.py
youtube_dl/extractor/internetvideoarchive.py
youtube_dl/extractor/izlesene.py
youtube_dl/extractor/jukebox.py
youtube_dl/extractor/lrt.py [new file with mode: 0644]
youtube_dl/extractor/nfl.py
youtube_dl/extractor/pbs.py
youtube_dl/extractor/prosiebensat1.py
youtube_dl/extractor/spankwire.py
youtube_dl/extractor/sportdeutschland.py
youtube_dl/extractor/sunporno.py
youtube_dl/extractor/tapely.py [new file with mode: 0644]
youtube_dl/extractor/ted.py
youtube_dl/extractor/thvideo.py
youtube_dl/extractor/tvigle.py
youtube_dl/extractor/vgtv.py
youtube_dl/extractor/vimeo.py
youtube_dl/extractor/vuclip.py
youtube_dl/extractor/worldstarhiphop.py
youtube_dl/extractor/yahoo.py
youtube_dl/extractor/ynet.py
youtube_dl/extractor/youtube.py
youtube_dl/options.py
youtube_dl/utils.py
youtube_dl/version.py

index 0f7442906f28ebf9901e6ebc594da64febef95c8..cabc5eb9adb998791256214ec6b1633ba9a075c6 100644 (file)
--- a/README.md
+++ b/README.md
@@ -348,21 +348,34 @@ $ youtube-dl --dateafter 20000101 --datebefore 20091231
 
 # FAQ
 
-### I'm getting an error `Unable to extract OpenGraph title` on YouTube playlists
+### How do I update youtube-dl?
 
-YouTube changed their playlist format in March 2014 and later on, so you'll need at least youtube-dl 2014.07.25 to download all YouTube videos.
+If you've followed [our manual installation instructions](http://rg3.github.io/youtube-dl/download.html), you can simply run `youtube-dl -U` (or, on Linux, `sudo youtube-dl -U`).
+
+If you have used pip, a simple `sudo pip install -U youtube-dl` is sufficient to update.
 
-If you have installed youtube-dl with a package manager, pip, setup.py or a tarball, please use that to update. Note that Ubuntu packages do not seem to get updated anymore. Since we are not affiliated with Ubuntu, there is little we can do. Feel free to report bugs to the Ubuntu packaging guys - all they have to do is update the package to a somewhat recent version.
+If you have installed youtube-dl using a package manager like *apt-get* or *yum*, use the standard system update mechanism to update. Note that distribution packages are often outdated. As a rule of thumb, youtube-dl releases at least once a month, and often weekly or even daily. Simply go to http://yt-dl.org/ to find out the current version. Unfortunately, there is nothing we youtube-dl developers can do if your distributions serves a really outdated version. You can (and should) complain to your distribution in their bugtracker or support forum.
 
-Alternatively, uninstall the youtube-dl package and follow [our manual installation instructions](http://rg3.github.io/youtube-dl/download.html). In a pinch, this should do if you used `apt-get` before to install youtube-dl:
+As a last resort, you can also uninstall the version installed by your package manager and follow our manual installation instructions. For that, remove the distribution's package, with a line like
+
+    sudo apt-get remove -y youtube-dl
+
+Afterwards, simply follow [our manual installation instructions](http://rg3.github.io/youtube-dl/download.html):
 
 ```
-sudo apt-get remove -y youtube-dl
 sudo wget https://yt-dl.org/latest/youtube-dl -O /usr/local/bin/youtube-dl
 sudo chmod a+x /usr/local/bin/youtube-dl
 hash -r
 ```
 
+Again, from then on you'll be able to update with `sudo youtube-dl -U`.
+
+### I'm getting an error `Unable to extract OpenGraph title` on YouTube playlists
+
+YouTube changed their playlist format in March 2014 and later on, so you'll need at least youtube-dl 2014.07.25 to download all YouTube videos.
+
+If you have installed youtube-dl with a package manager, pip, setup.py or a tarball, please use that to update. Note that Ubuntu packages do not seem to get updated anymore. Since we are not affiliated with Ubuntu, there is little we can do. Feel free to report bugs to the Ubuntu packaging guys - all they have to do is update the package to a somewhat recent version. See above for a way to update.
+
 ### Do I always have to pass in `--max-quality FORMAT`, or `-citw`?
 
 By default, youtube-dl intends to have the best options (incidentally, if you have a convincing case that these should be different, [please file an issue where you explain that](https://yt-dl.org/bug)). Therefore, it is unnecessary and sometimes harmful to copy long option strings from webpages. In particular, `--max-quality` *limits* the video quality (so if you want the best quality, do NOT pass it in), and the only option out of `-citw` that is regularly useful is `-i`.
index 7f3ab8438736485187f96464d6844080ac98c45f..62cb3ce0219ba46dadc1cc0c08891bf0941d2304 100644 (file)
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 import errno
 import io
 import hashlib
@@ -12,6 +14,7 @@ from youtube_dl import YoutubeDL
 from youtube_dl.utils import (
     compat_str,
     preferredencoding,
+    write_string,
 )
 
 
@@ -40,10 +43,10 @@ def report_warning(message):
     If stderr is a tty file the 'WARNING:' will be colored
     '''
     if sys.stderr.isatty() and os.name != 'nt':
-        _msg_header = u'\033[0;33mWARNING:\033[0m'
+        _msg_header = '\033[0;33mWARNING:\033[0m'
     else:
-        _msg_header = u'WARNING:'
-    output = u'%s %s\n' % (_msg_header, message)
+        _msg_header = 'WARNING:'
+    output = '%s %s\n' % (_msg_header, message)
     if 'b' in getattr(sys.stderr, 'mode', '') or sys.version_info[0] < 3:
         output = output.encode(preferredencoding())
     sys.stderr.write(output)
@@ -103,22 +106,22 @@ def expect_info_dict(self, expected_dict, got_dict):
 
             self.assertTrue(
                 isinstance(got, compat_str),
-                u'Expected a %s object, but got %s for field %s' % (
+                'Expected a %s object, but got %s for field %s' % (
                     compat_str.__name__, type(got).__name__, info_field))
             self.assertTrue(
                 match_rex.match(got),
-                u'field %s (value: %r) should match %r' % (info_field, got, match_str))
+                'field %s (value: %r) should match %r' % (info_field, got, match_str))
         elif isinstance(expected, type):
             got = got_dict.get(info_field)
             self.assertTrue(isinstance(got, expected),
-                u'Expected type %r for field %s, but got value %r of type %r' % (expected, info_field, got, type(got)))
+                'Expected type %r for field %s, but got value %r of type %r' % (expected, info_field, got, type(got)))
         else:
             if isinstance(expected, compat_str) and expected.startswith('md5:'):
                 got = 'md5:' + md5(got_dict.get(info_field))
             else:
                 got = got_dict.get(info_field)
             self.assertEqual(expected, got,
-                u'invalid value for field %s, expected %r, got %r' % (info_field, expected, got))
+                'invalid value for field %s, expected %r, got %r' % (info_field, expected, got))
 
     # Check for the presence of mandatory fields
     if got_dict.get('_type') != 'playlist':
@@ -126,7 +129,7 @@ def expect_info_dict(self, expected_dict, got_dict):
             self.assertTrue(got_dict.get(key), 'Missing mandatory field %s' % key)
     # Check for mandatory fields that are automatically set by YoutubeDL
     for key in ['webpage_url', 'extractor', 'extractor_key']:
-        self.assertTrue(got_dict.get(key), u'Missing field: %s' % key)
+        self.assertTrue(got_dict.get(key), 'Missing field: %s' % key)
 
     # Are checkable fields missing from the test case definition?
     test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value))
@@ -134,7 +137,15 @@ def expect_info_dict(self, expected_dict, got_dict):
         if value and key in ('title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location'))
     missing_keys = set(test_info_dict.keys()) - set(expected_dict.keys())
     if missing_keys:
-        sys.stderr.write(u'\n"info_dict": ' + json.dumps(test_info_dict, ensure_ascii=False, indent=4) + u'\n')
+        def _repr(v):
+            if isinstance(v, compat_str):
+                return "'%s'" % v.replace('\\', '\\\\').replace("'", "\\'")
+            else:
+                return repr(v)
+        info_dict_str = ''.join(
+            '    %s: %s,\n' % (_repr(k), _repr(v))
+            for k, v in test_info_dict.items())
+        write_string('\n"info_dict": {' + info_dict_str + '}\n', out=sys.stderr)
         self.assertFalse(
             missing_keys,
             'Missing keys in test definition: %s' % (
index 3efbed29dd34de570f2db4e6eb4954ec2f4b9c6e..bcca0efead42b85f39337a4c28f0d654447cd8e2 100644 (file)
@@ -22,7 +22,8 @@ from youtube_dl.utils import (
     fix_xml_ampersands,
     get_meta_content,
     orderedSet,
-    PagedList,
+    OnDemandPagedList,
+    InAdvancePagedList,
     parse_duration,
     read_batch_urls,
     sanitize_filename,
@@ -43,6 +44,7 @@ from youtube_dl.utils import (
     limit_length,
     escape_rfc3986,
     escape_url,
+    js_to_json,
 )
 
 
@@ -137,6 +139,7 @@ class TestUtil(unittest.TestCase):
         self.assertEqual(unified_strdate('Dec 14, 2012'), '20121214')
         self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011')
         self.assertEqual(unified_strdate('1968-12-10'), '19681210')
+        self.assertEqual(unified_strdate('28/01/2014 21:00:00 +0100'), '20140128')
 
     def test_find_xpath_attr(self):
         testxml = '''<root>
@@ -246,10 +249,14 @@ class TestUtil(unittest.TestCase):
                 for i in range(firstid, upto):
                     yield i
 
-            pl = PagedList(get_page, pagesize)
+            pl = OnDemandPagedList(get_page, pagesize)
             got = pl.getslice(*sliceargs)
             self.assertEqual(got, expected)
 
+            iapl = InAdvancePagedList(get_page, size // pagesize + 1, pagesize)
+            got = iapl.getslice(*sliceargs)
+            self.assertEqual(got, expected)
+
         testPL(5, 2, (), [0, 1, 2, 3, 4])
         testPL(5, 2, (1,), [1, 2, 3, 4])
         testPL(5, 2, (2,), [2, 3, 4])
@@ -325,5 +332,28 @@ class TestUtil(unittest.TestCase):
         )
         self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0')
 
+    def test_js_to_json_realworld(self):
+        inp = '''{
+            'clip':{'provider':'pseudo'}
+        }'''
+        self.assertEqual(js_to_json(inp), '''{
+            "clip":{"provider":"pseudo"}
+        }''')
+        json.loads(js_to_json(inp))
+
+        inp = '''{
+            'playlist':[{'controls':{'all':null}}]
+        }'''
+        self.assertEqual(js_to_json(inp), '''{
+            "playlist":[{"controls":{"all":null}}]
+        }''')
+
+    def test_js_to_json_edgecases(self):
+        on = js_to_json("{abc_def:'1\\'\\\\2\\\\\\'3\"4'}")
+        self.assertEqual(json.loads(on), {"abc_def": "1'\\2\\'3\"4"})
+
+        on = js_to_json('{"abc": true}')
+        self.assertEqual(json.loads(on), {'abc': True})
+
 if __name__ == '__main__':
     unittest.main()
index 604e76ab60ba42081c3b4779e77e2963038f43e5..df2cb09f2a87dcacbb97de9193265f9bf1e852af 100644 (file)
@@ -47,18 +47,6 @@ _TESTS = [
         '2ACFC7A61CA478CD21425E5A57EBD73DDC78E22A.2094302436B2D377D14A3BBA23022D023B8BC25AA',
         'A52CB8B320D22032ABB3A41D773D2B6342034902.A22E87CDD37DBE75A5E52412DC874AC16A7CFCA2',
     ),
-    (
-        'http://s.ytimg.com/yts/swfbin/player-vfl5vIhK2/watch_as3.swf',
-        'swf',
-        86,
-        'O1I3456789abcde0ghijklmnopqrstuvwxyzABCDEFGHfJKLMN2PQRSTUVWXY\\!"#$%&\'()*+,-./:;<=>?'
-    ),
-    (
-        'http://s.ytimg.com/yts/swfbin/player-vflmDyk47/watch_as3.swf',
-        'swf',
-        'F375F75BF2AFDAAF2666E43868D46816F83F13E81C46.3725A8218E446A0DECD33F79DC282994D6AA92C92C9',
-        '9C29AA6D499282CD97F33DCED0A644E8128A5273.64C18E31F38361864D86834E6662FAADFA2FB57F'
-    ),
     (
         'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflBb0OQx.js',
         'js',
index 86bff185b40db1680593e0c6a96b0a7dcd5dd928..e51ea701f68d3b850edcc188f4221dceb139b6dc 100644 (file)
@@ -190,6 +190,7 @@ from .livestream import (
     LivestreamOriginalIE,
     LivestreamShortenerIE,
 )
+from .lrt import LRTIE
 from .lynda import (
     LyndaIE,
     LyndaCourseIE
@@ -354,6 +355,7 @@ from .swrmediathek import SWRMediathekIE
 from .syfy import SyfyIE
 from .sztvhu import SztvHuIE
 from .tagesschau import TagesschauIE
+from .tapely import TapelyIE
 from .teachertube import (
     TeacherTubeIE,
     TeacherTubeUserIE,
@@ -371,7 +373,10 @@ from .thisav import ThisAVIE
 from .tinypic import TinyPicIE
 from .tlc import TlcIE, TlcDeIE
 from .tnaflix import TNAFlixIE
-from .thvideo import THVideoIE
+from .thvideo import (
+    THVideoIE,
+    THVideoPlaylistIE
+)
 from .toutv import TouTvIE
 from .toypics import ToypicsUserIE, ToypicsIE
 from .traileraddict import TrailerAddictIE
index 957d35979d34244f7c4cb4df649a349453bf9123..c3d02f85e8f023deac51287b72cd45623db72f07 100644 (file)
@@ -86,11 +86,15 @@ class ArteTVPlus7IE(InfoExtractor):
         info = self._download_json(json_url, video_id)
         player_info = info['videoJsonPlayer']
 
+        upload_date_str = player_info.get('shootingDate')
+        if not upload_date_str:
+            upload_date_str = player_info.get('VDA', '').split(' ')[0]
+
         info_dict = {
             'id': player_info['VID'],
             'title': player_info['VTI'],
             'description': player_info.get('VDE'),
-            'upload_date': unified_strdate(player_info.get('VDA', '').split(' ')[0]),
+            'upload_date': unified_strdate(upload_date_str),
             'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
         }
 
index 4e2960c6260ebf6bf0d242a6b0bfc38baf40b25c..2e277c8c3c28af872750bc8108db56b1382fd992 100644 (file)
@@ -26,6 +26,8 @@ class BRIE(InfoExtractor):
                 'title': 'Wenn das Traditions-Theater wackelt',
                 'description': 'Heimatsound-Festival 2014: Wenn das Traditions-Theater wackelt',
                 'duration': 34,
+                'uploader': 'BR',
+                'upload_date': '20140802',
             }
         },
         {
@@ -66,8 +68,7 @@ class BRIE(InfoExtractor):
     ]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        display_id = mobj.group('id')
+        display_id = self._match_id(url)
         page = self._download_webpage(url, display_id)
         xml_url = self._search_regex(
             r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/(?:[a-z0-9\-]+/)+[a-z0-9/~_.-]+)'}\)\);", page, 'XMLURL')
index 65c12136a3a636763e5d41fb307beb798ee27b83..d4227e6ebb51244018d24da87927c54061058dc8 100644 (file)
@@ -35,7 +35,6 @@ class CliphunterIE(InfoExtractor):
             'title': 'Fun Jynx Maze solo',
             'thumbnail': 're:^https?://.*\.jpg$',
             'age_limit': 18,
-            'duration': 1317,
         }
     }
 
@@ -86,14 +85,11 @@ class CliphunterIE(InfoExtractor):
         thumbnail = self._search_regex(
             r"var\s+mov_thumb\s*=\s*'([^']+)';",
             webpage, 'thumbnail', fatal=False)
-        duration = int_or_none(self._search_regex(
-            r'pl_dur\s*=\s*([0-9]+)', webpage, 'duration', fatal=False))
 
         return {
             'id': video_id,
             'title': video_title,
             'formats': formats,
-            'duration': duration,
             'age_limit': self._rta_search(webpage),
             'thumbnail': thumbnail,
         }
index f43a0a569a3d90d555a27cece8ac3e68951c5106..611cf95f1125ec9340a96554df9542d1fcbd62b4 100644 (file)
@@ -334,7 +334,11 @@ class InfoExtractor(object):
         try:
             return json.loads(json_string)
         except ValueError as ve:
-            raise ExtractorError('Failed to download JSON', cause=ve)
+            errmsg = '%s: Failed to parse JSON ' % video_id
+            if fatal:
+                raise ExtractorError(errmsg, cause=ve)
+            else:
+                self.report_warning(errmsg + str(ve))
 
     def report_warning(self, msg, video_id=None):
         idstr = '' if video_id is None else '%s: ' % video_id
index 817a9bd6143544560541dbdb1c2d1ea18b95177d..5f24ac7214a95b762d3805779d1c9517ca3d0000 100644 (file)
@@ -29,9 +29,8 @@ class DropboxIE(InfoExtractor):
         video_id = mobj.group('id')
         fn = compat_urllib_parse_unquote(url_basename(url))
         title = os.path.splitext(fn)[0]
-        video_url = (
-            re.sub(r'[?&]dl=0', '', url) +
-            ('?' if '?' in url else '&') + 'dl=1')
+        video_url = re.sub(r'[?&]dl=0', '', url)
+        video_url += ('?' if '?' not in video_url else '&') + 'dl=1'
 
         return {
             'id': video_id,
index 522aa3d639d6a05f9aca15171b80fa97a0eb0715..bb231ecb1cd81577d41c9c16ec3e0156c4abcf3e 100644 (file)
@@ -14,11 +14,11 @@ class EpornerIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P<id>\d+)/(?P<display_id>[\w-]+)'
     _TEST = {
         'url': 'http://www.eporner.com/hd-porn/95008/Infamous-Tiffany-Teen-Strip-Tease-Video/',
-        'md5': '3b427ae4b9d60619106de3185c2987cd',
+        'md5': '39d486f046212d8e1b911c52ab4691f8',
         'info_dict': {
             'id': '95008',
             'display_id': 'Infamous-Tiffany-Teen-Strip-Tease-Video',
-            'ext': 'flv',
+            'ext': 'mp4',
             'title': 'Infamous Tiffany Teen Strip Tease Video',
             'duration': 194,
             'view_count': int,
index 60e68d98ac68ec5f0ccff4413af70a54bfd75ced..3ad993751759cca6900bbc9cc21b4dfe1a8589fa 100644 (file)
@@ -35,7 +35,7 @@ class FacebookIE(InfoExtractor):
             'id': '637842556329505',
             'ext': 'mp4',
             'duration': 38,
-            'title': 'Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam fin...',
+            'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam',
         }
     }, {
         'note': 'Video without discernible title',
index 721e5fce011e113bf8c413543df496fc3eeca17d..d966e8403dfe9e03765d6a2eb0ab895a0da4100a 100644 (file)
@@ -21,7 +21,7 @@ class FunnyOrDieIE(InfoExtractor):
         },
     }, {
         'url': 'http://www.funnyordie.com/embed/e402820827',
-        'md5': 'ff4d83318f89776ed0250634cfaa8d36',
+        'md5': '29f4c5e5a61ca39dfd7e8348a75d0aad',
         'info_dict': {
             'id': 'e402820827',
             'ext': 'mp4',
index 0dfa4853dbd3b1cbacaf2b9532dcdeffc8a9300c..14c024e48fcde2ebd2cc07ca7be4142b03bb86d7 100644 (file)
@@ -155,7 +155,6 @@ class GenericIE(InfoExtractor):
         # funnyordie embed
         {
             'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
-            'md5': '7cf780be104d40fea7bae52eed4a470e',
             'info_dict': {
                 'id': '18e820ec3f',
                 'ext': 'mp4',
@@ -180,13 +179,13 @@ class GenericIE(InfoExtractor):
         # Embedded TED video
         {
             'url': 'http://en.support.wordpress.com/videos/ted-talks/',
-            'md5': 'deeeabcc1085eb2ba205474e7235a3d5',
+            'md5': '65fdff94098e4a607385a60c5177c638',
             'info_dict': {
-                'id': '981',
+                'id': '1969',
                 'ext': 'mp4',
-                'title': 'My web playroom',
-                'uploader': 'Ze Frank',
-                'description': 'md5:ddb2a40ecd6b6a147e400e535874947b',
+                'title': 'Hidden miracles of the natural world',
+                'uploader': 'Louie Schwartzberg',
+                'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
             }
         },
         # Embeded Ustream video
@@ -226,21 +225,6 @@ class GenericIE(InfoExtractor):
                 'skip_download': 'Requires rtmpdump'
             }
         },
-        # smotri embed
-        {
-            'url': 'http://rbctv.rbc.ru/archive/news/562949990879132.shtml',
-            'md5': 'ec40048448e9284c9a1de77bb188108b',
-            'info_dict': {
-                'id': 'v27008541fad',
-                'ext': 'mp4',
-                'title': 'Крым и Севастополь вошли в состав России',
-                'description': 'md5:fae01b61f68984c7bd2fa741e11c3175',
-                'duration': 900,
-                'upload_date': '20140318',
-                'uploader': 'rbctv_2012_4',
-                'uploader_id': 'rbctv_2012_4',
-            },
-        },
         # Condé Nast embed
         {
             'url': 'http://www.wired.com/2014/04/honda-asimo/',
@@ -295,13 +279,13 @@ class GenericIE(InfoExtractor):
         {
             'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
             'info_dict': {
-                'id': 'jpSGZsgga_I',
+                'id': '4vAffPZIT44',
                 'ext': 'mp4',
-                'title': 'Asphalt 8: Airborne - Launch Trailer',
+                'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
                 'uploader': 'Gameloft',
                 'uploader_id': 'gameloft',
-                'upload_date': '20130821',
-                'description': 'md5:87bd95f13d8be3e7da87a5f2c443106a',
+                'upload_date': '20140828',
+                'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
             },
             'params': {
                 'skip_download': True,
index 73bd6d8903018ce374488441e6fbc47dff318244..363dc66086e350af241959f2b547004ebd07d6db 100644 (file)
@@ -36,16 +36,16 @@ class GodTubeIE(InfoExtractor):
             'http://www.godtube.com/resource/mediaplayer/%s.xml' % video_id.lower(),
             video_id, 'Downloading player config XML')
 
-        video_url = config.find('.//file').text
-        uploader = config.find('.//author').text
-        timestamp = parse_iso8601(config.find('.//date').text)
-        duration = parse_duration(config.find('.//duration').text)
-        thumbnail = config.find('.//image').text
+        video_url = config.find('file').text
+        uploader = config.find('author').text
+        timestamp = parse_iso8601(config.find('date').text)
+        duration = parse_duration(config.find('duration').text)
+        thumbnail = config.find('image').text
 
         media = self._download_xml(
             'http://www.godtube.com/media/xml/?v=%s' % video_id, video_id, 'Downloading media XML')
 
-        title = media.find('.//title').text
+        title = media.find('title').text
 
         return {
             'id': video_id,
index bebfe8568705165c31ed73e1b5bfc6f76fa58510..53714f47f1a0a8cd1abb8aab0ec09cdbd283d51b 100644 (file)
@@ -38,11 +38,9 @@ class GolemIE(InfoExtractor):
         }
 
         formats = []
-        for e in config.findall('./*[url]'):
+        for e in config:
             url = e.findtext('./url')
             if not url:
-                self._downloader.report_warning(
-                    "{0}: url: empty, skipping".format(e.tag))
                 continue
 
             formats.append({
@@ -57,7 +55,7 @@ class GolemIE(InfoExtractor):
         info['formats'] = formats
 
         thumbnails = []
-        for e in config.findall('.//teaser[url]'):
+        for e in config.findall('.//teaser'):
             url = e.findtext('./url')
             if not url:
                 continue
index 12e9e61c465cce7f30ed93d293d946ea05f4de74..c80185b535b8d9853adf886fb8b5582284d12a03 100644 (file)
@@ -89,7 +89,12 @@ class IGNIE(InfoExtractor):
                 '<param name="flashvars"[^>]*value="[^"]*?url=(https?://www\.ign\.com/videos/.*?)["&]',
                 webpage)
             if multiple_urls:
-                return [self.url_result(u, ie='IGN') for u in multiple_urls]
+                entries = [self.url_result(u, ie='IGN') for u in multiple_urls]
+                return {
+                    '_type': 'playlist',
+                    'id': name_or_id,
+                    'entries': entries,
+                }
 
         video_id = self._find_video_id(webpage)
         result = self._get_video_info(video_id)
index 4ddda2f1bb86dd534f623218b2acd74566d781e1..53f9a5f7587bcf36d9d4a63f6cfa36d90496dd28 100644 (file)
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 import re
 
 from .common import InfoExtractor
@@ -12,12 +14,13 @@ class InternetVideoArchiveIE(InfoExtractor):
     _VALID_URL = r'https?://video\.internetvideoarchive\.net/flash/players/.*?\?.*?publishedid.*?'
 
     _TEST = {
-        u'url': u'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247',
-        u'file': u'452693.mp4',
-        u'info_dict': {
-            u'title': u'SKYFALL',
-            u'description': u'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.',
-            u'duration': 153,
+        'url': 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247',
+        'info_dict': {
+            'id': '452693',
+            'ext': 'mp4',
+            'title': 'SKYFALL',
+            'description': 'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.',
+            'duration': 149,
         },
     }
 
@@ -42,7 +45,7 @@ class InternetVideoArchiveIE(InfoExtractor):
         url = self._build_url(query)
 
         flashconfiguration = self._download_xml(url, video_id,
-            u'Downloading flash configuration')
+            'Downloading flash configuration')
         file_url = flashconfiguration.find('file').text
         file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx')
         # Replace some of the parameters in the query to get the best quality
@@ -51,7 +54,7 @@ class InternetVideoArchiveIE(InfoExtractor):
             lambda m: self._clean_query(m.group()),
             file_url)
         info = self._download_xml(file_url, video_id,
-            u'Downloading video info')
+            'Downloading video info')
         item = info.find('channel/item')
 
         def _bp(p):
index a83dd249f6cd5694884158de6471802df6fe2d01..07ef682ee38052088d07f3f232c245ded77b2193 100644 (file)
@@ -63,7 +63,8 @@ class IzleseneIE(InfoExtractor):
 
         title = self._og_search_title(webpage)
         description = self._og_search_description(webpage)
-        thumbnail = self._og_search_thumbnail(webpage)
+        thumbnail = self._proto_relative_url(
+            self._og_search_thumbnail(webpage), scheme='http:')
 
         uploader = self._html_search_regex(
             r"adduserUsername\s*=\s*'([^']+)';",
index 9b553b9fa52873739b0d4ecb4e3927e1beff929e..5aa32bf092d8bfae17fd302fe399acf5d5264164 100644 (file)
@@ -11,10 +11,9 @@ from ..utils import (
 
 
 class JukeboxIE(InfoExtractor):
-    _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<video_id>[a-z0-9\-]+)\.html'
+    _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<id>[a-z0-9\-]+)\.html'
     _TEST = {
         'url': 'http://www.jukebox.es/kosheen/videoclip,pride,r303r.html',
-        'md5': '1574e9b4d6438446d5b7dbcdf2786276',
         'info_dict': {
             'id': 'r303r',
             'ext': 'flv',
@@ -24,8 +23,7 @@ class JukeboxIE(InfoExtractor):
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('video_id')
+        video_id = self._match_id(url)
 
         html = self._download_webpage(url, video_id)
         iframe_url = unescapeHTML(self._search_regex(r'<iframe .*src="([^"]*)"', html, 'iframe url'))
diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py
new file mode 100644 (file)
index 0000000..fca0bfe
--- /dev/null
@@ -0,0 +1,69 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    js_to_json,
+    parse_duration,
+    remove_end,
+)
+
+
+class LRTIE(InfoExtractor):
+    IE_NAME = 'lrt.lt'
+    _VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/irasas/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.lrt.lt/mediateka/irasas/54391/',
+        'info_dict': {
+            'id': '54391',
+            'ext': 'mp4',
+            'title': 'Septynios Kauno dienos',
+            'description': 'Kauno miesto ir apskrities naujienos',
+            'duration': 1783,
+        },
+        'params': {
+            'skip_download': True,  # HLS download
+        },
+
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = remove_end(self._og_search_title(webpage), ' - LRT')
+        thumbnail = self._og_search_thumbnail(webpage)
+        description = self._og_search_description(webpage)
+        duration = parse_duration(self._search_regex(
+            r"'duration':\s*'([^']+)',", webpage,
+            'duration', fatal=False, default=None))
+
+        formats = []
+        for js in re.findall(r'(?s)config:\s*(\{.*?\})', webpage):
+            data = json.loads(js_to_json(js))
+            if data['provider'] == 'rtmp':
+                formats.append({
+                    'format_id': 'rtmp',
+                    'ext': determine_ext(data['file']),
+                    'url': data['streamer'],
+                    'play_path': 'mp4:%s' % data['file'],
+                    'preference': -1,
+                })
+            else:
+                formats.extend(
+                    self._extract_m3u8_formats(data['file'], video_id, 'mp4'))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': thumbnail,
+            'description': description,
+            'duration': duration,
+        }
index 963c4587c88c26c02e479ee1c3764bb2bd46269c..cc7c921c364d64ee504fa6d31265d13a96565e8d 100644 (file)
@@ -6,6 +6,7 @@ import re
 from .common import InfoExtractor
 from ..utils import (
     ExtractorError,
+    compat_urllib_parse_urlparse,
     int_or_none,
     remove_end,
 )
@@ -13,76 +14,116 @@ from ..utils import (
 
 class NFLIE(InfoExtractor):
     IE_NAME = 'nfl.com'
-    _VALID_URL = r'(?x)https?://(?:www\.)?nfl\.com/(?:videos/(?:.+)/|.*?\#video=)(?P<id>\d..[0-9]+)'
-    _PLAYER_CONFIG_URL = 'http://www.nfl.com/static/content/static/config/video/config.json'
-    _TEST = {
-        'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights',
-        # 'md5': '5eb8c40a727dda106d510e5d6ffa79e5',  # md5 checksum fluctuates
-        'info_dict': {
-            'id': '0ap3000000398478',
-            'ext': 'mp4',
-            'title': 'Week 3: Washington Redskins vs. Philadelphia Eagles highlights',
-            'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478',
-            'upload_date': '20140921',
-            'timestamp': 1411337580,
-            'thumbnail': 're:^https?://.*\.jpg$',
+    _VALID_URL = r'''(?x)https?://
+        (?P<host>(?:www\.)?(?:nfl\.com|.*?\.clubs\.nfl\.com))/
+        (?:.+?/)*
+        (?P<id>(?:\d[a-z]{2}\d{13}|\w{8}\-(?:\w{4}\-){3}\w{12}))'''
+    _TESTS = [
+        {
+            'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights',
+            'md5': '394ef771ddcd1354f665b471d78ec4c6',
+            'info_dict': {
+                'id': '0ap3000000398478',
+                'ext': 'mp4',
+                'title': 'Week 3: Redskins vs. Eagles highlights',
+                'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478',
+                'upload_date': '20140921',
+                'timestamp': 1411337580,
+                'thumbnail': 're:^https?://.*\.jpg$',
+            }
+        },
+        {
+            'url': 'http://prod.www.steelers.clubs.nfl.com/video-and-audio/videos/LIVE_Post_Game_vs_Browns/9d72f26a-9e2b-4718-84d3-09fb4046c266',
+            'md5': 'cf85bdb4bc49f6e9d3816d130c78279c',
+            'info_dict': {
+                'id': '9d72f26a-9e2b-4718-84d3-09fb4046c266',
+                'ext': 'mp4',
+                'title': 'LIVE: Post Game vs. Browns',
+                'description': 'md5:6a97f7e5ebeb4c0e69a418a89e0636e8',
+                'upload_date': '20131229',
+                'timestamp': 1388354455,
+                'thumbnail': 're:^https?://.*\.jpg$',
+            }
+        }
+    ]
+
+    @staticmethod
+    def prepend_host(host, url):
+        if not url.startswith('http'):
+            if not url.startswith('/'):
+                url = '/%s' % url
+            url = 'http://{0:}{1:}'.format(host, url)
+        return url
+
+    @staticmethod
+    def format_from_stream(stream, protocol, host, path_prefix='',
+                           preference=0, note=None):
+        url = '{protocol:}://{host:}/{prefix:}{path:}'.format(
+            protocol=protocol,
+            host=host,
+            prefix=path_prefix,
+            path=stream.get('path'),
+        )
+        return {
+            'url': url,
+            'vbr': int_or_none(stream.get('rate', 0), 1000),
+            'preference': preference,
+            'format_note': note,
         }
-    }
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id, host = mobj.group('id'), mobj.group('host')
 
-        config = self._download_json(self._PLAYER_CONFIG_URL, video_id,
-                                     note='Downloading player config')
-        url_template = 'http://nfl.com{contentURLTemplate:s}'.format(**config)
-        video_data = self._download_json(url_template.format(id=video_id), video_id)
+        webpage = self._download_webpage(url, video_id)
 
-        cdns = config.get('cdns')
-        if not cdns:
-            raise ExtractorError('Failed to get CDN data', expected=True)
+        config_url = NFLIE.prepend_host(host, self._search_regex(
+            r'(?:config|configURL)\s*:\s*"([^"]+)"', webpage, 'config URL'))
+        config = self._download_json(config_url, video_id,
+                                     note='Downloading player config')
+        url_template = NFLIE.prepend_host(
+            host, '{contentURLTemplate:}'.format(**config))
+        video_data = self._download_json(
+            url_template.format(id=video_id), video_id)
 
         formats = []
-        streams = video_data.get('cdnData', {}).get('bitrateInfo', [])
-        for name, cdn in cdns.items():
-            # LimeLight streams don't seem to work
-            if cdn.get('name') == 'LIMELIGHT':
-                continue
-
-            protocol = cdn.get('protocol')
-            host = remove_end(cdn.get('host', ''), '/')
-            if not (protocol and host):
-                continue
-
-            path_prefix = cdn.get('pathprefix', '')
-            if path_prefix and not path_prefix.endswith('/'):
-                path_prefix = '%s/' % path_prefix
-
-            get_url = lambda p: '{protocol:s}://{host:s}/{prefix:s}{path:}'.format(
-                protocol=protocol,
-                host=host,
-                prefix=path_prefix,
-                path=p,
-            )
-
-            if protocol == 'rtmp':
-                preference = -2
-            elif 'prog' in name.lower():
-                preference = -1
-            else:
-                preference = 0
-
+        cdn_data = video_data.get('cdnData', {})
+        streams = cdn_data.get('bitrateInfo', [])
+        if cdn_data.get('format') == 'EXTERNAL_HTTP_STREAM':
+            parts = compat_urllib_parse_urlparse(cdn_data.get('uri'))
+            protocol, host = parts.scheme, parts.netloc
             for stream in streams:
-                path = stream.get('path')
-                if not path:
+                formats.append(
+                    NFLIE.format_from_stream(stream, protocol, host))
+        else:
+            cdns = config.get('cdns')
+            if not cdns:
+                raise ExtractorError('Failed to get CDN data', expected=True)
+
+            for name, cdn in cdns.items():
+                # LimeLight streams don't seem to work
+                if cdn.get('name') == 'LIMELIGHT':
                     continue
 
-                formats.append({
-                    'url': get_url(path),
-                    'vbr': int_or_none(stream.get('rate', 0), 1000),
-                    'preference': preference,
-                    'format_note': name,
-                })
+                protocol = cdn.get('protocol')
+                host = remove_end(cdn.get('host', ''), '/')
+                if not (protocol and host):
+                    continue
+
+                prefix = cdn.get('pathprefix', '')
+                if prefix and not prefix.endswith('/'):
+                    prefix = '%s/' % prefix
+
+                preference = 0
+                if protocol == 'rtmp':
+                    preference = -2
+                elif 'prog' in name.lower():
+                    preference = 1
+
+                for stream in streams:
+                    formats.append(
+                        NFLIE.format_from_stream(stream, protocol, host,
+                                                 prefix, preference, name))
 
         self._sort_formats(formats)
 
@@ -94,7 +135,7 @@ class NFLIE(InfoExtractor):
 
         return {
             'id': video_id,
-            'title': video_data.get('storyHeadline'),
+            'title': video_data.get('headline'),
             'formats': formats,
             'description': video_data.get('caption'),
             'duration': video_data.get('duration'),
index 2adfde9091b5ceae50abd8f0c79abd129c259751..8f140d62660b896f5a6f819d621a762d13fbdb69 100644 (file)
@@ -4,6 +4,7 @@ import re
 
 from .common import InfoExtractor
 from ..utils import (
+    unified_strdate,
     US_RATINGS,
 )
 
@@ -11,10 +12,10 @@ from ..utils import (
 class PBSIE(InfoExtractor):
     _VALID_URL = r'''(?x)https?://
         (?:
-            # Direct video URL
-            video\.pbs\.org/(?:viralplayer|video)/(?P<id>[0-9]+)/? |
-            # Article with embedded player
-           (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+)/?(?:$|[?\#]) |
+           # Direct video URL
+           video\.pbs\.org/(?:viralplayer|video)/(?P<id>[0-9]+)/? |
+           # Article with embedded player (or direct video)
+           (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
            # Player
            video\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/
         )
@@ -65,10 +66,25 @@ class PBSIE(InfoExtractor):
                 'duration': 6559,
                 'thumbnail': 're:^https?://.*\.jpg$',
             }
+        },
+        {
+            'url': 'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html',
+            'md5': '908f3e5473a693b266b84e25e1cf9703',
+            'info_dict': {
+                'id': '2365160389',
+                'display_id': 'killer-typhoon',
+                'ext': 'mp4',
+                'description': 'md5:c741d14e979fc53228c575894094f157',
+                'title': 'Killer Typhoon',
+                'duration': 3172,
+                'thumbnail': 're:^https?://.*\.jpg$',
+                'upload_date': '20140122',
+            }
         }
+
     ]
 
-    def _extract_ids(self, url):
+    def _extract_webpage(self, url):
         mobj = re.match(self._VALID_URL, url)
 
         presumptive_id = mobj.group('presumptive_id')
@@ -76,15 +92,20 @@ class PBSIE(InfoExtractor):
         if presumptive_id:
             webpage = self._download_webpage(url, display_id)
 
+            upload_date = unified_strdate(self._search_regex(
+                r'<input type="hidden" id="air_date_[0-9]+" value="([^"]+)"',
+                webpage, 'upload date', default=None))
+
             MEDIA_ID_REGEXES = [
                 r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'",  # frontline video embed
                 r'class="coveplayerid">([^<]+)<',                       # coveplayer
+                r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>',  # jwplayer
             ]
 
             media_id = self._search_regex(
                 MEDIA_ID_REGEXES, webpage, 'media ID', fatal=False, default=None)
             if media_id:
-                return media_id, presumptive_id
+                return media_id, presumptive_id, upload_date
 
             url = self._search_regex(
                 r'<iframe\s+(?:class|id)=["\']partnerPlayer["\'].*?\s+src=["\'](.*?)["\']>',
@@ -104,10 +125,10 @@ class PBSIE(InfoExtractor):
             video_id = mobj.group('id')
             display_id = video_id
 
-        return video_id, display_id
+        return video_id, display_id, None
 
     def _real_extract(self, url):
-        video_id, display_id = self._extract_ids(url)
+        video_id, display_id, upload_date = self._extract_webpage(url)
 
         info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id
         info = self._download_json(info_url, display_id)
@@ -119,6 +140,7 @@ class PBSIE(InfoExtractor):
 
         return {
             'id': video_id,
+            'display_id': display_id,
             'title': info['title'],
             'url': info['alternate_encoding']['url'],
             'ext': 'mp4',
@@ -126,4 +148,5 @@ class PBSIE(InfoExtractor):
             'thumbnail': info.get('image_url'),
             'duration': info.get('duration'),
             'age_limit': age_limit,
+            'upload_date': upload_date,
         }
index 5b2a723c1d8dce6f05fcdd3647c93b48ad41dc5a..619496de7a57f9ab297b708bbffb3005c85e0dd8 100644 (file)
@@ -144,7 +144,7 @@ class ProSiebenSat1IE(InfoExtractor):
                 'id': '2156342',
                 'ext': 'mp4',
                 'title': 'Kurztrips zum Valentinstag',
-                'description': 'md5:8ba6301e70351ae0bedf8da00f7ba528',
+                'description': 'Romantischer Kurztrip zum Valentinstag? Wir verraten, was sich hier wirklich lohnt.',
                 'duration': 307.24,
             },
             'params': {
@@ -180,12 +180,10 @@ class ProSiebenSat1IE(InfoExtractor):
     ]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
 
-        page = self._download_webpage(url, video_id, 'Downloading page')
-
-        clip_id = self._html_search_regex(self._CLIPID_REGEXES, page, 'clip id')
+        clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id')
 
         access_token = 'testclient'
         client_name = 'kolibri-1.2.5'
@@ -234,12 +232,12 @@ class ProSiebenSat1IE(InfoExtractor):
 
         urls = self._download_json(url_api_url, clip_id, 'Downloading urls JSON')
 
-        title = self._html_search_regex(self._TITLE_REGEXES, page, 'title')
-        description = self._html_search_regex(self._DESCRIPTION_REGEXES, page, 'description', fatal=False)
-        thumbnail = self._og_search_thumbnail(page)
+        title = self._html_search_regex(self._TITLE_REGEXES, webpage, 'title')
+        description = self._html_search_regex(self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False)
+        thumbnail = self._og_search_thumbnail(webpage)
 
         upload_date = unified_strdate(self._html_search_regex(
-            self._UPLOAD_DATE_REGEXES, page, 'upload date', default=None))
+            self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None))
 
         formats = []
 
index 2007a00134dfe73cd721dbee9e86e4f349a2e034..94602e89e56549243ed38ecb107ef842cd8ebd46 100644 (file)
@@ -9,7 +9,6 @@ from ..utils import (
     compat_urllib_parse,
     unified_strdate,
     str_to_int,
-    int_or_none,
 )
 from ..aes import aes_decrypt_text
 
@@ -40,31 +39,42 @@ class SpankwireIE(InfoExtractor):
         req.add_header('Cookie', 'age_verified=1')
         webpage = self._download_webpage(req, video_id)
 
-        title = self._html_search_regex(r'<h1>([^<]+)', webpage, 'title')
+        title = self._html_search_regex(
+            r'<h1>([^<]+)', webpage, 'title')
         description = self._html_search_regex(
-            r'<div\s+id="descriptionContent">([^<]+)<', webpage, 'description', fatal=False)
+            r'<div\s+id="descriptionContent">([^<]+)<',
+            webpage, 'description', fatal=False)
         thumbnail = self._html_search_regex(
-            r'flashvars\.image_url = "([^"]+)', webpage, 'thumbnail', fatal=False)
+            r'playerData\.screenShot\s*=\s*["\']([^"\']+)["\']',
+            webpage, 'thumbnail', fatal=False)
 
         uploader = self._html_search_regex(
-            r'by:\s*<a [^>]*>(.+?)</a>', webpage, 'uploader', fatal=False)
+            r'by:\s*<a [^>]*>(.+?)</a>',
+            webpage, 'uploader', fatal=False)
         uploader_id = self._html_search_regex(
-            r'by:\s*<a href="/Profile\.aspx\?.*?UserId=(\d+).*?"', webpage, 'uploader id', fatal=False)
-        upload_date = self._html_search_regex(r'</a> on (.+?) at \d+:\d+', webpage, 'upload date', fatal=False)
-        if upload_date:
-            upload_date = unified_strdate(upload_date)
-        
-        view_count = self._html_search_regex(
-            r'<div id="viewsCounter"><span>([^<]+)</span> views</div>', webpage, 'view count', fatal=False)
-        if view_count:
-            view_count = str_to_int(view_count)
-        comment_count = int_or_none(self._html_search_regex(
-            r'<span id="spCommentCount">\s*(\d+)</span> Comments</div>', webpage, 'comment count', fatal=False))
+            r'by:\s*<a href="/Profile\.aspx\?.*?UserId=(\d+).*?"',
+            webpage, 'uploader id', fatal=False)
+        upload_date = unified_strdate(self._html_search_regex(
+            r'</a> on (.+?) at \d+:\d+',
+            webpage, 'upload date', fatal=False))
 
-        video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'flashvars\.quality_[0-9]{3}p = "([^"]+)', webpage)))
+        view_count = str_to_int(self._html_search_regex(
+            r'<div id="viewsCounter"><span>([\d,\.]+)</span> views</div>',
+            webpage, 'view count', fatal=False))
+        comment_count = str_to_int(self._html_search_regex(
+            r'Comments<span[^>]+>\s*\(([\d,\.]+)\)</span>',
+            webpage, 'comment count', fatal=False))
+
+        video_urls = list(map(
+            compat_urllib_parse.unquote,
+            re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*["\']([^"\']+)["\']', webpage)))
         if webpage.find('flashvars\.encrypted = "true"') != -1:
-            password = self._html_search_regex(r'flashvars\.video_title = "([^"]+)', webpage, 'password').replace('+', ' ')
-            video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))
+            password = self._html_search_regex(
+                r'flashvars\.video_title = "([^"]+)',
+                webpage, 'password').replace('+', ' ')
+            video_urls = list(map(
+                lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'),
+                video_urls))
 
         formats = []
         for video_url in video_urls:
index 185353bef7eb363ad9aa74487d6c384e975560f9..abb82778325fd74f55a2ae1ce00f8f98316ad0a1 100644 (file)
@@ -17,11 +17,11 @@ class SportDeutschlandIE(InfoExtractor):
         'info_dict': {
             'id': 'live-li-ning-badminton-weltmeisterschaft-2014-kopenhagen',
             'ext': 'mp4',
-            'title': 'LIVE: Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen',
+            'title': 're:Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen',
             'categories': ['Badminton'],
             'view_count': int,
             'thumbnail': 're:^https?://.*\.jpg$',
-            'description': 're:^Die Badminton-WM 2014 aus Kopenhagen LIVE',
+            'description': 're:Die Badminton-WM 2014 aus Kopenhagen bei Sportdeutschland\.TV',
             'timestamp': int,
             'upload_date': 're:^201408[23][0-9]$',
         },
index 7de3c9dd5014586a118ab3eb55365897d160945d..263f09b4645fa8b6255f1216e99cab27afce2bee 100644 (file)
@@ -39,10 +39,10 @@ class SunPornoIE(InfoExtractor):
             r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False)
 
         duration = parse_duration(self._search_regex(
-            r'<span>Duration: (\d+:\d+)</span>', webpage, 'duration', fatal=False))
+            r'Duration:\s*(\d+:\d+)\s*<', webpage, 'duration', fatal=False))
 
         view_count = int_or_none(self._html_search_regex(
-            r'<span class="views">(\d+)</span>', webpage, 'view count', fatal=False))
+            r'class="views">\s*(\d+)\s*<', webpage, 'view count', fatal=False))
         comment_count = int_or_none(self._html_search_regex(
             r'(\d+)</b> Comments?', webpage, 'comment count', fatal=False))
 
diff --git a/youtube_dl/extractor/tapely.py b/youtube_dl/extractor/tapely.py
new file mode 100644 (file)
index 0000000..77e0562
--- /dev/null
@@ -0,0 +1,104 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    clean_html,
+    compat_urllib_request,
+    float_or_none,
+    parse_iso8601,
+)
+
+
+class TapelyIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?tape\.ly/(?P<id>[A-Za-z0-9\-_]+)(?:/(?P<songnr>\d+))?'
+    _API_URL = 'http://tape.ly/showtape?id={0:}'
+    _S3_SONG_URL = 'http://mytape.s3.amazonaws.com/{0:}'
+    _SOUNDCLOUD_SONG_URL = 'http://api.soundcloud.com{0:}'
+    _TESTS = [
+        {
+            'url': 'http://tape.ly/my-grief-as-told-by-water',
+            'info_dict': {
+                'id': 23952,
+                'title': 'my grief as told by water',
+                'thumbnail': 're:^https?://.*\.png$',
+                'uploader_id': 16484,
+                'timestamp': 1411848286,
+                'description': 'For Robin and Ponkers, whom the tides of life have taken out to sea.',
+            },
+            'playlist_count': 13,
+        },
+        {
+            'url': 'http://tape.ly/my-grief-as-told-by-water/1',
+            'md5': '79031f459fdec6530663b854cbc5715c',
+            'info_dict': {
+                'id': 258464,
+                'title': 'Dreaming Awake  (My Brightest Diamond)',
+                'ext': 'm4a',
+            },
+        },
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('id')
+
+        playlist_url = self._API_URL.format(display_id)
+        request = compat_urllib_request.Request(playlist_url)
+        request.add_header('X-Requested-With', 'XMLHttpRequest')
+        request.add_header('Accept', 'application/json')
+
+        playlist = self._download_json(request, display_id)
+
+        tape = playlist['tape']
+
+        entries = []
+        for s in tape['songs']:
+            song = s['song']
+            entry = {
+                'id': song['id'],
+                'duration': float_or_none(song.get('songduration'), 1000),
+                'title': song['title'],
+            }
+            if song['source'] == 'S3':
+                entry.update({
+                    'url': self._S3_SONG_URL.format(song['filename']),
+                })
+                entries.append(entry)
+            elif song['source'] == 'YT':
+                self.to_screen('YouTube video detected')
+                yt_id = song['filename'].replace('/youtube/', '')
+                entry.update(self.url_result(yt_id, 'Youtube', video_id=yt_id))
+                entries.append(entry)
+            elif song['source'] == 'SC':
+                self.to_screen('SoundCloud song detected')
+                sc_url = self._SOUNDCLOUD_SONG_URL.format(song['filename'])
+                entry.update(self.url_result(sc_url, 'Soundcloud'))
+                entries.append(entry)
+            else:
+                self.report_warning('Unknown song source: %s' % song['source'])
+
+        if mobj.group('songnr'):
+            songnr = int(mobj.group('songnr')) - 1
+            try:
+                return entries[songnr]
+            except IndexError:
+                raise ExtractorError(
+                    'No song with index: %s' % mobj.group('songnr'),
+                    expected=True)
+
+        return {
+            '_type': 'playlist',
+            'id': tape['id'],
+            'display_id': display_id,
+            'title': tape['name'],
+            'entries': entries,
+            'thumbnail': tape.get('image_url'),
+            'description': clean_html(tape.get('subtext')),
+            'like_count': tape.get('likescount'),
+            'uploader_id': tape.get('user_id'),
+            'timestamp': parse_iso8601(tape.get('published_at')),
+        }
index 1cca47771290beaa2d4090126e181afe4059f460..d5e28efada55a91a480ce031df0bc2774de2ccc6 100644 (file)
@@ -149,7 +149,7 @@ class TEDIE(SubtitlesInfoExtractor):
             thumbnail = 'http://' + thumbnail
         return {
             'id': video_id,
-            'title': talk_info['title'],
+            'title': talk_info['title'].strip(),
             'uploader': talk_info['speaker'],
             'thumbnail': thumbnail,
             'description': self._og_search_description(webpage),
index 607e947bbaf506c71814cd5a36ff3851f3afbb91..496f15d80b478f94bc2aac86c3d20417e2b09925 100644 (file)
@@ -26,8 +26,7 @@ class THVideoIE(InfoExtractor):
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = self._match_id(url)
 
         # extract download link from mobile player page
         webpage_player = self._download_webpage(
@@ -57,3 +56,29 @@ class THVideoIE(InfoExtractor):
             'description': description,
             'upload_date': upload_date
         }
+
+
+class THVideoPlaylistIE(InfoExtractor):
+    _VALID_URL = r'http?://(?:www\.)?thvideo\.tv/mylist(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://thvideo.tv/mylist2',
+        'info_dict': {
+            'id': '2',
+            'title': '幻想万華鏡',
+        },
+        'playlist_mincount': 23,
+    }
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+        list_title = self._html_search_regex(
+            r'<h1 class="show_title">(.*?)<b id', webpage, 'playlist title',
+            fatal=False)
+
+        entries = [
+            self.url_result('http://thvideo.tv/v/th' + id, 'THVideo')
+            for id in re.findall(r'<dd><a href="http://thvideo.tv/v/th(\d+)/" target=', webpage)]
+
+        return self.playlist_result(entries, playlist_id, list_title)
index dc86978509da2b8680fd37bdd912028366c3928b..27962b5fe146dd16e85f46e341725b4e30bf24e1 100644 (file)
@@ -17,16 +17,16 @@ class TvigleIE(InfoExtractor):
 
     _TESTS = [
         {
-            'url': 'http://www.tvigle.ru/video/brat-2/',
-            'md5': '72cb7eab33e54314e1790da402d3c9c3',
+            'url': 'http://www.tvigle.ru/video/brat/',
+            'md5': 'ff4344a4894b0524441fb6f8218dc716',
             'info_dict': {
-                'id': '5119390',
-                'display_id': 'brat-2',
+                'id': '5118490',
+                'display_id': 'brat',
                 'ext': 'mp4',
-                'title': 'Брат 2 ',
-                'description': 'md5:5751f4fe345a58e1692585c361294bd8',
-                'duration': 7356.369,
-                'age_limit': 0,
+                'title': 'Брат',
+                'description': 'md5:d16ac7c0b47052ea51fddb92c4e413eb',
+                'duration': 5722.6,
+                'age_limit': 16,
             },
         },
         {
@@ -71,6 +71,7 @@ class TvigleIE(InfoExtractor):
                     'format_id': '%s-%s' % (vcodec, quality),
                     'vcodec': vcodec,
                     'height': int(quality[:-1]),
+                    'filesize': item['video_files_size'][vcodec][quality],
                 })
         self._sort_formats(formats)
 
index 7d27d6c57e61afeb19e7f8d10171b0b9d944bf57..96447007021f054fc155906511833aab5ac551c0 100644 (file)
@@ -31,7 +31,7 @@ class VGTVIE(InfoExtractor):
             'url': 'http://www.vgtv.no/#!/live/100764/opptak-vgtv-foelger-em-kvalifiseringen',
             'info_dict': {
                 'id': '100764',
-                'ext': 'mp4',
+                'ext': 'flv',
                 'title': 'OPPTAK: VGTV følger EM-kvalifiseringen',
                 'description': 'md5:3772d9c0dc2dff92a886b60039a7d4d3',
                 'thumbnail': 're:^https?://.*\.jpg',
@@ -50,7 +50,7 @@ class VGTVIE(InfoExtractor):
             'url': 'http://www.vgtv.no/#!/live/100015/direkte-her-kan-du-se-laksen-live-fra-suldalslaagen',
             'info_dict': {
                 'id': '100015',
-                'ext': 'mp4',
+                'ext': 'flv',
                 'title': 'DIREKTE: Her kan du se laksen live fra Suldalslågen!',
                 'description': 'md5:9a60cc23fa349f761628924e56eeec2d',
                 'thumbnail': 're:^https?://.*\.jpg',
index 4be1b878585525f70bec0be87c122bf3b10eee9b..d2c36b58a25ef7d98c98192ab9eccc0d710e85d9 100644 (file)
@@ -8,18 +8,19 @@ import itertools
 from .common import InfoExtractor
 from .subtitles import SubtitlesInfoExtractor
 from ..utils import (
+    clean_html,
     compat_HTTPError,
     compat_urllib_parse,
     compat_urllib_request,
-    clean_html,
-    get_element_by_attribute,
+    compat_urlparse,
     ExtractorError,
+    get_element_by_attribute,
+    InAdvancePagedList,
+    int_or_none,
     RegexNotFoundError,
-    smuggle_url,
     std_headers,
     unsmuggle_url,
     urlencode_postdata,
-    int_or_none,
 )
 
 
@@ -90,6 +91,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
                 'uploader_id': 'openstreetmapus',
                 'uploader': 'OpenStreetMap US',
                 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography',
+                'description': 'md5:380943ec71b89736ff4bf27183233d09',
                 'duration': 1595,
             },
         },
@@ -104,6 +106,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
                 'uploader': 'The BLN & Business of Software',
                 'uploader_id': 'theblnbusinessofsoftware',
                 'duration': 3610,
+                'description': None,
             },
         },
         {
@@ -118,6 +121,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
                 'uploader_id': 'user18948128',
                 'uploader': 'Jaime Marquínez Ferrándiz',
                 'duration': 10,
+                'description': 'This is "youtube-dl password protected test video" by Jaime Marquínez Ferrándiz on Vimeo, the home for high quality videos and the people who love them.',
             },
             'params': {
                 'videopassword': 'youtube-dl',
@@ -204,6 +208,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
         # Extract ID from URL
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
+        orig_url = url
         if mobj.group('pro') or mobj.group('player'):
             url = 'http://player.vimeo.com/video/' + video_id
 
@@ -274,18 +279,23 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
                 _, video_thumbnail = sorted((int(width if width.isdigit() else 0), t_url) for (width, t_url) in video_thumbs.items())[-1]
 
         # Extract video description
-        video_description = None
-        try:
-            video_description = get_element_by_attribute("class", "description_wrapper", webpage)
-            if video_description:
-                video_description = clean_html(video_description)
-        except AssertionError as err:
-            # On some pages like (http://player.vimeo.com/video/54469442) the
-            # html tags are not closed, python 2.6 cannot handle it
-            if err.args[0] == 'we should not get here!':
-                pass
-            else:
-                raise
+
+        video_description = self._html_search_regex(
+            r'(?s)<div\s+class="[^"]*description[^"]*"[^>]*>(.*?)</div>',
+            webpage, 'description', default=None)
+        if not video_description:
+            video_description = self._html_search_meta(
+                'description', webpage, default=None)
+        if not video_description and mobj.group('pro'):
+            orig_webpage = self._download_webpage(
+                orig_url, video_id,
+                note='Downloading webpage for description',
+                fatal=False)
+            if orig_webpage:
+                video_description = self._html_search_meta(
+                    'description', orig_webpage, default=None)
+        if not video_description and not mobj.group('player'):
+            self._downloader.report_warning('Cannot find video description')
 
         # Extract video duration
         video_duration = int_or_none(config["video"].get("duration"))
@@ -533,32 +543,55 @@ class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE):
 
 
 class VimeoLikesIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes(?:$|[?#])'
+    _VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes/?(?:$|[?#]|sort:)'
     IE_NAME = 'vimeo:likes'
     IE_DESC = 'Vimeo user likes'
     _TEST = {
-        'url': 'https://vimeo.com/user20132939/likes',
-        'playlist_mincount': 4,
-        'add_ies': ['Generic'],
+        'url': 'https://vimeo.com/user755559/likes/',
+        'playlist_mincount': 293,
         "info_dict": {
-            "description": "Videos Philipp Hagemeister likes on Vimeo.",
-            "title": "Vimeo / Philipp Hagemeister's likes",
-        },
-        'params': {
-            'extract_flat': False,
+            "description": "See all the videos urza likes",
+            "title": 'Videos urza likes',
         },
     }
 
     def _real_extract(self, url):
         user_id = self._match_id(url)
-        rss_url = '%s//vimeo.com/user%s/likes/rss' % (
-            self.http_scheme(), user_id)
-        surl = smuggle_url(rss_url, {
-            'force_videoid': '%s_likes' % user_id,
-            'to_generic': True,
-        })
+        webpage = self._download_webpage(url, user_id)
+        page_count = self._int(
+            self._search_regex(
+                r'''(?x)<li><a\s+href="[^"]+"\s+data-page="([0-9]+)">
+                    .*?</a></li>\s*<li\s+class="pagination_next">
+                ''', webpage, 'page count'),
+            'page count', fatal=True)
+        PAGE_SIZE = 12
+        title = self._html_search_regex(
+            r'(?s)<h1>(.+?)</h1>', webpage, 'title', fatal=False)
+        description = self._html_search_meta('description', webpage)
+
+        def _get_page(idx):
+            page_url = '%s//vimeo.com/user%s/likes/page:%d/sort:date' % (
+                self.http_scheme(), user_id, idx + 1)
+            webpage = self._download_webpage(
+                page_url, user_id,
+                note='Downloading page %d/%d' % (idx + 1, page_count))
+            video_list = self._search_regex(
+                r'(?s)<ol class="js-browse_list[^"]+"[^>]*>(.*?)</ol>',
+                webpage, 'video content')
+            paths = re.findall(
+                r'<li[^>]*>\s*<a\s+href="([^"]+)"', video_list)
+            for path in paths:
+                yield {
+                    '_type': 'url',
+                    'url': compat_urlparse.urljoin(page_url, path),
+                }
+
+        pl = InAdvancePagedList(_get_page, page_count, PAGE_SIZE)
 
         return {
-            '_type': 'url',
-            'url': surl,
+            '_type': 'playlist',
+            'id': 'user%s_likes' % user_id,
+            'title': title,
+            'description': description,
+            'entries': pl,
         }
index fb0600f1a911b0f63922f112221e0cc025d27cce..ec3c010ad7e151bfc304315cdc5fd32bc21e8f43 100644 (file)
@@ -5,6 +5,7 @@ import re
 from .common import InfoExtractor
 from ..utils import (
     compat_urllib_parse_urlparse,
+    ExtractorError,
     parse_duration,
     qualities,
 )
@@ -14,13 +15,12 @@ class VuClipIE(InfoExtractor):
     _VALID_URL = r'http://(?:m\.)?vuclip\.com/w\?.*?cid=(?P<id>[0-9]+)'
 
     _TEST = {
-        'url': 'http://m.vuclip.com/w?cid=843902317&fid=63532&z=1007&nvar&frm=index.html&bu=4757321434',
-        'md5': '92ac9d1ccefec4f0bb474661ab144fcf',
+        'url': 'http://m.vuclip.com/w?cid=922692425&fid=70295&z=1010&nvar&frm=index.html',
         'info_dict': {
-            'id': '843902317',
+            'id': '922692425',
             'ext': '3gp',
-            'title': 'Movie Trailer: Noah',
-            'duration': 139,
+            'title': 'The Toy Soldiers - Hollywood Movie Trailer',
+            'duration': 180,
         }
     }
 
@@ -37,16 +37,32 @@ class VuClipIE(InfoExtractor):
             webpage = self._download_webpage(
                 adfree_url, video_id, note='Download post-ad page')
 
+        error_msg = self._html_search_regex(
+            r'<p class="message">(.*?)</p>', webpage, 'error message',
+            default=None)
+        if error_msg:
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, error_msg), expected=True)
+
+        # These clowns alternate between two page types
         links_code = self._search_regex(
-            r'(?s)<div class="social align_c".*?>(.*?)<hr\s*/?>', webpage,
-            'links')
+            r'''(?xs)
+                (?:
+                    <img\s+src="/im/play.gif".*?>|
+                    <!--\ player\ end\ -->\s*</div><!--\ thumb\ end-->
+                )
+                (.*?)
+                (?:
+                    <a\s+href="fblike|<div\s+class="social">
+                )
+            ''', webpage, 'links')
         title = self._html_search_regex(
             r'<title>(.*?)-\s*Vuclip</title>', webpage, 'title').strip()
 
         quality_order = qualities(['Reg', 'Hi'])
         formats = []
         for url, q in re.findall(
-                r'<a href="(?P<url>[^"]+)".*?>(?P<q>[^<]+)</a>', links_code):
+                r'<a\s+href="(?P<url>[^"]+)".*?>(?:<button[^>]*>)?(?P<q>[^<]+)(?:</button>)?</a>', links_code):
             format_id = compat_urllib_parse_urlparse(url).scheme + '-' + q
             formats.append({
                 'format_id': format_id,
@@ -56,7 +72,7 @@ class VuClipIE(InfoExtractor):
         self._sort_formats(formats)
 
         duration = parse_duration(self._search_regex(
-            r'\(([0-9:]+)\)</span></h1>', webpage, 'duration', fatal=False))
+            r'\(([0-9:]+)\)</span>', webpage, 'duration', fatal=False))
 
         return {
             'id': video_id,
index 4e89acd81bbb5ddfb97b1da3381b1b9f25873c96..bda3870db9f16e12c721c361696656df8be8a1b3 100644 (file)
@@ -13,37 +13,35 @@ class WorldStarHipHopIE(InfoExtractor):
         "info_dict": {
             "id": "wshh6a7q1ny0G34ZwuIO",
             "ext": "mp4",
-            "title": "Video: KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!"
+            "title": "KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!"
         }
     }
 
     def _real_extract(self, url):
-        m = re.match(self._VALID_URL, url)
-        video_id = m.group('id')
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
 
-        webpage_src = self._download_webpage(url, video_id)
-
-        m_vevo_id = re.search(r'videoId=(.*?)&amp?',
-                              webpage_src)
+        m_vevo_id = re.search(r'videoId=(.*?)&amp?', webpage)
         if m_vevo_id is not None:
             return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo')
 
         video_url = self._search_regex(
-            r'so\.addVariable\("file","(.*?)"\)', webpage_src, 'video URL')
+            r'so\.addVariable\("file","(.*?)"\)', webpage, 'video URL')
 
         if 'youtube' in video_url:
             return self.url_result(video_url, ie='Youtube')
 
         video_title = self._html_search_regex(
-            r"<title>(.*)</title>", webpage_src, 'title')
+            r'(?s)<div class="content-heading">\s*<h1>(.*?)</h1>',
+            webpage, 'title')
 
         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
         thumbnail = self._html_search_regex(
-            r'rel="image_src" href="(.*)" />', webpage_src, 'thumbnail',
+            r'rel="image_src" href="(.*)" />', webpage, 'thumbnail',
             fatal=False)
         if not thumbnail:
-            _title = r"""candytitles.*>(.*)</span>"""
-            mobj = re.search(_title, webpage_src)
+            _title = r'candytitles.*>(.*)</span>'
+            mobj = re.search(_title, webpage)
             if mobj is not None:
                 video_title = mobj.group(1)
 
index 3ab6017cdb51a3eaef6a3a1686719fba714780dd..221341c138c76f09186dc06bf19025bdb85e6d87 100644 (file)
@@ -37,16 +37,6 @@ class YahooIE(InfoExtractor):
                 'description': 'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?',
             },
         },
-        {
-            'url': 'https://movies.yahoo.com/video/world-loves-spider-man-190819223.html',
-            'md5': '410b7104aa9893b765bc22787a22f3d9',
-            'info_dict': {
-                'id': '516ed8e2-2c4f-339f-a211-7a8b49d30845',
-                'ext': 'mp4',
-                'title': 'The World Loves Spider-Man',
-                'description': '''People all over the world are celebrating the release of \"The Amazing Spider-Man 2.\" We're taking a look at the enthusiastic response Spider-Man has received from viewers all over the world.''',
-            }
-        },
         {
             'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed',
             'md5': '60e8ac193d8fb71997caa8fce54c6460',
index 24872861a940dfe6b6788b8c043b5645b5d4780a..944d7da380d668809f511243f3f192a41beba3b6 100644 (file)
@@ -13,7 +13,7 @@ class YnetIE(InfoExtractor):
     _TESTS = [
         {
             'url': 'http://hot.ynet.co.il/home/0,7340,L-11659-99244,00.html',
-            'md5': '002b44ee2f33d50363a1c153bed524cf',
+            'md5': '4b29cb57c3dddd57642b3f051f535b07',
             'info_dict': {
                 'id': 'L-11659-99244',
                 'ext': 'flv',
@@ -22,7 +22,7 @@ class YnetIE(InfoExtractor):
             }
         }, {
             'url': 'http://hot.ynet.co.il/home/0,7340,L-8859-84418,00.html',
-            'md5': '6455046ae1b48cf7e2b7cae285e53a16',
+            'md5': '8194c2ea221e9a639cac96b6b0753dc5',
             'info_dict': {
                 'id': 'L-8859-84418',
                 'ext': 'flv',
@@ -33,9 +33,7 @@ class YnetIE(InfoExtractor):
     ]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        
+        video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
 
         content = compat_urllib_parse.unquote_plus(self._og_search_video_url(webpage))
index 99198e38092a8ed507b8e44aae41677e7ce17e17..9041cfa8770897851d06026942517a103df2a639 100644 (file)
@@ -26,7 +26,7 @@ from ..utils import (
     get_element_by_attribute,
     ExtractorError,
     int_or_none,
-    PagedList,
+    OnDemandPagedList,
     unescapeHTML,
     unified_strdate,
     orderedSet,
@@ -655,6 +655,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 
         # Get video webpage
         url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
+        pref_cookies = [
+            c for c in self._downloader.cookiejar
+            if c.domain == '.youtube.com' and c.name == 'PREF']
+        for pc in pref_cookies:
+            if 'hl=' in pc.value:
+                pc.value = re.sub(r'hl=[^&]+', 'hl=en', pc.value)
+            else:
+                if pc.value:
+                    pc.value += '&'
+                pc.value += 'hl=en'
         video_webpage = self._download_webpage(url, video_id)
 
         # Attempt to extract SWF player URL
@@ -1341,7 +1351,7 @@ class YoutubeUserIE(InfoExtractor):
                     'id': video_id,
                     'title': title,
                 }
-        url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
+        url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
 
         return self.playlist_result(url_results, playlist_title=username)
 
index 44dcb1e34fa9eca6b95c42e85433fec88fe6fd24..f651337adbedf1b58460d7fa147dec79664b0f27 100644 (file)
@@ -87,7 +87,7 @@ def parseOpts(overrideArguments=None):
         for private_opt in ['-p', '--password', '-u', '--username', '--video-password']:
             try:
                 i = opts.index(private_opt)
-                opts[i+1] = '<PRIVATE>'
+                opts[i+1] = 'PRIVATE'
             except ValueError:
                 pass
         return opts
index b644f4e920bf0353658ec9920abdb0541dbaf0e2..f8dd9c72d8e07ee7fc231b33f123df4c16b2d6d6 100644 (file)
@@ -673,6 +673,8 @@ class ExtractorError(Exception):
             expected = True
         if video_id is not None:
             msg = video_id + ': ' + msg
+        if cause:
+            msg += u' (caused by %r)' % cause
         if not expected:
             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
         super(ExtractorError, self).__init__(msg)
@@ -799,6 +801,12 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
                 del req.headers['User-agent']
             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
             del req.headers['Youtubedl-user-agent']
+
+        if sys.version_info < (2, 7) and '#' in req.get_full_url():
+            # Python 2.6 is brain-dead when it comes to fragments
+            req._Request__original = req._Request__original.partition('#')[0]
+            req._Request__r_type = req._Request__r_type.partition('#')[0]
+
         return req
 
     def http_response(self, req, resp):
@@ -884,6 +892,7 @@ def unified_strdate(date_str):
         '%d/%m/%Y',
         '%d/%m/%y',
         '%Y/%m/%d %H:%M:%S',
+        '%d/%m/%Y %H:%M:%S',
         '%Y-%m-%d %H:%M:%S',
         '%d.%m.%Y %H:%M',
         '%d.%m.%Y %H.%M',
@@ -1384,14 +1393,16 @@ def check_executable(exe, args=[]):
 
 
 class PagedList(object):
-    def __init__(self, pagefunc, pagesize):
-        self._pagefunc = pagefunc
-        self._pagesize = pagesize
-
     def __len__(self):
         # This is only useful for tests
         return len(self.getslice())
 
+
+class OnDemandPagedList(PagedList):
+    def __init__(self, pagefunc, pagesize):
+        self._pagefunc = pagefunc
+        self._pagesize = pagesize
+
     def getslice(self, start=0, end=None):
         res = []
         for pagenum in itertools.count(start // self._pagesize):
@@ -1430,6 +1441,35 @@ class PagedList(object):
         return res
 
 
+class InAdvancePagedList(PagedList):
+    def __init__(self, pagefunc, pagecount, pagesize):
+        self._pagefunc = pagefunc
+        self._pagecount = pagecount
+        self._pagesize = pagesize
+
+    def getslice(self, start=0, end=None):
+        res = []
+        start_page = start // self._pagesize
+        end_page = (
+            self._pagecount if end is None else (end // self._pagesize + 1))
+        skip_elems = start - start_page * self._pagesize
+        only_more = None if end is None else end - start
+        for pagenum in range(start_page, end_page):
+            page = list(self._pagefunc(pagenum))
+            if skip_elems:
+                page = page[skip_elems:]
+                skip_elems = None
+            if only_more is not None:
+                if len(page) < only_more:
+                    only_more -= len(page)
+                else:
+                    page = page[:only_more]
+                    res.extend(page)
+                    break
+            res.extend(page)
+        return res
+
+
 def uppercase_escape(s):
     unicode_escape = codecs.getdecoder('unicode_escape')
     return re.sub(
@@ -1540,27 +1580,24 @@ def strip_jsonp(code):
 
 def js_to_json(code):
     def fix_kv(m):
-        key = m.group(2)
-        if key.startswith("'"):
-            assert key.endswith("'")
-            assert '"' not in key
-            key = '"%s"' % key[1:-1]
-        elif not key.startswith('"'):
-            key = '"%s"' % key
-
-        value = m.group(4)
-        if value.startswith("'"):
-            assert value.endswith("'")
-            assert '"' not in value
-            value = '"%s"' % value[1:-1]
-
-        return m.group(1) + key + m.group(3) + value
+        v = m.group(0)
+        if v in ('true', 'false', 'null'):
+            return v
+        if v.startswith('"'):
+            return v
+        if v.startswith("'"):
+            v = v[1:-1]
+            v = re.sub(r"\\\\|\\'|\"", lambda m: {
+                '\\\\': '\\\\',
+                "\\'": "'",
+                '"': '\\"',
+            }[m.group(0)], v)
+        return '"%s"' % v
 
     res = re.sub(r'''(?x)
-            ([{,]\s*)
-            ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
-            (:\s*)
-            ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
+        "(?:[^"\\]*(?:\\\\|\\")?)*"|
+        '(?:[^'\\]*(?:\\\\|\\')?)*'|
+        [a-zA-Z_][a-zA-Z_0-9]*
         ''', fix_kv, code)
     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
     return res
index eb4356811004f264e1b3f47578f8547edf93d73c..1384b496b31c290b8c6ff5984831339076cc7d39 100644 (file)
@@ -1,2 +1,2 @@
 
-__version__ = '2014.09.28.1'
+__version__ = '2014.09.29.2'