Merge remote-tracking branch 'diffycat/thvideo-update'
authorPhilipp Hagemeister <phihag@phihag.de>
Sun, 28 Sep 2014 22:36:55 +0000 (00:36 +0200)
committerPhilipp Hagemeister <phihag@phihag.de>
Sun, 28 Sep 2014 22:36:55 +0000 (00:36 +0200)
test/test_utils.py
youtube_dl/extractor/nfl.py
youtube_dl/extractor/vimeo.py
youtube_dl/extractor/youtube.py
youtube_dl/options.py
youtube_dl/utils.py

index 3efbed29dd34de570f2db4e6eb4954ec2f4b9c6e..6419b3ca96c7f29d3fcd61c86b31aa646ea91470 100644 (file)
@@ -22,7 +22,8 @@ from youtube_dl.utils import (
     fix_xml_ampersands,
     get_meta_content,
     orderedSet,
-    PagedList,
+    OnDemandPagedList,
+    InAdvancePagedList,
     parse_duration,
     read_batch_urls,
     sanitize_filename,
@@ -246,10 +247,14 @@ class TestUtil(unittest.TestCase):
                 for i in range(firstid, upto):
                     yield i
 
-            pl = PagedList(get_page, pagesize)
+            pl = OnDemandPagedList(get_page, pagesize)
             got = pl.getslice(*sliceargs)
             self.assertEqual(got, expected)
 
+            iapl = InAdvancePagedList(get_page, size // pagesize + 1, pagesize)
+            got = iapl.getslice(*sliceargs)
+            self.assertEqual(got, expected)
+
         testPL(5, 2, (), [0, 1, 2, 3, 4])
         testPL(5, 2, (1,), [1, 2, 3, 4])
         testPL(5, 2, (2,), [2, 3, 4])
index 963c4587c88c26c02e479ee1c3764bb2bd46269c..4832b3ce4b765d332ea8e827205547bd062c400c 100644 (file)
@@ -6,6 +6,7 @@ import re
 from .common import InfoExtractor
 from ..utils import (
     ExtractorError,
+    compat_urllib_parse,
     int_or_none,
     remove_end,
 )
@@ -13,76 +14,116 @@ from ..utils import (
 
 class NFLIE(InfoExtractor):
     IE_NAME = 'nfl.com'
-    _VALID_URL = r'(?x)https?://(?:www\.)?nfl\.com/(?:videos/(?:.+)/|.*?\#video=)(?P<id>\d..[0-9]+)'
-    _PLAYER_CONFIG_URL = 'http://www.nfl.com/static/content/static/config/video/config.json'
-    _TEST = {
-        'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights',
-        # 'md5': '5eb8c40a727dda106d510e5d6ffa79e5',  # md5 checksum fluctuates
-        'info_dict': {
-            'id': '0ap3000000398478',
-            'ext': 'mp4',
-            'title': 'Week 3: Washington Redskins vs. Philadelphia Eagles highlights',
-            'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478',
-            'upload_date': '20140921',
-            'timestamp': 1411337580,
-            'thumbnail': 're:^https?://.*\.jpg$',
+    _VALID_URL = r'''(?x)https?://
+        (?P<host>(?:www\.)?(?:nfl\.com|.*?\.clubs\.nfl\.com))/
+        (?:.+?/)*
+        (?P<id>(?:\d[a-z]{2}\d{13}|\w{8}\-(?:\w{4}\-){3}\w{12}))'''
+    _TESTS = [
+        {
+            'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights',
+            'md5': '394ef771ddcd1354f665b471d78ec4c6',
+            'info_dict': {
+                'id': '0ap3000000398478',
+                'ext': 'mp4',
+                'title': 'Week 3: Redskins vs. Eagles highlights',
+                'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478',
+                'upload_date': '20140921',
+                'timestamp': 1411337580,
+                'thumbnail': 're:^https?://.*\.jpg$',
+            }
+        },
+        {
+            'url': 'http://prod.www.steelers.clubs.nfl.com/video-and-audio/videos/LIVE_Post_Game_vs_Browns/9d72f26a-9e2b-4718-84d3-09fb4046c266',
+            'md5': 'cf85bdb4bc49f6e9d3816d130c78279c',
+            'info_dict': {
+                'id': '9d72f26a-9e2b-4718-84d3-09fb4046c266',
+                'ext': 'mp4',
+                'title': 'LIVE: Post Game vs. Browns',
+                'description': 'md5:6a97f7e5ebeb4c0e69a418a89e0636e8',
+                'upload_date': '20131229',
+                'timestamp': 1388354455,
+                'thumbnail': 're:^https?://.*\.jpg$',
+            }
+        }
+    ]
+
+    @staticmethod
+    def prepend_host(host, url):
+        if not url.startswith('http'):
+            if not url.startswith('/'):
+                url = '/%s' % url
+            url = 'http://{0:}{1:}'.format(host, url)
+        return url
+
+    @staticmethod
+    def format_from_stream(stream, protocol, host, path_prefix='',
+                           preference=0, note=None):
+        url = '{protocol:}://{host:}/{prefix:}{path:}'.format(
+            protocol=protocol,
+            host=host,
+            prefix=path_prefix,
+            path=stream.get('path'),
+        )
+        return {
+            'url': url,
+            'vbr': int_or_none(stream.get('rate', 0), 1000),
+            'preference': preference,
+            'format_note': note,
         }
-    }
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id, host = mobj.group('id'), mobj.group('host')
 
-        config = self._download_json(self._PLAYER_CONFIG_URL, video_id,
-                                     note='Downloading player config')
-        url_template = 'http://nfl.com{contentURLTemplate:s}'.format(**config)
-        video_data = self._download_json(url_template.format(id=video_id), video_id)
+        webpage = self._download_webpage(url, video_id)
 
-        cdns = config.get('cdns')
-        if not cdns:
-            raise ExtractorError('Failed to get CDN data', expected=True)
+        config_url = NFLIE.prepend_host(host, self._search_regex(
+            r'(?:config|configURL)\s*:\s*"([^"]+)"', webpage, 'config URL'))
+        config = self._download_json(config_url, video_id,
+                                     note='Downloading player config')
+        url_template = NFLIE.prepend_host(
+            host, '{contentURLTemplate:}'.format(**config))
+        video_data = self._download_json(
+            url_template.format(id=video_id), video_id)
 
         formats = []
-        streams = video_data.get('cdnData', {}).get('bitrateInfo', [])
-        for name, cdn in cdns.items():
-            # LimeLight streams don't seem to work
-            if cdn.get('name') == 'LIMELIGHT':
-                continue
-
-            protocol = cdn.get('protocol')
-            host = remove_end(cdn.get('host', ''), '/')
-            if not (protocol and host):
-                continue
-
-            path_prefix = cdn.get('pathprefix', '')
-            if path_prefix and not path_prefix.endswith('/'):
-                path_prefix = '%s/' % path_prefix
-
-            get_url = lambda p: '{protocol:s}://{host:s}/{prefix:s}{path:}'.format(
-                protocol=protocol,
-                host=host,
-                prefix=path_prefix,
-                path=p,
-            )
-
-            if protocol == 'rtmp':
-                preference = -2
-            elif 'prog' in name.lower():
-                preference = -1
-            else:
-                preference = 0
-
+        cdn_data = video_data.get('cdnData', {})
+        streams = cdn_data.get('bitrateInfo', [])
+        if cdn_data.get('format') == 'EXTERNAL_HTTP_STREAM':
+            parts = compat_urllib_parse.urlparse(cdn_data.get('uri'))
+            protocol, host = parts.scheme, parts.netloc
             for stream in streams:
-                path = stream.get('path')
-                if not path:
+                formats.append(
+                    NFLIE.format_from_stream(stream, protocol, host))
+        else:
+            cdns = config.get('cdns')
+            if not cdns:
+                raise ExtractorError('Failed to get CDN data', expected=True)
+
+            for name, cdn in cdns.items():
+                # LimeLight streams don't seem to work
+                if cdn.get('name') == 'LIMELIGHT':
                     continue
 
-                formats.append({
-                    'url': get_url(path),
-                    'vbr': int_or_none(stream.get('rate', 0), 1000),
-                    'preference': preference,
-                    'format_note': name,
-                })
+                protocol = cdn.get('protocol')
+                host = remove_end(cdn.get('host', ''), '/')
+                if not (protocol and host):
+                    continue
+
+                prefix = cdn.get('pathprefix', '')
+                if prefix and not prefix.endswith('/'):
+                    prefix = '%s/' % prefix
+
+                preference = 0
+                if protocol == 'rtmp':
+                    preference = -2
+                elif 'prog' in name.lower():
+                    preference = 1
+
+                for stream in streams:
+                    formats.append(
+                        NFLIE.format_from_stream(stream, protocol, host,
+                                                 prefix, preference, name))
 
         self._sort_formats(formats)
 
@@ -94,7 +135,7 @@ class NFLIE(InfoExtractor):
 
         return {
             'id': video_id,
-            'title': video_data.get('storyHeadline'),
+            'title': video_data.get('headline'),
             'formats': formats,
             'description': video_data.get('caption'),
             'duration': video_data.get('duration'),
index 4be1b878585525f70bec0be87c122bf3b10eee9b..403d0bb28ebb4e3d3f1d93af46452485dfe44bb6 100644 (file)
@@ -8,18 +8,19 @@ import itertools
 from .common import InfoExtractor
 from .subtitles import SubtitlesInfoExtractor
 from ..utils import (
+    clean_html,
     compat_HTTPError,
     compat_urllib_parse,
     compat_urllib_request,
-    clean_html,
-    get_element_by_attribute,
+    compat_urlparse,
     ExtractorError,
+    get_element_by_attribute,
+    InAdvancePagedList,
+    int_or_none,
     RegexNotFoundError,
-    smuggle_url,
     std_headers,
     unsmuggle_url,
     urlencode_postdata,
-    int_or_none,
 )
 
 
@@ -533,32 +534,55 @@ class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE):
 
 
 class VimeoLikesIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes(?:$|[?#])'
+    _VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes/?(?:$|[?#]|sort:)'
     IE_NAME = 'vimeo:likes'
     IE_DESC = 'Vimeo user likes'
     _TEST = {
-        'url': 'https://vimeo.com/user20132939/likes',
-        'playlist_mincount': 4,
-        'add_ies': ['Generic'],
+        'url': 'https://vimeo.com/user755559/likes/',
+        'playlist_mincount': 293,
         "info_dict": {
-            "description": "Videos Philipp Hagemeister likes on Vimeo.",
-            "title": "Vimeo / Philipp Hagemeister's likes",
-        },
-        'params': {
-            'extract_flat': False,
+            "description": "See all the videos urza likes",
+            "title": 'Videos urza likes',
         },
     }
 
     def _real_extract(self, url):
         user_id = self._match_id(url)
-        rss_url = '%s//vimeo.com/user%s/likes/rss' % (
-            self.http_scheme(), user_id)
-        surl = smuggle_url(rss_url, {
-            'force_videoid': '%s_likes' % user_id,
-            'to_generic': True,
-        })
+        webpage = self._download_webpage(url, user_id)
+        page_count = self._int(
+            self._search_regex(
+                r'''(?x)<li><a\s+href="[^"]+"\s+data-page="([0-9]+)">
+                    .*?</a></li>\s*<li\s+class="pagination_next">
+                ''', webpage, 'page count'),
+            'page count', fatal=True)
+        PAGE_SIZE = 12
+        title = self._html_search_regex(
+            r'(?s)<h1>(.+?)</h1>', webpage, 'title', fatal=False)
+        description = self._html_search_meta('description', webpage)
+
+        def _get_page(idx):
+            page_url = '%s//vimeo.com/user%s/likes/page:%d/sort:date' % (
+                self.http_scheme(), user_id, idx + 1)
+            webpage = self._download_webpage(
+                page_url, user_id,
+                note='Downloading page %d/%d' % (idx + 1, page_count))
+            video_list = self._search_regex(
+                r'(?s)<ol class="js-browse_list[^"]+"[^>]*>(.*?)</ol>',
+                webpage, 'video content')
+            paths = re.findall(
+                r'<li[^>]*>\s*<a\s+href="([^"]+)"', video_list)
+            for path in paths:
+                yield {
+                    '_type': 'url',
+                    'url': compat_urlparse.urljoin(page_url, path),
+                }
+
+        pl = InAdvancePagedList(_get_page, page_count, PAGE_SIZE)
 
         return {
-            '_type': 'url',
-            'url': surl,
+            '_type': 'playlist',
+            'id': 'user%s_likes' % user_id,
+            'title': title,
+            'description': description,
+            'entries': pl,
         }
index 99198e38092a8ed507b8e44aae41677e7ce17e17..045507bc73f211a7227439fbe6425de9de0d658d 100644 (file)
@@ -26,7 +26,7 @@ from ..utils import (
     get_element_by_attribute,
     ExtractorError,
     int_or_none,
-    PagedList,
+    OnDemandPagedList,
     unescapeHTML,
     unified_strdate,
     orderedSet,
@@ -1341,7 +1341,7 @@ class YoutubeUserIE(InfoExtractor):
                     'id': video_id,
                     'title': title,
                 }
-        url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
+        url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
 
         return self.playlist_result(url_results, playlist_title=username)
 
index 44dcb1e34fa9eca6b95c42e85433fec88fe6fd24..f651337adbedf1b58460d7fa147dec79664b0f27 100644 (file)
@@ -87,7 +87,7 @@ def parseOpts(overrideArguments=None):
         for private_opt in ['-p', '--password', '-u', '--username', '--video-password']:
             try:
                 i = opts.index(private_opt)
-                opts[i+1] = '<PRIVATE>'
+                opts[i+1] = 'PRIVATE'
             except ValueError:
                 pass
         return opts
index b644f4e920bf0353658ec9920abdb0541dbaf0e2..9f49507c1256a707a139bbbba407a2b17d3a87ad 100644 (file)
@@ -1384,14 +1384,16 @@ def check_executable(exe, args=[]):
 
 
 class PagedList(object):
-    def __init__(self, pagefunc, pagesize):
-        self._pagefunc = pagefunc
-        self._pagesize = pagesize
-
     def __len__(self):
         # This is only useful for tests
         return len(self.getslice())
 
+
+class OnDemandPagedList(PagedList):
+    def __init__(self, pagefunc, pagesize):
+        self._pagefunc = pagefunc
+        self._pagesize = pagesize
+
     def getslice(self, start=0, end=None):
         res = []
         for pagenum in itertools.count(start // self._pagesize):
@@ -1430,6 +1432,35 @@ class PagedList(object):
         return res
 
 
+class InAdvancePagedList(PagedList):
+    def __init__(self, pagefunc, pagecount, pagesize):
+        self._pagefunc = pagefunc
+        self._pagecount = pagecount
+        self._pagesize = pagesize
+
+    def getslice(self, start=0, end=None):
+        res = []
+        start_page = start // self._pagesize
+        end_page = (
+            self._pagecount if end is None else (end // self._pagesize + 1))
+        skip_elems = start - start_page * self._pagesize
+        only_more = None if end is None else end - start
+        for pagenum in range(start_page, end_page):
+            page = list(self._pagefunc(pagenum))
+            if skip_elems:
+                page = page[skip_elems:]
+                skip_elems = None
+            if only_more is not None:
+                if len(page) < only_more:
+                    only_more -= len(page)
+                else:
+                    page = page[:only_more]
+                    res.extend(page)
+                    break
+            res.extend(page)
+        return res
+
+
 def uppercase_escape(s):
     unicode_escape = codecs.getdecoder('unicode_escape')
     return re.sub(