Merge branch 'xxxymovies' of https://github.com/peugeot/youtube-dl into peugeot-xxxym...
authorSergey M․ <dstftw@gmail.com>
Mon, 29 Dec 2014 14:38:28 +0000 (20:38 +0600)
committerSergey M․ <dstftw@gmail.com>
Mon, 29 Dec 2014 14:38:28 +0000 (20:38 +0600)
test/helper.py
test/test_download.py
youtube_dl/extractor/__init__.py
youtube_dl/extractor/alphaporno.py [new file with mode: 0644]
youtube_dl/extractor/archiveorg.py
youtube_dl/extractor/arte.py
youtube_dl/extractor/bbccouk.py
youtube_dl/extractor/cnn.py
youtube_dl/extractor/eroprofile.py [new file with mode: 0644]
youtube_dl/extractor/sunporno.py

index 8a820526abfe5dbae31a1921312c60d075667c32..96d58b7c12fd9119b3b5f65eb9c41cfc3c97f500 100644 (file)
@@ -99,7 +99,7 @@ def gettestcases(include_onlymatching=False):
 md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()
 
 
-def expect_info_dict(self, expected_dict, got_dict):
+def expect_info_dict(self, got_dict, expected_dict):
     for info_field, expected in expected_dict.items():
         if isinstance(expected, compat_str) and expected.startswith('re:'):
             got = got_dict.get(info_field)
index a009aa475442ae588405a99f432af6feade92836..412f3dbce8683766ba53061fb2aecee95339b829 100644 (file)
@@ -155,7 +155,7 @@ def generator(test_case):
             if is_playlist:
                 self.assertEqual(res_dict['_type'], 'playlist')
                 self.assertTrue('entries' in res_dict)
-                expect_info_dict(self, test_case.get('info_dict', {}), res_dict)
+                expect_info_dict(self, res_dict, test_case.get('info_dict', {}))
 
             if 'playlist_mincount' in test_case:
                 assertGreaterEqual(
@@ -204,7 +204,7 @@ def generator(test_case):
                 with io.open(info_json_fn, encoding='utf-8') as infof:
                     info_dict = json.load(infof)
 
-                expect_info_dict(self, tc.get('info_dict', {}), info_dict)
+                expect_info_dict(self, info_dict, tc.get('info_dict', {}))
         finally:
             try_rm_tcs_files()
             if is_playlist and res_dict is not None and res_dict.get('entries'):
index c8a77616e24fff789f11422a0ab46d71c0592eca..c79e5b7d03e6e92d36cf543740d9b6adf58039b4 100644 (file)
@@ -7,6 +7,7 @@ from .adobetv import AdobeTVIE
 from .adultswim import AdultSwimIE
 from .aftonbladet import AftonbladetIE
 from .aljazeera import AlJazeeraIE
+from .alphaporno import AlphaPornoIE
 from .anitube import AnitubeIE
 from .anysex import AnySexIE
 from .aol import AolIE
@@ -64,6 +65,7 @@ from .cnet import CNETIE
 from .cnn import (
     CNNIE,
     CNNBlogsIE,
+    CNNArticleIE,
 )
 from .collegehumor import CollegeHumorIE
 from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE
@@ -108,6 +110,7 @@ from .elpais import ElPaisIE
 from .empflix import EMPFlixIE
 from .engadget import EngadgetIE
 from .eporner import EpornerIE
+from .eroprofile import EroProfileIE
 from .escapist import EscapistIE
 from .everyonesmixtape import EveryonesMixtapeIE
 from .exfm import ExfmIE
diff --git a/youtube_dl/extractor/alphaporno.py b/youtube_dl/extractor/alphaporno.py
new file mode 100644 (file)
index 0000000..c34719d
--- /dev/null
@@ -0,0 +1,77 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    parse_iso8601,
+    parse_duration,
+    parse_filesize,
+    int_or_none,
+)
+
+
+class AlphaPornoIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?alphaporno\.com/videos/(?P<id>[^/]+)'
+    _TEST = {
+        'url': 'http://www.alphaporno.com/videos/sensual-striptease-porn-with-samantha-alexandra/',
+        'md5': 'feb6d3bba8848cd54467a87ad34bd38e',
+        'info_dict': {
+            'id': '258807',
+            'display_id': 'sensual-striptease-porn-with-samantha-alexandra',
+            'ext': 'mp4',
+            'title': 'Sensual striptease porn with Samantha Alexandra',
+            'thumbnail': 're:https?://.*\.jpg$',
+            'timestamp': 1418694611,
+            'upload_date': '20141216',
+            'duration': 387,
+            'filesize_approx': 54120000,
+            'tbr': 1145,
+            'categories': list,
+            'age_limit': 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_id = self._search_regex(
+            r"video_id\s*:\s*'([^']+)'", webpage, 'video id', default=None)
+
+        video_url = self._search_regex(
+            r"video_url\s*:\s*'([^']+)'", webpage, 'video url')
+        ext = self._html_search_meta(
+            'encodingFormat', webpage, 'ext', default='.mp4')[1:]
+
+        title = self._search_regex(
+            [r'<meta content="([^"]+)" itemprop="description">',
+             r'class="title" itemprop="name">([^<]+)<'],
+            webpage, 'title')
+        thumbnail = self._html_search_meta('thumbnail', webpage, 'thumbnail')
+        timestamp = parse_iso8601(self._html_search_meta(
+            'uploadDate', webpage, 'upload date'))
+        duration = parse_duration(self._html_search_meta(
+            'duration', webpage, 'duration'))
+        filesize_approx = parse_filesize(self._html_search_meta(
+            'contentSize', webpage, 'file size'))
+        bitrate = int_or_none(self._html_search_meta(
+            'bitrate', webpage, 'bitrate'))
+        categories = self._html_search_meta(
+            'keywords', webpage, 'categories', default='').split(',')
+
+        age_limit = self._rta_search(webpage)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'url': video_url,
+            'ext': ext,
+            'title': title,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'duration': duration,
+            'filesize_approx': filesize_approx,
+            'tbr': bitrate,
+            'categories': categories,
+            'age_limit': age_limit,
+        }
index 34ce8429b121261784a1645c28e2a33cb76bcacb..9fc35a42b8612d828ccc3ae43c9e4f74782f5352 100644 (file)
@@ -1,42 +1,48 @@
 from __future__ import unicode_literals
 
-import json
-import re
-
 from .common import InfoExtractor
-from ..utils import (
-    unified_strdate,
-)
+from ..utils import unified_strdate
 
 
 class ArchiveOrgIE(InfoExtractor):
     IE_NAME = 'archive.org'
     IE_DESC = 'archive.org videos'
-    _VALID_URL = r'(?:https?://)?(?:www\.)?archive\.org/details/(?P<id>[^?/]+)(?:[?].*)?$'
-    _TEST = {
-        "url": "http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect",
-        'file': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect.ogv',
+    _VALID_URL = r'https?://(?:www\.)?archive\.org/details/(?P<id>[^?/]+)(?:[?].*)?$'
+    _TESTS = [{
+        'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
         'md5': '8af1d4cf447933ed3c7f4871162602db',
         'info_dict': {
-            "title": "1968 Demo - FJCC Conference Presentation Reel #1",
-            "description": "Reel 1 of 3: Also known as the \"Mother of All Demos\", Doug Engelbart's presentation at the Fall Joint Computer Conference in San Francisco, December 9, 1968 titled \"A Research Center for Augmenting Human Intellect.\" For this presentation, Doug and his team astonished the audience by not only relating their research, but demonstrating it live. This was the debut of the mouse, interactive computing, hypermedia, computer supported software engineering, video teleconferencing, etc. See also <a href=\"http://dougengelbart.org/firsts/dougs-1968-demo.html\" rel=\"nofollow\">Doug's 1968 Demo page</a> for more background, highlights, links, and the detailed paper published in this conference proceedings. Filmed on 3 reels: Reel 1 | <a href=\"http://www.archive.org/details/XD300-24_68HighlightsAResearchCntAugHumanIntellect\" rel=\"nofollow\">Reel 2</a> | <a href=\"http://www.archive.org/details/XD300-25_68HighlightsAResearchCntAugHumanIntellect\" rel=\"nofollow\">Reel 3</a>",
-            "upload_date": "19681210",
-            "uploader": "SRI International"
+            'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect',
+            'ext': 'ogv',
+            'title': '1968 Demo - FJCC Conference Presentation Reel #1',
+            'description': 'md5:1780b464abaca9991d8968c877bb53ed',
+            'upload_date': '19681210',
+            'uploader': 'SRI International'
+        }
+    }, {
+        'url': 'https://archive.org/details/Cops1922',
+        'md5': '18f2a19e6d89af8425671da1cf3d4e04',
+        'info_dict': {
+            'id': 'Cops1922',
+            'ext': 'ogv',
+            'title': 'Buster Keaton\'s "Cops" (1922)',
+            'description': 'md5:70f72ee70882f713d4578725461ffcc3',
         }
-    }
+    }]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = self._match_id(url)
 
         json_url = url + ('?' if '?' in url else '&') + 'output=json'
-        json_data = self._download_webpage(json_url, video_id)
-        data = json.loads(json_data)
+        data = self._download_json(json_url, video_id)
+
+        def get_optional(data_dict, field):
+            return data_dict['metadata'].get(field, [None])[0]
 
-        title = data['metadata']['title'][0]
-        description = data['metadata']['description'][0]
-        uploader = data['metadata']['creator'][0]
-        upload_date = unified_strdate(data['metadata']['date'][0])
+        title = get_optional(data, 'title')
+        description = get_optional(data, 'description')
+        uploader = get_optional(data, 'creator')
+        upload_date = unified_strdate(get_optional(data, 'date'))
 
         formats = [
             {
index 219631b9b0dfa690a37e09a7b3473566543370e2..929dd3cc5550beb1b2da8874763084b5146d2f33 100644 (file)
@@ -37,7 +37,7 @@ class ArteTvIE(InfoExtractor):
             config_xml_url, video_id, note='Downloading configuration')
 
         formats = [{
-            'forma_id': q.attrib['quality'],
+            'format_id': q.attrib['quality'],
             # The playpath starts at 'mp4:', if we don't manually
             # split the url, rtmpdump will incorrectly parse them
             'url': q.text.split('mp4:', 1)[0],
@@ -133,7 +133,7 @@ class ArteTVPlus7IE(InfoExtractor):
                 'width': int_or_none(f.get('width')),
                 'height': int_or_none(f.get('height')),
                 'tbr': int_or_none(f.get('bitrate')),
-                'quality': qfunc(f['quality']),
+                'quality': qfunc(f.get('quality')),
                 'source_preference': source_pref,
             }
 
index 2d2f742aee26062af0c52a30dc5f017290daf51e..f690dc803c9b818342a311c02b414d9729943baf 100644 (file)
@@ -71,7 +71,20 @@ class BBCCoUkIE(SubtitlesInfoExtractor):
                 'skip_download': True,
             },
             'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
-        },
+        }, {
+            'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
+            'info_dict': {
+                'id': 'b04v209v',
+                'ext': 'flv',
+                'title': 'Pete Tong, The Essential New Tune Special',
+                'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
+                'duration': 10800,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            }
+        }
     ]
 
     def _extract_asx_playlist(self, connection, programme_id):
@@ -203,6 +216,59 @@ class BBCCoUkIE(SubtitlesInfoExtractor):
 
         return formats, subtitles
 
+    def _download_playlist(self, playlist_id):
+        try:
+            playlist = self._download_json(
+                'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
+                playlist_id, 'Downloading playlist JSON')
+
+            version = playlist.get('defaultAvailableVersion')
+            if version:
+                smp_config = version['smpConfig']
+                title = smp_config['title']
+                description = smp_config['summary']
+                for item in smp_config['items']:
+                    kind = item['kind']
+                    if kind != 'programme' and kind != 'radioProgramme':
+                        continue
+                    programme_id = item.get('vpid')
+                    duration = int(item.get('duration'))
+                    formats, subtitles = self._download_media_selector(programme_id)
+                return programme_id, title, description, duration, formats, subtitles
+        except ExtractorError as ee:
+            if not isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404:
+                raise
+
+        # fallback to legacy playlist
+        playlist = self._download_xml(
+                'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id,
+                playlist_id, 'Downloading legacy playlist XML')
+
+        no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems')
+        if no_items is not None:
+            reason = no_items.get('reason')
+            if reason == 'preAvailability':
+                msg = 'Episode %s is not yet available' % playlist_id
+            elif reason == 'postAvailability':
+                msg = 'Episode %s is no longer available' % playlist_id
+            elif reason == 'noMedia':
+                msg = 'Episode %s is not currently available' % playlist_id
+            else:
+                msg = 'Episode %s is not available: %s' % (playlist_id, reason)
+            raise ExtractorError(msg, expected=True)
+
+        for item in self._extract_items(playlist):
+            kind = item.get('kind')
+            if kind != 'programme' and kind != 'radioProgramme':
+                continue
+            title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text
+            description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text
+            programme_id = item.get('identifier')
+            duration = int(item.get('duration'))
+            formats, subtitles = self._download_media_selector(programme_id)
+
+        return programme_id, title, description, duration, formats, subtitles
+
     def _real_extract(self, url):
         group_id = self._match_id(url)
 
@@ -219,32 +285,7 @@ class BBCCoUkIE(SubtitlesInfoExtractor):
             duration = player['duration']
             formats, subtitles = self._download_media_selector(programme_id)
         else:
-            playlist = self._download_xml(
-                'http://www.bbc.co.uk/iplayer/playlist/%s' % group_id,
-                group_id, 'Downloading playlist XML')
-
-            no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems')
-            if no_items is not None:
-                reason = no_items.get('reason')
-                if reason == 'preAvailability':
-                    msg = 'Episode %s is not yet available' % group_id
-                elif reason == 'postAvailability':
-                    msg = 'Episode %s is no longer available' % group_id
-                elif reason == 'noMedia':
-                    msg = 'Episode %s is not currently available' % group_id
-                else:
-                    msg = 'Episode %s is not available: %s' % (group_id, reason)
-                raise ExtractorError(msg, expected=True)
-
-            for item in self._extract_items(playlist):
-                kind = item.get('kind')
-                if kind != 'programme' and kind != 'radioProgramme':
-                    continue
-                title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text
-                description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text
-                programme_id = item.get('identifier')
-                duration = int(item.get('duration'))
-                formats, subtitles = self._download_media_selector(programme_id)
+            programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
 
         if self._downloader.params.get('listsubtitles', False):
             self._list_available_subtitles(programme_id, subtitles)
index 81142ee419d45b9df9f75bdc152ab87e1317650f..1bff005d64ff72f00f0cd85871df2ee1c79c236b 100644 (file)
@@ -11,14 +11,14 @@ from ..utils import (
 
 
 class CNNIE(InfoExtractor):
-    _VALID_URL = r'''(?x)https?://((edition|www)\.)?cnn\.com/video/(data/.+?|\?)/
-        (?P<path>.+?/(?P<title>[^/]+?)(?:\.cnn(-ap)?|(?=&)))'''
+    _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/
+        (?P<path>.+?/(?P<title>[^/]+?)(?:\.cnn(?:-ap)?|(?=&)))'''
 
     _TESTS = [{
         'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn',
         'md5': '3e6121ea48df7e2259fe73a0628605c4',
         'info_dict': {
-            'id': 'sports_2013_06_09_nadal-1-on-1.cnn',
+            'id': 'sports/2013/06/09/nadal-1-on-1.cnn',
             'ext': 'mp4',
             'title': 'Nadal wins 8th French Open title',
             'description': 'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.',
@@ -127,3 +127,28 @@ class CNNBlogsIE(InfoExtractor):
             'url': cnn_url,
             'ie_key': CNNIE.ie_key(),
         }
+
+
+class CNNArticleIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!video/)'
+    _TEST = {
+        'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/',
+        'md5': '275b326f85d80dff7592a9820f5dc887',
+        'info_dict': {
+            'id': 'bestoftv/2014/12/21/sotu-crowley-president-obama-north-korea-not-going-to-be-intimidated.cnn',
+            'ext': 'mp4',
+            'title': 'Obama: We\'re not going to be intimidated',
+            'description': 'md5:e735586f3dc936075fa654a4d91b21f9',
+            'upload_date': '20141220',
+        },
+        'add_ie': ['CNN'],
+    }
+
+    def _real_extract(self, url):
+        webpage = self._download_webpage(url, url_basename(url))
+        cnn_url = self._html_search_regex(r"video:\s*'([^']+)'", webpage, 'cnn url')
+        return {
+            '_type': 'url',
+            'url': 'http://cnn.com/video/?/video/' + cnn_url,
+            'ie_key': CNNIE.ie_key(),
+        }
diff --git a/youtube_dl/extractor/eroprofile.py b/youtube_dl/extractor/eroprofile.py
new file mode 100644 (file)
index 0000000..79e2fbd
--- /dev/null
@@ -0,0 +1,45 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class EroProfileIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?eroprofile\.com/m/videos/view/(?P<id>[^/]+)'
+    _TEST = {
+        'url': 'http://www.eroprofile.com/m/videos/view/sexy-babe-softcore',
+        'md5': 'c26f351332edf23e1ea28ce9ec9de32f',
+        'info_dict': {
+            'id': '3733775',
+            'display_id': 'sexy-babe-softcore',
+            'ext': 'm4v',
+            'title': 'sexy babe softcore',
+            'thumbnail': 're:https?://.*\.jpg',
+            'age_limit': 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_id = self._search_regex(
+            [r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'],
+            webpage, 'video id', default=None)
+
+        video_url = self._search_regex(
+            r'<source src="([^"]+)', webpage, 'video url')
+        title = self._html_search_regex(
+            r'Title:</th><td>([^<]+)</td>', webpage, 'title')
+        thumbnail = self._search_regex(
+            r'onclick="showVideoPlayer\(\)"><img src="([^"]+)',
+            webpage, 'thumbnail', fatal=False)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'url': video_url,
+            'title': title,
+            'thumbnail': thumbnail,
+            'age_limit': 18,
+        }
index 263f09b4645fa8b6255f1216e99cab27afce2bee..8a333f1d24d6be3bd5160d843c3cd6451ef83178 100644 (file)
@@ -28,23 +28,27 @@ class SunPornoIE(InfoExtractor):
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = self._match_id(url)
 
         webpage = self._download_webpage(url, video_id)
 
-        title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, 'title')
-        description = self._html_search_meta('description', webpage, 'description')
+        title = self._html_search_regex(
+            r'<title>([^<]+)</title>', webpage, 'title')
+        description = self._html_search_meta(
+            'description', webpage, 'description')
         thumbnail = self._html_search_regex(
             r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False)
 
         duration = parse_duration(self._search_regex(
-            r'Duration:\s*(\d+:\d+)\s*<', webpage, 'duration', fatal=False))
+            r'itemprop="duration">\s*(\d+:\d+)\s*<',
+            webpage, 'duration', fatal=False))
 
         view_count = int_or_none(self._html_search_regex(
-            r'class="views">\s*(\d+)\s*<', webpage, 'view count', fatal=False))
+            r'class="views">\s*(\d+)\s*<',
+            webpage, 'view count', fatal=False))
         comment_count = int_or_none(self._html_search_regex(
-            r'(\d+)</b> Comments?', webpage, 'comment count', fatal=False))
+            r'(\d+)</b> Comments?',
+            webpage, 'comment count', fatal=False))
 
         formats = []
         quality = qualities(['mp4', 'flv'])