Merge branch 'huajiao' of https://github.com/pyx/youtube-dl into pyx-huajiao
authorYen Chi Hsuan <yan12125@gmail.com>
Sat, 15 Oct 2016 06:53:05 +0000 (14:53 +0800)
committerYen Chi Hsuan <yan12125@gmail.com>
Sat, 15 Oct 2016 06:53:05 +0000 (14:53 +0800)
12 files changed:
ChangeLog
README.md
youtube_dl/extractor/canalplus.py
youtube_dl/extractor/carambatv.py
youtube_dl/extractor/cbsinteractive.py
youtube_dl/extractor/chirbit.py
youtube_dl/extractor/clipfish.py
youtube_dl/extractor/cmt.py
youtube_dl/extractor/crunchyroll.py
youtube_dl/extractor/orf.py
youtube_dl/extractor/safari.py
youtube_dl/extractor/videomore.py

index d2b78a4891f849c44ca029662d88345fa6f28114..c1173150dc0a2d5d6785f97a6e027493834b8315 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,6 +1,11 @@
 version <unreleased>
 
 Extractors
+* [cmt] Fix mgid extraction (#10813)
+* [chirbit] Fix extraction of user profile pages
+* [charambatv] Fix extraction
+* [canalplus] Fix extraction for some videos
+* [cbsinteractive] Fix extraction for cnet.com
 * [parliamentliveuk] Lower case URLs are now recognized (#10912)
 
 
index 1cb44b2cfb9e6578a2dc10893d0493e0140a7b1b..a53deb819f80bbda0e96586c9b361a9b6a61d45f 100644 (file)
--- a/README.md
+++ b/README.md
@@ -902,7 +902,7 @@ If you want to find out whether a given URL is supported, simply call youtube-dl
 
 # Why do I need to go through that much red tape when filing bugs?
 
-Before we had the issue template, despite our extensive [bug reporting instructions](#bugs), about 80% of the issue reports we got were useless, for instance because people used ancient versions hundreds of releases old, because of simple syntactic errors (not in youtube-dl but in general shell usage), because the problem was alrady reported multiple times before, because people did not actually read an error message, even if it said "please install ffmpeg", because people did not mention the URL they were trying to download and many more simple, easy-to-avoid problems, many of whom were totally unrelated to youtube-dl.
+Before we had the issue template, despite our extensive [bug reporting instructions](#bugs), about 80% of the issue reports we got were useless, for instance because people used ancient versions hundreds of releases old, because of simple syntactic errors (not in youtube-dl but in general shell usage), because the problem was already reported multiple times before, because people did not actually read an error message, even if it said "please install ffmpeg", because people did not mention the URL they were trying to download and many more simple, easy-to-avoid problems, many of whom were totally unrelated to youtube-dl.
 
 youtube-dl is an open-source project manned by too few volunteers, so we'd rather spend time fixing bugs where we are certain none of those simple problems apply, and where we can be reasonably confident to be able to reproduce the issue without asking the reporter repeatedly. As such, the output of `youtube-dl -v YOUR_URL_HERE` is really all that's required to file an issue. The issue template also guides you through some basic steps you can do, such as checking that your version of youtube-dl is current.
 
index 6dab226af422266ee9898ddd061515bdc76f0101..1c3c41d26619ec2fa347c4a75093b2a1cf7003a2 100644 (file)
@@ -6,11 +6,13 @@ import re
 from .common import InfoExtractor
 from ..compat import compat_urllib_parse_urlparse
 from ..utils import (
+    dict_get,
     ExtractorError,
     HEADRequest,
-    unified_strdate,
-    qualities,
     int_or_none,
+    qualities,
+    remove_end,
+    unified_strdate,
 )
 
 
@@ -43,47 +45,46 @@ class CanalplusIE(InfoExtractor):
 
     _TESTS = [{
         'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1192814',
-        'md5': '41f438a4904f7664b91b4ed0dec969dc',
         'info_dict': {
-            'id': '1192814',
+            'id': '1405510',
+            'display_id': 'pid1830-c-zapping',
             'ext': 'mp4',
-            'title': "L'Année du Zapping 2014 - L'Année du Zapping 2014",
-            'description': "Toute l'année 2014 dans un Zapping exceptionnel !",
-            'upload_date': '20150105',
+            'title': 'Zapping - 02/07/2016',
+            'description': 'Le meilleur de toutes les chaînes, tous les jours',
+            'upload_date': '20160702',
         },
     }, {
         'url': 'http://www.piwiplus.fr/videos-piwi/pid1405-le-labyrinthe-boing-super-ranger.html?vid=1108190',
         'info_dict': {
             'id': '1108190',
-            'ext': 'flv',
-            'title': 'Le labyrinthe - Boing super ranger',
+            'display_id': 'pid1405-le-labyrinthe-boing-super-ranger',
+            'ext': 'mp4',
+            'title': 'BOING SUPER RANGER - Ep : Le labyrinthe',
             'description': 'md5:4cea7a37153be42c1ba2c1d3064376ff',
             'upload_date': '20140724',
         },
         'skip': 'Only works from France',
     }, {
-        'url': 'http://www.d8.tv/d8-docs-mags/pid5198-d8-en-quete-d-actualite.html?vid=1390231',
+        'url': 'http://www.c8.fr/c8-divertissement/ms-touche-pas-a-mon-poste/pid6318-videos-integrales.html',
+        'md5': '4b47b12b4ee43002626b97fad8fb1de5',
         'info_dict': {
-            'id': '1390231',
+            'id': '1420213',
+            'display_id': 'pid6318-videos-integrales',
             'ext': 'mp4',
-            'title': "Vacances pas chères : prix discount ou grosses dépenses ? - En quête d'actualité",
-            'description': 'md5:edb6cf1cb4a1e807b5dd089e1ac8bfc6',
-            'upload_date': '20160512',
-        },
-        'params': {
-            'skip_download': True,
+            'title': 'TPMP ! Même le matin - Les 35H de Baba - 14/10/2016',
+            'description': 'md5:f96736c1b0ffaa96fd5b9e60ad871799',
+            'upload_date': '20161014',
         },
+        'skip': 'Only works from France',
     }, {
-        'url': 'http://www.itele.fr/chroniques/invite-bruce-toussaint/thierry-solere-nicolas-sarkozy-officialisera-sa-candidature-a-la-primaire-quand-il-le-voudra-167224',
+        'url': 'http://www.itele.fr/chroniques/invite-michael-darmon/rachida-dati-nicolas-sarkozy-est-le-plus-en-phase-avec-les-inquietudes-des-francais-171510',
         'info_dict': {
-            'id': '1398334',
+            'id': '1420176',
+            'display_id': 'rachida-dati-nicolas-sarkozy-est-le-plus-en-phase-avec-les-inquietudes-des-francais-171510',
             'ext': 'mp4',
-            'title': "L'invité de Bruce Toussaint du 07/06/2016 - ",
-            'description': 'md5:40ac7c9ad0feaeb6f605bad986f61324',
-            'upload_date': '20160607',
-        },
-        'params': {
-            'skip_download': True,
+            'title': 'L\'invité de Michaël Darmon du 14/10/2016 - ',
+            'description': 'Chaque matin du lundi au vendredi, Michaël Darmon reçoit un invité politique à 8h25.',
+            'upload_date': '20161014',
         },
     }, {
         'url': 'http://m.canalplus.fr/?vid=1398231',
@@ -95,18 +96,17 @@ class CanalplusIE(InfoExtractor):
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.groupdict().get('id') or mobj.groupdict().get('vid')
 
         site_id = self._SITE_ID_MAP[compat_urllib_parse_urlparse(url).netloc.rsplit('.', 2)[-2]]
 
         # Beware, some subclasses do not define an id group
-        display_id = mobj.group('display_id') or video_id
+        display_id = remove_end(dict_get(mobj.groupdict(), ('display_id', 'id', 'vid')), '.html')
 
-        if video_id is None:
-            webpage = self._download_webpage(url, display_id)
-            video_id = self._search_regex(
-                [r'<canal:player[^>]+?videoId=(["\'])(?P<id>\d+)', r'id=["\']canal_video_player(?P<id>\d+)'],
-                webpage, 'video id', group='id')
+        webpage = self._download_webpage(url, display_id)
+        video_id = self._search_regex(
+            [r'<canal:player[^>]+?videoId=(["\'])(?P<id>\d+)',
+             r'id=["\']canal_video_player(?P<id>\d+)'],
+            webpage, 'video id', group='id')
 
         info_url = self._VIDEO_INFO_TEMPLATE % (site_id, video_id)
         video_data = self._download_json(info_url, video_id, 'Downloading video JSON')
index 5797fb95142f556bade163051f4f51a40e32cda7..66c0f900a402664653a846e9b39fc44c1da2853e 100644 (file)
@@ -9,6 +9,8 @@ from ..utils import (
     try_get,
 )
 
+from .videomore import VideomoreIE
+
 
 class CarambaTVIE(InfoExtractor):
     _VALID_URL = r'(?:carambatv:|https?://video1\.carambatv\.ru/v/)(?P<id>\d+)'
@@ -62,14 +64,16 @@ class CarambaTVPageIE(InfoExtractor):
     _VALID_URL = r'https?://carambatv\.ru/(?:[^/]+/)+(?P<id>[^/?#&]+)'
     _TEST = {
         'url': 'http://carambatv.ru/movie/bad-comedian/razborka-v-manile/',
-        'md5': '',
+        'md5': 'a49fb0ec2ad66503eeb46aac237d3c86',
         'info_dict': {
-            'id': '191910501',
-            'ext': 'mp4',
+            'id': '475222',
+            'ext': 'flv',
             'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)',
-            'thumbnail': 're:^https?://.*\.jpg$',
-            'duration': 2678.31,
+            'thumbnail': 're:^https?://.*\.jpg',
+            # duration reported by videomore is incorrect
+            'duration': int,
         },
+        'add_ie': [VideomoreIE.ie_key()],
     }
 
     def _real_extract(self, url):
@@ -77,6 +81,16 @@ class CarambaTVPageIE(InfoExtractor):
 
         webpage = self._download_webpage(url, video_id)
 
+        videomore_url = VideomoreIE._extract_url(webpage)
+        if videomore_url:
+            title = self._og_search_title(webpage)
+            return {
+                '_type': 'url_transparent',
+                'url': videomore_url,
+                'ie_key': VideomoreIE.ie_key(),
+                'title': title,
+            }
+
         video_url = self._og_search_property('video:iframe', webpage, default=None)
 
         if not video_url:
index 821db20b23052ca71d594c6c05ad705a400129a3..57b18e81d412b20162f60e8d8e44699b76f2e3af 100644 (file)
@@ -63,7 +63,7 @@ class CBSInteractiveIE(ThePlatformIE):
         webpage = self._download_webpage(url, display_id)
 
         data_json = self._html_search_regex(
-            r"data-(?:cnet|zdnet)-video(?:-uvp)?-options='([^']+)'",
+            r"data-(?:cnet|zdnet)-video(?:-uvp(?:js)?)?-options='([^']+)'",
             webpage, 'data json')
         data = self._parse_json(data_json, display_id)
         vdata = data.get('video') or data['videos'][0]
index 61aed016753b28ceac34974c76aca0f9e1639f35..f35df143a604695c0b1fe7b0e33d7384192d1d98 100644 (file)
@@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 
 import base64
+import re
 
 from .common import InfoExtractor
 from ..utils import parse_duration
@@ -70,7 +71,6 @@ class ChirbitProfileIE(InfoExtractor):
         'url': 'http://chirbit.com/ScarletBeauty',
         'info_dict': {
             'id': 'ScarletBeauty',
-            'title': 'Chirbits by ScarletBeauty',
         },
         'playlist_mincount': 3,
     }
@@ -78,13 +78,10 @@ class ChirbitProfileIE(InfoExtractor):
     def _real_extract(self, url):
         profile_id = self._match_id(url)
 
-        rss = self._download_xml(
-            'http://chirbit.com/rss/%s' % profile_id, profile_id)
+        webpage = self._download_webpage(url, profile_id)
 
         entries = [
-            self.url_result(audio_url.text, 'Chirbit')
-            for audio_url in rss.findall('./channel/item/link')]
+            self.url_result(self._proto_relative_url('//chirb.it/' + video_id))
+            for _, video_id in re.findall(r'<input[^>]+id=([\'"])copy-btn-(?P<id>[0-9a-zA-Z]+)\1', webpage)]
 
-        title = rss.find('./channel/title').text
-
-        return self.playlist_result(entries, profile_id, title)
+        return self.playlist_result(entries, profile_id)
index 3a47f6fa4e1cdf734670ff64abb9aa4c02c94a6e..bb52e0c6ff75178626f83cd0a6d2de6607e861ad 100644 (file)
@@ -1,3 +1,4 @@
+# coding: utf-8
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
@@ -10,15 +11,15 @@ from ..utils import (
 class ClipfishIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?clipfish\.de/(?:[^/]+/)+video/(?P<id>[0-9]+)'
     _TEST = {
-        'url': 'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/',
-        'md5': '79bc922f3e8a9097b3d68a93780fd475',
+        'url': 'http://www.clipfish.de/special/ugly-americans/video/4343170/s01-e01-ugly-americans-date-in-der-hoelle/',
+        'md5': '720563e467b86374c194bdead08d207d',
         'info_dict': {
-            'id': '3966754',
+            'id': '4343170',
             'ext': 'mp4',
-            'title': 'FIFA 14 - E3 2013 Trailer',
-            'description': 'Video zu FIFA 14: E3 2013 Trailer',
-            'upload_date': '20130611',
-            'duration': 82,
+            'title': 'S01 E01 - Ugly Americans - Date in der Hölle',
+            'description': 'Mark Lilly arbeitet im Sozialdienst der Stadt New York und soll Immigranten bei ihrer Einbürgerung in die USA zur Seite stehen.',
+            'upload_date': '20161005',
+            'duration': 1291,
             'view_count': int,
         }
     }
@@ -50,10 +51,14 @@ class ClipfishIE(InfoExtractor):
                 'tbr': int_or_none(video_info.get('bitrate')),
             })
 
+        descr = video_info.get('descr')
+        if descr:
+            descr = descr.strip()
+
         return {
             'id': video_id,
             'title': video_info['title'],
-            'description': video_info.get('descr'),
+            'description': descr,
             'formats': formats,
             'thumbnail': video_info.get('media_content_thumbnail_large') or video_info.get('media_thumbnail'),
             'duration': int_or_none(video_info.get('media_length')),
index ac3bdfe8f2fcaf2344d02c00f53dcfc9cb12af32..7d3e9b0c9ce89fff9b8094f2d86beaa5fb35e7e0 100644 (file)
@@ -26,7 +26,7 @@ class CMTIE(MTVIE):
             'id': '1504699',
             'ext': 'mp4',
             'title': 'Still The King Ep. 109 in 3 Minutes',
-            'description': 'Relive or catch up with Still The King by watching this recap of season 1, episode 9. New episodes Sundays 9/8c.',
+            'description': 'Relive or catch up with Still The King by watching this recap of season 1, episode 9.',
             'timestamp': 1469421000.0,
             'upload_date': '20160725',
         },
@@ -42,3 +42,8 @@ class CMTIE(MTVIE):
                 '%s said: video is not available' % cls.IE_NAME, expected=True)
 
         return super(CMTIE, cls)._transform_rtmp_url(rtmp_video_url)
+
+    def _extract_mgid(self, webpage):
+        return self._search_regex(
+            r'MTVN\.VIDEO\.contentUri\s*=\s*([\'"])(?P<mgid>.+?)\1',
+            webpage, 'mgid', group='mgid')
index c38fd095a3ea95facf214e6eb350fc71a20b39a6..cc141f68ec52f4d3b7f795a099b5b1ccf310fdbb 100644 (file)
@@ -150,6 +150,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):
             # rtmp
             'skip_download': True,
         },
+        'skip': 'Video gone',
     }, {
         'url': 'http://www.crunchyroll.com/rezero-starting-life-in-another-world-/episode-5-the-morning-of-our-promise-is-still-distant-702409',
         'info_dict': {
index 6ae30679a0a226b0d242b2ef773fbab9a90920c5..c7b1075725222842fbdad5cc300011e440824a0a 100644 (file)
@@ -1,28 +1,28 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import json
 import re
 import calendar
 import datetime
 
 from .common import InfoExtractor
+from ..compat import compat_str
 from ..utils import (
     HEADRequest,
     unified_strdate,
-    ExtractorError,
     strip_jsonp,
     int_or_none,
     float_or_none,
     determine_ext,
     remove_end,
+    unescapeHTML,
 )
 
 
 class ORFTVthekIE(InfoExtractor):
     IE_NAME = 'orf:tvthek'
     IE_DESC = 'ORF TVthek'
-    _VALID_URL = r'https?://tvthek\.orf\.at/(?:programs/.+?/episodes|topics?/.+?|program/[^/]+)/(?P<id>\d+)'
+    _VALID_URL = r'https?://tvthek\.orf\.at/(?:[^/]+/)+(?P<id>\d+)'
 
     _TESTS = [{
         'url': 'http://tvthek.orf.at/program/Aufgetischt/2745173/Aufgetischt-Mit-der-Steirischen-Tafelrunde/8891389',
@@ -51,26 +51,23 @@ class ORFTVthekIE(InfoExtractor):
             'skip_download': True,  # rtsp downloads
         },
         '_skip': 'Blocked outside of Austria / Germany',
+    }, {
+        'url': 'http://tvthek.orf.at/topic/Fluechtlingskrise/10463081/Heimat-Fremde-Heimat/13879132/Senioren-betreuen-Migrantenkinder/13879141',
+        'skip_download': True,
+    }, {
+        'url': 'http://tvthek.orf.at/profile/Universum/35429',
+        'skip_download': True,
     }]
 
     def _real_extract(self, url):
         playlist_id = self._match_id(url)
         webpage = self._download_webpage(url, playlist_id)
 
-        data_json = self._search_regex(
-            r'initializeAdworx\((.+?)\);\n', webpage, 'video info')
-        all_data = json.loads(data_json)
-
-        def get_segments(all_data):
-            for data in all_data:
-                if data['name'] in (
-                        'Tracker::EPISODE_DETAIL_PAGE_OVER_PROGRAM',
-                        'Tracker::EPISODE_DETAIL_PAGE_OVER_TOPIC'):
-                    return data['values']['segments']
-
-        sdata = get_segments(all_data)
-        if not sdata:
-            raise ExtractorError('Unable to extract segments')
+        data_jsb = self._parse_json(
+            self._search_regex(
+                r'<div[^>]+class=(["\']).*?VideoPlaylist.*?\1[^>]+data-jsb=(["\'])(?P<json>.+?)\2',
+                webpage, 'playlist', group='json'),
+            playlist_id, transform_source=unescapeHTML)['playlist']['videos']
 
         def quality_to_int(s):
             m = re.search('([0-9]+)', s)
@@ -79,8 +76,11 @@ class ORFTVthekIE(InfoExtractor):
             return int(m.group(1))
 
         entries = []
-        for sd in sdata:
-            video_id = sd['id']
+        for sd in data_jsb:
+            video_id, title = sd.get('id'), sd.get('title')
+            if not video_id or not title:
+                continue
+            video_id = compat_str(video_id)
             formats = [{
                 'preference': -10 if fd['delivery'] == 'hls' else None,
                 'format_id': '%s-%s-%s' % (
@@ -88,7 +88,7 @@ class ORFTVthekIE(InfoExtractor):
                 'url': fd['src'],
                 'protocol': fd['protocol'],
                 'quality': quality_to_int(fd['quality']),
-            } for fd in sd['playlist_item_array']['sources']]
+            } for fd in sd['sources']]
 
             # Check for geoblocking.
             # There is a property is_geoprotection, but that's always false
@@ -115,14 +115,14 @@ class ORFTVthekIE(InfoExtractor):
             self._check_formats(formats, video_id)
             self._sort_formats(formats)
 
-            upload_date = unified_strdate(sd['created_date'])
+            upload_date = unified_strdate(sd.get('created_date'))
             entries.append({
                 '_type': 'video',
                 'id': video_id,
-                'title': sd['header'],
+                'title': title,
                 'formats': formats,
                 'description': sd.get('description'),
-                'duration': int(sd['duration_in_seconds']),
+                'duration': int_or_none(sd.get('duration_in_seconds')),
                 'upload_date': upload_date,
                 'thumbnail': sd.get('image_full_url'),
             })
index 8b35fd244addc68bb99b345c949284bc5f170361..c3aec1edde5e9d02efb377fa39941ae01d2f04b4 100644 (file)
@@ -157,7 +157,14 @@ class SafariCourseIE(SafariBaseIE):
     IE_NAME = 'safari:course'
     IE_DESC = 'safaribooksonline.com online courses'
 
-    _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)/(?P<id>[^/]+)/?(?:[#?]|$)'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            (?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)|
+                            techbus\.safaribooksonline\.com
+                        )
+                        /(?P<id>[^/]+)/?(?:[#?]|$)
+                    '''
 
     _TESTS = [{
         'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/',
@@ -170,6 +177,9 @@ class SafariCourseIE(SafariBaseIE):
     }, {
         'url': 'https://www.safaribooksonline.com/api/v1/book/9781449396459/?override_format=json',
         'only_matching': True,
+    }, {
+        'url': 'http://techbus.safaribooksonline.com/9780134426365',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
index 8a11ff84828a26d35566c7f5fe65c3f4cdc322b4..7f25665864c696757903deeb582a64f16eec0d85 100644 (file)
@@ -86,6 +86,11 @@ class VideomoreIE(InfoExtractor):
         mobj = re.search(
             r'<object[^>]+data=(["\'])https?://videomore\.ru/player\.swf\?.*config=(?P<url>https?://videomore\.ru/(?:[^/]+/)+\d+\.xml).*\1',
             webpage)
+        if not mobj:
+            mobj = re.search(
+                r'<iframe[^>]+src=([\'"])(?P<url>https?://videomore\.ru/embed/\d+)',
+                webpage)
+
         if mobj:
             return mobj.group('url')