Merge branch 'porn91' of https://github.com/PeterDing/youtube-dl into PeterDing-porn91
authorYen Chi Hsuan <yan12125@gmail.com>
Sat, 30 May 2015 15:33:10 +0000 (23:33 +0800)
committerYen Chi Hsuan <yan12125@gmail.com>
Sat, 30 May 2015 15:33:10 +0000 (23:33 +0800)
18 files changed:
README.md
docs/supportedsites.md
youtube_dl/YoutubeDL.py
youtube_dl/extractor/__init__.py
youtube_dl/extractor/crunchyroll.py
youtube_dl/extractor/facebook.py
youtube_dl/extractor/nowtv.py [new file with mode: 0644]
youtube_dl/extractor/rtlnow.py [deleted file]
youtube_dl/extractor/senateisvp.py
youtube_dl/extractor/soompi.py [new file with mode: 0644]
youtube_dl/extractor/spiegeltv.py
youtube_dl/extractor/tf1.py
youtube_dl/extractor/tube8.py
youtube_dl/extractor/tubitv.py [new file with mode: 0644]
youtube_dl/extractor/vgtv.py
youtube_dl/extractor/youtube.py
youtube_dl/postprocessor/embedthumbnail.py
youtube_dl/version.py

index e51bb534341e389a26a466f1fb4c3ef721731016..f3d83c89fafbdff0dc6b22b6d207a28428b59f96 100644 (file)
--- a/README.md
+++ b/README.md
@@ -168,7 +168,7 @@ which means you can modify it, redistribute it or use it however you like.
     --no-progress                    Do not print progress bar
     --console-title                  Display progress in console titlebar
     -v, --verbose                    Print various debugging information
-    --dump-pages                     Print downloaded pages to debug problems (very verbose)
+    --dump-pages                     Print downloaded pages encoded using base64 to debug problems (very verbose)
     --write-pages                    Write downloaded intermediary pages to files in the current directory to debug problems
     --print-traffic                  Display sent and read HTTP traffic
     -C, --call-home                  Contact the youtube-dl server for debugging
@@ -220,7 +220,7 @@ which means you can modify it, redistribute it or use it however you like.
     --embed-thumbnail                Embed thumbnail in the audio as cover art
     --add-metadata                   Write metadata to the video file
     --metadata-from-title FORMAT     Parse additional metadata like song title / artist from the video title. The format syntax is the same as --output, the parsed
-                                     parameters replace existing values. Additional templates: %(album), %(artist). Example: --metadata-from-title "%(artist)s -
+                                     parameters replace existing values. Additional templates: %(album)s, %(artist)s. Example: --metadata-from-title "%(artist)s -
                                      %(title)s" matches a title like "Coldplay - Paradise"
     --xattrs                         Write metadata to the video file's xattrs (using dublin core and xdg standards)
     --fixup POLICY                   Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn(the default;
index a4879bd9a1a4b5221e824927e0ef0ec4d6c9f734..a421ae62bf95bd8173c0f6427163b25dd150ae18 100644 (file)
@@ -26,8 +26,7 @@
  - **anitube.se**
  - **AnySex**
  - **Aparat**
- - **AppleDailyAnimationNews**
- - **AppleDailyRealtimeNews**
+ - **AppleDaily**
  - **AppleTrailers**
  - **archive.org**: archive.org videos
  - **ARD**
  - **fc2**
  - **fernsehkritik.tv**
  - **fernsehkritik.tv:postecke**
- - **Firedrive**
  - **Firstpost**
  - **Flickr**
  - **Folketinget**: Folketinget (ft.dk; Danish parliament)
  - **KanalPlay**: Kanal 5/9/11 Play
  - **Kankan**
  - **Karaoketv**
+ - **KarriereVideos**
  - **keek**
  - **KeezMovies**
  - **KhanAcademy**
  - **NosVideo**
  - **novamov**: NovaMov
  - **Nowness**
+ - **NowTV**
  - **nowvideo**: NowVideo
  - **npo.nl**
  - **npo.nl:live**
  - **Rte**
  - **rtl.nl**: rtl.nl and rtlxl.nl
  - **RTL2**
- - **RTLnow**
  - **RTP**
  - **RTS**: RTS.ch
  - **rtve.es:alacarta**: RTVE a la carta
  - **smotri:community**: Smotri.com community videos
  - **smotri:user**: Smotri.com user videos
  - **Snotr**
- - **Sockshare**
  - **Sohu**
  - **soundcloud**
  - **soundcloud:playlist**
  - **vier:videos**
  - **Viewster**
  - **viki**
+ - **viki:channel**
  - **vimeo**
  - **vimeo:album**
  - **vimeo:channel**
index d1953c18f39b438740aec88a1aadf4d529a8e0b4..21d247f234cf7d8ed6a73bdc8132e4271d8a907a 100755 (executable)
@@ -1047,6 +1047,8 @@ class YoutubeDL(object):
         if not formats:
             raise ExtractorError('No video formats found!')
 
+        formats_dict = {}
+
         # We check that all the formats have the format and format_id fields
         for i, format in enumerate(formats):
             if 'url' not in format:
@@ -1054,6 +1056,18 @@ class YoutubeDL(object):
 
             if format.get('format_id') is None:
                 format['format_id'] = compat_str(i)
+            format_id = format['format_id']
+            if format_id not in formats_dict:
+                formats_dict[format_id] = []
+            formats_dict[format_id].append(format)
+
+        # Make sure all formats have unique format_id
+        for format_id, ambiguous_formats in formats_dict.items():
+            if len(ambiguous_formats) > 1:
+                for i, format in enumerate(ambiguous_formats):
+                    format['format_id'] = '%s-%d' % (format_id, i)
+
+        for i, format in enumerate(formats):
             if format.get('format') is None:
                 format['format'] = '{id} - {res}{note}'.format(
                     id=format['format_id'],
index d20ad286d1712cba23576f9bad27fd91f22594d9..4dc07efe0e9f3c962a9880ecc5abc4d475dcf00f 100644 (file)
@@ -354,6 +354,7 @@ from .normalboots import NormalbootsIE
 from .nosvideo import NosVideoIE
 from .novamov import NovaMovIE
 from .nowness import NownessIE
+from .nowtv import NowTVIE
 from .nowvideo import NowVideoIE
 from .npo import (
     NPOIE,
@@ -438,7 +439,6 @@ from .roxwel import RoxwelIE
 from .rtbf import RTBFIE
 from .rte import RteIE
 from .rtlnl import RtlNlIE
-from .rtlnow import RTLnowIE
 from .rtl2 import RTL2IE
 from .rtp import RTPIE
 from .rts import RTSIE
@@ -481,6 +481,10 @@ from .smotri import (
 )
 from .snotr import SnotrIE
 from .sohu import SohuIE
+from .soompi import (
+    SoompiIE,
+    SoompiShowIE,
+)
 from .soundcloud import (
     SoundcloudIE,
     SoundcloudSetIE,
@@ -566,6 +570,7 @@ from .traileraddict import TrailerAddictIE
 from .trilulilu import TriluliluIE
 from .trutube import TruTubeIE
 from .tube8 import Tube8IE
+from .tubitv import TubiTvIE
 from .tudou import TudouIE
 from .tumblr import TumblrIE
 from .tunein import TuneInIE
index 1c77df47ef346173fc11a58396c98768e5afc986..41f0c736d98c229518bacb41fac2f35ce9b80958 100644 (file)
@@ -76,8 +76,8 @@ class CrunchyrollIE(InfoExtractor):
         self._login()
 
     def _decrypt_subtitles(self, data, iv, id):
-        data = bytes_to_intlist(data)
-        iv = bytes_to_intlist(iv)
+        data = bytes_to_intlist(base64.b64decode(data.encode('utf-8')))
+        iv = bytes_to_intlist(base64.b64decode(iv.encode('utf-8')))
         id = int(id)
 
         def obfuscate_key_aux(count, modulo, start):
@@ -179,6 +179,16 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
 
         return output
 
+    def _extract_subtitles(self, subtitle):
+        sub_root = xml.etree.ElementTree.fromstring(subtitle)
+        return [{
+            'ext': 'srt',
+            'data': self._convert_subtitles_to_srt(sub_root),
+        }, {
+            'ext': 'ass',
+            'data': self._convert_subtitles_to_ass(sub_root),
+        }]
+
     def _get_subtitles(self, video_id, webpage):
         subtitles = {}
         for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage):
@@ -190,25 +200,11 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
             data = self._search_regex(r'<data>([^<]+)', sub_page, 'subtitle_data', fatal=False)
             if not id or not iv or not data:
                 continue
-            id = int(id)
-            iv = base64.b64decode(iv)
-            data = base64.b64decode(data)
-
             subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8')
             lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False)
             if not lang_code:
                 continue
-            sub_root = xml.etree.ElementTree.fromstring(subtitle)
-            subtitles[lang_code] = [
-                {
-                    'ext': 'srt',
-                    'data': self._convert_subtitles_to_srt(sub_root),
-                },
-                {
-                    'ext': 'ass',
-                    'data': self._convert_subtitles_to_ass(sub_root),
-                },
-            ]
+            subtitles[lang_code] = self._extract_subtitles(subtitle)
         return subtitles
 
     def _real_extract(self, url):
index e8d6827165f2b7c40b51c084f128c0992b7e49cf..82dc27bc6ff3ed2edd3b318f3ed3d14e360ef22d 100644 (file)
@@ -152,7 +152,7 @@ class FacebookIE(InfoExtractor):
             raise ExtractorError('Cannot find video formats')
 
         video_title = self._html_search_regex(
-            r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, 'title',
+            r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, 'title',
             default=None)
         if not video_title:
             video_title = self._html_search_regex(
diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py
new file mode 100644 (file)
index 0000000..173e46c
--- /dev/null
@@ -0,0 +1,192 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    parse_iso8601,
+    parse_duration,
+    remove_start,
+)
+
+
+class NowTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?P<station>rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/player'
+
+    _TESTS = [{
+        # rtl
+        'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/player',
+        'info_dict': {
+            'id': '203519',
+            'display_id': 'bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit',
+            'ext': 'mp4',
+            'title': 'Die neuen Bauern und eine Hochzeit',
+            'description': 'md5:e234e1ed6d63cf06be5c070442612e7e',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1432580700,
+            'upload_date': '20150525',
+            'duration': 2786,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # rtl2
+        'url': 'http://www.nowtv.de/rtl2/berlin-tag-nacht/berlin-tag-nacht-folge-934/player',
+        'info_dict': {
+            'id': '203481',
+            'display_id': 'berlin-tag-nacht/berlin-tag-nacht-folge-934',
+            'ext': 'mp4',
+            'title': 'Berlin - Tag & Nacht (Folge 934)',
+            'description': 'md5:c85e88c2e36c552dfe63433bc9506dd0',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1432666800,
+            'upload_date': '20150526',
+            'duration': 2641,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # rtlnitro
+        'url': 'http://www.nowtv.de/rtlnitro/alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00/player',
+        'info_dict': {
+            'id': '165780',
+            'display_id': 'alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00',
+            'ext': 'mp4',
+            'title': 'Hals- und Beinbruch',
+            'description': 'md5:b50d248efffe244e6f56737f0911ca57',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1432415400,
+            'upload_date': '20150523',
+            'duration': 2742,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # superrtl
+        'url': 'http://www.nowtv.de/superrtl/medicopter-117/angst/player',
+        'info_dict': {
+            'id': '99205',
+            'display_id': 'medicopter-117/angst',
+            'ext': 'mp4',
+            'title': 'Angst!',
+            'description': 'md5:30cbc4c0b73ec98bcd73c9f2a8c17c4e',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1222632900,
+            'upload_date': '20080928',
+            'duration': 3025,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # ntv
+        'url': 'http://www.nowtv.de/ntv/ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch/player',
+        'info_dict': {
+            'id': '203521',
+            'display_id': 'ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch',
+            'ext': 'mp4',
+            'title': 'Thema u.a.: Der erste Blick: Die Apple Watch',
+            'description': 'md5:4312b6c9d839ffe7d8caf03865a531af',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1432751700,
+            'upload_date': '20150527',
+            'duration': 1083,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # vox
+        'url': 'http://www.nowtv.de/vox/der-hundeprofi/buero-fall-chihuahua-joel/player',
+        'info_dict': {
+            'id': '128953',
+            'display_id': 'der-hundeprofi/buero-fall-chihuahua-joel',
+            'ext': 'mp4',
+            'title': "Büro-Fall / Chihuahua 'Joel'",
+            'description': 'md5:e62cb6bf7c3cc669179d4f1eb279ad8d',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1432408200,
+            'upload_date': '20150523',
+            'duration': 3092,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('id')
+        station = mobj.group('station')
+
+        info = self._download_json(
+            'https://api.nowtv.de/v3/movies/%s?fields=*,format,files' % display_id,
+            display_id)
+
+        video_id = compat_str(info['id'])
+
+        files = info['files']
+        if not files:
+            if info.get('geoblocked', False):
+                raise ExtractorError(
+                    'Video %s is not available from your location due to geo restriction' % video_id,
+                    expected=True)
+            if not info.get('free', True):
+                raise ExtractorError(
+                    'Video %s is not available for free' % video_id, expected=True)
+
+        f = info.get('format', {})
+        station = f.get('station') or station
+
+        STATIONS = {
+            'rtl': 'rtlnow',
+            'rtl2': 'rtl2now',
+            'vox': 'voxnow',
+            'nitro': 'rtlnitronow',
+            'ntv': 'n-tvnow',
+            'superrtl': 'superrtlnow'
+        }
+
+        formats = []
+        for item in files['items']:
+            item_path = remove_start(item['path'], '/')
+            tbr = int_or_none(item['bitrate'])
+            m3u8_url = 'http://hls.fra.%s.de/hls-vod-enc/%s.m3u8' % (STATIONS[station], item_path)
+            m3u8_url = m3u8_url.replace('now/', 'now/videos/')
+            formats.append({
+                'url': m3u8_url,
+                'format_id': '%s-%sk' % (item['id'], tbr),
+                'ext': 'mp4',
+                'tbr': tbr,
+            })
+        self._sort_formats(formats)
+
+        title = info['title']
+        description = info.get('articleLong') or info.get('articleShort')
+        timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ')
+        duration = parse_duration(info.get('duration'))
+        thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo')
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'duration': duration,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py
deleted file mode 100644 (file)
index 785a804..0000000
+++ /dev/null
@@ -1,174 +0,0 @@
-# encoding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
-    clean_html,
-    unified_strdate,
-    int_or_none,
-)
-
-
-class RTLnowIE(InfoExtractor):
-    """Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW"""
-    _VALID_URL = r'''(?x)
-                        (?:https?://)?
-                        (?P<url>
-                            (?P<domain>
-                                rtl-now\.rtl\.de|
-                                rtl2now\.rtl2\.de|
-                                (?:www\.)?voxnow\.de|
-                                (?:www\.)?rtlnitronow\.de|
-                                (?:www\.)?superrtlnow\.de|
-                                (?:www\.)?n-tvnow\.de)
-                            /+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?
-                            (?:container_id|film_id)=(?P<video_id>[0-9]+)&
-                            player=1(?:&season=[0-9]+)?(?:&.*)?
-                        )'''
-
-    _TESTS = [
-        {
-            'url': 'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1',
-            'info_dict': {
-                'id': '90419',
-                'ext': 'flv',
-                'title': 'Ahornallee - Folge 1 - Der Einzug',
-                'description': 'md5:ce843b6b5901d9a7f7d04d1bbcdb12de',
-                'upload_date': '20070416',
-                'duration': 1685,
-            },
-            'params': {
-                'skip_download': True,
-            },
-            'skip': 'Only works from Germany',
-        },
-        {
-            'url': 'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5',
-            'info_dict': {
-                'id': '69756',
-                'ext': 'flv',
-                'title': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.',
-                'description': 'md5:3fb247005ed21a935ffc82b7dfa70cf0',
-                'thumbnail': 'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg',
-                'upload_date': '20120519',
-                'duration': 1245,
-            },
-            'params': {
-                'skip_download': True,
-            },
-            'skip': 'Only works from Germany',
-        },
-        {
-            'url': 'http://www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17',
-            'info_dict': {
-                'id': '13883',
-                'ext': 'flv',
-                'title': 'Voxtours - Südafrika-Reporter II',
-                'description': 'md5:de7f8d56be6fd4fed10f10f57786db00',
-                'upload_date': '20090627',
-                'duration': 1800,
-            },
-            'params': {
-                'skip_download': True,
-            },
-        },
-        {
-            'url': 'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1',
-            'info_dict': {
-                'id': '99205',
-                'ext': 'flv',
-                'title': 'Medicopter 117 - Angst!',
-                'description': 're:^Im Therapiezentrum \'Sonnalm\' kommen durch eine Unachtsamkeit die für die B.handlung mit Phobikern gehaltenen Voglespinnen frei\. Eine Ausreißerin',
-                'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg',
-                'upload_date': '20080928',
-                'duration': 2691,
-            },
-            'params': {
-                'skip_download': True,
-            },
-        },
-        {
-            'url': 'http://rtl-now.rtl.de/der-bachelor/folge-4.php?film_id=188729&player=1&season=5',
-            'info_dict': {
-                'id': '188729',
-                'ext': 'flv',
-                'upload_date': '20150204',
-                'description': 'md5:5e1ce23095e61a79c166d134b683cecc',
-                'title': 'Der Bachelor - Folge 4',
-            }
-        }, {
-            'url': 'http://www.n-tvnow.de/deluxe-alles-was-spass-macht/thema-ua-luxushotel-fuer-vierbeiner.php?container_id=153819&player=1&season=0',
-            'only_matching': True,
-        },
-    ]
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_page_url = 'http://%s/' % mobj.group('domain')
-        video_id = mobj.group('video_id')
-
-        webpage = self._download_webpage('http://' + mobj.group('url'), video_id)
-
-        mobj = re.search(r'(?s)<div style="margin-left: 20px; font-size: 13px;">(.*?)<div id="playerteaser">', webpage)
-        if mobj:
-            raise ExtractorError(clean_html(mobj.group(1)), expected=True)
-
-        title = self._og_search_title(webpage)
-        description = self._og_search_description(webpage)
-        thumbnail = self._og_search_thumbnail(webpage, default=None)
-
-        upload_date = unified_strdate(self._html_search_meta('uploadDate', webpage, 'upload date'))
-
-        mobj = re.search(r'<meta itemprop="duration" content="PT(?P<seconds>\d+)S" />', webpage)
-        duration = int(mobj.group('seconds')) if mobj else None
-
-        playerdata_url = self._html_search_regex(
-            r"'playerdata': '(?P<playerdata_url>[^']+)'", webpage, 'playerdata_url')
-
-        playerdata = self._download_xml(playerdata_url, video_id, 'Downloading player data XML')
-
-        videoinfo = playerdata.find('./playlist/videoinfo')
-
-        formats = []
-        for filename in videoinfo.findall('filename'):
-            mobj = re.search(r'(?P<url>rtmpe://(?:[^/]+/){2})(?P<play_path>.+)', filename.text)
-            if mobj:
-                fmt = {
-                    'url': mobj.group('url'),
-                    'play_path': 'mp4:' + mobj.group('play_path'),
-                    'page_url': video_page_url,
-                    'player_url': video_page_url + 'includes/vodplayer.swf',
-                }
-            else:
-                mobj = re.search(r'.*/(?P<hoster>[^/]+)/videos/(?P<play_path>.+)\.f4m', filename.text)
-                if mobj:
-                    fmt = {
-                        'url': 'rtmpe://fms.rtl.de/' + mobj.group('hoster'),
-                        'play_path': 'mp4:' + mobj.group('play_path'),
-                        'page_url': url,
-                        'player_url': video_page_url + 'includes/vodplayer.swf',
-                    }
-                else:
-                    fmt = {
-                        'url': filename.text,
-                    }
-            fmt.update({
-                'width': int_or_none(filename.get('width')),
-                'height': int_or_none(filename.get('height')),
-                'vbr': int_or_none(filename.get('bitrate')),
-                'ext': 'flv',
-            })
-            formats.append(fmt)
-
-        return {
-            'id': video_id,
-            'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
-            'upload_date': upload_date,
-            'duration': duration,
-            'formats': formats,
-        }
index d3b8a1be49702f71a1a8c4eb7bd01d17cf103071..9c53704ea383b1af34e8f8157e327b71c2c3865a 100644 (file)
@@ -48,7 +48,7 @@ class SenateISVPIE(InfoExtractor):
         ["arch", "", "http://ussenate-f.akamaihd.net/"]
     ]
     _IE_NAME = 'senate.gov'
-    _VALID_URL = r'http://www\.senate\.gov/isvp/\?(?P<qs>.+)'
+    _VALID_URL = r'http://www\.senate\.gov/isvp/?\?(?P<qs>.+)'
     _TESTS = [{
         'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png',
         'info_dict': {
@@ -72,12 +72,16 @@ class SenateISVPIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'Integrated Senate Video Player'
         }
+    }, {
+        # From http://www.c-span.org/video/?96791-1
+        'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715',
+        'only_matching': True,
     }]
 
     @staticmethod
     def _search_iframe_url(webpage):
         mobj = re.search(
-            r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/\?[^'\"]+)['\"]",
+            r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]",
             webpage)
         if mobj:
             return mobj.group('url')
diff --git a/youtube_dl/extractor/soompi.py b/youtube_dl/extractor/soompi.py
new file mode 100644 (file)
index 0000000..5da66ca
--- /dev/null
@@ -0,0 +1,146 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .crunchyroll import CrunchyrollIE
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    remove_start,
+    xpath_text,
+)
+
+
+class SoompiBaseIE(InfoExtractor):
+    def _get_episodes(self, webpage, episode_filter=None):
+        episodes = self._parse_json(
+            self._search_regex(
+                r'VIDEOS\s*=\s*(\[.+?\]);', webpage, 'episodes JSON'),
+            None)
+        return list(filter(episode_filter, episodes))
+
+
+class SoompiIE(SoompiBaseIE, CrunchyrollIE):
+    IE_NAME = 'soompi'
+    _VALID_URL = r'https?://tv\.soompi\.com/(?:en/)?watch/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://tv.soompi.com/en/watch/29235',
+        'info_dict': {
+            'id': '29235',
+            'ext': 'mp4',
+            'title': 'Episode 1096',
+            'description': '2015-05-20'
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    def _get_episode(self, webpage, video_id):
+        return self._get_episodes(webpage, lambda x: x['id'] == video_id)[0]
+
+    def _get_subtitles(self, config, video_id):
+        sub_langs = {}
+        for subtitle in config.findall('./{default}preload/subtitles/subtitle'):
+            sub_langs[subtitle.attrib['id']] = subtitle.attrib['title']
+
+        subtitles = {}
+        for s in config.findall('./{default}preload/subtitle'):
+            lang_code = sub_langs.get(s.attrib['id'])
+            if not lang_code:
+                continue
+            sub_id = s.get('id')
+            data = xpath_text(s, './data', 'data')
+            iv = xpath_text(s, './iv', 'iv')
+            if not id or not iv or not data:
+                continue
+            subtitle = self._decrypt_subtitles(data, iv, sub_id).decode('utf-8')
+            subtitles[lang_code] = self._extract_subtitles(subtitle)
+        return subtitles
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        try:
+            webpage = self._download_webpage(
+                url, video_id, 'Downloading episode page')
+        except ExtractorError as ee:
+            if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
+                webpage = ee.cause.read()
+                block_message = self._html_search_regex(
+                    r'(?s)<div class="block-message">(.+?)</div>', webpage,
+                    'block message', default=None)
+                if block_message:
+                    raise ExtractorError(block_message, expected=True)
+            raise
+
+        formats = []
+        config = None
+        for format_id in re.findall(r'\?quality=([0-9a-zA-Z]+)', webpage):
+            config = self._download_xml(
+                'http://tv.soompi.com/en/show/_/%s-config.xml?mode=hls&quality=%s' % (video_id, format_id),
+                video_id, 'Downloading %s XML' % format_id)
+            m3u8_url = xpath_text(
+                config, './{default}preload/stream_info/file',
+                '%s m3u8 URL' % format_id)
+            if not m3u8_url:
+                continue
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, video_id, 'mp4', m3u8_id=format_id))
+        self._sort_formats(formats)
+
+        episode = self._get_episode(webpage, video_id)
+
+        title = episode['name']
+        description = episode.get('description')
+        duration = int_or_none(episode.get('duration'))
+
+        thumbnails = [{
+            'id': thumbnail_id,
+            'url': thumbnail_url,
+        } for thumbnail_id, thumbnail_url in episode.get('img_url', {}).items()]
+
+        subtitles = self.extract_subtitles(config, video_id)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnails': thumbnails,
+            'duration': duration,
+            'formats': formats,
+            'subtitles': subtitles
+        }
+
+
+class SoompiShowIE(SoompiBaseIE):
+    IE_NAME = 'soompi:show'
+    _VALID_URL = r'https?://tv\.soompi\.com/en/shows/(?P<id>[0-9a-zA-Z\-_]+)'
+    _TESTS = [{
+        'url': 'http://tv.soompi.com/en/shows/liar-game',
+        'info_dict': {
+            'id': 'liar-game',
+            'title': 'Liar Game',
+            'description': 'md5:52c02bce0c1a622a95823591d0589b66',
+        },
+        'playlist_count': 14,
+    }]
+
+    def _real_extract(self, url):
+        show_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            url, show_id, 'Downloading show page')
+
+        title = remove_start(self._og_search_title(webpage), 'SoompiTV | ')
+        description = self._og_search_description(webpage)
+
+        entries = [
+            self.url_result('http://tv.soompi.com/en/watch/%s' % episode['id'], 'Soompi')
+            for episode in self._get_episodes(webpage)]
+
+        return self.playlist_result(entries, show_id, title, description)
index 98cf92d89a1151edfd11b8f15a86eeaa6a83178d..359722ad697a7948bea484e7c8f8f77a924544f8 100644 (file)
@@ -51,9 +51,9 @@ class SpiegeltvIE(InfoExtractor):
         is_wide = media_json['is_wide']
 
         server_json = self._download_json(
-            'http://www.spiegel.tv/streaming_servers/', video_id,
-            note='Downloading server information')
-        server = server_json[0]['endpoint']
+            'http://spiegeltv-prod-static.s3.amazonaws.com/projectConfigs/projectConfig.json',
+            video_id, note='Downloading server information')
+        server = server_json['streamingserver'][0]['endpoint']
 
         thumbnails = []
         for image in media_json['images']:
@@ -76,5 +76,6 @@ class SpiegeltvIE(InfoExtractor):
             'ext': 'm4v',
             'description': description,
             'duration': duration,
-            'thumbnails': thumbnails
+            'thumbnails': thumbnails,
+            'rtmp_live': True,
         }
index 656410528172fed2a171bd2935afa247fb2e6841..3a68eaa80ea6867e6806a4f242a8afc910b8ba06 100644 (file)
@@ -6,7 +6,7 @@ from .common import InfoExtractor
 
 class TF1IE(InfoExtractor):
     """TF1 uses the wat.tv player."""
-    _VALID_URL = r'http://(?:videos\.tf1|www\.tfou|www\.tf1)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html'
+    _VALID_URL = r'http://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html'
     _TESTS = [{
         'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
         'info_dict': {
@@ -35,6 +35,9 @@ class TF1IE(InfoExtractor):
     }, {
         'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html',
         'only_matching': True,
+    }, {
+        'url': 'http://lci.tf1.fr/sept-a-huit/videos/sept-a-huit-du-24-mai-2015-8611550.html',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
index d73ad3762a1b455cfd4bc384c27e2dd85e776dde..6ca8840b0869f1490c06c7df0905c28e38ba1fee 100644 (file)
@@ -47,7 +47,7 @@ class Tube8IE(InfoExtractor):
         webpage = self._download_webpage(req, display_id)
 
         flashvars = json.loads(self._html_search_regex(
-            r'var flashvars\s*=\s*({.+?})', webpage, 'flashvars'))
+            r'flashvars\s*=\s*({.+?})', webpage, 'flashvars'))
 
         video_url = flashvars['video_url']
         if flashvars.get('encrypted') is True:
@@ -58,19 +58,19 @@ class Tube8IE(InfoExtractor):
         thumbnail = flashvars.get('image_url')
 
         title = self._html_search_regex(
-            r'videotitle\s*=\s*"([^"]+)', webpage, 'title')
+            r'videoTitle\s*=\s*"([^"]+)', webpage, 'title')
         description = self._html_search_regex(
-            r'>Description:</strong>(.+?)<', webpage, 'description', fatal=False)
+            r'>Description:</strong>\s*(.+?)\s*<', webpage, 'description', fatal=False)
         uploader = self._html_search_regex(
-            r'<strong class="video-username">(?:<a href="[^"]+">)?([^<]+)(?:</a>)?</strong>',
+            r'<span class="username">\s*(.+?)\s*<',
             webpage, 'uploader', fatal=False)
 
         like_count = int_or_none(self._html_search_regex(
-            r"rupVar\s*=\s*'(\d+)'", webpage, 'like count', fatal=False))
+            r'rupVar\s*=\s*"(\d+)"', webpage, 'like count', fatal=False))
         dislike_count = int_or_none(self._html_search_regex(
-            r"rdownVar\s*=\s*'(\d+)'", webpage, 'dislike count', fatal=False))
+            r'rdownVar\s*=\s*"(\d+)"', webpage, 'dislike count', fatal=False))
         view_count = self._html_search_regex(
-            r'<strong>Views: </strong>([\d,\.]+)</li>', webpage, 'view count', fatal=False)
+            r'<strong>Views: </strong>([\d,\.]+)\s*</li>', webpage, 'view count', fatal=False)
         if view_count:
             view_count = str_to_int(view_count)
         comment_count = self._html_search_regex(
diff --git a/youtube_dl/extractor/tubitv.py b/youtube_dl/extractor/tubitv.py
new file mode 100644 (file)
index 0000000..2c4b218
--- /dev/null
@@ -0,0 +1,84 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import codecs
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_parse,
+    compat_urllib_request
+)
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+)
+
+
+class TubiTvIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?tubitv\.com/video\?id=(?P<id>[0-9]+)'
+    _LOGIN_URL = 'http://tubitv.com/login'
+    _NETRC_MACHINE = 'tubitv'
+    _TEST = {
+        'url': 'http://tubitv.com/video?id=54411&title=The_Kitchen_Musical_-_EP01',
+        'info_dict': {
+            'id': '54411',
+            'ext': 'mp4',
+            'title': 'The Kitchen Musical - EP01',
+            'thumbnail': 're:^https?://.*\.png$',
+            'description': 'md5:37532716166069b353e8866e71fefae7',
+            'duration': 2407,
+        },
+        'params': {
+            'skip_download': 'HLS download',
+        },
+    }
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            return
+        self.report_login()
+        form_data = {
+            'username': username,
+            'password': password,
+        }
+        payload = compat_urllib_parse.urlencode(form_data).encode('utf-8')
+        request = compat_urllib_request.Request(self._LOGIN_URL, payload)
+        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        login_page = self._download_webpage(
+            request, None, False, 'Wrong login info')
+        if not re.search(r'id="tubi-logout"', login_page):
+            raise ExtractorError(
+                'Login failed (invalid username/password)', expected=True)
+
+    def _real_initialize(self):
+        self._login()
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+        if re.search(r"<(?:DIV|div) class='login-required-screen'>", webpage):
+            raise ExtractorError(
+                'This video requires login, use --username and --password '
+                'options to provide account credentials.', expected=True)
+
+        title = self._og_search_title(webpage)
+        description = self._og_search_description(webpage)
+        thumbnail = self._og_search_thumbnail(webpage)
+        duration = int_or_none(self._html_search_meta(
+            'video:duration', webpage, 'duration'))
+
+        apu = self._search_regex(r"apu='([^']+)'", webpage, 'apu')
+        m3u8_url = codecs.decode(apu, 'rot_13')[::-1]
+        formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': thumbnail,
+            'description': description,
+            'duration': duration,
+        }
index e6ee1e4715efc5d47dd3f9aa32d6559a5737a8ea..f38a72fde8974a7a1ea290de04281f67079b1a16 100644 (file)
@@ -4,7 +4,10 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..utils import float_or_none
+from ..utils import (
+    ExtractorError,
+    float_or_none,
+)
 
 
 class VGTVIE(InfoExtractor):
@@ -59,16 +62,16 @@ class VGTVIE(InfoExtractor):
         },
         {
             # streamType: live
-            'url': 'http://www.vgtv.no/#!/live/100015/direkte-her-kan-du-se-laksen-live-fra-suldalslaagen',
+            'url': 'http://www.vgtv.no/#!/live/113063/direkte-v75-fra-solvalla',
             'info_dict': {
-                'id': '100015',
+                'id': '113063',
                 'ext': 'flv',
-                'title': 'DIREKTE: Her kan du se laksen live fra Suldalslågen!',
-                'description': 'md5:9a60cc23fa349f761628924e56eeec2d',
+                'title': 're:^DIREKTE: V75 fra Solvalla [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+                'description': 'md5:b3743425765355855f88e096acc93231',
                 'thumbnail': 're:^https?://.*\.jpg',
                 'duration': 0,
-                'timestamp': 1407423348,
-                'upload_date': '20140807',
+                'timestamp': 1432975582,
+                'upload_date': '20150530',
                 'view_count': int,
             },
             'params': {
@@ -97,7 +100,12 @@ class VGTVIE(InfoExtractor):
             % (host, video_id, HOST_WEBSITES[host]),
             video_id, 'Downloading media JSON')
 
+        if data.get('status') == 'inactive':
+            raise ExtractorError(
+                'Video %s is no longer available' % video_id, expected=True)
+
         streams = data['streamUrls']
+        stream_type = data.get('streamType')
 
         formats = []
 
@@ -107,7 +115,8 @@ class VGTVIE(InfoExtractor):
                 hls_url, video_id, 'mp4', m3u8_id='hls'))
 
         hds_url = streams.get('hds')
-        if hds_url:
+        # wasLive hds are always 404
+        if hds_url and stream_type != 'wasLive':
             formats.extend(self._extract_f4m_formats(
                 hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18',
                 video_id, f4m_id='hds'))
@@ -135,13 +144,14 @@ class VGTVIE(InfoExtractor):
 
         return {
             'id': video_id,
-            'title': data['title'],
+            'title': self._live_title(data['title']),
             'description': data['description'],
             'thumbnail': data['images']['main'] + '?t[]=900x506q80',
             'timestamp': data['published'],
             'duration': float_or_none(data['duration'], 1000),
             'view_count': data['displays'],
             'formats': formats,
+            'is_live': True if stream_type == 'live' else False,
         }
 
 
index 0301682b8dd228cab336bc1e68eaf868660fd5c7..fcdbfe0bc959a011bebf8656184fe164b3eca84a 100644 (file)
@@ -1399,6 +1399,26 @@ class YoutubeChannelIE(InfoExtractor):
         channel_id = self._match_id(url)
 
         url = self._TEMPLATE_URL % channel_id
+
+        # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
+        # Workaround by extracting as a playlist if managed to obtain channel playlist URL
+        # otherwise fallback on channel by page extraction
+        channel_page = self._download_webpage(
+            url + '?view=57', channel_id,
+            'Downloading channel page', fatal=False)
+        channel_playlist_id = self._search_regex(
+            [r'<meta itemprop="channelId" content="([^"]+)">',
+             r'data-channel-external-id="([^"]+)"'],
+            channel_page, 'channel id', default=None)
+        if channel_playlist_id and channel_playlist_id.startswith('UC'):
+            playlist_id = 'UU' + channel_playlist_id[2:]
+            channel_playlist = unescapeHTML(self._search_regex(
+                r'href="/?(watch\?v=[0-9A-Za-z_-]{11}&amp;list=%s)"' % playlist_id,
+                channel_page, 'channel playlist URL', default=None))
+            if channel_playlist:
+                return self.url_result(
+                    compat_urlparse.urljoin(url, '/%s' % channel_playlist), 'YoutubePlaylist')
+
         channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
         autogenerated = re.search(r'''(?x)
                 class="[^"]*?(?:
index 8f825f7859058c9c40cd55e50ec9832a92858c32..774494efd1dbc9af1901bf67d1453689e846b780 100644 (file)
@@ -49,7 +49,7 @@ class EmbedThumbnailPP(FFmpegPostProcessor):
             os.remove(encodeFilename(filename))
             os.rename(encodeFilename(temp_filename), encodeFilename(filename))
 
-        elif info['ext'] == 'm4a':
+        elif info['ext'] in ['m4a', 'mp4']:
             if not check_executable('AtomicParsley', ['-v']):
                 raise EmbedThumbnailPPError('AtomicParsley was not found. Please install.')
 
@@ -82,6 +82,6 @@ class EmbedThumbnailPP(FFmpegPostProcessor):
                 os.remove(encodeFilename(filename))
                 os.rename(encodeFilename(temp_filename), encodeFilename(filename))
         else:
-            raise EmbedThumbnailPPError('Only mp3 and m4a are supported for thumbnail embedding for now.')
+            raise EmbedThumbnailPPError('Only mp3 and m4a/mp4 are supported for thumbnail embedding for now.')
 
         return [], info
index b333851534e9edd9c75ff70ee4350874530ea8f7..6537101310684fe09d5d6327bab9d46106112cf4 100644 (file)
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '2015.05.20'
+__version__ = '2015.05.29'