From: Yen Chi Hsuan Date: Tue, 9 Jan 2018 09:31:52 +0000 (+0800) Subject: Merge branch 'weibo' of https://github.com/sprhawk/youtube-dl into sprhawk-weibo X-Git-Url: http://git.bitcoin.ninja/index.cgi?p=youtube-dl;a=commitdiff_plain;h=1dd38dc0f40a7c3adbd9e346aa603d8ac566045a;hp=6648fd8ad6e581354f46c840465cff4c92d2c6f3 Merge branch 'weibo' of https://github.com/sprhawk/youtube-dl into sprhawk-weibo --- diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index d7a91239f..ad52c8900 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.12.23*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.12.23** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.01.07*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.01.07** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.12.23 +[debug] youtube-dl version 2018.01.07 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.travis.yml b/.travis.yml index 5f4f3922b..92f326860 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,16 +7,21 @@ python: - "3.4" - "3.5" - "3.6" + - "pypy" + - "pypy3" sudo: false env: - YTDL_TEST_SET=core - YTDL_TEST_SET=download matrix: + include: + - env: JYTHON=true; YTDL_TEST_SET=core + - env: JYTHON=true; YTDL_TEST_SET=download fast_finish: true allow_failures: - env: YTDL_TEST_SET=download + - env: JYTHON=true; YTDL_TEST_SET=core + - env: JYTHON=true; YTDL_TEST_SET=download +before_install: + - if [ "$JYTHON" == "true" ]; then ./devscripts/install_jython.sh; export PATH="$HOME/jython/bin:$PATH"; fi script: ./devscripts/run_tests.sh -notifications: - email: - - filippo.valsorda@gmail.com - - yasoob.khld@gmail.com diff --git a/ChangeLog b/ChangeLog index 420a1bd11..9d37cdcef 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,7 +1,69 @@ version Extractors +* [bilibili] fix extraction (#15188) + + +version 2018.01.07 + +Core +* [utils] Fix youtube-dl under PyPy3 on Windows +* [YoutubeDL] Output python implementation in debug header + +Extractors ++ [jwplatform] Add support for multiple embeds (#15192) +* [mitele] Fix extraction (#15186) ++ [motherless] Add support for groups (#15124) +* [lynda] Relax URL regular expression (#15185) +* [soundcloud] Fallback to avatar picture for thumbnail (#12878) +* [youku] Fix list extraction (#15135) +* [openload] Fix extraction (#15166) +* [lynda] Skip invalid subtitles (#15159) +* [twitch] Pass video id to url_result when extracting playlist (#15139) +* [rtve.es:alacarta] Fix extraction of some new URLs +* [acast] Fix extraction (#15147) + + +version 2017.12.31 + +Core ++ [extractor/common] Add container meta field for formats extracted + in _parse_mpd_formats (#13616) ++ [downloader/hls] Use HTTP headers for key request +* [common] Use AACL as the default fourcc when AudioTag is 255 +* [extractor/common] Fix extraction of DASH formats with the same + representation id (#15111) + +Extractors ++ [slutload] Add support for mobile URLs (#14806) +* [abc:iview] Bypass geo restriction +* [abc:iview] Fix extraction (#14711, #14782, #14838, #14917, #14963, #14985, + #15035, #15057, #15061, #15071, #15095, #15106) +* [openload] Fix extraction (#15118) +- [sandia] Remove extractor +- [collegerama] Remove extractor ++ [mediasite] Add support for sites based on Mediasite Video Platform (#5428, + #11185, #14343) ++ [ufctv] Add support for ufc.tv (#14520) +* [pluralsight] Fix missing first line of subtitles (#11118) +* [openload] Fallback on f-page extraction (#14665, #14879) +* [vimeo] Improve password protected videos extraction (#15114) +* [aws] Fix canonical/signed headers generation on python 2 (#15102) + + +version 2017.12.28 + +Extractors ++ [internazionale] Add support for internazionale.it (#14973) +* [playtvak] Relax video regular expression and make description optional + (#15037) ++ [filmweb] Add support for filmweb.no (#8773, #10368) ++ [23video] Add support for 23video.com ++ [espn] Add support for fivethirtyeight.com (#6864) ++ [umg:de] Add support for universal-music.de (#11582, #11584) ++ [espn] Add support for espnfc and extract more formats (#8053) * [youku] Update ccode (#14880) ++ [openload] Add support for oload.stream (#15070) * [youku] Fix list extraction (#15065) diff --git a/devscripts/install_jython.sh b/devscripts/install_jython.sh new file mode 100755 index 000000000..bafca4da4 --- /dev/null +++ b/devscripts/install_jython.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +wget http://central.maven.org/maven2/org/python/jython-installer/2.7.1/jython-installer-2.7.1.jar +java -jar jython-installer-2.7.1.jar -s -d "$HOME/jython" +$HOME/jython/bin/jython -m pip install nose diff --git a/docs/supportedsites.md b/docs/supportedsites.md index eac35e390..79b343048 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -3,6 +3,7 @@ - **1up.com** - **20min** - **220.ro** + - **23video** - **24video** - **3qsdn**: 3Q SDN - **3sat** @@ -170,7 +171,6 @@ - **CNN** - **CNNArticle** - **CNNBlogs** - - **CollegeRama** - **ComCarCoff** - **ComedyCentral** - **ComedyCentralFullEpisodes** @@ -269,6 +269,8 @@ - **Fczenit** - **filmon** - **filmon:channel** + - **Filmweb** + - **FiveThirtyEight** - **FiveTV** - **Flickr** - **Flipagram** @@ -359,6 +361,7 @@ - **InfoQ** - **Instagram** - **instagram:user**: Instagram user profile + - **Internazionale** - **InternetVideoArchive** - **IPrima** - **iqiyi**: 爱奇艺 @@ -445,6 +448,7 @@ - **media.ccc.de** - **Medialaan** - **Mediaset** + - **Mediasite** - **Medici** - **megaphone.fm**: megaphone.fm embedded players - **Meipai**: 美拍 @@ -474,6 +478,7 @@ - **Moniker**: allmyvideos.net and vidspot.net - **Morningstar**: morningstar.com - **Motherless** + - **MotherlessGroup** - **Motorsport**: motorsport.com - **MovieClips** - **MovieFap** @@ -713,7 +718,6 @@ - **safari**: safaribooksonline.com online video - **safari:api** - **safari:course**: safaribooksonline.com online courses - - **Sandia**: Sandia National Laboratories - **Sapo**: SAPO Vídeos - **savefrom.net** - **SBS**: sbs.com.au @@ -888,7 +892,9 @@ - **udemy** - **udemy:course** - **UDNEmbed**: 聯合影音 + - **UFCTV** - **UKTVPlay** + - **umg:de**: Universal Music Deutschland - **Unistra** - **Unity** - **uol.com.br** diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 8a372d2c9..7b31d5198 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -493,9 +493,20 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ _TEST_CASES = [ ( # https://github.com/rg3/youtube-dl/issues/13919 + # Also tests duplicate representation ids, see + # https://github.com/rg3/youtube-dl/issues/15111 'float_duration', 'http://unknown/manifest.mpd', [{ + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'm4a', + 'format_id': '318597', + 'format_note': 'DASH audio', + 'protocol': 'http_dash_segments', + 'acodec': 'mp4a.40.2', + 'vcodec': 'none', + 'tbr': 61.587, + }, { 'manifest_url': 'http://unknown/manifest.mpd', 'ext': 'mp4', 'format_id': '318597', diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index ace80f14b..97bd9c526 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -2233,8 +2233,16 @@ class YoutubeDL(object): sys.exc_clear() except Exception: pass - self._write_string('[debug] Python version %s - %s\n' % ( - platform.python_version(), platform_name())) + + def python_implementation(): + impl_name = platform.python_implementation() + if impl_name == 'PyPy' and hasattr(sys, 'pypy_version_info'): + return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3] + return impl_name + + self._write_string('[debug] Python version %s (%s) - %s\n' % ( + platform.python_version(), python_implementation(), + platform_name())) exe_versions = FFmpegPostProcessor.get_versions(self) exe_versions['rtmpdump'] = rtmpdump_version() diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 2a62248ef..41ca9adf1 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -3,12 +3,14 @@ from __future__ import unicode_literals import binascii import collections +import ctypes import email import getpass import io import itertools import optparse import os +import platform import re import shlex import shutil @@ -2906,6 +2908,24 @@ except ImportError: # not 2.6+ or is 3.x except ImportError: compat_zip = zip +if platform.python_implementation() == 'PyPy' and sys.pypy_version_info < (5, 4, 0): + # PyPy2 prior to version 5.4.0 expects byte strings as Windows function + # names, see the original PyPy issue [1] and the youtube-dl one [2]. + # 1. https://bitbucket.org/pypy/pypy/issues/2360/windows-ctypescdll-typeerror-function-name + # 2. https://github.com/rg3/youtube-dl/pull/4392 + def compat_ctypes_WINFUNCTYPE(*args, **kwargs): + real = ctypes.WINFUNCTYPE(*args, **kwargs) + + def resf(tpl, *args, **kwargs): + funcname, dll = tpl + return real((str(funcname), dll), *args, **kwargs) + + return resf +else: + def compat_ctypes_WINFUNCTYPE(*args, **kwargs): + return ctypes.WINFUNCTYPE(*args, **kwargs) + + __all__ = [ 'compat_HTMLParseError', 'compat_HTMLParser', @@ -2914,6 +2934,7 @@ __all__ = [ 'compat_chr', 'compat_cookiejar', 'compat_cookies', + 'compat_ctypes_WINFUNCTYPE', 'compat_etree_fromstring', 'compat_etree_register_namespace', 'compat_expanduser', diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 1a6e226c8..4dc3ab46a 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -163,7 +163,8 @@ class HlsFD(FragmentFD): return False if decrypt_info['METHOD'] == 'AES-128': iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence) - decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen(decrypt_info['URI']).read() + decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen( + self._prepare_url(info_dict, decrypt_info['URI'])).read() frag_content = AES.new( decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) self._append_fragment(ctx, frag_content) diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 60f753b95..87017ed39 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -1,6 +1,9 @@ from __future__ import unicode_literals +import hashlib +import hmac import re +import time from .common import InfoExtractor from ..compat import compat_str @@ -10,6 +13,7 @@ from ..utils import ( int_or_none, parse_iso8601, try_get, + update_url_query, ) @@ -101,21 +105,24 @@ class ABCIE(InfoExtractor): class ABCIViewIE(InfoExtractor): IE_NAME = 'abc.net.au:iview' _VALID_URL = r'https?://iview\.abc\.net\.au/programs/[^/]+/(?P[^/?#]+)' + _GEO_COUNTRIES = ['AU'] # ABC iview programs are normally available for 14 days only. _TESTS = [{ - 'url': 'http://iview.abc.net.au/programs/diaries-of-a-broken-mind/ZX9735A001S00', + 'url': 'http://iview.abc.net.au/programs/call-the-midwife/ZW0898A003S00', 'md5': 'cde42d728b3b7c2b32b1b94b4a548afc', 'info_dict': { - 'id': 'ZX9735A001S00', + 'id': 'ZW0898A003S00', 'ext': 'mp4', - 'title': 'Diaries Of A Broken Mind', - 'description': 'md5:7de3903874b7a1be279fe6b68718fc9e', - 'upload_date': '20161010', - 'uploader_id': 'abc2', - 'timestamp': 1476064920, + 'title': 'Series 5 Ep 3', + 'description': 'md5:e0ef7d4f92055b86c4f33611f180ed79', + 'upload_date': '20171228', + 'uploader_id': 'abc1', + 'timestamp': 1514499187, + }, + 'params': { + 'skip_download': True, }, - 'skip': 'Video gone', }] def _real_extract(self, url): @@ -126,20 +133,30 @@ class ABCIViewIE(InfoExtractor): title = video_params.get('title') or video_params['seriesTitle'] stream = next(s for s in video_params['playlist'] if s.get('type') == 'program') - format_urls = [ - try_get(stream, lambda x: x['hds-unmetered'], compat_str)] - - # May have higher quality video - sd_url = try_get( - stream, lambda x: x['streams']['hds']['sd'], compat_str) - if sd_url: - format_urls.append(sd_url.replace('metered', 'um')) - - formats = [] - for format_url in format_urls: - if format_url: - formats.extend( - self._extract_akamai_formats(format_url, video_id)) + house_number = video_params.get('episodeHouseNumber') + path = '/auth/hls/sign?ts={0}&hn={1}&d=android-mobile'.format( + int(time.time()), house_number) + sig = hmac.new( + 'android.content.res.Resources'.encode('utf-8'), + path.encode('utf-8'), hashlib.sha256).hexdigest() + token = self._download_webpage( + 'http://iview.abc.net.au{0}&sig={1}'.format(path, sig), video_id) + + def tokenize_url(url, token): + return update_url_query(url, { + 'hdnea': token, + }) + + for sd in ('sd', 'sd-low'): + sd_url = try_get( + stream, lambda x: x['streams']['hls'][sd], compat_str) + if not sd_url: + continue + formats = self._extract_m3u8_formats( + tokenize_url(sd_url, token), video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + if formats: + break self._sort_formats(formats) subtitles = {} diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py index 6dace3051..5871e72dc 100644 --- a/youtube_dl/extractor/acast.py +++ b/youtube_dl/extractor/acast.py @@ -8,7 +8,7 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( int_or_none, - parse_iso8601, + unified_timestamp, OnDemandPagedList, ) @@ -32,7 +32,7 @@ class ACastIE(InfoExtractor): }, { # test with multiple blings 'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna', - 'md5': '55c0097badd7095f494c99a172f86501', + 'md5': 'e87d5b8516cd04c0d81b6ee1caca28d0', 'info_dict': { 'id': '2a92b283-1a75-4ad8-8396-499c641de0d9', 'ext': 'mp3', @@ -40,23 +40,24 @@ class ACastIE(InfoExtractor): 'timestamp': 1477346700, 'upload_date': '20161024', 'description': 'md5:4f81f6d8cf2e12ee21a321d8bca32db4', - 'duration': 2797, + 'duration': 2766, } }] def _real_extract(self, url): channel, display_id = re.match(self._VALID_URL, url).groups() cast_data = self._download_json( - 'https://embed.acast.com/api/acasts/%s/%s' % (channel, display_id), display_id) + 'https://play-api.acast.com/splash/%s/%s' % (channel, display_id), display_id) + e = cast_data['result']['episode'] return { - 'id': compat_str(cast_data['id']), + 'id': compat_str(e['id']), 'display_id': display_id, - 'url': [b['audio'] for b in cast_data['blings'] if b['type'] == 'BlingAudio'][0], - 'title': cast_data['name'], - 'description': cast_data.get('description'), - 'thumbnail': cast_data.get('image'), - 'timestamp': parse_iso8601(cast_data.get('publishingDate')), - 'duration': int_or_none(cast_data.get('duration')), + 'url': e['mediaUrl'], + 'title': e['name'], + 'description': e.get('description'), + 'thumbnail': e.get('image'), + 'timestamp': unified_timestamp(e.get('publishingDate')), + 'duration': int_or_none(e.get('duration')), } diff --git a/youtube_dl/extractor/aws.py b/youtube_dl/extractor/aws.py index 670abce0c..dccfeaf73 100644 --- a/youtube_dl/extractor/aws.py +++ b/youtube_dl/extractor/aws.py @@ -21,11 +21,11 @@ class AWSIE(InfoExtractor): 'Accept': 'application/json', 'Host': self._AWS_PROXY_HOST, 'X-Amz-Date': amz_date, + 'X-Api-Key': self._AWS_API_KEY } session_token = aws_dict.get('session_token') if session_token: headers['X-Amz-Security-Token'] = session_token - headers['X-Api-Key'] = self._AWS_API_KEY def aws_hash(s): return hashlib.sha256(s.encode('utf-8')).hexdigest() @@ -33,9 +33,9 @@ class AWSIE(InfoExtractor): # Task 1: http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html canonical_querystring = compat_urllib_parse_urlencode(query) canonical_headers = '' - for header_name, header_value in headers.items(): + for header_name, header_value in sorted(headers.items()): canonical_headers += '%s:%s\n' % (header_name.lower(), header_value) - signed_headers = ';'.join([header.lower() for header in headers.keys()]) + signed_headers = ';'.join([header.lower() for header in sorted(headers.keys())]) canonical_request = '\n'.join([ 'GET', aws_dict['uri'], diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 1e57310d6..beffcecd0 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -102,6 +102,7 @@ class BiliBiliIE(InfoExtractor): video_id, anime_id, compat_urlparse.urljoin(url, '//bangumi.bilibili.com/anime/%s' % anime_id))) headers = { 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Referer': url } headers.update(self.geo_verification_headers()) @@ -116,10 +117,15 @@ class BiliBiliIE(InfoExtractor): payload = 'appkey=%s&cid=%s&otype=json&quality=2&type=mp4' % (self._APP_KEY, cid) sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() + headers = { + 'Referer': url + } + headers.update(self.geo_verification_headers()) + video_info = self._download_json( 'http://interface.bilibili.com/playurl?%s&sign=%s' % (payload, sign), video_id, note='Downloading video info page', - headers=self.geo_verification_headers()) + headers=headers) if 'durl' not in video_info: self._report_error(video_info) diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index d8bf073f4..51c11cb7e 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -4,59 +4,36 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlparse from ..utils import ( - dict_get, # ExtractorError, # HEADRequest, int_or_none, qualities, - remove_end, unified_strdate, ) class CanalplusIE(InfoExtractor): - IE_DESC = 'canalplus.fr, piwiplus.fr and d8.tv' - _VALID_URL = r'''(?x) - https?:// - (?: - (?: - (?:(?:www|m)\.)?canalplus\.fr| - (?:www\.)?piwiplus\.fr| - (?:www\.)?d8\.tv| - (?:www\.)?c8\.fr| - (?:www\.)?d17\.tv| - (?:(?:football|www)\.)?cstar\.fr| - (?:www\.)?itele\.fr - )/(?:(?:[^/]+/)*(?P[^/?#&]+))?(?:\?.*\bvid=(?P\d+))?| - player\.canalplus\.fr/#/(?P\d+) - ) - - ''' + IE_DESC = 'mycanal.fr and piwiplus.fr' + _VALID_URL = r'https?://(?:www\.)?(?Pmycanal|piwiplus)\.fr/(?:[^/]+/)*(?P[^?/]+)(?:\.html\?.*\bvid=|/p/)(?P\d+)' _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/%s/%s?format=json' _SITE_ID_MAP = { - 'canalplus': 'cplus', + 'mycanal': 'cplus', 'piwiplus': 'teletoon', - 'd8': 'd8', - 'c8': 'd8', - 'd17': 'd17', - 'cstar': 'd17', - 'itele': 'itele', } # Only works for direct mp4 URLs _GEO_COUNTRIES = ['FR'] _TESTS = [{ - 'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1192814', + 'url': 'https://www.mycanal.fr/d17-emissions/lolywood/p/1397061', 'info_dict': { - 'id': '1405510', - 'display_id': 'pid1830-c-zapping', + 'id': '1397061', + 'display_id': 'lolywood', 'ext': 'mp4', - 'title': 'Zapping - 02/07/2016', - 'description': 'Le meilleur de toutes les chaînes, tous les jours', - 'upload_date': '20160702', + 'title': 'Euro 2016 : Je préfère te prévenir - Lolywood - Episode 34', + 'description': 'md5:7d97039d455cb29cdba0d652a0efaa5e', + 'upload_date': '20160602', }, }, { # geo restricted, bypassed @@ -70,64 +47,12 @@ class CanalplusIE(InfoExtractor): 'upload_date': '20140724', }, 'expected_warnings': ['HTTP Error 403: Forbidden'], - }, { - # geo restricted, bypassed - 'url': 'http://www.c8.fr/c8-divertissement/ms-touche-pas-a-mon-poste/pid6318-videos-integrales.html?vid=1443684', - 'md5': 'bb6f9f343296ab7ebd88c97b660ecf8d', - 'info_dict': { - 'id': '1443684', - 'display_id': 'pid6318-videos-integrales', - 'ext': 'mp4', - 'title': 'Guess my iep ! - TPMP - 07/04/2017', - 'description': 'md5:6f005933f6e06760a9236d9b3b5f17fa', - 'upload_date': '20170407', - }, - 'expected_warnings': ['HTTP Error 403: Forbidden'], - }, { - 'url': 'http://www.itele.fr/chroniques/invite-michael-darmon/rachida-dati-nicolas-sarkozy-est-le-plus-en-phase-avec-les-inquietudes-des-francais-171510', - 'info_dict': { - 'id': '1420176', - 'display_id': 'rachida-dati-nicolas-sarkozy-est-le-plus-en-phase-avec-les-inquietudes-des-francais-171510', - 'ext': 'mp4', - 'title': 'L\'invité de Michaël Darmon du 14/10/2016 - ', - 'description': 'Chaque matin du lundi au vendredi, Michaël Darmon reçoit un invité politique à 8h25.', - 'upload_date': '20161014', - }, - }, { - 'url': 'http://football.cstar.fr/cstar-minisite-foot/pid7566-feminines-videos.html?vid=1416769', - 'info_dict': { - 'id': '1416769', - 'display_id': 'pid7566-feminines-videos', - 'ext': 'mp4', - 'title': 'France - Albanie : les temps forts de la soirée - 20/09/2016', - 'description': 'md5:c3f30f2aaac294c1c969b3294de6904e', - 'upload_date': '20160921', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://m.canalplus.fr/?vid=1398231', - 'only_matching': True, - }, { - 'url': 'http://www.d17.tv/emissions/pid8303-lolywood.html?vid=1397061', - 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - site_id = self._SITE_ID_MAP[compat_urllib_parse_urlparse(url).netloc.rsplit('.', 2)[-2]] - - # Beware, some subclasses do not define an id group - display_id = remove_end(dict_get(mobj.groupdict(), ('display_id', 'id', 'vid')), '.html') + site, display_id, video_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - [r']+?videoId=(["\'])(?P\d+)', - r'id=["\']canal_video_player(?P\d+)', - r'data-video=["\'](?P\d+)'], - webpage, 'video id', default=mobj.group('vid'), group='id') + site_id = self._SITE_ID_MAP[site] info_url = self._VIDEO_INFO_TEMPLATE % (site_id, video_id) video_data = self._download_json(info_url, video_id, 'Downloading video JSON') @@ -161,7 +86,7 @@ class CanalplusIE(InfoExtractor): format_url + '?hdcore=2.11.3', video_id, f4m_id=format_id, fatal=False)) else: formats.append({ - # the secret extracted ya function in http://player.canalplus.fr/common/js/canalPlayer.js + # the secret extracted from ya function in http://player.canalplus.fr/common/js/canalPlayer.js 'url': format_url + '?secret=pqzerjlsmdkjfoiuerhsdlfknaes', 'format_id': format_id, 'preference': preference(format_id), diff --git a/youtube_dl/extractor/collegerama.py b/youtube_dl/extractor/collegerama.py deleted file mode 100644 index 6a41db87c..000000000 --- a/youtube_dl/extractor/collegerama.py +++ /dev/null @@ -1,93 +0,0 @@ -from __future__ import unicode_literals - -import json - -from .common import InfoExtractor -from ..utils import ( - float_or_none, - int_or_none, - sanitized_Request, -) - - -class CollegeRamaIE(InfoExtractor): - _VALID_URL = r'https?://collegerama\.tudelft\.nl/Mediasite/Play/(?P[\da-f]+)' - _TESTS = [ - { - 'url': 'https://collegerama.tudelft.nl/Mediasite/Play/585a43626e544bdd97aeb71a0ec907a01d', - 'md5': '481fda1c11f67588c0d9d8fbdced4e39', - 'info_dict': { - 'id': '585a43626e544bdd97aeb71a0ec907a01d', - 'ext': 'mp4', - 'title': 'Een nieuwe wereld: waarden, bewustzijn en techniek van de mensheid 2.0.', - 'description': '', - 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$', - 'duration': 7713.088, - 'timestamp': 1413309600, - 'upload_date': '20141014', - }, - }, - { - 'url': 'https://collegerama.tudelft.nl/Mediasite/Play/86a9ea9f53e149079fbdb4202b521ed21d?catalog=fd32fd35-6c99-466c-89d4-cd3c431bc8a4', - 'md5': 'ef1fdded95bdf19b12c5999949419c92', - 'info_dict': { - 'id': '86a9ea9f53e149079fbdb4202b521ed21d', - 'ext': 'wmv', - 'title': '64ste Vakantiecursus: Afvalwater', - 'description': 'md5:7fd774865cc69d972f542b157c328305', - 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$', - 'duration': 10853, - 'timestamp': 1326446400, - 'upload_date': '20120113', - }, - }, - ] - - def _real_extract(self, url): - video_id = self._match_id(url) - - player_options_request = { - 'getPlayerOptionsRequest': { - 'ResourceId': video_id, - 'QueryString': '', - } - } - - request = sanitized_Request( - 'http://collegerama.tudelft.nl/Mediasite/PlayerService/PlayerService.svc/json/GetPlayerOptions', - json.dumps(player_options_request)) - request.add_header('Content-Type', 'application/json') - - player_options = self._download_json(request, video_id) - - presentation = player_options['d']['Presentation'] - title = presentation['Title'] - description = presentation.get('Description') - thumbnail = None - duration = float_or_none(presentation.get('Duration'), 1000) - timestamp = int_or_none(presentation.get('UnixTime'), 1000) - - formats = [] - for stream in presentation['Streams']: - for video in stream['VideoUrls']: - thumbnail_url = stream.get('ThumbnailUrl') - if thumbnail_url: - thumbnail = 'http://collegerama.tudelft.nl' + thumbnail_url - format_id = video['MediaType'] - if format_id == 'SS': - continue - formats.append({ - 'url': video['Location'], - 'format_id': format_id, - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 3b79b8cb4..5b6a09c0b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1880,6 +1880,7 @@ class InfoExtractor(object): 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, 'format_note': 'DASH %s' % content_type, 'filesize': filesize, + 'container': mimetype2ext(mime_type) + '_dash', } f.update(parse_codecs(representation_attrib.get('codecs'))) representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) @@ -2007,16 +2008,14 @@ class InfoExtractor(object): f['url'] = initialization_url f['fragments'].append({location_key(initialization_url): initialization_url}) f['fragments'].extend(representation_ms_info['fragments']) - try: - existing_format = next( - fo for fo in formats - if fo['format_id'] == representation_id) - except StopIteration: - full_info = formats_dict.get(representation_id, {}).copy() - full_info.update(f) - formats.append(full_info) - else: - existing_format.update(f) + # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation + # is not necessarily unique within a Period thus formats with + # the same `format_id` are quite possible. There are numerous examples + # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111, + # https://github.com/rg3/youtube-dl/issues/13919) + full_info = formats_dict.get(representation_id, {}).copy() + full_info.update(f) + formats.append(full_info) else: self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) return formats @@ -2056,7 +2055,7 @@ class InfoExtractor(object): stream_timescale = int_or_none(stream.get('TimeScale')) or timescale stream_name = stream.get('Name') for track in stream.findall('QualityLevel'): - fourcc = track.get('FourCC') + fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None) # TODO: add support for WVC1 and WMAP if fourcc not in ('H264', 'AVC1', 'AACL'): self.report_warning('%s is not a supported codec' % fourcc) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e863f03ab..a3ad4df1f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -205,7 +205,6 @@ from .cnn import ( CNNArticleIE, ) from .coub import CoubIE -from .collegerama import CollegeRamaIE from .comedycentral import ( ComedyCentralFullEpisodesIE, ComedyCentralIE, @@ -345,6 +344,7 @@ from .filmon import ( FilmOnIE, FilmOnChannelIE, ) +from .filmweb import FilmwebIE from .firsttv import FirstTVIE from .fivemin import FiveMinIE from .fivetv import FiveTVIE @@ -465,6 +465,7 @@ from .indavideo import ( ) from .infoq import InfoQIE from .instagram import InstagramIE, InstagramUserIE +from .internazionale import InternazionaleIE from .internetvideoarchive import InternetVideoArchiveIE from .iprima import IPrimaIE from .iqiyi import IqiyiIE @@ -574,6 +575,7 @@ from .massengeschmacktv import MassengeschmackTVIE from .matchtv import MatchTVIE from .mdr import MDRIE from .mediaset import MediasetIE +from .mediasite import MediasiteIE from .medici import MediciIE from .megaphone import MegaphoneIE from .meipai import MeipaiIE @@ -607,7 +609,10 @@ from .mofosex import MofosexIE from .mojvideo import MojvideoIE from .moniker import MonikerIE from .morningstar import MorningstarIE -from .motherless import MotherlessIE +from .motherless import ( + MotherlessIE, + MotherlessGroupIE +) from .motorsport import MotorsportIE from .movieclips import MovieClipsIE from .moviezine import MoviezineIE @@ -910,7 +915,6 @@ from .rutube import ( from .rutv import RUTVIE from .ruutu import RuutuIE from .ruv import RuvIE -from .sandia import SandiaIE from .safari import ( SafariIE, SafariApiIE, @@ -1120,6 +1124,7 @@ from .tvplayer import TVPlayerIE from .tweakers import TweakersIE from .twentyfourvideo import TwentyFourVideoIE from .twentymin import TwentyMinutenIE +from .twentythreevideo import TwentyThreeVideoIE from .twitch import ( TwitchVideoIE, TwitchChapterIE, @@ -1142,6 +1147,7 @@ from .udemy import ( UdemyCourseIE ) from .udn import UDNEmbedIE +from .ufctv import UFCTVIE from .uktvplay import UKTVPlayIE from .digiteka import DigitekaIE from .umg import UMGDeIE diff --git a/youtube_dl/extractor/filmweb.py b/youtube_dl/extractor/filmweb.py new file mode 100644 index 000000000..56000bc5b --- /dev/null +++ b/youtube_dl/extractor/filmweb.py @@ -0,0 +1,42 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class FilmwebIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?filmweb\.no/(?Ptrailere|filmnytt)/article(?P\d+)\.ece' + _TEST = { + 'url': 'http://www.filmweb.no/trailere/article1264921.ece', + 'md5': 'e353f47df98e557d67edaceda9dece89', + 'info_dict': { + 'id': '13033574', + 'ext': 'mp4', + 'title': 'Det som en gang var', + 'upload_date': '20160316', + 'timestamp': 1458140101, + 'uploader_id': '12639966', + 'uploader': 'Live Roaldset', + } + } + + def _real_extract(self, url): + article_type, article_id = re.match(self._VALID_URL, url).groups() + if article_type == 'filmnytt': + webpage = self._download_webpage(url, article_id) + article_id = self._search_regex(r'data-videoid="(\d+)"', webpage, 'article id') + embed_code = self._download_json( + 'https://www.filmweb.no/template_v2/ajax/json_trailerEmbed.jsp', + article_id, query={ + 'articleId': article_id, + })['embedCode'] + iframe_url = self._proto_relative_url(self._search_regex( + r']+src="([^"]+)', embed_code, 'iframe url')) + + return { + '_type': 'url_transparent', + 'id': article_id, + 'url': iframe_url, + 'ie_key': 'TwentyThreeVideo', + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index c7b609215..9b0cd004f 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -100,6 +100,7 @@ from .megaphone import MegaphoneIE from .vzaar import VzaarIE from .channel9 import Channel9IE from .vshare import VShareIE +from .mediasite import MediasiteIE class GenericIE(InfoExtractor): @@ -1925,6 +1926,18 @@ class GenericIE(InfoExtractor): 'title': 'vl14062007715967', 'ext': 'mp4', } + }, + { + 'url': 'http://www.heidelberg-laureate-forum.org/blog/video/lecture-friday-september-23-2016-sir-c-antony-r-hoare/', + 'md5': 'aecd089f55b1cb5a59032cb049d3a356', + 'info_dict': { + 'id': '90227f51a80c4d8f86c345a7fa62bd9a1d', + 'ext': 'mp4', + 'title': 'Lecture: Friday, September 23, 2016 - Sir Tony Hoare', + 'description': 'md5:5a51db84a62def7b7054df2ade403c6c', + 'timestamp': 1474354800, + 'upload_date': '20160920', + } } # { # # TODO: find another test @@ -2695,9 +2708,9 @@ class GenericIE(InfoExtractor): return self.url_result(viewlift_url) # Look for JWPlatform embeds - jwplatform_url = JWPlatformIE._extract_url(webpage) - if jwplatform_url: - return self.url_result(jwplatform_url, 'JWPlatform') + jwplatform_urls = JWPlatformIE._extract_urls(webpage) + if jwplatform_urls: + return self.playlist_from_matches(jwplatform_urls, video_id, video_title, ie=JWPlatformIE.ie_key()) # Look for Digiteka embeds digiteka_url = DigitekaIE._extract_url(webpage) @@ -2883,6 +2896,16 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( vshare_urls, video_id, video_title, ie=VShareIE.ie_key()) + # Look for Mediasite embeds + mediasite_urls = MediasiteIE._extract_urls(webpage) + if mediasite_urls: + entries = [ + self.url_result(smuggle_url( + compat_urlparse.urljoin(url, mediasite_url), + {'UrlReferrer': url}), ie=MediasiteIE.ie_key()) + for mediasite_url in mediasite_urls] + return self.playlist_result(entries, video_id, video_title) + def merge_dicts(dict1, dict2): merged = {} for k, v in dict1.items(): diff --git a/youtube_dl/extractor/internazionale.py b/youtube_dl/extractor/internazionale.py new file mode 100644 index 000000000..10ba1f6cf --- /dev/null +++ b/youtube_dl/extractor/internazionale.py @@ -0,0 +1,64 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import unified_timestamp + + +class InternazionaleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?internazionale\.it/video/(?:[^/]+/)*(?P[^/?#&]+)' + _TEST = { + 'url': 'https://www.internazionale.it/video/2015/02/19/richard-linklater-racconta-una-scena-di-boyhood', + 'md5': '3e39d32b66882c1218e305acbf8348ca', + 'info_dict': { + 'id': '265968', + 'display_id': 'richard-linklater-racconta-una-scena-di-boyhood', + 'ext': 'mp4', + 'title': 'Richard Linklater racconta una scena di Boyhood', + 'description': 'md5:efb7e5bbfb1a54ae2ed5a4a015f0e665', + 'timestamp': 1424354635, + 'upload_date': '20150219', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'format': 'bestvideo', + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + DATA_RE = r'data-%s=(["\'])(?P(?:(?!\1).)+)\1' + + title = self._search_regex( + DATA_RE % 'video-title', webpage, 'title', default=None, + group='value') or self._og_search_title(webpage) + + video_id = self._search_regex( + DATA_RE % 'job-id', webpage, 'video id', group='value') + video_path = self._search_regex( + DATA_RE % 'video-path', webpage, 'video path', group='value') + + video_base = 'https://video.internazionale.it/%s/%s.' % (video_path, video_id) + + formats = self._extract_m3u8_formats( + video_base + 'm3u8', display_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + formats.extend(self._extract_mpd_formats( + video_base + 'mpd', display_id, mpd_id='dash', fatal=False)) + self._sort_formats(formats) + + timestamp = unified_timestamp(self._html_search_meta( + 'article:published_time', webpage, 'timestamp')) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'thumbnail': self._og_search_thumbnail(webpage), + 'description': self._og_search_description(webpage), + 'timestamp': timestamp, + 'formats': formats, + } diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index c9bcbb08f..63d0dc998 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -23,11 +23,14 @@ class JWPlatformIE(InfoExtractor): @staticmethod def _extract_url(webpage): - mobj = re.search( - r'<(?:script|iframe)[^>]+?src=["\'](?P(?:https?:)?//content.jwplatform.com/players/[a-zA-Z0-9]{8})', + urls = JWPlatformIE._extract_urls(webpage) + return urls[0] if urls else None + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<(?:script|iframe)[^>]+?src=["\']((?:https?:)?//content\.jwplatform\.com/players/[a-zA-Z0-9]{8})', webpage) - if mobj: - return mobj.group('url') def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 1b6f5091d..f5c7abc13 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -94,7 +94,15 @@ class LyndaBaseIE(InfoExtractor): class LyndaIE(LyndaBaseIE): IE_NAME = 'lynda' IE_DESC = 'lynda.com videos' - _VALID_URL = r'https?://(?:www\.)?(?:lynda\.com|educourse\.ga)/(?:[^/]+/[^/]+/(?P\d+)|player/embed)/(?P\d+)' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?(?:lynda\.com|educourse\.ga)/ + (?: + (?:[^/]+/){2,3}(?P\d+)| + player/embed + )/ + (?P\d+) + ''' _TIMECODE_REGEX = r'\[(?P\d+:\d+:\d+[\.,]\d+)\]' @@ -113,6 +121,9 @@ class LyndaIE(LyndaBaseIE): }, { 'url': 'https://educourse.ga/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html', 'only_matching': True, + }, { + 'url': 'https://www.lynda.com/de/Graphic-Design-tutorials/Willkommen-Grundlagen-guten-Gestaltung/393570/393572-4.html', + 'only_matching': True, }] def _raise_unavailable(self, video_id): @@ -244,8 +255,9 @@ class LyndaIE(LyndaBaseIE): def _get_subtitles(self, video_id): url = 'https://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id subs = self._download_json(url, None, False) - if subs: - return {'en': [{'ext': 'srt', 'data': self._fix_subtitles(subs)}]} + fixed_subs = self._fix_subtitles(subs) + if fixed_subs: + return {'en': [{'ext': 'srt', 'data': fixed_subs}]} else: return {} @@ -256,7 +268,15 @@ class LyndaCourseIE(LyndaBaseIE): # Course link equals to welcome/introduction video link of same course # We will recognize it as course link - _VALID_URL = r'https?://(?:www|m)\.(?:lynda\.com|educourse\.ga)/(?P[^/]+/[^/]+/(?P\d+))-\d\.html' + _VALID_URL = r'https?://(?:www|m)\.(?:lynda\.com|educourse\.ga)/(?P(?:[^/]+/){2,3}(?P\d+))-2\.html' + + _TESTS = [{ + 'url': 'https://www.lynda.com/Graphic-Design-tutorials/Grundlagen-guten-Gestaltung/393570-2.html', + 'only_matching': True, + }, { + 'url': 'https://www.lynda.com/de/Graphic-Design-tutorials/Grundlagen-guten-Gestaltung/393570-2.html', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/mediasite.py b/youtube_dl/extractor/mediasite.py new file mode 100644 index 000000000..0e2645c55 --- /dev/null +++ b/youtube_dl/extractor/mediasite.py @@ -0,0 +1,214 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + float_or_none, + mimetype2ext, + unescapeHTML, + unsmuggle_url, + urljoin, +) + + +class MediasiteIE(InfoExtractor): + _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/Play/(?P[0-9a-f]{32,34})(?P\?[^#]+|)' + _TESTS = [ + { + 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d', + 'info_dict': { + 'id': '2db6c271681e4f199af3c60d1f82869b1d', + 'ext': 'mp4', + 'title': 'Lecture: Tuesday, September 20, 2016 - Sir Andrew Wiles', + 'description': 'Sir Andrew Wiles: “Equations in arithmetic”\\n\\nI will describe some of the interactions between modern number theory and the problem of solving equations in rational numbers or integers\\u0027.', + 'timestamp': 1474268400.0, + 'upload_date': '20160919', + }, + }, + { + 'url': 'http://mediasite.uib.no/Mediasite/Play/90bb363295d945d6b548c867d01181361d?catalog=a452b7df-9ae1-46b7-a3ba-aceeb285f3eb', + 'info_dict': { + 'id': '90bb363295d945d6b548c867d01181361d', + 'ext': 'mp4', + 'upload_date': '20150429', + 'title': '5) IT-forum 2015-Dag 1 - Dungbeetle - How and why Rain created a tiny bug tracker for Unity', + 'timestamp': 1430311380.0, + }, + }, + { + 'url': 'https://collegerama.tudelft.nl/Mediasite/Play/585a43626e544bdd97aeb71a0ec907a01d', + 'md5': '481fda1c11f67588c0d9d8fbdced4e39', + 'info_dict': { + 'id': '585a43626e544bdd97aeb71a0ec907a01d', + 'ext': 'mp4', + 'title': 'Een nieuwe wereld: waarden, bewustzijn en techniek van de mensheid 2.0.', + 'description': '', + 'thumbnail': r're:^https?://.*\.jpg(?:\?.*)?$', + 'duration': 7713.088, + 'timestamp': 1413309600, + 'upload_date': '20141014', + }, + }, + { + 'url': 'https://collegerama.tudelft.nl/Mediasite/Play/86a9ea9f53e149079fbdb4202b521ed21d?catalog=fd32fd35-6c99-466c-89d4-cd3c431bc8a4', + 'md5': 'ef1fdded95bdf19b12c5999949419c92', + 'info_dict': { + 'id': '86a9ea9f53e149079fbdb4202b521ed21d', + 'ext': 'wmv', + 'title': '64ste Vakantiecursus: Afvalwater', + 'description': 'md5:7fd774865cc69d972f542b157c328305', + 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$', + 'duration': 10853, + 'timestamp': 1326446400, + 'upload_date': '20120113', + }, + }, + { + 'url': 'http://digitalops.sandia.gov/Mediasite/Play/24aace4429fc450fb5b38cdbf424a66e1d', + 'md5': '9422edc9b9a60151727e4b6d8bef393d', + 'info_dict': { + 'id': '24aace4429fc450fb5b38cdbf424a66e1d', + 'ext': 'mp4', + 'title': 'Xyce Software Training - Section 1', + 'description': r're:(?s)SAND Number: SAND 2013-7800.{200,}', + 'upload_date': '20120409', + 'timestamp': 1333983600, + 'duration': 7794, + } + } + ] + + # look in Mediasite.Core.js (Mediasite.ContentStreamType[*]) + _STREAM_TYPES = { + 0: 'video1', # the main video + 2: 'slide', + 3: 'presentation', + 4: 'video2', # screencast? + 5: 'video3', + } + + @staticmethod + def _extract_urls(webpage): + return [ + unescapeHTML(mobj.group('url')) + for mobj in re.finditer( + r'(?xi)]+\bsrc=(["\'])(?P(?:(?:https?:)?//[^/]+)?/Mediasite/Play/[0-9a-f]{32,34}(?:\?.*?)?)\1', + webpage)] + + def _real_extract(self, url): + url, data = unsmuggle_url(url, {}) + mobj = re.match(self._VALID_URL, url) + resource_id = mobj.group('id') + query = mobj.group('query') + + webpage, urlh = self._download_webpage_handle(url, resource_id) # XXX: add UrlReferrer? + redirect_url = compat_str(urlh.geturl()) + + # XXX: might have also extracted UrlReferrer and QueryString from the html + service_path = compat_urlparse.urljoin(redirect_url, self._html_search_regex( + r']+\bid=["\']ServicePath[^>]+>(.+?)', webpage, resource_id, + default='/Mediasite/PlayerService/PlayerService.svc/json')) + + player_options = self._download_json( + '%s/GetPlayerOptions' % service_path, resource_id, + headers={ + 'Content-type': 'application/json; charset=utf-8', + 'X-Requested-With': 'XMLHttpRequest', + }, + data=json.dumps({ + 'getPlayerOptionsRequest': { + 'ResourceId': resource_id, + 'QueryString': query, + 'UrlReferrer': data.get('UrlReferrer', ''), + 'UseScreenReader': False, + } + }).encode('utf-8'))['d'] + + presentation = player_options['Presentation'] + title = presentation['Title'] + + if presentation is None: + raise ExtractorError( + 'Mediasite says: %s' % player_options['PlayerPresentationStatusMessage'], + expected=True) + + thumbnails = [] + formats = [] + for snum, Stream in enumerate(presentation['Streams']): + stream_type = Stream.get('StreamType') + if stream_type is None: + continue + + video_urls = Stream.get('VideoUrls') + if not isinstance(video_urls, list): + video_urls = [] + + stream_id = self._STREAM_TYPES.get( + stream_type, 'type%u' % stream_type) + + stream_formats = [] + for unum, VideoUrl in enumerate(video_urls): + video_url = VideoUrl.get('Location') + if not video_url or not isinstance(video_url, compat_str): + continue + # XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS + + media_type = VideoUrl.get('MediaType') + if media_type == 'SS': + stream_formats.extend(self._extract_ism_formats( + video_url, resource_id, + ism_id='%s-%u.%u' % (stream_id, snum, unum), + fatal=False)) + elif media_type == 'Dash': + stream_formats.extend(self._extract_mpd_formats( + video_url, resource_id, + mpd_id='%s-%u.%u' % (stream_id, snum, unum), + fatal=False)) + else: + stream_formats.append({ + 'format_id': '%s-%u.%u' % (stream_id, snum, unum), + 'url': video_url, + 'ext': mimetype2ext(VideoUrl.get('MimeType')), + }) + + # TODO: if Stream['HasSlideContent']: + # synthesise an MJPEG video stream '%s-%u.slides' % (stream_type, snum) + # from Stream['Slides'] + # this will require writing a custom downloader... + + # disprefer 'secondary' streams + if stream_type != 0: + for fmt in stream_formats: + fmt['preference'] = -1 + + thumbnail_url = Stream.get('ThumbnailUrl') + if thumbnail_url: + thumbnails.append({ + 'id': '%s-%u' % (stream_id, snum), + 'url': urljoin(redirect_url, thumbnail_url), + 'preference': -1 if stream_type != 0 else 0, + }) + formats.extend(stream_formats) + + self._sort_formats(formats) + + # XXX: Presentation['Presenters'] + # XXX: Presentation['Transcript'] + + return { + 'id': resource_id, + 'title': title, + 'description': presentation.get('Description'), + 'duration': float_or_none(presentation.get('Duration'), 1000), + 'timestamp': float_or_none(presentation.get('UnixTime'), 1000), + 'formats': formats, + 'thumbnails': thumbnails, + } diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 964dc542c..42759eae8 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -1,13 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals +import json import uuid from .common import InfoExtractor from .ooyala import OoyalaIE from ..compat import ( compat_str, - compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -42,31 +42,33 @@ class MiTeleBaseIE(InfoExtractor): duration = int_or_none(mmc.get('duration')) for location in mmc['locations']: gat = self._proto_relative_url(location.get('gat'), 'http:') - bas = location.get('bas') - loc = location.get('loc') + gcp = location.get('gcp') ogn = location.get('ogn') - if None in (gat, bas, loc, ogn): + if None in (gat, gcp, ogn): continue token_data = { - 'bas': bas, - 'icd': loc, + 'gcp': gcp, 'ogn': ogn, - 'sta': '0', + 'sta': 0, } media = self._download_json( - '%s/?%s' % (gat, compat_urllib_parse_urlencode(token_data)), - video_id, 'Downloading %s JSON' % location['loc']) - file_ = media.get('file') - if not file_: + gat, video_id, data=json.dumps(token_data).encode('utf-8'), + headers={ + 'Content-Type': 'application/json;charset=utf-8', + 'Referer': url, + }) + stream = media.get('stream') or media.get('file') + if not stream: continue - ext = determine_ext(file_) + ext = determine_ext(stream) if ext == 'f4m': formats.extend(self._extract_f4m_formats( - file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', + stream + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', video_id, f4m_id='hds', fatal=False)) elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - file_, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + stream, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 6fe3b6049..e24396e79 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -4,8 +4,11 @@ import datetime import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( ExtractorError, + InAdvancePagedList, + orderedSet, str_to_int, unified_strdate, ) @@ -114,3 +117,86 @@ class MotherlessIE(InfoExtractor): 'age_limit': age_limit, 'url': video_url, } + + +class MotherlessGroupIE(InfoExtractor): + _VALID_URL = 'https?://(?:www\.)?motherless\.com/gv?/(?P[a-z0-9_]+)' + _TESTS = [{ + 'url': 'http://motherless.com/g/movie_scenes', + 'info_dict': { + 'id': 'movie_scenes', + 'title': 'Movie Scenes', + 'description': 'Hot and sexy scenes from "regular" movies... ' + 'Beautiful actresses fully nude... A looot of ' + 'skin! :)Enjoy!', + }, + 'playlist_mincount': 662, + }, { + 'url': 'http://motherless.com/gv/sex_must_be_funny', + 'info_dict': { + 'id': 'sex_must_be_funny', + 'title': 'Sex must be funny', + 'description': 'Sex can be funny. Wide smiles,laugh, games, fun of ' + 'any kind!' + }, + 'playlist_mincount': 9, + }] + + @classmethod + def suitable(cls, url): + return (False if MotherlessIE.suitable(url) + else super(MotherlessGroupIE, cls).suitable(url)) + + def _extract_entries(self, webpage, base): + entries = [] + for mobj in re.finditer( + r'href="(?P/[^"]+)"[^>]*>(?:\s*]+alt="[^-]+-\s(?P[^"]+)")?', + webpage): + video_url = compat_urlparse.urljoin(base, mobj.group('href')) + if not MotherlessIE.suitable(video_url): + continue + video_id = MotherlessIE._match_id(video_url) + title = mobj.group('title') + entries.append(self.url_result( + video_url, ie=MotherlessIE.ie_key(), video_id=video_id, + video_title=title)) + # Alternative fallback + if not entries: + entries = [ + self.url_result( + compat_urlparse.urljoin(base, '/' + video_id), + ie=MotherlessIE.ie_key(), video_id=video_id) + for video_id in orderedSet(re.findall( + r'data-codename=["\']([A-Z0-9]+)', webpage))] + return entries + + def _real_extract(self, url): + group_id = self._match_id(url) + page_url = compat_urlparse.urljoin(url, '/gv/%s' % group_id) + webpage = self._download_webpage(page_url, group_id) + title = self._search_regex( + r'<title>([\w\s]+\w)\s+-', webpage, 'title', fatal=False) + description = self._html_search_meta( + 'description', webpage, fatal=False) + page_count = self._int(self._search_regex( + r'(\d+)</(?:a|span)><(?:a|span)[^>]+>\s*NEXT', + webpage, 'page_count'), 'page_count') + PAGE_SIZE = 80 + + def _get_page(idx): + webpage = self._download_webpage( + page_url, group_id, query={'page': idx + 1}, + note='Downloading page %d/%d' % (idx + 1, page_count) + ) + for entry in self._extract_entries(webpage, url): + yield entry + + playlist = InAdvancePagedList(_get_page, page_count, PAGE_SIZE) + + return { + '_type': 'playlist', + 'id': group_id, + 'title': title, + 'description': description, + 'entries': playlist + } diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index 8e13bcf1f..5c8b37e18 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -19,11 +19,11 @@ from ..utils import ( class OdnoklassnikiIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|m|mobile)\.)?(?:odnoklassniki|ok)\.ru/(?:video(?:embed)?|web-api/video/moviePlayer)/(?P<id>[\d-]+)' + _VALID_URL = r'https?://(?:(?:www|m|mobile)\.)?(?:odnoklassniki|ok)\.ru/(?:video(?:embed)?|web-api/video/moviePlayer|live)/(?P<id>[\d-]+)' _TESTS = [{ # metadata in JSON 'url': 'http://ok.ru/video/20079905452', - 'md5': '6ba728d85d60aa2e6dd37c9e70fdc6bc', + 'md5': '0b62089b479e06681abaaca9d204f152', 'info_dict': { 'id': '20079905452', 'ext': 'mp4', @@ -35,7 +35,6 @@ class OdnoklassnikiIE(InfoExtractor): 'like_count': int, 'age_limit': 0, }, - 'skip': 'Video has been blocked', }, { # metadataUrl 'url': 'http://ok.ru/video/63567059965189-0?fromTime=5', @@ -99,6 +98,9 @@ class OdnoklassnikiIE(InfoExtractor): }, { 'url': 'http://mobile.ok.ru/video/20079905452', 'only_matching': True, + }, { + 'url': 'https://www.ok.ru/live/484531969818', + 'only_matching': True, }] def _real_extract(self, url): @@ -184,6 +186,10 @@ class OdnoklassnikiIE(InfoExtractor): }) return info + assert title + if provider == 'LIVE_TV_APP': + info['title'] = self._live_title(title) + quality = qualities(('4', '0', '1', '2', '3', '5')) formats = [{ @@ -210,6 +216,20 @@ class OdnoklassnikiIE(InfoExtractor): if fmt_type: fmt['quality'] = quality(fmt_type) + # Live formats + m3u8_url = metadata.get('hlsMasterPlaylistUrl') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8', + m3u8_id='hls', fatal=False)) + rtmp_url = metadata.get('rtmpUrl') + if rtmp_url: + formats.append({ + 'url': rtmp_url, + 'format_id': 'rtmp', + 'ext': 'flv', + }) + self._sort_formats(formats) info['formats'] = formats diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index d1eb3be25..eaaaf8a08 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -284,6 +284,11 @@ class OpenloadIE(InfoExtractor): # for title and ext 'url': 'https://openload.co/embed/Sxz5sADo82g/', 'only_matching': True, + }, { + # unavailable via https://openload.co/embed/e-Ixz9ZR5L0/ but available + # via https://openload.co/f/e-Ixz9ZR5L0/ + 'url': 'https://openload.co/f/e-Ixz9ZR5L0/', + 'only_matching': True, }, { 'url': 'https://oload.tv/embed/KnG-kKZdcfY/', 'only_matching': True, @@ -305,20 +310,34 @@ class OpenloadIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - url = 'https://openload.co/embed/%s/' % video_id + url_pattern = 'https://openload.co/%%s/%s/' % video_id headers = { 'User-Agent': self._USER_AGENT, } - webpage = self._download_webpage(url, video_id, headers=headers) - - if 'File not found' in webpage or 'deleted by the owner' in webpage: - raise ExtractorError('File not found', expected=True, video_id=video_id) + for path in ('embed', 'f'): + page_url = url_pattern % path + last = path == 'f' + webpage = self._download_webpage( + page_url, video_id, 'Downloading %s webpage' % path, + headers=headers, fatal=last) + if not webpage: + continue + if 'File not found' in webpage or 'deleted by the owner' in webpage: + if not last: + continue + raise ExtractorError('File not found', expected=True, video_id=video_id) + break phantom = PhantomJSwrapper(self, required_version='2.0') - webpage, _ = phantom.get(url, html=webpage, video_id=video_id, headers=headers) + webpage, _ = phantom.get(page_url, html=webpage, video_id=video_id, headers=headers) + + decoded_id = (get_element_by_id('streamurl', webpage) or + get_element_by_id('streamuri', webpage) or + get_element_by_id('streamurj', webpage)) - decoded_id = get_element_by_id('streamurl', webpage) + if not decoded_id: + raise ExtractorError('Can\'t find stream URL', video_id=video_id) video_url = 'https://openload.co/stream/%s?mime=true' % decoded_id @@ -327,7 +346,7 @@ class OpenloadIE(InfoExtractor): 'title', default=None) or self._html_search_meta( 'description', webpage, 'title', fatal=True) - entries = self._parse_html5_media_entries(url, webpage, video_id) + entries = self._parse_html5_media_entries(page_url, webpage, video_id) entry = entries[0] if entries else {} subtitles = entry.get('subtitles') diff --git a/youtube_dl/extractor/playtvak.py b/youtube_dl/extractor/playtvak.py index 391e1bd09..4c5f57919 100644 --- a/youtube_dl/extractor/playtvak.py +++ b/youtube_dl/extractor/playtvak.py @@ -24,7 +24,7 @@ class PlaytvakIE(InfoExtractor): 'id': 'A150730_150323_hodinovy-manzel_kuko', 'ext': 'mp4', 'title': 'Vyžeňte vosy a sršně ze zahrady', - 'description': 'md5:f93d398691044d303bc4a3de62f3e976', + 'description': 'md5:4436e61b7df227a093778efb7e373571', 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', 'duration': 279, 'timestamp': 1438732860, @@ -36,9 +36,19 @@ class PlaytvakIE(InfoExtractor): 'info_dict': { 'id': 'A150624_164934_planespotting_cat', 'ext': 'flv', - 'title': 're:^Přímý přenos iDNES.cz [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': 're:^Planespotting [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': 'Sledujte provoz na ranveji Letiště Václava Havla v Praze', - 'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$', + 'is_live': True, + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, + }, { # another live stream, this one without Misc.videoFLV + 'url': 'https://slowtv.playtvak.cz/zive-sledujte-vlaky-v-primem-prenosu-dwi-/hlavni-nadrazi.aspx?c=A151218_145728_hlavni-nadrazi_plap', + 'info_dict': { + 'id': 'A151218_145728_hlavni-nadrazi_plap', + 'ext': 'flv', + 'title': 're:^Hlavní nádraží [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, }, 'params': { @@ -95,7 +105,7 @@ class PlaytvakIE(InfoExtractor): webpage = self._download_webpage(url, video_id) info_url = self._html_search_regex( - r'Misc\.videoFLV\(\s*{\s*data\s*:\s*"([^"]+)"', webpage, 'info url') + r'Misc\.video(?:FLV)?\(\s*{\s*data\s*:\s*"([^"]+)"', webpage, 'info url') parsed_url = compat_urlparse.urlparse(info_url) @@ -160,7 +170,7 @@ class PlaytvakIE(InfoExtractor): if is_live: title = self._live_title(title) description = self._og_search_description(webpage, default=None) or self._html_search_meta( - 'description', webpage, 'description') + 'description', webpage, 'description', default=None) timestamp = None duration = None if not is_live: diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index 597b11218..aacc5d4bb 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -171,12 +171,12 @@ class PluralsightIE(PluralsightBaseIE): for num, current in enumerate(subs): current = subs[num] start, text = ( - float_or_none(dict_get(current, TIME_OFFSET_KEYS)), + float_or_none(dict_get(current, TIME_OFFSET_KEYS, skip_false_values=False)), dict_get(current, TEXT_KEYS)) if start is None or text is None: continue end = duration if num == len(subs) - 1 else float_or_none( - dict_get(subs[num + 1], TIME_OFFSET_KEYS)) + dict_get(subs[num + 1], TIME_OFFSET_KEYS, skip_false_values=False)) if end is None: continue srt += os.linesep.join( diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index d9edf9da2..fa60ffd5e 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -31,6 +31,9 @@ def _decrypt_url(png): hash_index = data.index('#') alphabet_data = data[:hash_index] url_data = data[hash_index + 1:] + if url_data[0] == 'H' and url_data[3] == '%': + # remove useless HQ%% at the start + url_data = url_data[4:] alphabet = [] e = 0 diff --git a/youtube_dl/extractor/sandia.py b/youtube_dl/extractor/sandia.py deleted file mode 100644 index 96e43af84..000000000 --- a/youtube_dl/extractor/sandia.py +++ /dev/null @@ -1,65 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - mimetype2ext, -) - - -class SandiaIE(InfoExtractor): - IE_DESC = 'Sandia National Laboratories' - _VALID_URL = r'https?://digitalops\.sandia\.gov/Mediasite/Play/(?P<id>[0-9a-f]+)' - _TEST = { - 'url': 'http://digitalops.sandia.gov/Mediasite/Play/24aace4429fc450fb5b38cdbf424a66e1d', - 'md5': '9422edc9b9a60151727e4b6d8bef393d', - 'info_dict': { - 'id': '24aace4429fc450fb5b38cdbf424a66e1d', - 'ext': 'mp4', - 'title': 'Xyce Software Training - Section 1', - 'description': 're:(?s)SAND Number: SAND 2013-7800.{200,}', - 'upload_date': '20120409', - 'timestamp': 1333983600, - 'duration': 7794, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - presentation_data = self._download_json( - 'http://digitalops.sandia.gov/Mediasite/PlayerService/PlayerService.svc/json/GetPlayerOptions', - video_id, data=json.dumps({ - 'getPlayerOptionsRequest': { - 'ResourceId': video_id, - 'QueryString': '', - } - }), headers={ - 'Content-Type': 'application/json; charset=utf-8', - })['d']['Presentation'] - - title = presentation_data['Title'] - - formats = [] - for stream in presentation_data.get('Streams', []): - for fd in stream.get('VideoUrls', []): - formats.append({ - 'format_id': fd['MediaType'], - 'format_note': fd['MimeType'].partition('/')[2], - 'ext': mimetype2ext(fd['MimeType']), - 'url': fd['Location'], - 'protocol': 'f4m' if fd['MimeType'] == 'video/x-mp4-fragmented' else None, - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': presentation_data.get('Description'), - 'formats': formats, - 'timestamp': int_or_none(presentation_data.get('UnixTime'), 1000), - 'duration': int_or_none(presentation_data.get('Duration'), 1000), - } diff --git a/youtube_dl/extractor/slutload.py b/youtube_dl/extractor/slutload.py index 7145d285a..6fc2ff60d 100644 --- a/youtube_dl/extractor/slutload.py +++ b/youtube_dl/extractor/slutload.py @@ -1,11 +1,13 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor class SlutloadIE(InfoExtractor): _VALID_URL = r'^https?://(?:\w+\.)?slutload\.com/video/[^/]+/(?P<id>[^/]+)/?$' - _TEST = { + _TESTS = [{ 'url': 'http://www.slutload.com/video/virginie-baisee-en-cam/TD73btpBqSxc/', 'md5': '868309628ba00fd488cf516a113fd717', 'info_dict': { @@ -15,11 +17,17 @@ class SlutloadIE(InfoExtractor): 'age_limit': 18, 'thumbnail': r're:https?://.*?\.jpg' } - } + }, { + # mobile site + 'url': 'http://mobile.slutload.com/video/masturbation-solo/fviFLmc6kzJ/', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + + desktop_url = re.sub(r'^(https?://)mobile\.', r'\1', url) + webpage = self._download_webpage(desktop_url, video_id) video_title = self._html_search_regex(r'<h1><strong>([^<]+)</strong>', webpage, 'title').strip() diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 8894f4b0c..6c9816eef 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -136,6 +136,25 @@ class SoundcloudIE(InfoExtractor): 'license': 'all-rights-reserved', }, }, + # no album art, use avatar pic for thumbnail + { + 'url': 'https://soundcloud.com/garyvee/sideways-prod-mad-real', + 'md5': '59c7872bc44e5d99b7211891664760c2', + 'info_dict': { + 'id': '309699954', + 'ext': 'mp3', + 'title': 'Sideways (Prod. Mad Real)', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'uploader': 'garyvee', + 'upload_date': '20170226', + 'duration': 207, + 'thumbnail': r're:https?://.*\.jpg', + 'license': 'all-rights-reserved', + }, + 'params': { + 'skip_download': True, + }, + }, ] _CLIENT_ID = 'c6CU49JDMapyrQo06UxU9xouB9ZVzqCn' @@ -160,7 +179,7 @@ class SoundcloudIE(InfoExtractor): name = full_title or track_id if quiet: self.report_extraction(name) - thumbnail = info.get('artwork_url') + thumbnail = info.get('artwork_url') or info.get('user', {}).get('avatar_url') if isinstance(thumbnail, compat_str): thumbnail = thumbnail.replace('-large', '-t500x500') ext = 'mp3' diff --git a/youtube_dl/extractor/twentythreevideo.py b/youtube_dl/extractor/twentythreevideo.py new file mode 100644 index 000000000..aa0c6e90f --- /dev/null +++ b/youtube_dl/extractor/twentythreevideo.py @@ -0,0 +1,77 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class TwentyThreeVideoIE(InfoExtractor): + IE_NAME = '23video' + _VALID_URL = r'https?://video\.(?P<domain>twentythree\.net|23video\.com|filmweb\.no)/v\.ihtml/player\.html\?(?P<query>.*?\bphoto(?:_|%5f)id=(?P<id>\d+).*)' + _TEST = { + 'url': 'https://video.twentythree.net/v.ihtml/player.html?showDescriptions=0&source=site&photo%5fid=20448876&autoPlay=1', + 'md5': '75fcf216303eb1dae9920d651f85ced4', + 'info_dict': { + 'id': '20448876', + 'ext': 'mp4', + 'title': 'Video Marketing Minute: Personalized Video', + 'timestamp': 1513855354, + 'upload_date': '20171221', + 'uploader_id': '12258964', + 'uploader': 'Rasmus Bysted', + } + } + + def _real_extract(self, url): + domain, query, photo_id = re.match(self._VALID_URL, url).groups() + base_url = 'https://video.%s' % domain + photo_data = self._download_json( + base_url + '/api/photo/list?' + query, photo_id, query={ + 'format': 'json', + }, transform_source=lambda s: self._search_regex(r'(?s)({.+})', s, 'photo data'))['photo'] + title = photo_data['title'] + + formats = [] + + audio_path = photo_data.get('audio_download') + if audio_path: + formats.append({ + 'format_id': 'audio', + 'url': base_url + audio_path, + 'filesize': int_or_none(photo_data.get('audio_size')), + 'vcodec': 'none', + }) + + def add_common_info_to_list(l, template, id_field, id_value): + f_base = template % id_value + f_path = photo_data.get(f_base + 'download') + if not f_path: + return + l.append({ + id_field: id_value, + 'url': base_url + f_path, + 'width': int_or_none(photo_data.get(f_base + 'width')), + 'height': int_or_none(photo_data.get(f_base + 'height')), + 'filesize': int_or_none(photo_data.get(f_base + 'size')), + }) + + for f in ('mobile_high', 'medium', 'hd', '1080p', '4k'): + add_common_info_to_list(formats, 'video_%s_', 'format_id', f) + + thumbnails = [] + for t in ('quad16', 'quad50', 'quad75', 'quad100', 'small', 'portrait', 'standard', 'medium', 'large', 'original'): + add_common_info_to_list(thumbnails, '%s_', 'id', t) + + return { + 'id': photo_id, + 'title': title, + 'timestamp': int_or_none(photo_data.get('creation_date_epoch')), + 'duration': int_or_none(photo_data.get('video_length')), + 'view_count': int_or_none(photo_data.get('view_count')), + 'comment_count': int_or_none(photo_data.get('number_of_comments')), + 'uploader_id': photo_data.get('user_id'), + 'uploader': photo_data.get('display_name'), + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index bf57eac01..f9164af09 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -358,9 +358,16 @@ class TwitchPlaylistBaseIE(TwitchBaseIE): break offset += limit return self.playlist_result( - [self.url_result(entry) for entry in orderedSet(entries)], + [self._make_url_result(entry) for entry in orderedSet(entries)], channel_id, channel_name) + def _make_url_result(self, url): + try: + video_id = 'v%s' % TwitchVodIE._match_id(url) + return self.url_result(url, TwitchVodIE.ie_key(), video_id=video_id) + except AssertionError: + return self.url_result(url) + def _extract_playlist_page(self, response): videos = response.get('videos') return [video['url'] for video in videos] if videos else [] diff --git a/youtube_dl/extractor/ufctv.py b/youtube_dl/extractor/ufctv.py new file mode 100644 index 000000000..ab823814b --- /dev/null +++ b/youtube_dl/extractor/ufctv.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + parse_iso8601, +) + + +class UFCTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ufc\.tv/video/(?P<id>[^/]+)' + _TEST = { + 'url': 'https://www.ufc.tv/video/ufc-219-countdown-full-episode', + 'info_dict': { + 'id': '34167', + 'ext': 'mp4', + 'title': 'UFC 219 Countdown: Full Episode', + 'description': 'md5:26d4e8bf4665ae5878842d7050c3c646', + 'timestamp': 1513962360, + 'upload_date': '20171222', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + video_data = self._download_json(url, display_id, query={ + 'format': 'json', + }) + video_id = str(video_data['id']) + title = video_data['name'] + m3u8_url = self._download_json( + 'https://www.ufc.tv/service/publishpoint', video_id, query={ + 'type': 'video', + 'format': 'json', + 'id': video_id, + }, headers={ + 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0_1 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A402 Safari/604.1', + })['path'] + m3u8_url = m3u8_url.replace('_iphone.', '.') + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'duration': parse_duration(video_data.get('runtime')), + 'timestamp': parse_iso8601(video_data.get('releaseDate')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index cedb54876..6af705657 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -468,11 +468,12 @@ class VimeoIE(VimeoBaseInfoExtractor): request = sanitized_Request(url, headers=headers) try: webpage, urlh = self._download_webpage_handle(request, video_id) + redirect_url = compat_str(urlh.geturl()) # Some URLs redirect to ondemand can't be extracted with # this extractor right away thus should be passed through # ondemand extractor (e.g. https://vimeo.com/73445910) - if VimeoOndemandIE.suitable(urlh.geturl()): - return self.url_result(urlh.geturl(), VimeoOndemandIE.ie_key()) + if VimeoOndemandIE.suitable(redirect_url): + return self.url_result(redirect_url, VimeoOndemandIE.ie_key()) except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: errmsg = ee.cause.read() @@ -541,15 +542,15 @@ class VimeoIE(VimeoBaseInfoExtractor): if re.search(r'<form[^>]+?id="pw_form"', webpage) is not None: if '_video_password_verified' in data: raise ExtractorError('video password verification failed!') - self._verify_video_password(url, video_id, webpage) + self._verify_video_password(redirect_url, video_id, webpage) return self._real_extract( - smuggle_url(url, {'_video_password_verified': 'verified'})) + smuggle_url(redirect_url, {'_video_password_verified': 'verified'})) else: raise ExtractorError('Unable to extract info section', cause=e) else: if config.get('view') == 4: - config = self._verify_player_video_password(url, video_id) + config = self._verify_player_video_password(redirect_url, video_id) def is_rented(): if '>You rented this title.<' in webpage: diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index c7947d4a1..5b0b248cd 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -245,13 +245,19 @@ class YoukuShowIE(InfoExtractor): # No data-id value. 'url': 'http://list.youku.com/show/id_zefbfbd61237fefbfbdef.html', 'only_matching': True, + }, { + # Wrong number of reload_id. + 'url': 'http://list.youku.com/show/id_z20eb4acaf5c211e3b2ad.html', + 'only_matching': True, }] def _extract_entries(self, playlist_data_url, show_id, note, query): query['callback'] = 'cb' playlist_data = self._download_json( playlist_data_url, show_id, query=query, note=note, - transform_source=lambda s: js_to_json(strip_jsonp(s)))['html'] + transform_source=lambda s: js_to_json(strip_jsonp(s))).get('html') + if playlist_data is None: + return [None, None] drama_list = (get_element_by_class('p-drama-grid', playlist_data) or get_element_by_class('p-drama-half-row', playlist_data)) if drama_list is None: @@ -291,8 +297,8 @@ class YoukuShowIE(InfoExtractor): 'id': page_config['showid'], 'stage': reload_id, }) - entries.extend(new_entries) - + if new_entries is not None: + entries.extend(new_entries) desc = self._html_search_meta('description', webpage, fatal=False) playlist_title = desc.split(',')[0] if desc else None detail_li = get_element_by_class('p-intro', webpage) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 2843a3dc0..386897a85 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -39,6 +39,7 @@ from .compat import ( compat_HTMLParser, compat_basestring, compat_chr, + compat_ctypes_WINFUNCTYPE, compat_etree_fromstring, compat_expanduser, compat_html_entities, @@ -1330,24 +1331,24 @@ def _windows_write_string(s, out): if fileno not in WIN_OUTPUT_IDS: return False - GetStdHandle = ctypes.WINFUNCTYPE( + GetStdHandle = compat_ctypes_WINFUNCTYPE( ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)( - (b'GetStdHandle', ctypes.windll.kernel32)) + ('GetStdHandle', ctypes.windll.kernel32)) h = GetStdHandle(WIN_OUTPUT_IDS[fileno]) - WriteConsoleW = ctypes.WINFUNCTYPE( + WriteConsoleW = compat_ctypes_WINFUNCTYPE( ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR, ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD), - ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32)) + ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32)) written = ctypes.wintypes.DWORD(0) - GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32)) + GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32)) FILE_TYPE_CHAR = 0x0002 FILE_TYPE_REMOTE = 0x8000 - GetConsoleMode = ctypes.WINFUNCTYPE( + GetConsoleMode = compat_ctypes_WINFUNCTYPE( ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.POINTER(ctypes.wintypes.DWORD))( - (b'GetConsoleMode', ctypes.windll.kernel32)) + ('GetConsoleMode', ctypes.windll.kernel32)) INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value def not_a_console(handle): diff --git a/youtube_dl/version.py b/youtube_dl/version.py index f999584d7..9030e2415 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.12.23' +__version__ = '2018.01.07'