From: remitamine Date: Wed, 24 May 2017 08:21:33 +0000 (+0200) Subject: Merge pull request #12861 from Tithen-Firion/cbsinteractive-fix X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=commitdiff_plain;h=9e35298f973b25c513e30bbc1fe2a05a1c6b7fab;hp=96820c1c6bd2a9613725fc17fd44fd00d3301475;p=youtube-dl Merge pull request #12861 from Tithen-Firion/cbsinteractive-fix [cbsinteractive] update extractor and test cases --- diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index dedc07604..730a07ed1 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.04.26*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.04.26** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.05.23*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.05.23** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.04.26 +[debug] youtube-dl version 2017.05.23 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/AUTHORS b/AUTHORS index 1bdb74285..f80c34a08 100644 --- a/AUTHORS +++ b/AUTHORS @@ -212,3 +212,8 @@ Xiao Di Guan Thomas Winant Daniel Twardowski Jeremie Jarosh +Gerard Rovira +Marvin Ewald +Frédéric Bournival +Timendum +gritstub diff --git a/ChangeLog b/ChangeLog index b23aa10d5..75a95f0b0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,166 @@ +version 2017.05.23 + +Core ++ [downloader/external] Pass -loglevel to ffmpeg downloader (#13183) ++ [adobepass] Add support for Bright House Networks (#13149) + +Extractors ++ [streamcz] Add support for subtitles (#13174) +* [youtube] Fix DASH manifest signature decryption (#8944, #13156) +* [toggle] Relax URL regular expression (#13172) +* [toypics] Fix extraction (#13077) +* [njpwworld] Fix extraction (#13162, #13169) ++ [hitbox] Add support for smashcast.tv (#13154) +* [mitele] Update app key regular expression (#13158) + + +version 2017.05.18.1 + +Core +* [jsinterp] Fix typo and cleanup regular expressions (#13134) + + +version 2017.05.18 + +Core ++ [jsinterp] Add support for quoted names and indexers (#13123, #13124, #13125, + #13126, #13128, #13129, #13130, #13131, #13132) ++ [extractor/common] Add support for schemeless URLs in _extract_wowza_formats + (#13088, #13092) ++ [utils] Recognize more audio codecs (#13081) + +Extractors ++ [vier] Extract more metadata (#12539) +* [vier] Improve extraction (#12801) + + Add support for authentication + * Bypass authentication when no credentials provided + * Improve extraction robustness +* [dailymail] Fix sources extraction (#13057) +* [dailymotion] Extend URL regular expression (#13079) + + +version 2017.05.14 + +Core ++ [extractor/common] Respect Width and Height attributes in ISM manifests ++ [postprocessor/metadatafromtitle] Add support regular expression syntax for + --metadata-from-title (#13065) + +Extractors ++ [mediaset] Add support for video.mediaset.it (#12708, #12964) +* [orf:radio] Fix extraction (#11643, #12926) +* [aljazeera] Extend URL regular expression (#13053) +* [imdb] Relax URL regular expression (#13056) ++ [francetv] Add support for mobile.france.tv (#13068) ++ [upskill] Add support for upskillcourses.com (#13043) +* [thescene] Fix extraction (#13061) +* [condenast] Improve embed support +* [liveleak] Fix extraction (#12053) ++ [douyu] Support Douyu shows (#12228) +* [myspace] Improve URL regular expression (#13040) +* [adultswim] Use desktop platform in assets URL (#13041) + + +version 2017.05.09 + +Core +* [YoutubeDL] Force --restrict-filenames when no locale is set on all python + versions (#13027) + +Extractors +* [francetv] Adapt to site redesign (#13034) ++ [packtpub] Add support for authentication (#12622) +* [drtv] Lower preference for SignLanguage formats (#13013, #13016) ++ [cspan] Add support for brightcove live embeds (#13028) +* [vrv] Extract DASH formats and subtitles +* [funimation] Fix authentication (#13021) +* [adultswim] Fix extraction (#8640, #10950, #11042, #12121) + + Add support for Adobe Pass authentication + + Add support for live streams + + Add support for show pages +* [turner] Extract thumbnail, is_live and strip description ++ [nonktube] Add support for nonktube.com (#8647, #13024) ++ [nuevo] Pass headers to _extract_nuevo +* [nbc] Improve extraction (#12364) + + +version 2017.05.07 + +Common +* [extractor/common] Fix typo in _extract_akamai_formats ++ [postprocessor/ffmpeg] Embed chapters into media file with --add-metadata ++ [extractor/common] Introduce chapters meta field + +Extractors +* [youtube] Fix authentication (#12820, #12927, #12973, #12992, #12993, #12995, + #13003) +* [bilibili] Fix video downloading (#13001) +* [rmcdecouverte] Fix extraction (#12937) +* [theplatform] Extract chapters +* [bandcamp] Fix thumbnail extraction (#12980) +* [pornhub] Extend URL regular expression (#12996) ++ [youtube] Extract chapters ++ [nrk] Extract chapters ++ [vice] Add support for ooyala embeds in article pages ++ [vice] Support vice articles (#12968) +* [vice] Fix extraction for non en_us videos (#12967) +* [gdcvault] Fix extraction for some videos (#12733) +* [pbs] Improve multipart video support (#12981) +* [laola1tv] Fix extraction (#12880) ++ [cda] Support birthday verification (#12789) +* [leeco] Fix extraction (#12974) ++ [pbs] Extract chapters +* [amp] Imporove thumbnail and subtitles extraction +* [foxsports] Fix extraction (#12945) +- [coub] Remove comment count extraction (#12941) + + +version 2017.05.01 + +Core ++ [extractor/common] Extract view count from JSON-LD +* [utils] Improve unified_timestamp ++ [utils] Add video/mp2t to mimetype2ext +* [downloader/external] Properly handle live stream downloading cancellation + (#8932) ++ [utils] Add support for unicode whitespace in clean_html on python 2 (#12906) + +Extractors +* [infoq] Make audio format extraction non fatal (#12938) +* [brightcove] Allow whitespace around attribute names in embedded code ++ [zaq1] Add support for zaq1.pl (#12693) ++ [xvideos] Extract duration (#12828) +* [vevo] Fix extraction (#12879) ++ [noovo] Add support for noovo.ca (#12792) ++ [washingtonpost] Add support for embeds (#12699) +* [yandexmusic:playlist] Fix extraction for python 3 (#12888) +* [anvato] Improve extraction (#12913) + * Promote to regular shortcut based extractor + * Add mcp to access key mapping table + * Add support for embeds extraction + * Add support for anvato embeds in generic extractor +* [xtube] Fix extraction for older FLV videos (#12734) +* [tvplayer] Fix extraction (#12908) + + +version 2017.04.28 + +Core ++ [adobepass] Use geo verification headers for all requests +- [downloader/fragment] Remove assert for resume_len when no fragments + downloaded ++ [extractor/common] Add manifest_url for explicit group rendition formats +* [extractor/common] Fix manifest_url for m3u8 formats +- [extractor/common] Don't list master m3u8 playlists in format list (#12832) + +Extractor +* [aenetworks] Fix extraction for shows with single season ++ [go] Add support for Disney, DisneyJunior and DisneyXD show pages +* [youtube] Recognize new locale-based player URLs (#12885) ++ [streamable] Add support for new embedded URL schema (#12844) +* [arte:+7] Relax URL regular expression (#12837) + + version 2017.04.26 Core @@ -6,19 +169,19 @@ Core * [YoutubeDL] Fix output template for missing timestamp (#12796) * [socks] Handle cases where credentials are required but missing * [extractor/common] Improve HLS extraction (#12211) - - Extract m3u8 parsing to separate method - - Improve rendition groups extraction - - Build stream name according stream GROUP-ID - - Ignore reference to AUDIO group without URI when stream has no CODECS - - Use float for scaled tbr in _parse_m3u8_formats + * Extract m3u8 parsing to separate method + * Improve rendition groups extraction + * Build stream name according stream GROUP-ID + * Ignore reference to AUDIO group without URI when stream has no CODECS + * Use float for scaled tbr in _parse_m3u8_formats * [utils] Add support for TTML styles in dfxp2srt * [downloader/hls] No need to download keys for fragments that have been already downloaded * [downloader/fragment] Improve fragment downloading - - Resume immediately - - Don't concatenate fragments and decrypt them on every resume - - Optimize disk storage usage, don't store intermediate fragments on disk - - Store bookkeeping download state file + * Resume immediately + * Don't concatenate fragments and decrypt them on every resume + * Optimize disk storage usage, don't store intermediate fragments on disk + * Store bookkeeping download state file + [extractor/common] Add support for multiple getters in try_get + [extractor/common] Add support for video of WebPage context in _json_ld (#12778) diff --git a/README.md b/README.md index 1ecd2005a..dc0be1f40 100644 --- a/README.md +++ b/README.md @@ -400,12 +400,14 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo --add-metadata Write metadata to the video file --metadata-from-title FORMAT Parse additional metadata like song title / artist from the video title. The format - syntax is the same as --output, the parsed - parameters replace existing values. - Additional templates: %(album)s, - %(artist)s. Example: --metadata-from-title - "%(artist)s - %(title)s" matches a title - like "Coldplay - Paradise" + syntax is the same as --output. Regular + expression with named capture groups may + also be used. The parsed parameters replace + existing values. Example: --metadata-from- + title "%(artist)s - %(title)s" matches a + title like "Coldplay - Paradise". Example + (regex): --metadata-from-title + "(?P.+?) - (?P.+)" --xattrs Write metadata to the video file's xattrs (using dublin core and xdg standards) --fixup POLICY Automatically correct known faults of the diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 0be7af8c4..5545cff23 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -45,6 +45,7 @@ - **anderetijden**: npo.nl and ntr.nl - **AnimeOnDemand** - **anitube.se** + - **Anvato** - **AnySex** - **Aparat** - **AppleConnect** @@ -215,6 +216,7 @@ - **DiscoveryVR** - **Disney** - **Dotsub** + - **DouyuShow** - **DouyuTV**: 斗鱼 - **DPlay** - **DPlayIt** @@ -280,7 +282,8 @@ - **france2.fr:generation-quoi** - **FranceCulture** - **FranceInter** - - **francetv**: France 2, 3, 4, 5 and Ô + - **FranceTV** + - **FranceTVEmbed** - **francetvinfo.fr** - **Freesound** - **freespeech.org** @@ -431,6 +434,7 @@ - **MDR**: MDR.DE and KiKA - **media.ccc.de** - **Medialaan** + - **Mediaset** - **Medici** - **Meipai**: 美拍 - **MelonVOD** @@ -529,6 +533,8 @@ - **NJPWWorld**: 新日本プロレスワールド - **NobelPrize** - **Noco** + - **NonkTube** + - **Noovo** - **Normalboots** - **NosVideo** - **Nova**: TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz @@ -600,7 +606,6 @@ - **pluralsight** - **pluralsight:course** - **plus.google**: Google Plus - - **pluzz.francetv.fr** - **podomatic** - **Pokemon** - **PolskieRadio** @@ -798,7 +803,7 @@ - **ToonGoggles** - **Tosh**: Tosh.0 - **tou.tv** - - **Toypics**: Toypics user profile + - **Toypics**: Toypics video - **ToypicsUser**: Toypics user profile - **TrailerAddict** (Currently broken) - **Trilulilu** @@ -858,6 +863,8 @@ - **uol.com.br** - **uplynk** - **uplynk:preplay** + - **Upskill** + - **UpskillCourse** - **Urort**: NRK P3 Urørt - **URPlay** - **USANetwork** @@ -877,9 +884,10 @@ - **VGTV**: VGTV, BTTV, FTV, Aftenposten and Aftonbladet - **vh1.com** - **Viafree** - - **Vice** + - **vice** + - **vice:article** + - **vice:show** - **Viceland** - - **ViceShow** - **Vidbit** - **Viddler** - **Videa** @@ -1013,6 +1021,7 @@ - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword) - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) - **Zapiks** + - **Zaq1** - **ZDF** - **ZDFChannel** - **zingmp3**: mp3.zing.vn diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 2a3c3f7cb..6f52e11f7 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -184,16 +184,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'pluzz_francetv_11507', 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/master.m3u8?caption=2017%2F16%2F156589847-1492488987.m3u8%3Afra%3AFrancais&audiotrack=0%3Afra%3AFrancais', [{ - 'url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/master.m3u8?caption=2017%2F16%2F156589847-1492488987.m3u8%3Afra%3AFrancais&audiotrack=0%3Afra%3AFrancais', - 'ext': 'mp4', - 'format_id': 'meta', - 'format_note': 'Quality selection URL', - 'protocol': 'm3u8', - 'preference': -100, - 'resolution': 'multiple' - }, { 'url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_0_av.m3u8?null=0', - 'manifest_url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_0_av.m3u8?null=0', + 'manifest_url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/master.m3u8?caption=2017%2F16%2F156589847-1492488987.m3u8%3Afra%3AFrancais&audiotrack=0%3Afra%3AFrancais', 'ext': 'mp4', 'format_id': '180', 'protocol': 'm3u8', @@ -204,7 +196,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'height': 144, }, { 'url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_1_av.m3u8?null=0', - 'manifest_url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_1_av.m3u8?null=0', + 'manifest_url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/master.m3u8?caption=2017%2F16%2F156589847-1492488987.m3u8%3Afra%3AFrancais&audiotrack=0%3Afra%3AFrancais', 'ext': 'mp4', 'format_id': '303', 'protocol': 'm3u8', @@ -215,7 +207,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'height': 180, }, { 'url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_2_av.m3u8?null=0', - 'manifest_url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_2_av.m3u8?null=0', + 'manifest_url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/master.m3u8?caption=2017%2F16%2F156589847-1492488987.m3u8%3Afra%3AFrancais&audiotrack=0%3Afra%3AFrancais', 'ext': 'mp4', 'format_id': '575', 'protocol': 'm3u8', @@ -226,7 +218,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'height': 288, }, { 'url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_3_av.m3u8?null=0', - 'manifest_url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_3_av.m3u8?null=0', + 'manifest_url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/master.m3u8?caption=2017%2F16%2F156589847-1492488987.m3u8%3Afra%3AFrancais&audiotrack=0%3Afra%3AFrancais', 'ext': 'mp4', 'format_id': '831', 'protocol': 'm3u8', @@ -237,7 +229,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'height': 396, }, { 'url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_4_av.m3u8?null=0', - 'manifest_url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_4_av.m3u8?null=0', + 'manifest_url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/master.m3u8?caption=2017%2F16%2F156589847-1492488987.m3u8%3Afra%3AFrancais&audiotrack=0%3Afra%3AFrancais', 'ext': 'mp4', 'protocol': 'm3u8', 'format_id': '1467', @@ -254,28 +246,22 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'teamcoco_11995', 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8', [{ - 'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8', - 'ext': 'mp4', - 'format_id': 'meta', - 'format_note': 'Quality selection URL', - 'protocol': 'm3u8', - 'preference': -100, - 'resolution': 'multiple', - }, { 'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-audio-160k_v4.m3u8', + 'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8', 'ext': 'mp4', 'format_id': 'audio-0-Default', 'protocol': 'm3u8', 'vcodec': 'none', }, { 'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-audio-64k_v4.m3u8', + 'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8', 'ext': 'mp4', 'format_id': 'audio-1-Default', 'protocol': 'm3u8', 'vcodec': 'none', }, { 'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-audio-64k_v4.m3u8', - 'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-audio-64k_v4.m3u8', + 'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8', 'ext': 'mp4', 'format_id': '71', 'protocol': 'm3u8', @@ -284,7 +270,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'tbr': 71, }, { 'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-400k_v4.m3u8', - 'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-400k_v4.m3u8', + 'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8', 'ext': 'mp4', 'format_id': '413', 'protocol': 'm3u8', @@ -295,7 +281,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'height': 224, }, { 'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-400k_v4.m3u8', - 'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-400k_v4.m3u8', + 'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8', 'ext': 'mp4', 'format_id': '522', 'protocol': 'm3u8', @@ -306,7 +292,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'height': 224, }, { 'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-1m_v4.m3u8', - 'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-1m_v4.m3u8', + 'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8', 'ext': 'mp4', 'format_id': '1205', 'protocol': 'm3u8', @@ -317,7 +303,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'height': 360, }, { 'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-2m_v4.m3u8', - 'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-2m_v4.m3u8', + 'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8', 'ext': 'mp4', 'format_id': '2374', 'protocol': 'm3u8', @@ -334,15 +320,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'toggle_mobile_12211', 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8', [{ - 'url': 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8', - 'ext': 'mp4', - 'format_id': 'meta', - 'format_note': 'Quality selection URL', - 'protocol': 'm3u8', - 'preference': -100, - 'resolution': 'multiple' - }, { 'url': 'http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_sa2ntrdg/name/a.mp4/index.m3u8', + 'manifest_url': 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8', 'ext': 'mp4', 'format_id': 'audio-English', 'protocol': 'm3u8', @@ -350,6 +329,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'vcodec': 'none', }, { 'url': 'http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_r7y0nitg/name/a.mp4/index.m3u8', + 'manifest_url': 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8', 'ext': 'mp4', 'format_id': 'audio-Undefined', 'protocol': 'm3u8', @@ -357,7 +337,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'vcodec': 'none', }, { 'url': 'http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_qlk9hlzr/name/a.mp4/index.m3u8', - 'manifest_url': 'http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_qlk9hlzr/name/a.mp4/index.m3u8', + 'manifest_url': 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8', 'ext': 'mp4', 'format_id': '155', 'protocol': 'm3u8', @@ -366,7 +346,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'height': 180, }, { 'url': 'http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_oefackmi/name/a.mp4/index.m3u8', - 'manifest_url': 'http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_oefackmi/name/a.mp4/index.m3u8', + 'manifest_url': 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8', 'ext': 'mp4', 'format_id': '502', 'protocol': 'm3u8', @@ -375,7 +355,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'height': 270, }, { 'url': 'http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/12/pv/1/flavorId/0_vyg9pj7k/name/a.mp4/index.m3u8', - 'manifest_url': 'http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/12/pv/1/flavorId/0_vyg9pj7k/name/a.mp4/index.m3u8', + 'manifest_url': 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8', 'ext': 'mp4', 'format_id': '827', 'protocol': 'm3u8', @@ -384,7 +364,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'height': 360, }, { 'url': 'http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/12/pv/1/flavorId/0_50n4psvx/name/a.mp4/index.m3u8', - 'manifest_url': 'http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/12/pv/1/flavorId/0_50n4psvx/name/a.mp4/index.m3u8', + 'manifest_url': 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8', 'ext': 'mp4', 'format_id': '1396', 'protocol': 'm3u8', @@ -398,16 +378,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'twitch_vod', 'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee', [{ - 'url': 'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee', - 'ext': 'mp4', - 'format_id': 'meta', - 'format_note': 'Quality selection URL', - 'protocol': 'm3u8', - 'preference': -100, - 'resolution': 'multiple' - }, { 'url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/audio_only/index-muted-HM49I092CC.m3u8', - 'manifest_url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/audio_only/index-muted-HM49I092CC.m3u8', + 'manifest_url': 'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee', 'ext': 'mp4', 'format_id': 'Audio Only', 'protocol': 'm3u8', @@ -416,7 +388,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'tbr': 182.725, }, { 'url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/mobile/index-muted-HM49I092CC.m3u8', - 'manifest_url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/mobile/index-muted-HM49I092CC.m3u8', + 'manifest_url': 'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee', 'ext': 'mp4', 'format_id': 'Mobile', 'protocol': 'm3u8', @@ -427,7 +399,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'height': 226, }, { 'url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/low/index-muted-HM49I092CC.m3u8', - 'manifest_url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/low/index-muted-HM49I092CC.m3u8', + 'manifest_url': 'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee', 'ext': 'mp4', 'format_id': 'Low', 'protocol': 'm3u8', @@ -438,7 +410,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'height': 360, }, { 'url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/medium/index-muted-HM49I092CC.m3u8', - 'manifest_url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/medium/index-muted-HM49I092CC.m3u8', + 'manifest_url': 'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee', 'ext': 'mp4', 'format_id': 'Medium', 'protocol': 'm3u8', @@ -449,7 +421,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'height': 480, }, { 'url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/high/index-muted-HM49I092CC.m3u8', - 'manifest_url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/high/index-muted-HM49I092CC.m3u8', + 'manifest_url': 'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee', 'ext': 'mp4', 'format_id': 'High', 'protocol': 'm3u8', @@ -460,7 +432,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'height': 720, }, { 'url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/chunked/index-muted-HM49I092CC.m3u8', - 'manifest_url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/chunked/index-muted-HM49I092CC.m3u8', + 'manifest_url': 'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee', 'ext': 'mp4', 'format_id': 'Source', 'protocol': 'm3u8', @@ -478,16 +450,8 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'vidio', 'https://www.vidio.com/videos/165683/playlist.m3u8', [{ - 'url': 'https://www.vidio.com/videos/165683/playlist.m3u8', - 'ext': 'mp4', - 'format_id': 'meta', - 'format_note': 'Quality selection URL', - 'protocol': 'm3u8', - 'preference': -100, - 'resolution': 'multiple' - }, { 'url': 'https://cdn1-a.production.vidio.static6.com/uploads/165683/dj_ambred-4383-b300.mp4.m3u8', - 'manifest_url': 'https://cdn1-a.production.vidio.static6.com/uploads/165683/dj_ambred-4383-b300.mp4.m3u8', + 'manifest_url': 'https://www.vidio.com/videos/165683/playlist.m3u8', 'ext': 'mp4', 'format_id': '270p 3G', 'protocol': 'm3u8', @@ -496,7 +460,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'height': 270, }, { 'url': 'https://cdn1-a.production.vidio.static6.com/uploads/165683/dj_ambred-4383-b600.mp4.m3u8', - 'manifest_url': 'https://cdn1-a.production.vidio.static6.com/uploads/165683/dj_ambred-4383-b600.mp4.m3u8', + 'manifest_url': 'https://www.vidio.com/videos/165683/playlist.m3u8', 'ext': 'mp4', 'format_id': '360p SD', 'protocol': 'm3u8', @@ -505,7 +469,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'height': 360, }, { 'url': 'https://cdn1-a.production.vidio.static6.com/uploads/165683/dj_ambred-4383-b1200.mp4.m3u8', - 'manifest_url': 'https://cdn1-a.production.vidio.static6.com/uploads/165683/dj_ambred-4383-b1200.mp4.m3u8', + 'manifest_url': 'https://www.vidio.com/videos/165683/playlist.m3u8', 'ext': 'mp4', 'format_id': '720p HD', 'protocol': 'm3u8', diff --git a/test/test_download.py b/test/test_download.py index 0e9f293b5..209f5f6d6 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -225,7 +225,7 @@ def generator(test_case, tname): format_bytes(got_fsize))) if 'md5' in tc: md5_for_file = _file_md5(tc_filename) - self.assertEqual(md5_for_file, tc['md5']) + self.assertEqual(tc['md5'], md5_for_file) # Finally, check test cases' data again but this time against # extracted data from info JSON file written during processing info_json_fn = os.path.splitext(tc_filename)[0] + '.info.json' diff --git a/test/test_utils.py b/test/test_utils.py index 4cd818850..f31559e71 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -44,6 +44,7 @@ from youtube_dl.utils import ( limit_length, mimetype2ext, month_by_name, + multipart_encode, ohdave_rsa_encrypt, OnDemandPagedList, orderedSet, @@ -338,6 +339,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(unified_timestamp('UNKNOWN DATE FORMAT'), None) self.assertEqual(unified_timestamp('May 16, 2016 11:15 PM'), 1463440500) self.assertEqual(unified_timestamp('Feb 7, 2016 at 6:35 pm'), 1454870100) + self.assertEqual(unified_timestamp('2017-03-30T17:52:41Q'), 1490896361) def test_determine_ext(self): self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4') @@ -619,6 +621,16 @@ class TestUtil(unittest.TestCase): 'http://example.com/path', {'test': '第二行тест'})), query_dict('http://example.com/path?test=%E7%AC%AC%E4%BA%8C%E8%A1%8C%D1%82%D0%B5%D1%81%D1%82')) + def test_multipart_encode(self): + self.assertEqual( + multipart_encode({b'field': b'value'}, boundary='AAAAAA')[0], + b'--AAAAAA\r\nContent-Disposition: form-data; name="field"\r\n\r\nvalue\r\n--AAAAAA--\r\n') + self.assertEqual( + multipart_encode({'欄位'.encode('utf-8'): '值'.encode('utf-8')}, boundary='AAAAAA')[0], + b'--AAAAAA\r\nContent-Disposition: form-data; name="\xe6\xac\x84\xe4\xbd\x8d"\r\n\r\n\xe5\x80\xbc\r\n--AAAAAA--\r\n') + self.assertRaises( + ValueError, multipart_encode, {b'field': b'value'}, boundary='value') + def test_dict_get(self): FALSE_VALUES = { 'none': None, @@ -899,6 +911,7 @@ class TestUtil(unittest.TestCase): def test_clean_html(self): self.assertEqual(clean_html('a:\nb'), 'a: b') self.assertEqual(clean_html('a:\n "b"'), 'a: "b"') + self.assertEqual(clean_html('a<br>\xa0b'), 'a\nb') def test_intlist_to_bytes(self): self.assertEqual( diff --git a/test/test_youtube_chapters.py b/test/test_youtube_chapters.py new file mode 100644 index 000000000..cb12f8384 --- /dev/null +++ b/test/test_youtube_chapters.py @@ -0,0 +1,268 @@ +#!/usr/bin/env python +# coding: utf-8 +from __future__ import unicode_literals + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import expect_value +from youtube_dl.extractor import YoutubeIE + + +class TestYoutubeChapters(unittest.TestCase): + + _TEST_CASES = [ + ( + # https://www.youtube.com/watch?v=A22oy8dFjqc + # pattern: 00:00 - <title> + '''This is the absolute ULTIMATE experience of Queen's set at LIVE AID, this is the best video mixed to the absolutely superior stereo radio broadcast. This vastly superior audio mix takes a huge dump on all of the official mixes. Best viewed in 1080p. ENJOY! ***MAKE SURE TO READ THE DESCRIPTION***<br /><a href="#" onclick="yt.www.watch.player.seekTo(00*60+36);return false;">00:36</a> - Bohemian Rhapsody<br /><a href="#" onclick="yt.www.watch.player.seekTo(02*60+42);return false;">02:42</a> - Radio Ga Ga<br /><a href="#" onclick="yt.www.watch.player.seekTo(06*60+53);return false;">06:53</a> - Ay Oh!<br /><a href="#" onclick="yt.www.watch.player.seekTo(07*60+34);return false;">07:34</a> - Hammer To Fall<br /><a href="#" onclick="yt.www.watch.player.seekTo(12*60+08);return false;">12:08</a> - Crazy Little Thing Called Love<br /><a href="#" onclick="yt.www.watch.player.seekTo(16*60+03);return false;">16:03</a> - We Will Rock You<br /><a href="#" onclick="yt.www.watch.player.seekTo(17*60+18);return false;">17:18</a> - We Are The Champions<br /><a href="#" onclick="yt.www.watch.player.seekTo(21*60+12);return false;">21:12</a> - Is This The World We Created...?<br /><br />Short song analysis:<br /><br />- "Bohemian Rhapsody": Although it's a short medley version, it's one of the best performances of the ballad section, with Freddie nailing the Bb4s with the correct studio phrasing (for the first time ever!).<br /><br />- "Radio Ga Ga": Although it's missing one chorus, this is one of - if not the best - the best versions ever, Freddie nails all the Bb4s and sounds very clean! Spike Edney's Roland Jupiter 8 also really shines through on this mix, compared to the DVD releases!<br /><br />- "Audience Improv": A great improv, Freddie sounds strong and confident. You gotta love when he sustains that A4 for 4 seconds!<br /><br />- "Hammer To Fall": Despite missing a verse and a chorus, it's a strong version (possibly the best ever). Freddie sings the song amazingly, and even ad-libs a C#5 and a C5! Also notice how heavy Brian's guitar sounds compared to the thin DVD mixes - it roars!<br /><br />- "Crazy Little Thing Called Love": A great version, the crowd loves the song, the jam is great as well! Only downside to this is the slight feedback issues.<br /><br />- "We Will Rock You": Although cut down to the 1st verse and chorus, Freddie sounds strong. He nails the A4, and the solo from Dr. May is brilliant!<br /><br />- "We Are the Champions": Perhaps the high-light of the performance - Freddie is very daring on this version, he sustains the pre-chorus Bb4s, nails the 1st C5, belts great A4s, but most importantly: He nails the chorus Bb4s, in all 3 choruses! This is the only time he has ever done so! It has to be said though, the last one sounds a bit rough, but that's a side effect of belting high notes for the past 18 minutes, with nodules AND laryngitis!<br /><br />- "Is This The World We Created... ?": Freddie and Brian perform a beautiful version of this, and it is one of the best versions ever. It's both sad and hilarious that a couple of BBC engineers are talking over the song, one of them being completely oblivious of the fact that he is interrupting the performance, on live television... Which was being televised to almost 2 billion homes.<br /><br /><br />All rights go to their respective owners!<br />-----Copyright Disclaimer Under Section 107 of the Copyright Act 1976, allowance is made for fair use for purposes such as criticism, comment, news reporting, teaching, scholarship, and research. Fair use is a use permitted by copyright statute that might otherwise be infringing. Non-profit, educational or personal use tips the balance in favor of fair use''', + 1477, + [{ + 'start_time': 36, + 'end_time': 162, + 'title': 'Bohemian Rhapsody', + }, { + 'start_time': 162, + 'end_time': 413, + 'title': 'Radio Ga Ga', + }, { + 'start_time': 413, + 'end_time': 454, + 'title': 'Ay Oh!', + }, { + 'start_time': 454, + 'end_time': 728, + 'title': 'Hammer To Fall', + }, { + 'start_time': 728, + 'end_time': 963, + 'title': 'Crazy Little Thing Called Love', + }, { + 'start_time': 963, + 'end_time': 1038, + 'title': 'We Will Rock You', + }, { + 'start_time': 1038, + 'end_time': 1272, + 'title': 'We Are The Champions', + }, { + 'start_time': 1272, + 'end_time': 1477, + 'title': 'Is This The World We Created...?', + }] + ), + ( + # https://www.youtube.com/watch?v=ekYlRhALiRQ + # pattern: <num>. <title> 0:00 + '1. Those Beaten Paths of Confusion <a href="#" onclick="yt.www.watch.player.seekTo(0*60+00);return false;">0:00</a><br />2. Beyond the Shadows of Emptiness & Nothingness <a href="#" onclick="yt.www.watch.player.seekTo(11*60+47);return false;">11:47</a><br />3. Poison Yourself...With Thought <a href="#" onclick="yt.www.watch.player.seekTo(26*60+30);return false;">26:30</a><br />4. The Agents of Transformation <a href="#" onclick="yt.www.watch.player.seekTo(35*60+57);return false;">35:57</a><br />5. Drowning in the Pain of Consciousness <a href="#" onclick="yt.www.watch.player.seekTo(44*60+32);return false;">44:32</a><br />6. Deny the Disease of Life <a href="#" onclick="yt.www.watch.player.seekTo(53*60+07);return false;">53:07</a><br /><br />More info/Buy: http://crepusculonegro.storenvy.com/products/257645-cn-03-arizmenda-within-the-vacuum-of-infinity<br /><br />No copyright is intended. The rights to this video are assumed by the owner and its affiliates.', + 4009, + [{ + 'start_time': 0, + 'end_time': 707, + 'title': '1. Those Beaten Paths of Confusion', + }, { + 'start_time': 707, + 'end_time': 1590, + 'title': '2. Beyond the Shadows of Emptiness & Nothingness', + }, { + 'start_time': 1590, + 'end_time': 2157, + 'title': '3. Poison Yourself...With Thought', + }, { + 'start_time': 2157, + 'end_time': 2672, + 'title': '4. The Agents of Transformation', + }, { + 'start_time': 2672, + 'end_time': 3187, + 'title': '5. Drowning in the Pain of Consciousness', + }, { + 'start_time': 3187, + 'end_time': 4009, + 'title': '6. Deny the Disease of Life', + }] + ), + ( + # https://www.youtube.com/watch?v=WjL4pSzog9w + # pattern: 00:00 <title> + '<a href="https://arizmenda.bandcamp.com/merch/despairs-depths-descended-cd" class="yt-uix-servicelink " data-target-new-window="True" data-servicelink="CDAQ6TgYACITCNf1raqT2dMCFdRjGAod_o0CBSj4HQ" data-url="https://arizmenda.bandcamp.com/merch/despairs-depths-descended-cd" rel="nofollow noopener" target="_blank">https://arizmenda.bandcamp.com/merch/...</a><br /><br /><a href="#" onclick="yt.www.watch.player.seekTo(00*60+00);return false;">00:00</a> Christening Unborn Deformities <br /><a href="#" onclick="yt.www.watch.player.seekTo(07*60+08);return false;">07:08</a> Taste of Purity<br /><a href="#" onclick="yt.www.watch.player.seekTo(16*60+16);return false;">16:16</a> Sculpting Sins of a Universal Tongue<br /><a href="#" onclick="yt.www.watch.player.seekTo(24*60+45);return false;">24:45</a> Birth<br /><a href="#" onclick="yt.www.watch.player.seekTo(31*60+24);return false;">31:24</a> Neves<br /><a href="#" onclick="yt.www.watch.player.seekTo(37*60+55);return false;">37:55</a> Libations in Limbo', + 2705, + [{ + 'start_time': 0, + 'end_time': 428, + 'title': 'Christening Unborn Deformities', + }, { + 'start_time': 428, + 'end_time': 976, + 'title': 'Taste of Purity', + }, { + 'start_time': 976, + 'end_time': 1485, + 'title': 'Sculpting Sins of a Universal Tongue', + }, { + 'start_time': 1485, + 'end_time': 1884, + 'title': 'Birth', + }, { + 'start_time': 1884, + 'end_time': 2275, + 'title': 'Neves', + }, { + 'start_time': 2275, + 'end_time': 2705, + 'title': 'Libations in Limbo', + }] + ), + ( + # https://www.youtube.com/watch?v=o3r1sn-t3is + # pattern: <title> 00:00 <note> + 'Download this show in MP3: <a href="http://sh.st/njZKK" class="yt-uix-servicelink " data-url="http://sh.st/njZKK" data-target-new-window="True" data-servicelink="CDAQ6TgYACITCK3j8_6o2dMCFVDCGAoduVAKKij4HQ" rel="nofollow noopener" target="_blank">http://sh.st/njZKK</a><br /><br />Setlist:<br />I-E-A-I-A-I-O <a href="#" onclick="yt.www.watch.player.seekTo(00*60+45);return false;">00:45</a><br />Suite-Pee <a href="#" onclick="yt.www.watch.player.seekTo(4*60+26);return false;">4:26</a> (Incomplete)<br />Attack <a href="#" onclick="yt.www.watch.player.seekTo(5*60+31);return false;">5:31</a> (First live performance since 2011)<br />Prison Song <a href="#" onclick="yt.www.watch.player.seekTo(8*60+42);return false;">8:42</a><br />Know <a href="#" onclick="yt.www.watch.player.seekTo(12*60+32);return false;">12:32</a> (First live performance since 2011)<br />Aerials <a href="#" onclick="yt.www.watch.player.seekTo(15*60+32);return false;">15:32</a><br />Soldier Side - Intro <a href="#" onclick="yt.www.watch.player.seekTo(19*60+13);return false;">19:13</a><br />B.Y.O.B. <a href="#" onclick="yt.www.watch.player.seekTo(20*60+09);return false;">20:09</a><br />Soil <a href="#" onclick="yt.www.watch.player.seekTo(24*60+32);return false;">24:32</a><br />Darts <a href="#" onclick="yt.www.watch.player.seekTo(27*60+48);return false;">27:48</a><br />Radio/Video <a href="#" onclick="yt.www.watch.player.seekTo(30*60+38);return false;">30:38</a><br />Hypnotize <a href="#" onclick="yt.www.watch.player.seekTo(35*60+05);return false;">35:05</a><br />Temper <a href="#" onclick="yt.www.watch.player.seekTo(38*60+08);return false;">38:08</a> (First live performance since 1999)<br />CUBErt <a href="#" onclick="yt.www.watch.player.seekTo(41*60+00);return false;">41:00</a><br />Needles <a href="#" onclick="yt.www.watch.player.seekTo(42*60+57);return false;">42:57</a><br />Deer Dance <a href="#" onclick="yt.www.watch.player.seekTo(46*60+27);return false;">46:27</a><br />Bounce <a href="#" onclick="yt.www.watch.player.seekTo(49*60+38);return false;">49:38</a><br />Suggestions <a href="#" onclick="yt.www.watch.player.seekTo(51*60+25);return false;">51:25</a><br />Psycho <a href="#" onclick="yt.www.watch.player.seekTo(53*60+52);return false;">53:52</a><br />Chop Suey! <a href="#" onclick="yt.www.watch.player.seekTo(58*60+13);return false;">58:13</a><br />Lonely Day <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+01*60+15);return false;">1:01:15</a><br />Question! <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+04*60+14);return false;">1:04:14</a><br />Lost in Hollywood <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+08*60+10);return false;">1:08:10</a><br />Vicinity of Obscenity <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+13*60+40);return false;">1:13:40</a>(First live performance since 2012)<br />Forest <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+16*60+17);return false;">1:16:17</a><br />Cigaro <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+20*60+02);return false;">1:20:02</a><br />Toxicity <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+23*60+57);return false;">1:23:57</a>(with Chino Moreno)<br />Sugar <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+27*60+53);return false;">1:27:53</a>', + 5640, + [{ + 'start_time': 45, + 'end_time': 266, + 'title': 'I-E-A-I-A-I-O', + }, { + 'start_time': 266, + 'end_time': 331, + 'title': 'Suite-Pee (Incomplete)', + }, { + 'start_time': 331, + 'end_time': 522, + 'title': 'Attack (First live performance since 2011)', + }, { + 'start_time': 522, + 'end_time': 752, + 'title': 'Prison Song', + }, { + 'start_time': 752, + 'end_time': 932, + 'title': 'Know (First live performance since 2011)', + }, { + 'start_time': 932, + 'end_time': 1153, + 'title': 'Aerials', + }, { + 'start_time': 1153, + 'end_time': 1209, + 'title': 'Soldier Side - Intro', + }, { + 'start_time': 1209, + 'end_time': 1472, + 'title': 'B.Y.O.B.', + }, { + 'start_time': 1472, + 'end_time': 1668, + 'title': 'Soil', + }, { + 'start_time': 1668, + 'end_time': 1838, + 'title': 'Darts', + }, { + 'start_time': 1838, + 'end_time': 2105, + 'title': 'Radio/Video', + }, { + 'start_time': 2105, + 'end_time': 2288, + 'title': 'Hypnotize', + }, { + 'start_time': 2288, + 'end_time': 2460, + 'title': 'Temper (First live performance since 1999)', + }, { + 'start_time': 2460, + 'end_time': 2577, + 'title': 'CUBErt', + }, { + 'start_time': 2577, + 'end_time': 2787, + 'title': 'Needles', + }, { + 'start_time': 2787, + 'end_time': 2978, + 'title': 'Deer Dance', + }, { + 'start_time': 2978, + 'end_time': 3085, + 'title': 'Bounce', + }, { + 'start_time': 3085, + 'end_time': 3232, + 'title': 'Suggestions', + }, { + 'start_time': 3232, + 'end_time': 3493, + 'title': 'Psycho', + }, { + 'start_time': 3493, + 'end_time': 3675, + 'title': 'Chop Suey!', + }, { + 'start_time': 3675, + 'end_time': 3854, + 'title': 'Lonely Day', + }, { + 'start_time': 3854, + 'end_time': 4090, + 'title': 'Question!', + }, { + 'start_time': 4090, + 'end_time': 4420, + 'title': 'Lost in Hollywood', + }, { + 'start_time': 4420, + 'end_time': 4577, + 'title': 'Vicinity of Obscenity (First live performance since 2012)', + }, { + 'start_time': 4577, + 'end_time': 4802, + 'title': 'Forest', + }, { + 'start_time': 4802, + 'end_time': 5037, + 'title': 'Cigaro', + }, { + 'start_time': 5037, + 'end_time': 5273, + 'title': 'Toxicity (with Chino Moreno)', + }, { + 'start_time': 5273, + 'end_time': 5640, + 'title': 'Sugar', + }] + ), + ( + # https://www.youtube.com/watch?v=PkYLQbsqCE8 + # pattern: <num> - <title> [<latinized title>] 0:00:00 + '''Затемно (Zatemno) is an Obscure Black Metal Band from Russia.<br /><br />"Во прах (Vo prakh)'' Into The Ashes", Debut mini-album released may 6, 2016, by Death Knell Productions<br />Released on 6 panel digipak CD, limited to 100 copies only<br />And digital format on Bandcamp<br /><br />Tracklist<br /><br />1 - Во прах [Vo prakh] <a href="#" onclick="yt.www.watch.player.seekTo(0*3600+00*60+00);return false;">0:00:00</a><br />2 - Искупление [Iskupleniye] <a href="#" onclick="yt.www.watch.player.seekTo(0*3600+08*60+10);return false;">0:08:10</a><br />3 - Из серпов луны...[Iz serpov luny] <a href="#" onclick="yt.www.watch.player.seekTo(0*3600+14*60+30);return false;">0:14:30</a><br /><br />Links:<br /><a href="https://deathknellprod.bandcamp.com/album/--2" class="yt-uix-servicelink " data-target-new-window="True" data-url="https://deathknellprod.bandcamp.com/album/--2" data-servicelink="CC8Q6TgYACITCNP234Kr2dMCFcNxGAodQqsIwSj4HQ" target="_blank" rel="nofollow noopener">https://deathknellprod.bandcamp.com/a...</a><br /><a href="https://www.facebook.com/DeathKnellProd/" class="yt-uix-servicelink " data-target-new-window="True" data-url="https://www.facebook.com/DeathKnellProd/" data-servicelink="CC8Q6TgYACITCNP234Kr2dMCFcNxGAodQqsIwSj4HQ" target="_blank" rel="nofollow noopener">https://www.facebook.com/DeathKnellProd/</a><br /><br /><br />I don't have any right about this artifact, my only intention is to spread the music of the band, all rights are reserved to the Затемно (Zatemno) and his producers, Death Knell Productions.<br /><br />------------------------------------------------------------------<br /><br />Subscribe for more videos like this.<br />My link: <a href="https://web.facebook.com/AttackOfTheDragons" class="yt-uix-servicelink " data-target-new-window="True" data-url="https://web.facebook.com/AttackOfTheDragons" data-servicelink="CC8Q6TgYACITCNP234Kr2dMCFcNxGAodQqsIwSj4HQ" target="_blank" rel="nofollow noopener">https://web.facebook.com/AttackOfTheD...</a>''', + 1138, + [{ + 'start_time': 0, + 'end_time': 490, + 'title': '1 - Во прах [Vo prakh]', + }, { + 'start_time': 490, + 'end_time': 870, + 'title': '2 - Искупление [Iskupleniye]', + }, { + 'start_time': 870, + 'end_time': 1138, + 'title': '3 - Из серпов луны...[Iz serpov luny]', + }] + ), + ] + + def test_youtube_chapters(self): + for description, duration, expected_chapters in self._TEST_CASES: + ie = YoutubeIE() + expect_value( + self, ie._extract_chapters(description, duration), + expected_chapters, None) + + +if __name__ == '__main__': + unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index eb465c425..4c33d494a 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -370,10 +370,10 @@ class YoutubeDL(object): else: raise - if (sys.version_info >= (3,) and sys.platform != 'win32' and + if (sys.platform != 'win32' and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and not params.get('restrictfilenames', False)): - # On Python 3, the Unicode filesystem API will throw errors (#1474) + # Unicode filesystem API will throw errors (#1474, #13027) self.report_warning( 'Assuming --restrict-filenames since file system encoding ' 'cannot encode all characters. ' diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index e13cf547d..db018fa89 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -29,7 +29,17 @@ class ExternalFD(FileDownloader): self.report_destination(filename) tmpfilename = self.temp_name(filename) - retval = self._call_downloader(tmpfilename, info_dict) + try: + retval = self._call_downloader(tmpfilename, info_dict) + except KeyboardInterrupt: + if not info_dict.get('is_live'): + raise + # Live stream downloading cancellation should be considered as + # correct and expected termination thus all postprocessing + # should take place + retval = 0 + self.to_screen('[%s] Interrupted by user' % self.get_basename()) + if retval == 0: fsize = os.path.getsize(encodeFilename(tmpfilename)) self.to_screen('\r[%s] Downloaded %s bytes' % (self.get_basename(), fsize)) @@ -202,6 +212,11 @@ class FFmpegFD(ExternalFD): args = [ffpp.executable, '-y'] + for log_level in ('quiet', 'verbose'): + if self.params.get(log_level, False): + args += ['-loglevel', log_level] + break + seekable = info_dict.get('_seekable') if seekable is not None: # setting -seekable prevents ffmpeg from guessing if the server diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index bb9e82578..bccc8ecc1 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -49,7 +49,7 @@ class FragmentFD(FileDownloader): index: 0-based index of current fragment among all fragments fragment_count: Total count of fragments - + This feature is experimental and file format may change in future. """ @@ -155,8 +155,6 @@ class FragmentFD(FileDownloader): self._write_ytdl_file(ctx) if ctx['fragment_index'] > 0: assert resume_len > 0 - else: - assert resume_len == 0 dest_stream, tmpfilename = sanitize_open(tmpfilename, open_mode) diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py index 100cf997f..d57ad85c2 100644 --- a/youtube_dl/extractor/adobepass.py +++ b/youtube_dl/extractor/adobepass.py @@ -36,6 +36,11 @@ MSO_INFO = { 'username_field': 'Ecom_User_ID', 'password_field': 'Ecom_Password', }, + 'Brighthouse': { + 'name': 'Bright House Networks | Spectrum', + 'username_field': 'j_username', + 'password_field': 'j_password', + }, 'Charter_Direct': { 'name': 'Charter Spectrum', 'username_field': 'IDToken1', @@ -1308,6 +1313,12 @@ class AdobePassIE(InfoExtractor): _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' _MVPD_CACHE = 'ap-mvpd' + def _download_webpage_handle(self, *args, **kwargs): + headers = kwargs.get('headers', {}) + headers.update(self.geo_verification_headers()) + kwargs['headers'] = headers + return super(AdobePassIE, self)._download_webpage_handle(*args, **kwargs) + @staticmethod def _get_mvpd_resource(provider_id, title, guid, rating): channel = etree.Element('channel') diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 989505c82..acc4ce38d 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -5,91 +5,52 @@ import re from .turner import TurnerBaseIE from ..utils import ( - ExtractorError, int_or_none, + strip_or_none, ) class AdultSwimIE(TurnerBaseIE): - _VALID_URL = r'https?://(?:www\.)?adultswim\.com/videos/(?P<is_playlist>playlists/)?(?P<show_path>[^/]+)/(?P<episode_path>[^/?#]+)/?' + _VALID_URL = r'https?://(?:www\.)?adultswim\.com/videos/(?P<show_path>[^/?#]+)(?:/(?P<episode_path>[^/?#]+))?' _TESTS = [{ 'url': 'http://adultswim.com/videos/rick-and-morty/pilot', - 'playlist': [ - { - 'md5': '247572debc75c7652f253c8daa51a14d', - 'info_dict': { - 'id': 'rQxZvXQ4ROaSOqq-or2Mow-0', - 'ext': 'flv', - 'title': 'Rick and Morty - Pilot Part 1', - 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. " - }, - }, - { - 'md5': '77b0e037a4b20ec6b98671c4c379f48d', - 'info_dict': { - 'id': 'rQxZvXQ4ROaSOqq-or2Mow-3', - 'ext': 'flv', - 'title': 'Rick and Morty - Pilot Part 4', - 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. " - }, - }, - ], 'info_dict': { 'id': 'rQxZvXQ4ROaSOqq-or2Mow', + 'ext': 'mp4', 'title': 'Rick and Morty - Pilot', - 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. " + 'description': 'Rick moves in with his daughter\'s family and establishes himself as a bad influence on his grandson, Morty.', + 'timestamp': 1493267400, + 'upload_date': '20170427', }, - 'skip': 'This video is only available for registered users', - }, { - 'url': 'http://www.adultswim.com/videos/playlists/american-parenting/putting-francine-out-of-business/', - 'playlist': [ - { - 'md5': '2eb5c06d0f9a1539da3718d897f13ec5', - 'info_dict': { - 'id': '-t8CamQlQ2aYZ49ItZCFog-0', - 'ext': 'flv', - 'title': 'American Dad - Putting Francine Out of Business', - 'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].' - }, - } - ], - 'info_dict': { - 'id': '-t8CamQlQ2aYZ49ItZCFog', - 'title': 'American Dad - Putting Francine Out of Business', - 'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].' + 'params': { + # m3u8 download + 'skip_download': True, }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'http://www.adultswim.com/videos/tim-and-eric-awesome-show-great-job/dr-steve-brule-for-your-wine/', - 'playlist': [ - { - 'md5': '3e346a2ab0087d687a05e1e7f3b3e529', - 'info_dict': { - 'id': 'sY3cMUR_TbuE4YmdjzbIcQ-0', - 'ext': 'mp4', - 'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', - 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n', - }, - } - ], 'info_dict': { 'id': 'sY3cMUR_TbuE4YmdjzbIcQ', + 'ext': 'mp4', 'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', - 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n', + 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.', + 'upload_date': '20080124', + 'timestamp': 1201150800, }, 'params': { # m3u8 download 'skip_download': True, - } + }, }, { - # heroMetadata.trailer 'url': 'http://www.adultswim.com/videos/decker/inside-decker-a-new-hero/', 'info_dict': { 'id': 'I0LQFQkaSUaFp8PnAWHhoQ', 'ext': 'mp4', 'title': 'Decker - Inside Decker: A New Hero', - 'description': 'md5:c916df071d425d62d70c86d4399d3ee0', - 'duration': 249.008, + 'description': 'The guys recap the conclusion of the season. They announce a new hero, take a peek into the Victorville Film Archive and welcome back the talented James Dean.', + 'timestamp': 1469480460, + 'upload_date': '20160725', }, 'params': { # m3u8 download @@ -97,136 +58,102 @@ class AdultSwimIE(TurnerBaseIE): }, 'expected_warnings': ['Unable to download f4m manifest'], }, { - 'url': 'http://www.adultswim.com/videos/toonami/friday-october-14th-2016/', + 'url': 'http://www.adultswim.com/videos/attack-on-titan', + 'info_dict': { + 'id': 'b7A69dzfRzuaXIECdxW8XQ', + 'title': 'Attack on Titan', + 'description': 'md5:6c8e003ea0777b47013e894767f5e114', + }, + 'playlist_mincount': 12, + }, { + 'url': 'http://www.adultswim.com/videos/streams/williams-stream', 'info_dict': { - 'id': 'eYiLsKVgQ6qTC6agD67Sig', - 'title': 'Toonami - Friday, October 14th, 2016', - 'description': 'md5:99892c96ffc85e159a428de85c30acde', + 'id': 'd8DEBj7QRfetLsRgFnGEyg', + 'ext': 'mp4', + 'title': r're:^Williams Stream \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'description': 'original programming', }, - 'playlist': [{ - 'md5': '', - 'info_dict': { - 'id': 'eYiLsKVgQ6qTC6agD67Sig', - 'ext': 'mp4', - 'title': 'Toonami - Friday, October 14th, 2016', - 'description': 'md5:99892c96ffc85e159a428de85c30acde', - }, - }], 'params': { # m3u8 download 'skip_download': True, }, - 'expected_warnings': ['Unable to download f4m manifest'], }] - @staticmethod - def find_video_info(collection, slug): - for video in collection.get('videos'): - if video.get('slug') == slug: - return video - - @staticmethod - def find_collection_by_linkURL(collections, linkURL): - for collection in collections: - if collection.get('linkURL') == linkURL: - return collection - - @staticmethod - def find_collection_containing_video(collections, slug): - for collection in collections: - for video in collection.get('videos'): - if video.get('slug') == slug: - return collection, video - return None, None - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - show_path = mobj.group('show_path') - episode_path = mobj.group('episode_path') - is_playlist = True if mobj.group('is_playlist') else False - - webpage = self._download_webpage(url, episode_path) - - # Extract the value of `bootstrappedData` from the Javascript in the page. - bootstrapped_data = self._parse_json(self._search_regex( - r'var bootstrappedData = ({.*});', webpage, 'bootstraped data'), episode_path) - - # Downloading videos from a /videos/playlist/ URL needs to be handled differently. - # NOTE: We are only downloading one video (the current one) not the playlist - if is_playlist: - collections = bootstrapped_data['playlists']['collections'] - collection = self.find_collection_by_linkURL(collections, show_path) - video_info = self.find_video_info(collection, episode_path) - - show_title = video_info['showTitle'] - segment_ids = [video_info['videoPlaybackID']] + show_path, episode_path = re.match(self._VALID_URL, url).groups() + display_id = episode_path or show_path + webpage = self._download_webpage(url, display_id) + initial_data = self._parse_json(self._search_regex( + r'AS_INITIAL_DATA(?:__)?\s*=\s*({.+?});', + webpage, 'initial data'), display_id) + + is_stream = show_path == 'streams' + if is_stream: + if not episode_path: + episode_path = 'live-stream' + + video_data = next(stream for stream_path, stream in initial_data['streams'].items() if stream_path == episode_path) + video_id = video_data.get('stream') + + if not video_id: + entries = [] + for episode in video_data.get('archiveEpisodes', []): + episode_url = episode.get('url') + if not episode_url: + continue + entries.append(self.url_result( + episode_url, 'AdultSwim', episode.get('id'))) + return self.playlist_result( + entries, video_data.get('id'), video_data.get('title'), + strip_or_none(video_data.get('description'))) else: - collections = bootstrapped_data['show']['collections'] - collection, video_info = self.find_collection_containing_video(collections, episode_path) - # Video wasn't found in the collections, let's try `slugged_video`. - if video_info is None: - if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path: - video_info = bootstrapped_data['slugged_video'] - if not video_info: - video_info = bootstrapped_data.get( - 'heroMetadata', {}).get('trailer', {}).get('video') - if not video_info: - video_info = bootstrapped_data.get('onlineOriginals', [None])[0] - if not video_info: - raise ExtractorError('Unable to find video info') - - show = bootstrapped_data['show'] - show_title = show['title'] - stream = video_info.get('stream') - if stream and stream.get('videoPlaybackID'): - segment_ids = [stream['videoPlaybackID']] - elif video_info.get('clips'): - segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']] - elif video_info.get('videoPlaybackID'): - segment_ids = [video_info['videoPlaybackID']] - elif video_info.get('id'): - segment_ids = [video_info['id']] - else: - if video_info.get('auth') is True: - raise ExtractorError( - 'This video is only available via cable service provider subscription that' - ' is not currently supported. You may want to use --cookies.', expected=True) - else: - raise ExtractorError('Unable to find stream or clips') - - episode_id = video_info['id'] - episode_title = video_info['title'] - episode_description = video_info.get('description') - episode_duration = int_or_none(video_info.get('duration')) - view_count = int_or_none(video_info.get('views')) + show_data = initial_data['show'] + + if not episode_path: + entries = [] + for video in show_data.get('videos', []): + slug = video.get('slug') + if not slug: + continue + entries.append(self.url_result( + 'http://adultswim.com/videos/%s/%s' % (show_path, slug), + 'AdultSwim', video.get('id'))) + return self.playlist_result( + entries, show_data.get('id'), show_data.get('title'), + strip_or_none(show_data.get('metadata', {}).get('description'))) + + video_data = show_data['sluggedVideo'] + video_id = video_data['id'] + + info = self._extract_cvp_info( + 'http://www.adultswim.com/videos/api/v0/assets?platform=desktop&id=' + video_id, + video_id, { + 'secure': { + 'media_src': 'http://androidhls-secure.cdn.turner.com/adultswim/big', + 'tokenizer_src': 'http://www.adultswim.com/astv/mvpd/processors/services/token_ipadAdobe.do', + }, + }, { + 'url': url, + 'site_name': 'AdultSwim', + 'auth_required': video_data.get('auth'), + }) - entries = [] - for part_num, segment_id in enumerate(segment_ids): - segement_info = self._extract_cvp_info( - 'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=desktop' % segment_id, - segment_id, { - 'secure': { - 'media_src': 'http://androidhls-secure.cdn.turner.com/adultswim/big', - 'tokenizer_src': 'http://www.adultswim.com/astv/mvpd/processors/services/token_ipadAdobe.do', - }, - }) - segment_title = '%s - %s' % (show_title, episode_title) - if len(segment_ids) > 1: - segment_title += ' Part %d' % (part_num + 1) - segement_info.update({ - 'id': segment_id, - 'title': segment_title, - 'description': episode_description, + info.update({ + 'id': video_id, + 'display_id': display_id, + 'description': info.get('description') or strip_or_none(video_data.get('description')), + }) + if not is_stream: + info.update({ + 'duration': info.get('duration') or int_or_none(video_data.get('duration')), + 'timestamp': info.get('timestamp') or int_or_none(video_data.get('launch_date')), + 'season_number': info.get('season_number') or int_or_none(video_data.get('season_number')), + 'episode': info['title'], + 'episode_number': info.get('episode_number') or int_or_none(video_data.get('episode_number')), }) - entries.append(segement_info) - return { - '_type': 'playlist', - 'id': episode_id, - 'display_id': episode_path, - 'entries': entries, - 'title': '%s - %s' % (show_title, episode_title), - 'description': episode_description, - 'duration': episode_duration, - 'view_count': view_count, - } + info['series'] = video_data.get('collection_title') or info.get('series') + if info['series'] and info['series'] != info['title']: + info['title'] = '%s - %s' % (info['series'], info['title']) + + return info diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index c01c67303..2dcdba9d2 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -101,10 +101,14 @@ class AENetworksIE(AENetworksBaseIE): for season_url_path in re.findall(r'(?s)<li[^>]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage): entries.append(self.url_result( compat_urlparse.urljoin(url, season_url_path), 'AENetworks')) - return self.playlist_result( - entries, self._html_search_meta('aetn:SeriesId', webpage), - self._html_search_meta('aetn:SeriesTitle', webpage)) - elif url_parts_len == 2: + if entries: + return self.playlist_result( + entries, self._html_search_meta('aetn:SeriesId', webpage), + self._html_search_meta('aetn:SeriesTitle', webpage)) + else: + # single season + url_parts_len = 2 + if url_parts_len == 2: entries = [] for episode_item in re.findall(r'(?s)<[^>]+class="[^"]*(?:episode|program)-item[^"]*"[^>]*>', webpage): episode_attributes = extract_attributes(episode_item) @@ -112,7 +116,7 @@ class AENetworksIE(AENetworksBaseIE): url, episode_attributes['data-canonical']) entries.append(self.url_result( episode_url, 'AENetworks', - episode_attributes['data-videoid'])) + episode_attributes.get('data-videoid') or episode_attributes.get('data-video-id'))) return self.playlist_result( entries, self._html_search_meta('aetn:SeasonId', webpage)) diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index 78d29c861..c8cb91dcb 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -207,11 +207,10 @@ class AfreecaTVIE(InfoExtractor): file_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', note='Downloading part %d m3u8 information' % file_num) - title = title if one else '%s (part %d)' % (title, file_num) file_info = common_entry.copy() file_info.update({ 'id': format_id, - 'title': title, + 'title': title if one else '%s (part %d)' % (title, file_num), 'upload_date': upload_date, 'duration': file_duration, 'formats': formats, diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py index 388e578d5..c68be3134 100644 --- a/youtube_dl/extractor/aljazeera.py +++ b/youtube_dl/extractor/aljazeera.py @@ -4,9 +4,9 @@ from .common import InfoExtractor class AlJazeeraIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/programmes/.*?/(?P<id>[^/]+)\.html' + _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?:programmes|video)/.*?/(?P<id>[^/]+)\.html' - _TEST = { + _TESTS = [{ 'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html', 'info_dict': { 'id': '3792260579001', @@ -19,7 +19,10 @@ class AlJazeeraIE(InfoExtractor): }, 'add_ie': ['BrightcoveNew'], 'skip': 'Not accessible from Travis CI server', - } + }, { + 'url': 'http://www.aljazeera.com/video/news/2017/05/sierra-leone-709-carat-diamond-auctioned-170511100111930.html', + 'only_matching': True, + }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/665003303001/default_default/index.html?videoId=%s' def _real_extract(self, url): diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py index 98f8e69cd..fde1a8ff7 100644 --- a/youtube_dl/extractor/amp.py +++ b/youtube_dl/extractor/amp.py @@ -34,9 +34,12 @@ class AMPIE(InfoExtractor): if isinstance(media_thumbnail, dict): media_thumbnail = [media_thumbnail] for thumbnail_data in media_thumbnail: - thumbnail = thumbnail_data['@attributes'] + thumbnail = thumbnail_data.get('@attributes', {}) + thumbnail_url = thumbnail.get('url') + if not thumbnail_url: + continue thumbnails.append({ - 'url': self._proto_relative_url(thumbnail['url'], 'http:'), + 'url': self._proto_relative_url(thumbnail_url, 'http:'), 'width': int_or_none(thumbnail.get('width')), 'height': int_or_none(thumbnail.get('height')), }) @@ -47,9 +50,14 @@ class AMPIE(InfoExtractor): if isinstance(media_subtitle, dict): media_subtitle = [media_subtitle] for subtitle_data in media_subtitle: - subtitle = subtitle_data['@attributes'] - lang = subtitle.get('lang') or 'en' - subtitles[lang] = [{'url': subtitle['href']}] + subtitle = subtitle_data.get('@attributes', {}) + subtitle_href = subtitle.get('href') + if not subtitle_href: + continue + subtitles.setdefault(subtitle.get('lang') or 'en', []).append({ + 'url': subtitle_href, + 'ext': mimetype2ext(subtitle.get('type')) or determine_ext(subtitle_href), + }) formats = [] media_content = get_media_node('content') diff --git a/youtube_dl/extractor/anvato.py b/youtube_dl/extractor/anvato.py index 623f44dce..8023da702 100644 --- a/youtube_dl/extractor/anvato.py +++ b/youtube_dl/extractor/anvato.py @@ -5,6 +5,7 @@ import base64 import hashlib import json import random +import re import time from .common import InfoExtractor @@ -16,6 +17,7 @@ from ..utils import ( intlist_to_bytes, int_or_none, strip_jsonp, + unescapeHTML, ) @@ -26,6 +28,8 @@ def md5_text(s): class AnvatoIE(InfoExtractor): + _VALID_URL = r'anvato:(?P<access_key_or_mcp>[^:]+):(?P<id>\d+)' + # Copied from anvplayer.min.js _ANVACK_TABLE = { 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ', @@ -114,6 +118,22 @@ class AnvatoIE(InfoExtractor): 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ' } + _MCP_TO_ACCESS_KEY_TABLE = { + 'qa': 'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922', + 'lin': 'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749', + 'univison': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa', + 'uni': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa', + 'dev': 'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a', + 'sps': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336', + 'spsstg': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336', + 'anv': 'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3', + 'gray': 'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900', + 'hearst': 'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99', + 'cbs': 'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe', + 'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582' + } + + _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1' _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce' def __init__(self, *args, **kwargs): @@ -178,12 +198,7 @@ class AnvatoIE(InfoExtractor): } if ext == 'm3u8' or media_format in ('m3u8', 'm3u8-variant'): - # Not using _extract_m3u8_formats here as individual media - # playlists are also included in published_urls. - if tbr is None: - formats.append(self._m3u8_meta_format(video_url, ext='mp4', m3u8_id='hls')) - continue - else: + if tbr is not None: a_format.update({ 'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])), 'ext': 'mp4', @@ -222,9 +237,42 @@ class AnvatoIE(InfoExtractor): 'subtitles': subtitles, } + @staticmethod + def _extract_urls(ie, webpage, video_id): + entries = [] + for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage): + anvplayer_data = ie._parse_json( + mobj.group('anvp'), video_id, transform_source=unescapeHTML, + fatal=False) + if not anvplayer_data: + continue + video = anvplayer_data.get('video') + if not isinstance(video, compat_str) or not video.isdigit(): + continue + access_key = anvplayer_data.get('accessKey') + if not access_key: + mcp = anvplayer_data.get('mcp') + if mcp: + access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get( + mcp.lower()) + if not access_key: + continue + entries.append(ie.url_result( + 'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(), + video_id=video)) + return entries + def _extract_anvato_videos(self, webpage, video_id): - anvplayer_data = self._parse_json(self._html_search_regex( - r'<script[^>]+data-anvp=\'([^\']+)\'', webpage, - 'Anvato player data'), video_id) + anvplayer_data = self._parse_json( + self._html_search_regex( + self._ANVP_RE, webpage, 'Anvato player data', group='anvp'), + video_id) return self._get_anvato_videos( anvplayer_data['accessKey'], anvplayer_data['video']) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + access_key, video_id = mobj.group('access_key_or_mcp', 'id') + if access_key not in self._ANVACK_TABLE: + access_key = self._MCP_TO_ACCESS_KEY_TABLE[access_key] + return self._get_anvato_videos(access_key, video_id) diff --git a/youtube_dl/extractor/appleconnect.py b/youtube_dl/extractor/appleconnect.py index ea7a70393..a84b8b1eb 100644 --- a/youtube_dl/extractor/appleconnect.py +++ b/youtube_dl/extractor/appleconnect.py @@ -12,13 +12,13 @@ class AppleConnectIE(InfoExtractor): _VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/idsa\.(?P<id>[\w-]+)' _TEST = { 'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3', - 'md5': '10d0f2799111df4cb1c924520ca78f98', + 'md5': 'e7c38568a01ea45402570e6029206723', 'info_dict': { 'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3', 'ext': 'm4v', 'title': 'Energy', 'uploader': 'Drake', - 'thumbnail': 'http://is5.mzstatic.com/image/thumb/Video5/v4/78/61/c5/7861c5fa-ad6d-294b-1464-cf7605b911d6/source/1920x1080sr.jpg', + 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20150710', 'timestamp': 1436545535, }, diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index a6801f3d4..b45b431e1 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -70,7 +70,8 @@ class AppleTrailersIE(InfoExtractor): }, { 'url': 'http://trailers.apple.com/trailers/magnolia/blackthorn/', 'info_dict': { - 'id': 'blackthorn', + 'id': '4489', + 'title': 'Blackthorn', }, 'playlist_mincount': 2, 'expected_warnings': ['Unable to download JSON metadata'], @@ -261,7 +262,7 @@ class AppleTrailersSectionIE(InfoExtractor): 'title': 'Most Popular', 'id': 'mostpopular', }, - 'playlist_mincount': 80, + 'playlist_mincount': 30, }, { 'url': 'http://trailers.apple.com/#section=moviestudios', 'info_dict': { diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index e21045bed..3c7d7250b 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -24,12 +24,12 @@ class ArchiveOrgIE(InfoExtractor): } }, { 'url': 'https://archive.org/details/Cops1922', - 'md5': 'bc73c8ab3838b5a8fc6c6651fa7b58ba', + 'md5': '0869000b4ce265e8ca62738b336b268a', 'info_dict': { 'id': 'Cops1922', 'ext': 'mp4', 'title': 'Buster Keaton\'s "Cops" (1922)', - 'description': 'md5:b4544662605877edd99df22f9620d858', + 'description': 'md5:89e7c77bf5d965dd5c0372cfb49470f6', } }, { 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect', diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index 99af6dc5a..01fa308ff 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -36,7 +36,7 @@ class AtresPlayerIE(InfoExtractor): }, { 'url': 'http://www.atresplayer.com/television/especial/videoencuentros/temporada-1/capitulo-112-david-bustamante_2014121600375.html', - 'md5': '0d0e918533bbd4b263f2de4d197d4aac', + 'md5': '6e52cbb513c405e403dbacb7aacf8747', 'info_dict': { 'id': 'capitulo-112-david-bustamante', 'ext': 'flv', diff --git a/youtube_dl/extractor/audioboom.py b/youtube_dl/extractor/audioboom.py index 8fc5f65c6..e48bb8972 100644 --- a/youtube_dl/extractor/audioboom.py +++ b/youtube_dl/extractor/audioboom.py @@ -16,7 +16,7 @@ class AudioBoomIE(InfoExtractor): 'title': '3/09/2016 Czaban Hour 3', 'description': 'Guest: Nate Davis - NFL free agency, Guest: Stan Gans', 'duration': 2245.72, - 'uploader': 'Steve Czaban', + 'uploader': 'SB Nation A.M.', 'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channel/steveczabanyahoosportsradio', } }, { diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 056e06376..489d0ba53 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -34,12 +34,12 @@ class BandcampIE(InfoExtractor): '_skip': 'There is a limit of 200 free downloads / month for the test song' }, { 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', - 'md5': '73d0b3171568232574e45652f8720b5c', + 'md5': '0369ace6b939f0927e62c67a1a8d9fa7', 'info_dict': { 'id': '2650410135', - 'ext': 'mp3', - 'title': 'Lanius (Battle)', - 'uploader': 'Ben Prunty Music', + 'ext': 'aiff', + 'title': 'Ben Prunty - Lanius (Battle)', + 'uploader': 'Ben Prunty', }, }] @@ -47,6 +47,7 @@ class BandcampIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) title = mobj.group('title') webpage = self._download_webpage(url, title) + thumbnail = self._html_search_meta('og:image', webpage, default=None) m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage) if not m_download: m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage) @@ -75,6 +76,7 @@ class BandcampIE(InfoExtractor): return { 'id': track_id, 'title': data['title'], + 'thumbnail': thumbnail, 'formats': formats, 'duration': float_or_none(data.get('duration')), } @@ -143,7 +145,7 @@ class BandcampIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'thumbnail': info.get('thumb_url'), + 'thumbnail': info.get('thumb_url') or thumbnail, 'uploader': info.get('artist'), 'artist': artist, 'track': track, diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index b0b7914d8..d5c5822f2 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -16,7 +16,7 @@ class BeegIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?beeg\.com/(?P<id>\d+)' _TEST = { 'url': 'http://beeg.com/5416503', - 'md5': '46c384def73b33dbc581262e5ee67cef', + 'md5': 'a1a1b1a8bc70a89e49ccfd113aed0820', 'info_dict': { 'id': '5416503', 'ext': 'mp4', diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 80dd8382e..1e3f25515 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -122,6 +122,11 @@ class BiliBiliIE(InfoExtractor): 'preference': -2 if 'hd.mp4' in backup_url else -3, }) + for a_format in formats: + a_format.setdefault('http_headers', {}).update({ + 'Referer': url, + }) + self._sort_formats(formats) entries.append({ diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py index 7a8e1f60b..e829974ff 100644 --- a/youtube_dl/extractor/bleacherreport.py +++ b/youtube_dl/extractor/bleacherreport.py @@ -35,7 +35,7 @@ class BleacherReportIE(InfoExtractor): 'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo', 'timestamp': 1446839961, 'uploader': 'Sean Fay', - 'description': 'md5:825e94e0f3521df52fa83b2ed198fa20', + 'description': 'md5:b1601e2314c4d8eec23b6eafe086a757', 'uploader_id': 6466954, 'upload_date': '20151011', }, @@ -90,17 +90,13 @@ class BleacherReportCMSIE(AMPIE): _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36})' _TESTS = [{ 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1', - 'md5': '8c2c12e3af7805152675446c905d159b', + 'md5': '2e4b0a997f9228ffa31fada5c53d1ed1', 'info_dict': { 'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Cena vs. Rollins Would Expose the Heavyweight Division', 'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index ff0aa11b1..2c32b6ae2 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -77,7 +77,7 @@ class BRIE(InfoExtractor): 'description': 'md5:bb659990e9e59905c3d41e369db1fbe3', 'duration': 893, 'uploader': 'Eva Maria Steimle', - 'upload_date': '20140117', + 'upload_date': '20170208', } }, ] diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 97602ca30..0ed59bcbc 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -5,6 +5,7 @@ import re import json from .common import InfoExtractor +from .adobepass import AdobePassIE from ..compat import ( compat_etree_fromstring, compat_parse_qs, @@ -448,7 +449,7 @@ class BrightcoveLegacyIE(InfoExtractor): return info -class BrightcoveNewIE(InfoExtractor): +class BrightcoveNewIE(AdobePassIE): IE_NAME = 'brightcove:new' _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>\d+|ref:[^&]+)' _TESTS = [{ @@ -522,7 +523,7 @@ class BrightcoveNewIE(InfoExtractor): # [2] looks like: for video, script_tag, account_id, player_id, embed in re.findall( r'''(?isx) - (<video\s+[^>]*data-video-id=['"]?[^>]+>) + (<video\s+[^>]*\bdata-video-id\s*=\s*['"]?[^>]+>) (?:.*? (<script[^>]+ src=["\'](?:https?:)?//players\.brightcove\.net/ @@ -602,6 +603,20 @@ class BrightcoveNewIE(InfoExtractor): raise ExtractorError(message, expected=True) raise + errors = json_data.get('errors') + if errors and errors[0].get('error_subcode') == 'TVE_AUTH': + custom_fields = json_data['custom_fields'] + tve_token = self._extract_mvpd_auth( + smuggled_data['source_url'], video_id, + custom_fields['bcadobepassrequestorid'], + custom_fields['bcadobepassresourceid']) + json_data = self._download_json( + api_url, video_id, headers={ + 'Accept': 'application/json;pk=%s' % policy_key + }, query={ + 'tveToken': tve_token, + }) + title = json_data['name'].strip() formats = [] @@ -667,7 +682,6 @@ class BrightcoveNewIE(InfoExtractor): }) formats.append(f) - errors = json_data.get('errors') if not formats and errors: error = errors[0] raise ExtractorError( @@ -684,7 +698,7 @@ class BrightcoveNewIE(InfoExtractor): is_live = False duration = float_or_none(json_data.get('duration'), 1000) - if duration and duration < 0: + if duration is not None and duration <= 0: is_live = True return { diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index f1f128c45..acd87e371 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -16,13 +16,10 @@ class Canalc2IE(InfoExtractor): 'md5': '060158428b650f896c542dfbb3d6487f', 'info_dict': { 'id': '12163', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Terrasses du Numérique', 'duration': 122, }, - 'params': { - 'skip_download': True, # Requires rtmpdump - } }, { 'url': 'http://archives-canalc2.u-strasbg.fr/video.asp?idVideo=11427&voir=oui', 'only_matching': True, diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index cf678e7f8..87ad14e91 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -96,6 +96,7 @@ class CBCIE(InfoExtractor): 'info_dict': { 'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks', 'id': 'dog-indoor-exercise-winter-1.3928238', + 'description': 'md5:c18552e41726ee95bd75210d1ca9194c', }, 'playlist_mincount': 6, }] @@ -165,12 +166,11 @@ class CBCPlayerIE(InfoExtractor): 'uploader': 'CBCC-NEW', }, }, { - # available only when we add `formats=MPEG4,FLV,MP3` to theplatform url 'url': 'http://www.cbc.ca/player/play/2164402062', - 'md5': '17a61eb813539abea40618d6323a7f82', + 'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6', 'info_dict': { 'id': '2164402062', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Cancer survivor four times over', 'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.', 'timestamp': 1320410746, diff --git a/youtube_dl/extractor/cbslocal.py b/youtube_dl/extractor/cbslocal.py index 8d5f11dd1..7d78e3aae 100644 --- a/youtube_dl/extractor/cbslocal.py +++ b/youtube_dl/extractor/cbslocal.py @@ -60,8 +60,8 @@ class CBSLocalIE(AnvatoIE): 'title': 'A Very Blue Anniversary', 'description': 'CBS2’s Cindy Hsu has more.', 'thumbnail': 're:^https?://.*', - 'timestamp': 1479962220, - 'upload_date': '20161124', + 'timestamp': int, + 'upload_date': r're:^\d{8}$', 'uploader': 'CBS', 'subtitles': { 'en': 'mincount:5', diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index 1ee35b501..78b7a923c 100755 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -9,7 +9,10 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + multipart_encode, parse_duration, + random_birthday, + urljoin, ) @@ -27,7 +30,8 @@ class CDAIE(InfoExtractor): 'description': 'md5:269ccd135d550da90d1662651fcb9772', 'thumbnail': r're:^https?://.*\.jpg$', 'average_rating': float, - 'duration': 39 + 'duration': 39, + 'age_limit': 0, } }, { 'url': 'http://www.cda.pl/video/57413289', @@ -41,13 +45,41 @@ class CDAIE(InfoExtractor): 'uploader': 'crash404', 'view_count': int, 'average_rating': float, - 'duration': 137 + 'duration': 137, + 'age_limit': 0, } + }, { + # Age-restricted + 'url': 'http://www.cda.pl/video/1273454c4', + 'info_dict': { + 'id': '1273454c4', + 'ext': 'mp4', + 'title': 'Bronson (2008) napisy HD 1080p', + 'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c', + 'height': 1080, + 'uploader': 'boniek61', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 5554, + 'age_limit': 18, + 'view_count': int, + 'average_rating': float, + }, }, { 'url': 'http://ebd.cda.pl/0x0/5749950c', 'only_matching': True, }] + def _download_age_confirm_page(self, url, video_id, *args, **kwargs): + form_data = random_birthday('rok', 'miesiac', 'dzien') + form_data.update({'return': url, 'module': 'video', 'module_id': video_id}) + data, content_type = multipart_encode(form_data) + return self._download_webpage( + urljoin(url, '/a/validatebirth'), video_id, *args, + data=data, headers={ + 'Referer': url, + 'Content-Type': content_type, + }, **kwargs) + def _real_extract(self, url): video_id = self._match_id(url) self._set_cookie('cda.pl', 'cda.player', 'html5') @@ -57,6 +89,13 @@ class CDAIE(InfoExtractor): if 'Ten film jest dostępny dla użytkowników premium' in webpage: raise ExtractorError('This video is only available for premium users.', expected=True) + need_confirm_age = False + if self._html_search_regex(r'(<form[^>]+action="/a/validatebirth")', + webpage, 'birthday validate form', default=None): + webpage = self._download_age_confirm_page( + url, video_id, note='Confirming age') + need_confirm_age = True + formats = [] uploader = self._search_regex(r'''(?x) @@ -81,6 +120,7 @@ class CDAIE(InfoExtractor): 'thumbnail': self._og_search_thumbnail(webpage), 'formats': formats, 'duration': None, + 'age_limit': 18 if need_confirm_age else 0, } def extract_format(page, version): @@ -121,7 +161,12 @@ class CDAIE(InfoExtractor): for href, resolution in re.findall( r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)', webpage): - webpage = self._download_webpage( + if need_confirm_age: + handler = self._download_age_confirm_page + else: + handler = self._download_webpage + + webpage = handler( self._BASE_URL + href, video_id, 'Downloading %s version information' % resolution, fatal=False) if not webpage: @@ -129,6 +174,7 @@ class CDAIE(InfoExtractor): # invalid version is requested. self.report_warning('Unable to download %s version information' % resolution) continue + extract_format(webpage, resolution) self._sort_formats(formats) diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py index bb52e0c6f..0920f6219 100644 --- a/youtube_dl/extractor/clipfish.py +++ b/youtube_dl/extractor/clipfish.py @@ -12,7 +12,7 @@ class ClipfishIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?clipfish\.de/(?:[^/]+/)+video/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.clipfish.de/special/ugly-americans/video/4343170/s01-e01-ugly-americans-date-in-der-hoelle/', - 'md5': '720563e467b86374c194bdead08d207d', + 'md5': 'b9a5dc46294154c1193e2d10e0c95693', 'info_dict': { 'id': '4343170', 'ext': 'mp4', diff --git a/youtube_dl/extractor/collegerama.py b/youtube_dl/extractor/collegerama.py index 18c734766..6a41db87c 100644 --- a/youtube_dl/extractor/collegerama.py +++ b/youtube_dl/extractor/collegerama.py @@ -21,7 +21,7 @@ class CollegeRamaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Een nieuwe wereld: waarden, bewustzijn en techniek van de mensheid 2.0.', 'description': '', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$', 'duration': 7713.088, 'timestamp': 1413309600, 'upload_date': '20141014', @@ -35,6 +35,7 @@ class CollegeRamaIE(InfoExtractor): 'ext': 'wmv', 'title': '64ste Vakantiecursus: Afvalwater', 'description': 'md5:7fd774865cc69d972f542b157c328305', + 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$', 'duration': 10853, 'timestamp': 1326446400, 'upload_date': '20120113', diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 8b3f04c61..fec39da8b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -245,6 +245,10 @@ class InfoExtractor(object): specified in the URL. end_time: Time in seconds where the reproduction should end, as specified in the URL. + chapters: A list of dictionaries, with the following entries: + * "start_time" - The start time of the chapter in seconds + * "end_time" - The end time of the chapter in seconds + * "title" (optional, string) The following fields should only be used when the video belongs to some logical chapter or section: @@ -990,6 +994,7 @@ class InfoExtractor(object): 'tbr': int_or_none(e.get('bitrate')), 'width': int_or_none(e.get('width')), 'height': int_or_none(e.get('height')), + 'view_count': int_or_none(e.get('interactionCount')), }) for e in json_ld: @@ -1334,7 +1339,7 @@ class InfoExtractor(object): if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access return [] - formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)] + formats = [] format_url = lambda u: ( u @@ -1386,6 +1391,7 @@ class InfoExtractor(object): f = { 'format_id': '-'.join(format_id), 'url': format_url(media_url), + 'manifest_url': m3u8_url, 'language': media.get('LANGUAGE'), 'ext': ext, 'protocol': entry_protocol, @@ -1438,7 +1444,7 @@ class InfoExtractor(object): f = { 'format_id': '-'.join(format_id), 'url': manifest_url, - 'manifest_url': manifest_url, + 'manifest_url': m3u8_url, 'tbr': tbr, 'ext': ext, 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')), @@ -1995,6 +2001,12 @@ class InfoExtractor(object): compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id) def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None): + """ + Parse formats from ISM manifest. + References: + 1. [MS-SSTR]: Smooth Streaming Protocol, + https://msdn.microsoft.com/en-us/library/ff469518.aspx + """ if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None: return [] @@ -2016,8 +2028,11 @@ class InfoExtractor(object): self.report_warning('%s is not a supported codec' % fourcc) continue tbr = int(track.attrib['Bitrate']) // 1000 - width = int_or_none(track.get('MaxWidth')) - height = int_or_none(track.get('MaxHeight')) + # [1] does not mention Width and Height attributes. However, + # they're often present while MaxWidth and MaxHeight are + # missing, so should be used as fallbacks + width = int_or_none(track.get('MaxWidth') or track.get('Width')) + height = int_or_none(track.get('MaxHeight') or track.get('Height')) sampling_rate = int_or_none(track.get('SamplingRate')) track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern) @@ -2168,7 +2183,7 @@ class InfoExtractor(object): def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): formats = [] hdcore_sign = 'hdcore=3.7.0' - f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') + f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') hds_host = hosts.get('hds') if hds_host: f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url) @@ -2190,8 +2205,9 @@ class InfoExtractor(object): def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) - url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url') - http_base_url = 'http' + url_base + url_base = self._search_regex( + r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url') + http_base_url = '%s:%s' % ('http', url_base) formats = [] if 'm3u8' not in skip_protocols: formats.extend(self._extract_m3u8_formats( @@ -2225,7 +2241,7 @@ class InfoExtractor(object): for protocol in ('rtmp', 'rtsp'): if protocol not in skip_protocols: formats.append({ - 'url': protocol + url_base, + 'url': '%s:%s' % (protocol, url_base), 'format_id': protocol, 'protocol': protocol, }) diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index d3463b874..0c3f0c0e4 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -16,7 +16,6 @@ from ..utils import ( mimetype2ext, orderedSet, parse_iso8601, - remove_end, ) @@ -50,10 +49,17 @@ class CondeNastIE(InfoExtractor): 'wmagazine': 'W Magazine', } - _VALID_URL = r'https?://(?:video|www|player)\.(?P<site>%s)\.com/(?P<type>watch|series|video|embed(?:js)?)/(?P<id>[^/?#]+)' % '|'.join(_SITES.keys()) + _VALID_URL = r'''(?x)https?://(?:video|www|player(?:-backend)?)\.(?:%s)\.com/ + (?: + (?: + embed(?:js)?| + (?:script|inline)/video + )/(?P<id>[0-9a-f]{24})(?:/(?P<player_id>[0-9a-f]{24}))?(?:.+?\btarget=(?P<target>[^&]+))?| + (?P<type>watch|series|video)/(?P<display_id>[^/?#]+) + )''' % '|'.join(_SITES.keys()) IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) - EMBED_URL = r'(?:https?:)?//player\.(?P<site>%s)\.com/(?P<type>embed(?:js)?)/.+?' % '|'.join(_SITES.keys()) + EMBED_URL = r'(?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+?' % '|'.join(_SITES.keys()) _TESTS = [{ 'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led', @@ -89,6 +95,12 @@ class CondeNastIE(InfoExtractor): 'upload_date': '20150916', 'timestamp': 1442434955, } + }, { + 'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player', + 'only_matching': True, + }, { + 'url': 'http://player-backend.cnevids.com/script/video/59138decb57ac36b83000005.js', + 'only_matching': True, }] def _extract_series(self, url, webpage): @@ -104,7 +116,7 @@ class CondeNastIE(InfoExtractor): entries = [self.url_result(build_url(path), 'CondeNast') for path in paths] return self.playlist_result(entries, playlist_title=title) - def _extract_video(self, webpage, url_type): + def _extract_video_params(self, webpage): query = {} params = self._search_regex( r'(?s)var params = {(.+?)}[;,]', webpage, 'player params', default=None) @@ -123,17 +135,30 @@ class CondeNastIE(InfoExtractor): 'playerId': params['data-player'], 'target': params['id'], }) - video_id = query['videoId'] + return query + + def _extract_video(self, params): + video_id = params['videoId'] + video_info = None - info_page = self._download_json( - 'http://player.cnevids.com/player/video.js', - video_id, 'Downloading video info', fatal=False, query=query) - if info_page: - video_info = info_page.get('video') - if not video_info: + if params.get('playerId'): + info_page = self._download_json( + 'http://player.cnevids.com/player/video.js', + video_id, 'Downloading video info', fatal=False, query=params) + if info_page: + video_info = info_page.get('video') + if not video_info: + info_page = self._download_webpage( + 'http://player.cnevids.com/player/loader.js', + video_id, 'Downloading loader info', query=params) + else: info_page = self._download_webpage( - 'http://player.cnevids.com/player/loader.js', - video_id, 'Downloading loader info', query=query) + 'https://player.cnevids.com/inline/video/%s.js' % video_id, + video_id, 'Downloading inline info', query={ + 'target': params.get('target', 'embedplayer') + }) + + if not video_info: video_info = self._parse_json( self._search_regex( r'(?s)var\s+config\s*=\s*({.+?});', info_page, 'config'), @@ -161,9 +186,7 @@ class CondeNastIE(InfoExtractor): }) self._sort_formats(formats) - info = self._search_json_ld( - webpage, video_id, fatal=False) if url_type != 'embed' else {} - info.update({ + return { 'id': video_id, 'formats': formats, 'title': title, @@ -174,22 +197,26 @@ class CondeNastIE(InfoExtractor): 'series': video_info.get('series_title'), 'season': video_info.get('season_title'), 'timestamp': parse_iso8601(video_info.get('premiere_date')), - }) - return info + 'categories': video_info.get('categories'), + } def _real_extract(self, url): - site, url_type, item_id = re.match(self._VALID_URL, url).groups() + video_id, player_id, target, url_type, display_id = re.match(self._VALID_URL, url).groups() - # Convert JS embed to regular embed - if url_type == 'embedjs': - parsed_url = compat_urlparse.urlparse(url) - url = compat_urlparse.urlunparse(parsed_url._replace( - path=remove_end(parsed_url.path, '.js').replace('/embedjs/', '/embed/'))) - url_type = 'embed' + if video_id: + return self._extract_video({ + 'videoId': video_id, + 'playerId': player_id, + 'target': target, + }) - webpage = self._download_webpage(url, item_id) + webpage = self._download_webpage(url, display_id) if url_type == 'series': return self._extract_series(url, webpage) else: - return self._extract_video(webpage, url_type) + params = self._extract_video_params(webpage) + info = self._search_json_ld( + webpage, display_id, fatal=False) + info.update(self._extract_video(params)) + return info diff --git a/youtube_dl/extractor/coub.py b/youtube_dl/extractor/coub.py index 5fa1f006b..6ea03e65c 100644 --- a/youtube_dl/extractor/coub.py +++ b/youtube_dl/extractor/coub.py @@ -24,12 +24,11 @@ class CoubIE(InfoExtractor): 'duration': 4.6, 'timestamp': 1428527772, 'upload_date': '20150408', - 'uploader': 'Артём Лоскутников', + 'uploader': 'Artyom Loskutnikov', 'uploader_id': 'artyom.loskutnikov', 'view_count': int, 'like_count': int, 'repost_count': int, - 'comment_count': int, 'age_limit': 0, }, }, { @@ -118,7 +117,6 @@ class CoubIE(InfoExtractor): view_count = int_or_none(coub.get('views_count') or coub.get('views_increase_count')) like_count = int_or_none(coub.get('likes_count')) repost_count = int_or_none(coub.get('recoubs_count')) - comment_count = int_or_none(coub.get('comments_count')) age_restricted = coub.get('age_restricted', coub.get('age_restricted_by_admin')) if age_restricted is not None: @@ -137,7 +135,6 @@ class CoubIE(InfoExtractor): 'view_count': view_count, 'like_count': like_count, 'repost_count': repost_count, - 'comment_count': comment_count, 'age_limit': age_limit, 'formats': formats, } diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py index f919ed208..13f425b2b 100644 --- a/youtube_dl/extractor/crackle.py +++ b/youtube_dl/extractor/crackle.py @@ -21,9 +21,10 @@ class CrackleIE(InfoExtractor): 'season_number': 8, 'episode_number': 4, 'subtitles': { - 'en-US': [{ - 'ext': 'ttml', - }] + 'en-US': [ + {'ext': 'vtt'}, + {'ext': 'tt'}, + ] }, }, 'params': { diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 2ed8b30bb..2ffa4a7f8 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -171,7 +171,7 @@ class CrunchyrollIE(CrunchyrollBaseIE): 'info_dict': { 'id': '727589', 'ext': 'mp4', - 'title': "KONOSUBA -God's blessing on this wonderful world! 2 Episode 1 – Give Me Deliverance from this Judicial Injustice!", + 'title': "KONOSUBA -God's blessing on this wonderful world! 2 Episode 1 – Give Me Deliverance From This Judicial Injustice!", 'description': 'md5:cbcf05e528124b0f3a0a419fc805ea7d', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Kadokawa Pictures Inc.', @@ -179,7 +179,7 @@ class CrunchyrollIE(CrunchyrollBaseIE): 'series': "KONOSUBA -God's blessing on this wonderful world!", 'season': "KONOSUBA -God's blessing on this wonderful world! 2", 'season_number': 2, - 'episode': 'Give Me Deliverance from this Judicial Injustice!', + 'episode': 'Give Me Deliverance From This Judicial Injustice!', 'episode_number': 1, }, 'params': { diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index d4576160b..171820e27 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -10,6 +10,7 @@ from ..utils import ( smuggle_url, determine_ext, ExtractorError, + extract_attributes, ) from .senateisvp import SenateISVPIE from .ustream import UstreamIE @@ -68,6 +69,7 @@ class CSpanIE(InfoExtractor): 'uploader_id': '12987475', }, }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' def _real_extract(self, url): video_id = self._match_id(url) @@ -78,6 +80,19 @@ class CSpanIE(InfoExtractor): if ustream_url: return self.url_result(ustream_url, UstreamIE.ie_key()) + if '&vod' not in url: + bc = self._search_regex( + r"(<[^>]+id='brightcove-player-embed'[^>]+>)", + webpage, 'brightcove embed', default=None) + if bc: + bc_attr = extract_attributes(bc) + bc_url = self.BRIGHTCOVE_URL_TEMPLATE % ( + bc_attr.get('data-bcaccountid', '3162030207001'), + bc_attr.get('data-noprebcplayerid', 'SyGGpuJy3g'), + bc_attr.get('data-newbcplayerid', 'default'), + bc_attr['data-bcid']) + return self.url_result(smuggle_url(bc_url, {'source_url': url})) + # We first look for clipid, because clipprog always appears before patterns = [r'id=\'clip(%s)\'\s*value=\'([0-9]+)\'' % t for t in ('id', 'prog')] results = list(filter(None, (re.search(p, webpage) for p in patterns))) diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py index 98c835bf1..538565c66 100644 --- a/youtube_dl/extractor/dailymail.py +++ b/youtube_dl/extractor/dailymail.py @@ -2,9 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( int_or_none, determine_protocol, + try_get, unescapeHTML, ) @@ -28,8 +30,14 @@ class DailyMailIE(InfoExtractor): video_data = self._parse_json(self._search_regex( r"data-opts='({.+?})'", webpage, 'video data'), video_id) title = unescapeHTML(video_data['title']) - video_sources = self._download_json(video_data.get( - 'sources', {}).get('url') or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id, video_id) + + sources_url = (try_get( + video_data, + (lambda x: x['plugins']['sources']['url'], + lambda x: x['sources']['url']), compat_str) or + 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id) + + video_sources = self._download_json(sources_url, video_id) formats = [] for rendition in video_sources['renditions']: diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 246efde43..f8db76c18 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -38,7 +38,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor): class DailymotionIE(DailymotionBaseInfoExtractor): - _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:embed|swf|#)/)?video/(?P<id>[^/?_]+)' + _VALID_URL = r'(?i)https?://(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:embed|swf|#)/)?video|swf)/(?P<id>[^/?_]+)' IE_NAME = 'dailymotion' _FORMATS = [ @@ -49,68 +49,82 @@ class DailymotionIE(DailymotionBaseInfoExtractor): ('stream_h264_hd1080_url', 'hd180'), ] - _TESTS = [ - { - 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', - 'md5': '2137c41a8e78554bb09225b8eb322406', - 'info_dict': { - 'id': 'x2iuewm', - 'ext': 'mp4', - 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News', - 'description': 'Several come bundled with the Steam Controller.', - 'thumbnail': r're:^https?:.*\.(?:jpg|png)$', - 'duration': 74, - 'timestamp': 1425657362, - 'upload_date': '20150306', - 'uploader': 'IGN', - 'uploader_id': 'xijv66', - 'age_limit': 0, - 'view_count': int, - } + _TESTS = [{ + 'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news', + 'md5': '074b95bdee76b9e3654137aee9c79dfe', + 'info_dict': { + 'id': 'x5kesuj', + 'ext': 'mp4', + 'title': 'Office Christmas Party Review – Jason Bateman, Olivia Munn, T.J. Miller', + 'description': 'Office Christmas Party Review - Jason Bateman, Olivia Munn, T.J. Miller', + 'thumbnail': r're:^https?:.*\.(?:jpg|png)$', + 'duration': 187, + 'timestamp': 1493651285, + 'upload_date': '20170501', + 'uploader': 'Deadline', + 'uploader_id': 'x1xm8ri', + 'age_limit': 0, + 'view_count': int, + }, + }, { + 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', + 'md5': '2137c41a8e78554bb09225b8eb322406', + 'info_dict': { + 'id': 'x2iuewm', + 'ext': 'mp4', + 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News', + 'description': 'Several come bundled with the Steam Controller.', + 'thumbnail': r're:^https?:.*\.(?:jpg|png)$', + 'duration': 74, + 'timestamp': 1425657362, + 'upload_date': '20150306', + 'uploader': 'IGN', + 'uploader_id': 'xijv66', + 'age_limit': 0, + 'view_count': int, }, + 'skip': 'video gone', + }, { # Vevo video - { - 'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi', - 'info_dict': { - 'title': 'Roar (Official)', - 'id': 'USUV71301934', - 'ext': 'mp4', - 'uploader': 'Katy Perry', - 'upload_date': '20130905', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'VEVO is only available in some countries', + 'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi', + 'info_dict': { + 'title': 'Roar (Official)', + 'id': 'USUV71301934', + 'ext': 'mp4', + 'uploader': 'Katy Perry', + 'upload_date': '20130905', + }, + 'params': { + 'skip_download': True, }, + 'skip': 'VEVO is only available in some countries', + }, { # age-restricted video - { - 'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband', - 'md5': '0d667a7b9cebecc3c89ee93099c4159d', - 'info_dict': { - 'id': 'xyh2zz', - 'ext': 'mp4', - 'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]', - 'uploader': 'HotWaves1012', - 'age_limit': 18, - }, - 'skip': 'video gone', + 'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband', + 'md5': '0d667a7b9cebecc3c89ee93099c4159d', + 'info_dict': { + 'id': 'xyh2zz', + 'ext': 'mp4', + 'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]', + 'uploader': 'HotWaves1012', + 'age_limit': 18, }, + 'skip': 'video gone', + }, { # geo-restricted, player v5 - { - 'url': 'http://www.dailymotion.com/video/xhza0o', - 'only_matching': True, - }, + 'url': 'http://www.dailymotion.com/video/xhza0o', + 'only_matching': True, + }, { # with subtitles - { - 'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news', - 'only_matching': True, - }, - { - 'url': 'http://www.dailymotion.com/swf/video/x3n92nf', - 'only_matching': True, - } - ] + 'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news', + 'only_matching': True, + }, { + 'url': 'http://www.dailymotion.com/swf/video/x3n92nf', + 'only_matching': True, + }, { + 'url': 'http://www.dailymotion.com/swf/x3ss1m_funny-magic-trick-barry-and-stuart_fun', + 'only_matching': True, + }] @staticmethod def _extract_urls(webpage): diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index bdfe638b4..5c9c0ecdc 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -21,7 +21,8 @@ class DemocracynowIE(InfoExtractor): 'info_dict': { 'id': '2015-0703-001', 'ext': 'mp4', - 'title': 'Daily Show', + 'title': 'Daily Show for July 03, 2015', + 'description': 'md5:80eb927244d6749900de6072c7cc2c86', }, }, { 'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree', diff --git a/youtube_dl/extractor/dotsub.py b/youtube_dl/extractor/dotsub.py index 1f75352ca..148605c0b 100644 --- a/youtube_dl/extractor/dotsub.py +++ b/youtube_dl/extractor/dotsub.py @@ -35,7 +35,7 @@ class DotsubIE(InfoExtractor): 'thumbnail': 're:^https?://dotsub.com/media/747bcf58-bd59-45b7-8c8c-ac312d084ee6/p', 'duration': 290, 'timestamp': 1476767794.2809999, - 'upload_date': '20160525', + 'upload_date': '20161018', 'uploader': 'parthivi001', 'uploader_id': 'user52596202', 'view_count': int, diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index 82d8a042f..9757f4422 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -3,11 +3,14 @@ from __future__ import unicode_literals import time import hashlib +import re from .common import InfoExtractor from ..utils import ( ExtractorError, unescapeHTML, + unified_strdate, + urljoin, ) @@ -20,7 +23,7 @@ class DouyuTVIE(InfoExtractor): 'id': '17732', 'display_id': 'iseven', 'ext': 'flv', - 'title': 're:^清晨醒脑!T-ARA根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': r're:.*m7show@163\.com.*', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': '7师傅', @@ -51,7 +54,7 @@ class DouyuTVIE(InfoExtractor): 'id': '17732', 'display_id': '17732', 'ext': 'flv', - 'title': 're:^清晨醒脑!T-ARA根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': r're:.*m7show@163\.com.*', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': '7师傅', @@ -117,3 +120,82 @@ class DouyuTVIE(InfoExtractor): 'uploader': uploader, 'is_live': True, } + + +class DouyuShowIE(InfoExtractor): + _VALID_URL = r'https?://v(?:mobile)?\.douyu\.com/show/(?P<id>[0-9a-zA-Z]+)' + + _TESTS = [{ + 'url': 'https://v.douyu.com/show/rjNBdvnVXNzvE2yw', + 'md5': '0c2cfd068ee2afe657801269b2d86214', + 'info_dict': { + 'id': 'rjNBdvnVXNzvE2yw', + 'ext': 'mp4', + 'title': '陈一发儿:砒霜 我有个室友系列!04-01 22点场', + 'duration': 7150.08, + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': '陈一发儿', + 'uploader_id': 'XrZwYelr5wbK', + 'uploader_url': 'https://v.douyu.com/author/XrZwYelr5wbK', + 'upload_date': '20170402', + }, + }, { + 'url': 'https://vmobile.douyu.com/show/rjNBdvnVXNzvE2yw', + 'only_matching': True, + }] + + def _real_extract(self, url): + url = url.replace('vmobile.', 'v.') + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + room_info = self._parse_json(self._search_regex( + r'var\s+\$ROOM\s*=\s*({.+});', webpage, 'room info'), video_id) + + video_info = None + + for trial in range(5): + # Sometimes Douyu rejects our request. Let's try it more times + try: + video_info = self._download_json( + 'https://vmobile.douyu.com/video/getInfo', video_id, + query={'vid': video_id}, + headers={ + 'Referer': url, + 'x-requested-with': 'XMLHttpRequest', + }) + break + except ExtractorError: + self._sleep(1, video_id) + + if not video_info: + raise ExtractorError('Can\'t fetch video info') + + formats = self._extract_m3u8_formats( + video_info['data']['video_url'], video_id, + entry_protocol='m3u8_native', ext='mp4') + + upload_date = unified_strdate(self._html_search_regex( + r'<em>上传时间:</em><span>([^<]+)</span>', webpage, + 'upload date', fatal=False)) + + uploader = uploader_id = uploader_url = None + mobj = re.search( + r'(?m)<a[^>]+href="/author/([0-9a-zA-Z]+)".+?<strong[^>]+title="([^"]+)"', + webpage) + if mobj: + uploader_id, uploader = mobj.groups() + uploader_url = urljoin(url, '/author/' + uploader_id) + + return { + 'id': video_id, + 'title': room_info['name'], + 'formats': formats, + 'duration': room_info.get('duration'), + 'thumbnail': room_info.get('pic'), + 'upload_date': upload_date, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'uploader_url': uploader_url, + } diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index e4917014a..c84624f1e 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -20,7 +20,7 @@ class DRTVIE(InfoExtractor): IE_NAME = 'drtv' _TESTS = [{ 'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10', - 'md5': '25e659cccc9a2ed956110a299fdf5983', + 'md5': '7ae17b4e18eb5d29212f424a7511c184', 'info_dict': { 'id': 'klassen-darlig-taber-10', 'ext': 'mp4', @@ -30,21 +30,37 @@ class DRTVIE(InfoExtractor): 'upload_date': '20160823', 'duration': 606.84, }, - 'params': { - 'skip_download': True, - }, }, { + # embed 'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang', - 'md5': '2c37175c718155930f939ef59952474a', 'info_dict': { 'id': 'christiania-pusher-street-ryddes-drdkrjpo', 'ext': 'mp4', 'title': 'LIVE Christianias rydning af Pusher Street er i gang', - 'description': '- Det er det fedeste, der er sket i 20 år, fortæller christianit til DR Nyheder.', + 'description': 'md5:2a71898b15057e9b97334f61d04e6eb5', 'timestamp': 1472800279, 'upload_date': '20160902', 'duration': 131.4, }, + 'params': { + 'skip_download': True, + }, + }, { + # with SignLanguage formats + 'url': 'https://www.dr.dk/tv/se/historien-om-danmark/-/historien-om-danmark-stenalder', + 'info_dict': { + 'id': 'historien-om-danmark-stenalder', + 'ext': 'mp4', + 'title': 'Historien om Danmark: Stenalder (1)', + 'description': 'md5:8c66dcbc1669bbc6f873879880f37f2a', + 'timestamp': 1490401996, + 'upload_date': '20170325', + 'duration': 3502.04, + 'formats': 'mincount:20', + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -88,7 +104,7 @@ class DRTVIE(InfoExtractor): elif kind in ('VideoResource', 'AudioResource'): duration = float_or_none(asset.get('DurationInMilliseconds'), 1000) restricted_to_denmark = asset.get('RestrictedToDenmark') - spoken_subtitles = asset.get('Target') == 'SpokenSubtitles' + asset_target = asset.get('Target') for link in asset.get('Links', []): uri = link.get('Uri') if not uri: @@ -96,9 +112,9 @@ class DRTVIE(InfoExtractor): target = link.get('Target') format_id = target or '' preference = None - if spoken_subtitles: + if asset_target in ('SpokenSubtitles', 'SignLanguage'): preference = -1 - format_id += '-spoken-subtitles' + format_id += '-%s' % asset_target if target == 'HDS': f4m_formats = self._extract_f4m_formats( uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43', diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 355a4e56f..ed603eb29 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -41,6 +41,7 @@ from .alphaporno import AlphaPornoIE from .amcnetworks import AMCNetworksIE from .animeondemand import AnimeOnDemandIE from .anitube import AnitubeIE +from .anvato import AnvatoIE from .anysex import AnySexIE from .aol import AolIE from .allocine import AllocineIE @@ -250,7 +251,10 @@ from .democracynow import DemocracynowIE from .dfb import DFBIE from .dhm import DHMIE from .dotsub import DotsubIE -from .douyutv import DouyuTVIE +from .douyutv import ( + DouyuShowIE, + DouyuTVIE, +) from .dplay import ( DPlayIE, DPlayItIE, @@ -349,9 +353,9 @@ from .foxsports import FoxSportsIE from .franceculture import FranceCultureIE from .franceinter import FranceInterIE from .francetv import ( - PluzzIE, - FranceTvInfoIE, FranceTVIE, + FranceTVEmbedIE, + FranceTVInfoIE, GenerationQuoiIE, CultureboxIE, ) @@ -541,6 +545,7 @@ from .mangomolo import ( ) from .matchtv import MatchTVIE from .mdr import MDRIE +from .mediaset import MediasetIE from .medici import MediciIE from .meipai import MeipaiIE from .melonvod import MelonVODIE @@ -662,6 +667,8 @@ from .nintendo import NintendoIE from .njpwworld import NJPWWorldIE from .nobelprize import NobelPrizeIE from .noco import NocoIE +from .nonktube import NonkTubeIE +from .noovo import NoovoIE from .normalboots import NormalbootsIE from .nosvideo import NosVideoIE from .nova import NovaIE @@ -730,8 +737,8 @@ from .openload import OpenloadIE from .ora import OraTVIE from .orf import ( ORFTVthekIE, - ORFOE1IE, ORFFM4IE, + ORFOE1IE, ORFIPTVIE, ) from .packtpub import ( @@ -1096,6 +1103,10 @@ from .uplynk import ( UplynkIE, UplynkPreplayIE, ) +from .upskill import ( + UpskillIE, + UpskillCourseIE, +) from .urort import UrortIE from .urplay import URPlayIE from .usanetwork import USANetworkIE @@ -1123,6 +1134,7 @@ from .vgtv import ( from .vh1 import VH1IE from .vice import ( ViceIE, + ViceArticleIE, ViceShowIE, ) from .viceland import VicelandIE @@ -1298,5 +1310,6 @@ from .youtube import ( YoutubeWatchLaterIE, ) from .zapiks import ZapiksIE +from .zaq1 import Zaq1IE from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE diff --git a/youtube_dl/extractor/foxsports.py b/youtube_dl/extractor/foxsports.py index a3bb98377..985542727 100644 --- a/youtube_dl/extractor/foxsports.py +++ b/youtube_dl/extractor/foxsports.py @@ -11,10 +11,10 @@ class FoxSportsIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?foxsports\.com/(?:[^/]+/)*(?P<id>[^/]+)' _TEST = { - 'url': 'http://www.foxsports.com/video?vid=432609859715', + 'url': 'http://www.foxsports.com/tennessee/video/432609859715', 'md5': 'b49050e955bebe32c301972e4012ac17', 'info_dict': { - 'id': 'i0qKWsk3qJaM', + 'id': 'bwduI3X_TgUB', 'ext': 'mp4', 'title': 'Courtney Lee on going up 2-0 in series vs. Blazers', 'description': 'Courtney Lee talks about Memphis being focused.', @@ -31,8 +31,9 @@ class FoxSportsIE(InfoExtractor): webpage = self._download_webpage(url, video_id) config = self._parse_json( - self._search_regex( - r"data-player-config='([^']+)'", webpage, 'data player config'), + self._html_search_regex( + r"""class="[^"]*(?:fs-player|platformPlayer-wrapper)[^"]*".+?data-player-config='([^']+)'""", + webpage, 'data player config'), video_id) return self.url_result(smuggle_url(update_url_query( diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 48d43ae58..546d5caa0 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -21,11 +21,13 @@ from .dailymotion import ( class FranceTVBaseInfoExtractor(InfoExtractor): - def _extract_video(self, video_id, catalogue): + def _extract_video(self, video_id, catalogue=None): info = self._download_json( - 'http://webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=%s&catalogue=%s' - % (video_id, catalogue), - video_id, 'Downloading video JSON') + 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/', + video_id, 'Downloading video JSON', query={ + 'idDiffusion': video_id, + 'catalogue': catalogue or '', + }) if info.get('status') == 'NOK': raise ExtractorError( @@ -109,27 +111,97 @@ class FranceTVBaseInfoExtractor(InfoExtractor): } -class PluzzIE(FranceTVBaseInfoExtractor): - IE_NAME = 'pluzz.francetv.fr' - _VALID_URL = r'https?://(?:m\.)?pluzz\.francetv\.fr/videos/(?P<id>.+?)\.html' +class FranceTVIE(FranceTVBaseInfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?france\.tv|mobile\.france\.tv)/(?:[^/]+/)+(?P<id>[^/]+)\.html' - # Can't use tests, videos expire in 7 days + _TESTS = [{ + 'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html', + 'info_dict': { + 'id': '157550144', + 'ext': 'mp4', + 'title': '13h15, le dimanche... - Les mystères de Jésus', + 'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42', + 'timestamp': 1494156300, + 'upload_date': '20170507', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + }, { + # france3 + 'url': 'https://www.france.tv/france-3/des-chiffres-et-des-lettres/139063-emission-du-mardi-9-mai-2017.html', + 'only_matching': True, + }, { + # france4 + 'url': 'https://www.france.tv/france-4/hero-corp/saison-1/134151-apres-le-calme.html', + 'only_matching': True, + }, { + # france5 + 'url': 'https://www.france.tv/france-5/c-a-dire/saison-10/137013-c-a-dire.html', + 'only_matching': True, + }, { + # franceo + 'url': 'https://www.france.tv/france-o/archipels/132249-mon-ancetre-l-esclave.html', + 'only_matching': True, + }, { + # france2 live + 'url': 'https://www.france.tv/france-2/direct.html', + 'only_matching': True, + }, { + 'url': 'https://www.france.tv/documentaires/histoire/136517-argentine-les-500-bebes-voles-de-la-dictature.html', + 'only_matching': True, + }, { + 'url': 'https://www.france.tv/jeux-et-divertissements/divertissements/133965-le-web-contre-attaque.html', + 'only_matching': True, + }, { + 'url': 'https://mobile.france.tv/france-5/c-dans-l-air/137347-emission-du-vendredi-12-mai-2017.html', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_id = self._html_search_meta( - 'id_video', webpage, 'video id', default=None) + catalogue = None + video_id = self._search_regex( + r'data-main-video=(["\'])(?P<id>(?:(?!\1).)+)\1', + webpage, 'video id', default=None, group='id') + if not video_id: - video_id = self._search_regex( - r'data-diffusion=["\'](\d+)', webpage, 'video id') + video_id, catalogue = self._html_search_regex( + r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"', + webpage, 'video ID').split('@') + return self._extract_video(video_id, catalogue) - return self._extract_video(video_id, 'Pluzz') +class FranceTVEmbedIE(FranceTVBaseInfoExtractor): + _VALID_URL = r'https?://embed\.francetv\.fr/*\?.*?\bue=(?P<id>[^&]+)' -class FranceTvInfoIE(FranceTVBaseInfoExtractor): + _TEST = { + 'url': 'http://embed.francetv.fr/?ue=7fd581a2ccf59d2fc5719c5c13cf6961', + 'info_dict': { + 'id': 'NI_983319', + 'ext': 'mp4', + 'title': 'Le Pen Reims', + 'upload_date': '20170505', + 'timestamp': 1493981780, + 'duration': 16, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'http://api-embed.webservices.francetelevisions.fr/key/%s' % video_id, + video_id) + + return self._extract_video(video['video_id'], video.get('catalog')) + + +class FranceTVInfoIE(FranceTVBaseInfoExtractor): IE_NAME = 'francetvinfo.fr' _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P<title>[^/?#&.]+)' @@ -233,124 +305,6 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): return self._extract_video(video_id, catalogue) -class FranceTVIE(FranceTVBaseInfoExtractor): - IE_NAME = 'francetv' - IE_DESC = 'France 2, 3, 4, 5 and Ô' - _VALID_URL = r'''(?x) - https?:// - (?: - (?:www\.)?france[2345o]\.fr/ - (?: - emissions/[^/]+/(?:videos|diffusions)| - emission/[^/]+| - videos| - jt - ) - /| - embed\.francetv\.fr/\?ue= - ) - (?P<id>[^/?]+) - ''' - - _TESTS = [ - # france2 - { - 'url': 'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104', - 'md5': 'c03fc87cb85429ffd55df32b9fc05523', - 'info_dict': { - 'id': '109169362', - 'ext': 'flv', - 'title': '13h15, le dimanche...', - 'description': 'md5:9a0932bb465f22d377a449be9d1a0ff7', - 'upload_date': '20140914', - 'timestamp': 1410693600, - }, - }, - # france3 - { - 'url': 'http://www.france3.fr/emissions/pieces-a-conviction/diffusions/13-11-2013_145575', - 'md5': '679bb8f8921f8623bd658fa2f8364da0', - 'info_dict': { - 'id': '000702326_CAPP_PicesconvictionExtrait313022013_120220131722_Au', - 'ext': 'mp4', - 'title': 'Le scandale du prix des médicaments', - 'description': 'md5:1384089fbee2f04fc6c9de025ee2e9ce', - 'upload_date': '20131113', - 'timestamp': 1384380000, - }, - }, - # france4 - { - 'url': 'http://www.france4.fr/emissions/hero-corp/videos/rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4', - 'md5': 'a182bf8d2c43d88d46ec48fbdd260c1c', - 'info_dict': { - 'id': 'rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4', - 'ext': 'mp4', - 'title': 'Hero Corp Making of - Extrait 1', - 'description': 'md5:c87d54871b1790679aec1197e73d650a', - 'upload_date': '20131106', - 'timestamp': 1383766500, - }, - }, - # france5 - { - 'url': 'http://www.france5.fr/emissions/c-a-dire/videos/quels_sont_les_enjeux_de_cette_rentree_politique__31-08-2015_908948?onglet=tous&page=1', - 'md5': 'f6c577df3806e26471b3d21631241fd0', - 'info_dict': { - 'id': '123327454', - 'ext': 'flv', - 'title': 'C à dire ?! - Quels sont les enjeux de cette rentrée politique ?', - 'description': 'md5:4a0d5cb5dce89d353522a84462bae5a4', - 'upload_date': '20150831', - 'timestamp': 1441035120, - }, - }, - # franceo - { - 'url': 'http://www.franceo.fr/jt/info-soir/18-07-2015', - 'md5': '47d5816d3b24351cdce512ad7ab31da8', - 'info_dict': { - 'id': '125377621', - 'ext': 'flv', - 'title': 'Infô soir', - 'description': 'md5:01b8c6915a3d93d8bbbd692651714309', - 'upload_date': '20150718', - 'timestamp': 1437241200, - 'duration': 414, - }, - }, - { - # francetv embed - 'url': 'http://embed.francetv.fr/?ue=8d7d3da1e3047c42ade5a5d7dfd3fc87', - 'info_dict': { - 'id': 'EV_30231', - 'ext': 'flv', - 'title': 'Alcaline, le concert avec Calogero', - 'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff', - 'upload_date': '20150226', - 'timestamp': 1424989860, - 'duration': 5400, - }, - }, - { - 'url': 'http://www.france4.fr/emission/highlander/diffusion-du-17-07-2015-04h05', - 'only_matching': True, - }, - { - 'url': 'http://www.franceo.fr/videos/125377617', - 'only_matching': True, - } - ] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_id, catalogue = self._html_search_regex( - r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"', - webpage, 'video ID').split('@') - return self._extract_video(video_id, catalogue) - - class GenerationQuoiIE(InfoExtractor): IE_NAME = 'france2.fr:generation-quoi' _VALID_URL = r'https?://generation-quoi\.france2\.fr/portrait/(?P<id>[^/?#]+)' diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py index e44a2a87f..8c37509ec 100644 --- a/youtube_dl/extractor/funimation.py +++ b/youtube_dl/extractor/funimation.py @@ -2,15 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_urllib_parse_unquote_plus, -) +from ..compat import compat_HTTPError from ..utils import ( determine_ext, int_or_none, js_to_json, - sanitized_Request, ExtractorError, urlencode_postdata ) @@ -20,6 +16,7 @@ class FunimationIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?funimation(?:\.com|now\.uk)/shows/[^/]+/(?P<id>[^/?#&]+)' _NETRC_MACHINE = 'funimation' + _TOKEN = None _TESTS = [{ 'url': 'https://www.funimation.com/shows/hacksign/role-play/', @@ -38,56 +35,38 @@ class FunimationIE(InfoExtractor): }, { 'url': 'https://www.funimation.com/shows/attack-on-titan-junior-high/broadcast-dub-preview/', 'info_dict': { - 'id': '9635', + 'id': '210051', 'display_id': 'broadcast-dub-preview', 'ext': 'mp4', 'title': 'Attack on Titan: Junior High - Broadcast Dub Preview', - 'description': 'md5:f8ec49c0aff702a7832cd81b8a44f803', 'thumbnail': r're:https?://.*\.(?:jpg|png)', }, - 'skip': 'Access without user interaction is forbidden by CloudFlare', + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'https://www.funimationnow.uk/shows/puzzle-dragons-x/drop-impact/simulcast/', 'only_matching': True, }] - _LOGIN_URL = 'http://www.funimation.com/login' - - def _extract_cloudflare_session_ua(self, url): - ci_session_cookie = self._get_cookies(url).get('ci_session') - if ci_session_cookie: - ci_session = compat_urllib_parse_unquote_plus(ci_session_cookie.value) - # ci_session is a string serialized by PHP function serialize() - # This case is simple enough to use regular expressions only - return self._search_regex( - r'"user_agent";s:\d+:"([^"]+)"', ci_session, 'user agent', - default=None) - def _login(self): (username, password) = self._get_login_info() if username is None: return - data = urlencode_postdata({ - 'email_field': username, - 'password_field': password, - }) - user_agent = self._extract_cloudflare_session_ua(self._LOGIN_URL) - if not user_agent: - user_agent = 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0' - login_request = sanitized_Request(self._LOGIN_URL, data, headers={ - 'User-Agent': user_agent, - 'Content-Type': 'application/x-www-form-urlencoded' - }) - login_page = self._download_webpage( - login_request, None, 'Logging in as %s' % username) - if any(p in login_page for p in ('funimation.com/logout', '>Log Out<')): - return - error = self._html_search_regex( - r'(?s)<div[^>]+id=["\']errorMessages["\'][^>]*>(.+?)</div>', - login_page, 'error messages', default=None) - if error: - raise ExtractorError('Unable to login: %s' % error, expected=True) - raise ExtractorError('Unable to log in') + try: + data = self._download_json( + 'https://prod-api-funimationnow.dadcdigital.com/api/auth/login/', + None, 'Logging in as %s' % username, data=urlencode_postdata({ + 'username': username, + 'password': password, + })) + self._TOKEN = data['token'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + error = self._parse_json(e.cause.read().decode(), None)['error'] + raise ExtractorError(error, expected=True) + raise def _real_initialize(self): self._login() @@ -125,9 +104,12 @@ class FunimationIE(InfoExtractor): description = self._html_search_meta(['description', 'og:description'], webpage, fatal=True) try: + headers = {} + if self._TOKEN: + headers['Authorization'] = 'Token %s' % self._TOKEN sources = self._download_json( 'https://prod-api-funimationnow.dadcdigital.com/api/source/catalog/video/%s/signed/' % video_id, - video_id)['items'] + video_id, headers=headers)['items'] except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: error = self._parse_json(e.cause.read(), video_id)['errors'][0] diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 81c0ce9a3..49409369c 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -58,8 +58,7 @@ class FunnyOrDieIE(InfoExtractor): m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) source_formats = list(filter( - lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', - m3u8_formats)) + lambda f: f.get('vcodec') != 'none', m3u8_formats)) bitrates = [int(bitrate) for bitrate in re.findall(r'[,/]v(\d+)(?=[,/])', m3u8_url)] bitrates.sort() diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 682c49e79..00d311158 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -78,8 +78,7 @@ class GameSpotIE(OnceIE): if m3u8_formats: self._sort_formats(m3u8_formats) m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', - m3u8_formats)) + lambda f: f.get('vcodec') != 'none', m3u8_formats)) if len(qualities) == len(m3u8_formats): for q, m3u8_format in zip(qualities, m3u8_formats): f = m3u8_format.copy() diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index 3136427db..f71d9092e 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -75,6 +75,19 @@ class GDCVaultIE(InfoExtractor): 'format': 'jp', # The japanese audio } }, + { + # gdc-player.html + 'url': 'http://www.gdcvault.com/play/1435/An-American-engine-in-Tokyo', + 'info_dict': { + 'id': '1435', + 'display_id': 'An-American-engine-in-Tokyo', + 'ext': 'flv', + 'title': 'An American Engine in Tokyo:/nThe collaboration of Epic Games and Square Enix/nFor THE LAST REMINANT', + }, + 'params': { + 'skip_download': True, # Requires rtmpdump + }, + }, ] def _login(self, webpage_url, display_id): @@ -128,7 +141,7 @@ class GDCVaultIE(InfoExtractor): 'title': title, } - PLAYER_REGEX = r'<iframe src="(?P<xml_root>.+?)/player.*?\.html.*?".*?</iframe>' + PLAYER_REGEX = r'<iframe src="(?P<xml_root>.+?)/(?:gdc-)?player.*?\.html.*?".*?</iframe>' xml_root = self._html_search_regex( PLAYER_REGEX, start_page, 'xml root', default=None) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 67184bc5d..c108d4a8a 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -86,6 +86,10 @@ from .openload import OpenloadIE from .videopress import VideoPressIE from .rutube import RutubeIE from .limelight import LimelightBaseIE +from .anvato import AnvatoIE +from .washingtonpost import WashingtonPostIE +from .wistia import WistiaIE +from .mediaset import MediasetIE class GenericIE(InfoExtractor): @@ -1427,6 +1431,22 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + { + # Brightcove embed with whitespace around attribute names + 'url': 'http://www.stack.com/video/3167554373001/learn-to-hit-open-three-pointers-with-damian-lillard-s-baseline-drift-drill', + 'info_dict': { + 'id': '3167554373001', + 'ext': 'mp4', + 'title': "Learn to Hit Open Three-Pointers With Damian Lillard's Baseline Drift Drill", + 'description': 'md5:57bacb0e0f29349de4972bfda3191713', + 'uploader_id': '1079349493', + 'upload_date': '20140207', + 'timestamp': 1391810548, + }, + 'params': { + 'skip_download': True, + }, + }, # Another form of arte.tv embed { 'url': 'http://www.tv-replay.fr/redirection/09-04-16/arte-reportage-arte-11508975.html', @@ -1677,6 +1697,42 @@ class GenericIE(InfoExtractor): }, 'playlist_mincount': 5, }, + { + 'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/', + 'info_dict': { + 'id': 'standoff-with-walnut-creek-murder-suspect-ends-with-arrest', + 'title': 'Standoff with Walnut Creek murder suspect ends', + 'description': 'md5:3ccc48a60fc9441eeccfc9c469ebf788', + }, + 'playlist_mincount': 4, + }, + { + # WashingtonPost embed + 'url': 'http://www.vanityfair.com/hollywood/2017/04/donald-trump-tv-pitches', + 'info_dict': { + 'id': '8caf6e88-d0ec-11e5-90d3-34c2c42653ac', + 'ext': 'mp4', + 'title': "No one has seen the drama series based on Trump's life \u2014 until now", + 'description': 'Donald Trump wanted a weekly TV drama based on his life. It never aired. But The Washington Post recently obtained a scene from the pilot script — and enlisted actors.', + 'timestamp': 1455216756, + 'uploader': 'The Washington Post', + 'upload_date': '20160211', + }, + 'add_ie': [WashingtonPostIE.ie_key()], + }, + { + # Mediaset embed + 'url': 'http://www.tgcom24.mediaset.it/politica/serracchiani-voglio-vivere-in-una-societa-aperta-reazioni-sproporzionate-_3071354-201702a.shtml', + 'info_dict': { + 'id': '720642', + 'ext': 'mp4', + 'title': 'Serracchiani: "Voglio vivere in una società aperta, con tutela del patto di fiducia"', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [MediasetIE.ie_key()], + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -2070,57 +2126,20 @@ class GenericIE(InfoExtractor): playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p) # Look for embedded Wistia player - match = re.search( - r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) - if match: - embed_url = self._proto_relative_url( - unescapeHTML(match.group('url'))) - return { - '_type': 'url_transparent', - 'url': embed_url, - 'ie_key': 'Wistia', - 'uploader': video_uploader, - } - - match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage) - if match: + wistia_url = WistiaIE._extract_url(webpage) + if wistia_url: return { '_type': 'url_transparent', - 'url': 'wistia:%s' % match.group('id'), - 'ie_key': 'Wistia', + 'url': self._proto_relative_url(wistia_url), + 'ie_key': WistiaIE.ie_key(), 'uploader': video_uploader, } - match = re.search( - r'''(?sx) - <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*? - <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2 - ''', webpage) - if match: - return self.url_result(self._proto_relative_url( - 'wistia:%s' % match.group('id')), 'Wistia') - # Look for SVT player svt_url = SVTIE._extract_url(webpage) if svt_url: return self.url_result(svt_url, 'SVT') - # Look for embedded condenast player - matches = re.findall( - r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")', - webpage) - if matches: - return { - '_type': 'playlist', - 'entries': [{ - '_type': 'url', - 'ie_key': 'CondeNast', - 'url': ma, - } for ma in matches], - 'title': video_title, - 'id': video_id, - } - # Look for Bandcamp pages with custom domain mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage) if mobj is not None: @@ -2514,28 +2533,11 @@ class GenericIE(InfoExtractor): return self.playlist_result( limelight_urls, video_id, video_title, video_description) - mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage) - if mobj: - lm = { - 'Media': 'media', - 'Channel': 'channel', - 'ChannelList': 'channel_list', - } - return self.url_result(smuggle_url('limelight:%s:%s' % ( - lm[mobj.group(1)], mobj.group(2)), {'source_url': url}), - 'Limelight%s' % mobj.group(1), mobj.group(2)) - - mobj = re.search( - r'''(?sx) - <object[^>]+class=(["\'])LimelightEmbeddedPlayerFlash\1[^>]*>.*? - <param[^>]+ - name=(["\'])flashVars\2[^>]+ - value=(["\'])(?:(?!\3).)*mediaId=(?P<id>[a-z0-9]{32}) - ''', webpage) - if mobj: - return self.url_result(smuggle_url( - 'limelight:media:%s' % mobj.group('id'), - {'source_url': url}), 'LimelightMedia', mobj.group('id')) + # Look for Anvato embeds + anvato_urls = AnvatoIE._extract_urls(self, webpage, video_id) + if anvato_urls: + return self.playlist_result( + anvato_urls, video_id, video_title, video_description) # Look for AdobeTVVideo embeds mobj = re.search( @@ -2654,6 +2656,18 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( rutube_urls, ie=RutubeIE.ie_key()) + # Look for WashingtonPost embeds + wapo_urls = WashingtonPostIE._extract_urls(webpage) + if wapo_urls: + return self.playlist_from_matches( + wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key()) + + # Look for Mediaset embeds + mediaset_urls = MediasetIE._extract_urls(webpage) + if mediaset_urls: + return self.playlist_from_matches( + mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key()) + # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld( webpage, video_id, default={}, expected_type='VideoObject') diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index 4c9be47b4..9c7b1bd37 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -36,22 +36,26 @@ class GoIE(AdobePassIE): 'requestor_id': 'DisneyXD', } } - _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:[^/]+/)*(?:vdka(?P<id>\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys()) + _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:(?:[^/]+/)*(?P<id>vdka\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys()) _TESTS = [{ - 'url': 'http://abc.go.com/shows/castle/video/most-recent/vdka0_g86w5onx', + 'url': 'http://abc.go.com/shows/designated-survivor/video/most-recent/VDKA3807643', 'info_dict': { - 'id': '0_g86w5onx', + 'id': 'VDKA3807643', 'ext': 'mp4', - 'title': 'Sneak Peek: Language Arts', - 'description': 'md5:7dcdab3b2d17e5217c953256af964e9c', + 'title': 'The Traitor in the White House', + 'description': 'md5:05b009d2d145a1e85d25111bd37222e8', }, 'params': { # m3u8 download 'skip_download': True, }, }, { - 'url': 'http://abc.go.com/shows/after-paradise/video/most-recent/vdka3335601', - 'only_matching': True, + 'url': 'http://watchdisneyxd.go.com/doraemon', + 'info_dict': { + 'title': 'Doraemon', + 'id': 'SH55574025', + }, + 'playlist_mincount': 51, }, { 'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding', 'only_matching': True, @@ -60,19 +64,36 @@ class GoIE(AdobePassIE): 'only_matching': True, }] + def _extract_videos(self, brand, video_id='-1', show_id='-1'): + display_id = video_id if video_id != '-1' else show_id + return self._download_json( + 'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/%s/-1/%s/-1/-1.json' % (brand, show_id, video_id), + display_id)['video'] + def _real_extract(self, url): sub_domain, video_id, display_id = re.match(self._VALID_URL, url).groups() + site_info = self._SITE_INFO[sub_domain] + brand = site_info['brand'] if not video_id: webpage = self._download_webpage(url, display_id) video_id = self._search_regex( # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood - r'data-video-id=["\']*VDKA(\w+)', webpage, 'video id') - site_info = self._SITE_INFO[sub_domain] - brand = site_info['brand'] - video_data = self._download_json( - 'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/-1/-1/%s/-1/-1.json' % (brand, video_id), - video_id)['video'][0] + r'data-video-id=["\']*(VDKA\w+)', webpage, 'video id', default=None) + if not video_id: + # show extraction works for Disney, DisneyJunior and DisneyXD + # ABC and Freeform has different layout + show_id = self._search_regex(r'data-show-id=["\']*(SH\d+)', webpage, 'show id') + videos = self._extract_videos(brand, show_id=show_id) + show_title = self._search_regex(r'data-show-title="([^"]+)"', webpage, 'show title', fatal=False) + entries = [] + for video in videos: + entries.append(self.url_result( + video['url'], 'Go', video.get('id'), video.get('title'))) + entries.reverse() + return self.playlist_result(entries, show_id, show_title) + video_data = self._extract_videos(brand, video_id)[0] + video_id = video_data['id'] title = video_data['title'] formats = [] @@ -105,7 +126,7 @@ class GoIE(AdobePassIE): self._initialize_geo_bypass(['US']) entitlement = self._download_json( 'https://api.entitlement.watchabc.go.com/vp2/ws-secure/entitlement/2020/authorize.json', - video_id, data=urlencode_postdata(data), headers=self.geo_verification_headers()) + video_id, data=urlencode_postdata(data)) errors = entitlement.get('errors', {}).get('errors', []) if errors: for error in errors: diff --git a/youtube_dl/extractor/hitbox.py b/youtube_dl/extractor/hitbox.py index e21ebb8fb..1d905dc81 100644 --- a/youtube_dl/extractor/hitbox.py +++ b/youtube_dl/extractor/hitbox.py @@ -16,8 +16,8 @@ from ..utils import ( class HitboxIE(InfoExtractor): IE_NAME = 'hitbox' - _VALID_URL = r'https?://(?:www\.)?hitbox\.tv/video/(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?:hitbox|smashcast)\.tv/(?:[^/]+/)*videos?/(?P<id>[0-9]+)' + _TESTS = [{ 'url': 'http://www.hitbox.tv/video/203213', 'info_dict': { 'id': '203213', @@ -38,13 +38,15 @@ class HitboxIE(InfoExtractor): # m3u8 download 'skip_download': True, }, - } + }, { + 'url': 'https://www.smashcast.tv/hitboxlive/videos/203213', + 'only_matching': True, + }] def _extract_metadata(self, url, video_id): thumb_base = 'https://edge.sf.hitbox.tv' metadata = self._download_json( - '%s/%s' % (url, video_id), video_id, - 'Downloading metadata JSON') + '%s/%s' % (url, video_id), video_id, 'Downloading metadata JSON') date = 'media_live_since' media_type = 'livestream' @@ -63,14 +65,15 @@ class HitboxIE(InfoExtractor): views = int_or_none(video_meta.get('media_views')) timestamp = parse_iso8601(video_meta.get(date), ' ') categories = [video_meta.get('category_name')] - thumbs = [ - {'url': thumb_base + video_meta.get('media_thumbnail'), - 'width': 320, - 'height': 180}, - {'url': thumb_base + video_meta.get('media_thumbnail_large'), - 'width': 768, - 'height': 432}, - ] + thumbs = [{ + 'url': thumb_base + video_meta.get('media_thumbnail'), + 'width': 320, + 'height': 180 + }, { + 'url': thumb_base + video_meta.get('media_thumbnail_large'), + 'width': 768, + 'height': 432 + }] return { 'id': video_id, @@ -90,7 +93,7 @@ class HitboxIE(InfoExtractor): video_id = self._match_id(url) player_config = self._download_json( - 'https://www.hitbox.tv/api/player/config/video/%s' % video_id, + 'https://www.smashcast.tv/api/player/config/video/%s' % video_id, video_id, 'Downloading video JSON') formats = [] @@ -121,8 +124,7 @@ class HitboxIE(InfoExtractor): self._sort_formats(formats) metadata = self._extract_metadata( - 'https://www.hitbox.tv/api/media/video', - video_id) + 'https://www.smashcast.tv/api/media/video', video_id) metadata['formats'] = formats return metadata @@ -130,8 +132,8 @@ class HitboxIE(InfoExtractor): class HitboxLiveIE(HitboxIE): IE_NAME = 'hitbox:live' - _VALID_URL = r'https?://(?:www\.)?hitbox\.tv/(?!video)(?P<id>.+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?:hitbox|smashcast)\.tv/(?P<id>[^/?#&]+)' + _TESTS = [{ 'url': 'http://www.hitbox.tv/dimak', 'info_dict': { 'id': 'dimak', @@ -146,13 +148,20 @@ class HitboxLiveIE(HitboxIE): # live 'skip_download': True, }, - } + }, { + 'url': 'https://www.smashcast.tv/dimak', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if HitboxIE.suitable(url) else super(HitboxLiveIE, cls).suitable(url) def _real_extract(self, url): video_id = self._match_id(url) player_config = self._download_json( - 'https://www.hitbox.tv/api/player/config/live/%s' % video_id, + 'https://www.smashcast.tv/api/player/config/live/%s' % video_id, video_id) formats = [] @@ -197,8 +206,7 @@ class HitboxLiveIE(HitboxIE): self._sort_formats(formats) metadata = self._extract_metadata( - 'https://www.hitbox.tv/api/media/live', - video_id) + 'https://www.smashcast.tv/api/media/live', video_id) metadata['formats'] = formats metadata['is_live'] = True metadata['title'] = self._live_title(metadata.get('title')) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index f95c00c73..3ff672a89 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -13,7 +13,7 @@ from ..utils import ( class ImdbIE(InfoExtractor): IE_NAME = 'imdb' IE_DESC = 'Internet Movie Database trailers' - _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video/[^/]+/|title/tt\d+.*?#lb-|videoplayer/)vi(?P<id>\d+)' + _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title).+?[/-]vi(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.imdb.com/video/imdb/vi2524815897', @@ -35,6 +35,9 @@ class ImdbIE(InfoExtractor): }, { 'url': 'http://www.imdb.com/videoplayer/vi1562949145', 'only_matching': True, + }, { + 'url': 'http://www.imdb.com/title/tt4218696/videoplayer/vi2608641561', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index 9fb71e8ef..fe425e786 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -87,8 +87,8 @@ class InfoQIE(BokeCCBaseIE): def _extract_http_audio(self, webpage, video_id): fields = self._hidden_inputs(webpage) - http_audio_url = fields['filename'] - if http_audio_url is None: + http_audio_url = fields.get('filename') + if not http_audio_url: return [] cookies_header = {'Cookie': self._extract_cookies(webpage)} diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py index 3190b187c..1f91ba017 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/youtube_dl/extractor/laola1tv.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import json + from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -8,15 +10,15 @@ from ..utils import ( urlencode_postdata, xpath_element, xpath_text, - urljoin, update_url_query, + js_to_json, ) class Laola1TvEmbedIE(InfoExtractor): IE_NAME = 'laola1tv:embed' _VALID_URL = r'https?://(?:www\.)?laola1\.tv/titanplayer\.php\?.*?\bvideoid=(?P<id>\d+)' - _TEST = { + _TESTS = [{ # flashvars.premium = "false"; 'url': 'https://www.laola1.tv/titanplayer.php?videoid=708065&type=V&lang=en&portal=int&customer=1024', 'info_dict': { @@ -26,7 +28,30 @@ class Laola1TvEmbedIE(InfoExtractor): 'uploader': 'ITTF - International Table Tennis Federation', 'upload_date': '20161211', }, - } + }] + + def _extract_token_url(self, stream_access_url, video_id, data): + return self._download_json( + stream_access_url, video_id, headers={ + 'Content-Type': 'application/json', + }, data=json.dumps(data).encode())['data']['stream-access'][0] + + def _extract_formats(self, token_url, video_id): + token_doc = self._download_xml( + token_url, video_id, 'Downloading token', + headers=self.geo_verification_headers()) + + token_attrib = xpath_element(token_doc, './/token').attrib + + if token_attrib['status'] != '0': + raise ExtractorError( + 'Token error: %s' % token_attrib['comment'], expected=True) + + formats = self._extract_akamai_formats( + '%s?hdnea=%s' % (token_attrib['url'], token_attrib['auth']), + video_id) + self._sort_formats(formats) + return formats def _real_extract(self, url): video_id = self._match_id(url) @@ -68,29 +93,16 @@ class Laola1TvEmbedIE(InfoExtractor): else: data_abo = urlencode_postdata( dict((i, v) for i, v in enumerate(_v('req_liga_abos').split(',')))) - token_url = self._download_json( - 'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access', - video_id, query={ + stream_access_url = update_url_query( + 'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access', { 'videoId': _v('id'), 'target': self._search_regex(r'vs_target = (\d+);', webpage, 'vs target'), 'label': _v('label'), 'area': _v('area'), - }, data=data_abo)['data']['stream-access'][0] - - token_doc = self._download_xml( - token_url, video_id, 'Downloading token', - headers=self.geo_verification_headers()) - - token_attrib = xpath_element(token_doc, './/token').attrib - - if token_attrib['status'] != '0': - raise ExtractorError( - 'Token error: %s' % token_attrib['comment'], expected=True) + }) + token_url = self._extract_token_url(stream_access_url, video_id, data_abo) - formats = self._extract_akamai_formats( - '%s?hdnea=%s' % (token_attrib['url'], token_attrib['auth']), - video_id) - self._sort_formats(formats) + formats = self._extract_formats(token_url, video_id) categories_str = _v('meta_sports') categories = categories_str.split(',') if categories_str else [] @@ -107,7 +119,7 @@ class Laola1TvEmbedIE(InfoExtractor): } -class Laola1TvIE(InfoExtractor): +class Laola1TvIE(Laola1TvEmbedIE): IE_NAME = 'laola1tv' _VALID_URL = r'https?://(?:www\.)?laola1\.tv/[a-z]+-[a-z]+/[^/]+/(?P<id>[^/?#&]+)' _TESTS = [{ @@ -164,13 +176,42 @@ class Laola1TvIE(InfoExtractor): if 'Dieser Livestream ist bereits beendet.' in webpage: raise ExtractorError('This live stream has already finished.', expected=True) - iframe_url = urljoin(url, self._search_regex( - r'<iframe[^>]*?id="videoplayer"[^>]*?src="([^"]+)"', - webpage, 'iframe url')) + conf = self._parse_json(self._search_regex( + r'(?s)conf\s*=\s*({.+?});', webpage, 'conf'), + display_id, js_to_json) + + video_id = conf['videoid'] + + config = self._download_json(conf['configUrl'], video_id, query={ + 'videoid': video_id, + 'partnerid': conf['partnerid'], + 'language': conf.get('language', ''), + 'portal': conf.get('portalid', ''), + }) + error = config.get('error') + if error: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + + video_data = config['video'] + title = video_data['title'] + is_live = video_data.get('isLivestream') and video_data.get('isLive') + meta = video_data.get('metaInformation') + sports = meta.get('sports') + categories = sports.split(',') if sports else [] + + token_url = self._extract_token_url( + video_data['streamAccess'], video_id, + video_data['abo']['required']) + + formats = self._extract_formats(token_url, video_id) return { - '_type': 'url', + 'id': video_id, 'display_id': display_id, - 'url': iframe_url, - 'ie_key': 'Laola1TvEmbed', + 'title': self._live_title(title) if is_live else title, + 'description': video_data.get('description'), + 'thumbnail': video_data.get('image'), + 'categories': categories, + 'formats': formats, + 'is_live': is_live, } diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py index 9eda956d2..0a07c1320 100644 --- a/youtube_dl/extractor/leeco.py +++ b/youtube_dl/extractor/leeco.py @@ -23,7 +23,6 @@ from ..utils import ( str_or_none, url_basename, urshift, - update_url_query, ) @@ -51,7 +50,7 @@ class LeIE(InfoExtractor): 'id': '1415246', 'ext': 'mp4', 'title': '美人天下01', - 'description': 'md5:f88573d9d7225ada1359eaf0dbf8bcda', + 'description': 'md5:28942e650e82ed4fcc8e4de919ee854d', }, 'params': { 'hls_prefer_native': True, @@ -69,7 +68,6 @@ class LeIE(InfoExtractor): 'params': { 'hls_prefer_native': True, }, - 'skip': 'Only available in China', }, { 'url': 'http://sports.le.com/video/25737697.html', 'only_matching': True, @@ -81,7 +79,7 @@ class LeIE(InfoExtractor): 'only_matching': True, }] - # ror() and calc_time_key() are reversed from a embedded swf file in KLetvPlayer.swf + # ror() and calc_time_key() are reversed from a embedded swf file in LetvPlayer.swf def ror(self, param1, param2): _loc3_ = 0 while _loc3_ < param2: @@ -90,15 +88,8 @@ class LeIE(InfoExtractor): return param1 def calc_time_key(self, param1): - _loc2_ = 773625421 - _loc3_ = self.ror(param1, _loc2_ % 13) - _loc3_ = _loc3_ ^ _loc2_ - _loc3_ = self.ror(_loc3_, _loc2_ % 17) - return _loc3_ - - # reversed from http://jstatic.letvcdn.com/sdk/player.js - def get_mms_key(self, time): - return self.ror(time, 8) ^ 185025305 + _loc2_ = 185025305 + return self.ror(param1, _loc2_ % 17) ^ _loc2_ # see M3U8Encryption class in KLetvPlayer.swf @staticmethod @@ -122,7 +113,7 @@ class LeIE(InfoExtractor): def _check_errors(self, play_json): # Check for errors - playstatus = play_json['playstatus'] + playstatus = play_json['msgs']['playstatus'] if playstatus['status'] == 0: flag = playstatus['flag'] if flag == 1: @@ -134,58 +125,31 @@ class LeIE(InfoExtractor): media_id = self._match_id(url) page = self._download_webpage(url, media_id) - play_json_h5 = self._download_json( - 'http://api.le.com/mms/out/video/playJsonH5', - media_id, 'Downloading html5 playJson data', query={ - 'id': media_id, - 'platid': 3, - 'splatid': 304, - 'format': 1, - 'tkey': self.get_mms_key(int(time.time())), - 'domain': 'www.le.com', - 'tss': 'no', - }, - headers=self.geo_verification_headers()) - self._check_errors(play_json_h5) - play_json_flash = self._download_json( - 'http://api.le.com/mms/out/video/playJson', + 'http://player-pc.le.com/mms/out/video/playJson', media_id, 'Downloading flash playJson data', query={ 'id': media_id, 'platid': 1, 'splatid': 101, 'format': 1, + 'source': 1000, 'tkey': self.calc_time_key(int(time.time())), 'domain': 'www.le.com', + 'region': 'cn', }, headers=self.geo_verification_headers()) self._check_errors(play_json_flash) - def get_h5_urls(media_url, format_id): - location = self._download_json( - media_url, media_id, - 'Download JSON metadata for format %s' % format_id, query={ - 'format': 1, - 'expect': 3, - 'tss': 'no', - })['location'] - - return { - 'http': update_url_query(location, {'tss': 'no'}), - 'hls': update_url_query(location, {'tss': 'ios'}), - } - def get_flash_urls(media_url, format_id): - media_url += '&' + compat_urllib_parse_urlencode({ - 'm3v': 1, - 'format': 1, - 'expect': 3, - 'rateid': format_id, - }) - nodes_data = self._download_json( media_url, media_id, - 'Download JSON metadata for format %s' % format_id) + 'Download JSON metadata for format %s' % format_id, + query={ + 'm3v': 1, + 'format': 1, + 'expect': 3, + 'tss': 'ios', + }) req = self._request_webpage( nodes_data['nodelist'][0]['location'], media_id, @@ -199,29 +163,28 @@ class LeIE(InfoExtractor): extracted_formats = [] formats = [] - for play_json, get_urls in ((play_json_h5, get_h5_urls), (play_json_flash, get_flash_urls)): - playurl = play_json['playurl'] - play_domain = playurl['domain'][0] - - for format_id, format_data in playurl.get('dispatch', []).items(): - if format_id in extracted_formats: - continue - extracted_formats.append(format_id) - - media_url = play_domain + format_data[0] - for protocol, format_url in get_urls(media_url, format_id).items(): - f = { - 'url': format_url, - 'ext': determine_ext(format_data[1]), - 'format_id': '%s-%s' % (protocol, format_id), - 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', - 'quality': int_or_none(format_id), - } - - if format_id[-1:] == 'p': - f['height'] = int_or_none(format_id[:-1]) - - formats.append(f) + playurl = play_json_flash['msgs']['playurl'] + play_domain = playurl['domain'][0] + + for format_id, format_data in playurl.get('dispatch', []).items(): + if format_id in extracted_formats: + continue + extracted_formats.append(format_id) + + media_url = play_domain + format_data[0] + for protocol, format_url in get_flash_urls(media_url, format_id).items(): + f = { + 'url': format_url, + 'ext': determine_ext(format_data[1]), + 'format_id': '%s-%s' % (protocol, format_id), + 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', + 'quality': int_or_none(format_id), + } + + if format_id[-1:] == 'p': + f['height'] = int_or_none(format_id[:-1]) + + formats.append(f) self._sort_formats(formats, ('height', 'quality', 'format_id')) publish_time = parse_iso8601(self._html_search_regex( diff --git a/youtube_dl/extractor/lego.py b/youtube_dl/extractor/lego.py index d3bca6435..b312e77f1 100644 --- a/youtube_dl/extractor/lego.py +++ b/youtube_dl/extractor/lego.py @@ -86,7 +86,7 @@ class LEGOIE(InfoExtractor): formats = self._extract_akamai_formats( '%si/s/public/%s_,%s,.mp4.csmil/master.m3u8' % (streaming_base, path, streaming_path), video_id) m3u8_formats = list(filter( - lambda f: f.get('protocol') == 'm3u8_native' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + lambda f: f.get('protocol') == 'm3u8_native' and f.get('vcodec') != 'none', formats)) if len(m3u8_formats) == len(self._BITRATES): self._sort_formats(m3u8_formats) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index c7de65353..c54519636 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -import json import re from .common import InfoExtractor @@ -11,10 +10,10 @@ class LiveLeakIE(InfoExtractor): _VALID_URL = r'https?://(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<id>[\w_]+)(?:.*)' _TESTS = [{ 'url': 'http://www.liveleak.com/view?i=757_1364311680', - 'md5': '50f79e05ba149149c1b4ea961223d5b3', + 'md5': '0813c2430bea7a46bf13acf3406992f4', 'info_dict': { 'id': '757_1364311680', - 'ext': 'flv', + 'ext': 'mp4', 'description': 'extremely bad day for this guy..!', 'uploader': 'ljfriel2', 'title': 'Most unlucky car accident', @@ -22,7 +21,7 @@ class LiveLeakIE(InfoExtractor): } }, { 'url': 'http://www.liveleak.com/view?i=f93_1390833151', - 'md5': 'b13a29626183c9d33944e6a04f41aafc', + 'md5': 'd3f1367d14cc3c15bf24fbfbe04b9abf', 'info_dict': { 'id': 'f93_1390833151', 'ext': 'mp4', @@ -32,6 +31,7 @@ class LiveLeakIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$' } }, { + # Prochan embed 'url': 'http://www.liveleak.com/view?i=4f7_1392687779', 'md5': '42c6d97d54f1db107958760788c5f48f', 'info_dict': { @@ -41,11 +41,13 @@ class LiveLeakIE(InfoExtractor): 'uploader': 'CapObveus', 'title': 'Man is Fatally Struck by Reckless Car While Packing up a Moving Truck', 'age_limit': 18, - } + }, + 'skip': 'Video is dead', }, { # Covers https://github.com/rg3/youtube-dl/pull/5983 + # Multiple resolutions 'url': 'http://www.liveleak.com/view?i=801_1409392012', - 'md5': '0b3bec2d888c20728ca2ad3642f0ef15', + 'md5': 'c3a449dbaca5c0d1825caecd52a57d7b', 'info_dict': { 'id': '801_1409392012', 'ext': 'mp4', @@ -93,57 +95,38 @@ class LiveLeakIE(InfoExtractor): webpage, 'age limit', default=None)) video_thumbnail = self._og_search_thumbnail(webpage) - sources_raw = self._search_regex( - r'(?s)sources:\s*(\[.*?\]),', webpage, 'video URLs', default=None) - if sources_raw is None: - alt_source = self._search_regex( - r'(file: ".*?"),', webpage, 'video URL', default=None) - if alt_source: - sources_raw = '[{ %s}]' % alt_source - else: - # Maybe an embed? - embed_url = self._search_regex( - r'<iframe[^>]+src="(https?://(?:www\.)?(?:prochan|youtube)\.com/embed[^"]+)"', - webpage, 'embed URL') - return { - '_type': 'url_transparent', - 'url': embed_url, - 'id': video_id, - 'title': video_title, - 'description': video_description, - 'uploader': video_uploader, - 'age_limit': age_limit, - } + entries = self._parse_html5_media_entries(url, webpage, video_id) + if not entries: + # Maybe an embed? + embed_url = self._search_regex( + r'<iframe[^>]+src="((?:https?:)?//(?:www\.)?(?:prochan|youtube)\.com/embed[^"]+)"', + webpage, 'embed URL') + return { + '_type': 'url_transparent', + 'url': embed_url, + 'id': video_id, + 'title': video_title, + 'description': video_description, + 'uploader': video_uploader, + 'age_limit': age_limit, + } - sources_json = re.sub(r'\s([a-z]+):\s', r'"\1": ', sources_raw) - sources = json.loads(sources_json) + info_dict = entries[0] - formats = [{ - 'format_id': '%s' % i, - 'format_note': s.get('label'), - 'url': s['file'], - } for i, s in enumerate(sources)] + for a_format in info_dict['formats']: + if not a_format.get('height'): + a_format['height'] = self._search_regex( + r'([0-9]+)p\.mp4', a_format['url'], 'height label', default=None) - for i, s in enumerate(sources): - # Removing '.h264_*.mp4' gives the raw video, which is essentially - # the same video without the LiveLeak logo at the top (see - # https://github.com/rg3/youtube-dl/pull/4768) - orig_url = re.sub(r'\.h264_.+?\.mp4', '', s['file']) - if s['file'] != orig_url: - formats.append({ - 'format_id': 'original-%s' % i, - 'format_note': s.get('label'), - 'url': orig_url, - 'preference': 1, - }) - self._sort_formats(formats) + self._sort_formats(info_dict['formats']) - return { + info_dict.update({ 'id': video_id, 'title': video_title, 'description': video_description, 'uploader': video_uploader, - 'formats': formats, 'age_limit': age_limit, 'thumbnail': video_thumbnail, - } + }) + + return info_dict diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py new file mode 100644 index 000000000..9760eafd5 --- /dev/null +++ b/youtube_dl/extractor/mediaset.py @@ -0,0 +1,118 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + parse_duration, + try_get, + unified_strdate, +) + + +class MediasetIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?: + mediaset:| + https?:// + (?:www\.)?video\.mediaset\.it/ + (?: + (?:video|on-demand)/(?:[^/]+/)+[^/]+_| + player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid= + ) + )(?P<id>[0-9]+) + ''' + _TESTS = [{ + # full episode + 'url': 'http://www.video.mediaset.it/video/hello_goodbye/full/quarta-puntata_661824.html', + 'md5': '9b75534d42c44ecef7bf1ffeacb7f85d', + 'info_dict': { + 'id': '661824', + 'ext': 'mp4', + 'title': 'Quarta puntata', + 'description': 'md5:7183696d6df570e3412a5ef74b27c5e2', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1414, + 'creator': 'mediaset', + 'upload_date': '20161107', + 'series': 'Hello Goodbye', + 'categories': ['reality'], + }, + 'expected_warnings': ['is not a supported codec'], + }, { + # clip + 'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html', + 'only_matching': True, + }, { + # iframe simple + 'url': 'http://www.video.mediaset.it/player/playerIFrame.shtml?id=665924&autoplay=true', + 'only_matching': True, + }, { + # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/) + 'url': 'https://www.video.mediaset.it/player/playerIFrameTwitter.shtml?id=665104&playrelated=false&autoplay=false&related=true&hidesocial=true', + 'only_matching': True, + }, { + 'url': 'mediaset:661824', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>https?://(?:www\.)?video\.mediaset\.it/player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid=\d+.*?)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video_list = self._download_json( + 'http://cdnsel01.mediaset.net/GetCdn.aspx', + video_id, 'Downloading video CDN JSON', query={ + 'streamid': video_id, + 'format': 'json', + })['videoList'] + + formats = [] + for format_url in video_list: + if '.ism' in format_url: + formats.extend(self._extract_ism_formats( + format_url, video_id, ism_id='mss', fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': determine_ext(format_url), + }) + self._sort_formats(formats) + + mediainfo = self._download_json( + 'http://plr.video.mediaset.it/html/metainfo.sjson', + video_id, 'Downloading video info JSON', query={ + 'id': video_id, + })['video'] + + title = mediainfo['title'] + + creator = try_get( + mediainfo, lambda x: x['brand-info']['publisher'], compat_str) + category = try_get( + mediainfo, lambda x: x['brand-info']['category'], compat_str) + categories = [category] if category else None + + return { + 'id': video_id, + 'title': title, + 'description': mediainfo.get('short-description'), + 'thumbnail': mediainfo.get('thumbnail'), + 'duration': parse_duration(mediainfo.get('duration')), + 'creator': creator, + 'upload_date': unified_strdate(mediainfo.get('production-date')), + 'webpage_url': mediainfo.get('url'), + 'series': mediainfo.get('brand-value'), + 'categories': categories, + 'formats': formats, + } diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 28b743cca..964dc542c 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -136,11 +136,9 @@ class MiTeleIE(InfoExtractor): video_id, 'Downloading gigya script') # Get a appKey/uuid for getting the session key - appKey_var = self._search_regex( - r'value\s*\(\s*["\']appGridApplicationKey["\']\s*,\s*([0-9a-f]+)', - gigya_sc, 'appKey variable') appKey = self._search_regex( - r'var\s+%s\s*=\s*["\']([0-9a-f]+)' % appKey_var, gigya_sc, 'appKey') + r'constant\s*\(\s*["\']_appGridApplicationKey["\']\s*,\s*["\']([0-9a-f]+)', + gigya_sc, 'appKey') session_json = self._download_json( 'https://appgrid-api.cloud.accedo.tv/session', diff --git a/youtube_dl/extractor/myspace.py b/youtube_dl/extractor/myspace.py index f281238c9..e164d5940 100644 --- a/youtube_dl/extractor/myspace.py +++ b/youtube_dl/extractor/myspace.py @@ -12,64 +12,62 @@ from ..utils import ( class MySpaceIE(InfoExtractor): - _VALID_URL = r'https?://myspace\.com/([^/]+)/(?P<mediatype>video/[^/]+/|music/song/.*?)(?P<id>\d+)' + _VALID_URL = r'''(?x) + https?:// + myspace\.com/[^/]+/ + (?P<mediatype> + video/[^/]+/(?P<video_id>\d+)| + music/song/[^/?#&]+-(?P<song_id>\d+)-\d+(?:[/?#&]|$) + ) + ''' - _TESTS = [ - { - 'url': 'https://myspace.com/fiveminutestothestage/video/little-big-town/109594919', - 'md5': '9c1483c106f4a695c47d2911feed50a7', - 'info_dict': { - 'id': '109594919', - 'ext': 'mp4', - 'title': 'Little Big Town', - 'description': 'This country quartet was all smiles while playing a sold out show at the Pacific Amphitheatre in Orange County, California.', - 'uploader': 'Five Minutes to the Stage', - 'uploader_id': 'fiveminutestothestage', - 'timestamp': 1414108751, - 'upload_date': '20141023', - }, + _TESTS = [{ + 'url': 'https://myspace.com/fiveminutestothestage/video/little-big-town/109594919', + 'md5': '9c1483c106f4a695c47d2911feed50a7', + 'info_dict': { + 'id': '109594919', + 'ext': 'mp4', + 'title': 'Little Big Town', + 'description': 'This country quartet was all smiles while playing a sold out show at the Pacific Amphitheatre in Orange County, California.', + 'uploader': 'Five Minutes to the Stage', + 'uploader_id': 'fiveminutestothestage', + 'timestamp': 1414108751, + 'upload_date': '20141023', }, + }, { # songs - { - 'url': 'https://myspace.com/killsorrow/music/song/of-weakened-soul...-93388656-103880681', - 'md5': '1d7ee4604a3da226dd69a123f748b262', - 'info_dict': { - 'id': '93388656', - 'ext': 'm4a', - 'title': 'Of weakened soul...', - 'uploader': 'Killsorrow', - 'uploader_id': 'killsorrow', - }, - }, { - 'add_ie': ['Youtube'], - 'url': 'https://myspace.com/threedaysgrace/music/song/animal-i-have-become-28400208-28218041', - 'info_dict': { - 'id': 'xqds0B_meys', - 'ext': 'webm', - 'title': 'Three Days Grace - Animal I Have Become', - 'description': 'md5:8bd86b3693e72a077cf863a8530c54bb', - 'uploader': 'ThreeDaysGraceVEVO', - 'uploader_id': 'ThreeDaysGraceVEVO', - 'upload_date': '20091002', - }, - }, { - 'add_ie': ['Youtube'], - 'url': 'https://myspace.com/starset2/music/song/first-light-95799905-106964426', - 'info_dict': { - 'id': 'ypWvQgnJrSU', - 'ext': 'mp4', - 'title': 'Starset - First Light', - 'description': 'md5:2d5db6c9d11d527683bcda818d332414', - 'uploader': 'Yumi K', - 'uploader_id': 'SorenPromotions', - 'upload_date': '20140725', - } + 'url': 'https://myspace.com/killsorrow/music/song/of-weakened-soul...-93388656-103880681', + 'md5': '1d7ee4604a3da226dd69a123f748b262', + 'info_dict': { + 'id': '93388656', + 'ext': 'm4a', + 'title': 'Of weakened soul...', + 'uploader': 'Killsorrow', + 'uploader_id': 'killsorrow', }, - ] + }, { + 'add_ie': ['Youtube'], + 'url': 'https://myspace.com/threedaysgrace/music/song/animal-i-have-become-28400208-28218041', + 'info_dict': { + 'id': 'xqds0B_meys', + 'ext': 'webm', + 'title': 'Three Days Grace - Animal I Have Become', + 'description': 'md5:8bd86b3693e72a077cf863a8530c54bb', + 'uploader': 'ThreeDaysGraceVEVO', + 'uploader_id': 'ThreeDaysGraceVEVO', + 'upload_date': '20091002', + }, + }, { + 'url': 'https://myspace.com/starset2/music/song/first-light-95799905-106964426', + 'only_matching': True, + }, { + 'url': 'https://myspace.com/thelargemouthbassband/music/song/02-pure-eyes.mp3-94422330-105113388', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = mobj.group('video_id') or mobj.group('song_id') is_song = mobj.group('mediatype').startswith('music/song') webpage = self._download_webpage(url, video_id) player_url = self._search_regex( diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index d2a44d05d..62db70b43 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -5,10 +5,8 @@ import re from .common import InfoExtractor from .theplatform import ThePlatformIE from .adobepass import AdobePassIE -from ..compat import compat_urllib_parse_urlparse from ..utils import ( find_xpath_attr, - lowercase_escape, smuggle_url, unescapeHTML, update_url_query, @@ -17,7 +15,7 @@ from ..utils import ( class NBCIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)' + _VALID_URL = r'(?P<permalink>https?://(?:www\.)?nbc\.com/[^/]+/video/[^/]+/(?P<id>n?\d+))' _TESTS = [ { @@ -36,16 +34,6 @@ class NBCIE(AdobePassIE): 'skip_download': True, }, }, - { - 'url': 'http://www.nbc.com/the-tonight-show/episodes/176', - 'info_dict': { - 'id': '176', - 'ext': 'flv', - 'title': 'Ricky Gervais, Steven Van Zandt, ILoveMakonnen', - 'description': 'A brand new episode of The Tonight Show welcomes Ricky Gervais, Steven Van Zandt and ILoveMakonnen.', - }, - 'skip': '404 Not Found', - }, { 'url': 'http://www.nbc.com/saturday-night-live/video/star-wars-teaser/2832821', 'info_dict': { @@ -63,11 +51,6 @@ class NBCIE(AdobePassIE): }, 'skip': 'Only works from US', }, - { - # This video has expired but with an escaped embedURL - 'url': 'http://www.nbc.com/parenthood/episode-guide/season-5/just-like-at-home/515', - 'only_matching': True, - }, { # HLS streams requires the 'hdnea3' cookie 'url': 'http://www.nbc.com/Kings/video/goliath/n1806', @@ -88,59 +71,38 @@ class NBCIE(AdobePassIE): ] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - info = { + permalink, video_id = re.match(self._VALID_URL, url).groups() + video_data = self._download_json( + 'https://api.nbc.com/v3/videos', video_id, query={ + 'filter[permalink]': permalink, + })['data'][0]['attributes'] + query = { + 'mbr': 'true', + 'manifest': 'm3u', + } + video_id = video_data['guid'] + title = video_data['title'] + if video_data.get('entitlement') == 'auth': + resource = self._get_mvpd_resource( + 'nbcentertainment', title, video_id, + video_data.get('vChipRating')) + query['auth'] = self._extract_mvpd_auth( + url, video_id, 'nbcentertainment', resource) + theplatform_url = smuggle_url(update_url_query( + 'http://link.theplatform.com/s/NnzsPC/media/guid/2410887629/' + video_id, + query), {'force_smil_url': True}) + return { '_type': 'url_transparent', - 'ie_key': 'ThePlatform', 'id': video_id, + 'title': title, + 'url': theplatform_url, + 'description': video_data.get('description'), + 'keywords': video_data.get('keywords'), + 'season_number': int_or_none(video_data.get('seasonNumber')), + 'episode_number': int_or_none(video_data.get('episodeNumber')), + 'series': video_data.get('showName'), + 'ie_key': 'ThePlatform', } - video_data = None - preload = self._search_regex( - r'PRELOAD\s*=\s*({.+})', webpage, 'preload data', default=None) - if preload: - preload_data = self._parse_json(preload, video_id) - path = compat_urllib_parse_urlparse(url).path.rstrip('/') - entity_id = preload_data.get('xref', {}).get(path) - video_data = preload_data.get('entities', {}).get(entity_id) - if video_data: - query = { - 'mbr': 'true', - 'manifest': 'm3u', - } - video_id = video_data['guid'] - title = video_data['title'] - if video_data.get('entitlement') == 'auth': - resource = self._get_mvpd_resource( - 'nbcentertainment', title, video_id, - video_data.get('vChipRating')) - query['auth'] = self._extract_mvpd_auth( - url, video_id, 'nbcentertainment', resource) - theplatform_url = smuggle_url(update_url_query( - 'http://link.theplatform.com/s/NnzsPC/media/guid/2410887629/' + video_id, - query), {'force_smil_url': True}) - info.update({ - 'id': video_id, - 'title': title, - 'url': theplatform_url, - 'description': video_data.get('description'), - 'keywords': video_data.get('keywords'), - 'season_number': int_or_none(video_data.get('seasonNumber')), - 'episode_number': int_or_none(video_data.get('episodeNumber')), - 'series': video_data.get('showName'), - }) - else: - theplatform_url = unescapeHTML(lowercase_escape(self._html_search_regex( - [ - r'(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"', - r'<iframe[^>]+src="((?:https?:)?//player\.theplatform\.com/[^"]+)"', - r'"embedURL"\s*:\s*"([^"]+)"' - ], - webpage, 'theplatform url').replace('_no_endcard', '').replace('\\/', '/'))) - if theplatform_url.startswith('//'): - theplatform_url = 'http:' + theplatform_url - info['url'] = smuggle_url(theplatform_url, {'source_url': url}) - return info class NBCSportsVPlayerIE(InfoExtractor): diff --git a/youtube_dl/extractor/njpwworld.py b/youtube_dl/extractor/njpwworld.py index f5e3f6815..9b5ad5a9f 100644 --- a/youtube_dl/extractor/njpwworld.py +++ b/youtube_dl/extractor/njpwworld.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( + extract_attributes, get_element_by_class, urlencode_postdata, ) @@ -56,17 +57,24 @@ class NJPWWorldIE(InfoExtractor): webpage = self._download_webpage(url, video_id) formats = [] - for player_url, kind in re.findall(r'<a[^>]+href="(/player[^"]+)".+?<img[^>]+src="[^"]+qf_btn_([^".]+)', webpage): - player_url = compat_urlparse.urljoin(url, player_url) - + for mobj in re.finditer(r'<a[^>]+\bhref=(["\'])/player.+?[^>]*>', webpage): + player = extract_attributes(mobj.group(0)) + player_path = player.get('href') + if not player_path: + continue + kind = self._search_regex( + r'(low|high)$', player.get('class') or '', 'kind', + default='low') + player_url = compat_urlparse.urljoin(url, player_path) player_page = self._download_webpage( player_url, video_id, note='Downloading player page') - entries = self._parse_html5_media_entries( player_url, player_page, video_id, m3u8_id='hls-%s' % kind, - m3u8_entry_protocol='m3u8_native', - preference=2 if 'hq' in kind else 1) - formats.extend(entries[0]['formats']) + m3u8_entry_protocol='m3u8_native') + kind_formats = entries[0]['formats'] + for f in kind_formats: + f['quality'] = 2 if kind == 'high' else 1 + formats.extend(kind_formats) self._sort_formats(formats) diff --git a/youtube_dl/extractor/nonktube.py b/youtube_dl/extractor/nonktube.py new file mode 100644 index 000000000..63e58aae2 --- /dev/null +++ b/youtube_dl/extractor/nonktube.py @@ -0,0 +1,33 @@ +from __future__ import unicode_literals + +from .nuevo import NuevoBaseIE + + +class NonkTubeIE(NuevoBaseIE): + _VALID_URL = r'https?://(?:www\.)?nonktube\.com/(?:(?:video|embed)/|media/nuevo/embed\.php\?.*?\bid=)(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.nonktube.com/video/118636/sensual-wife-uncensored-fucked-in-hairy-pussy-and-facialized', + 'info_dict': { + 'id': '118636', + 'ext': 'mp4', + 'title': 'Sensual Wife Uncensored Fucked In Hairy Pussy And Facialized', + 'age_limit': 18, + 'duration': 1150.98, + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'https://www.nonktube.com/embed/118636', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + info = self._extract_nuevo( + 'https://www.nonktube.com/media/nuevo/econfig.php?key=%s' + % video_id, video_id) + + info['age_limit'] = 18 + return info diff --git a/youtube_dl/extractor/noovo.py b/youtube_dl/extractor/noovo.py new file mode 100644 index 000000000..f7fa098a5 --- /dev/null +++ b/youtube_dl/extractor/noovo.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + smuggle_url, + try_get, +) + + +class NoovoIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?noovo\.ca/videos/(?P<id>[^/]+/[^/?#&]+)' + _TESTS = [{ + # clip + 'url': 'http://noovo.ca/videos/rpm-plus/chrysler-imperial', + 'info_dict': { + 'id': '5386045029001', + 'ext': 'mp4', + 'title': 'Chrysler Imperial', + 'description': 'md5:de3c898d1eb810f3e6243e08c8b4a056', + 'timestamp': 1491399228, + 'upload_date': '20170405', + 'uploader_id': '618566855001', + 'creator': 'vtele', + 'view_count': int, + 'series': 'RPM+', + }, + 'params': { + 'skip_download': True, + }, + }, { + # episode + 'url': 'http://noovo.ca/videos/l-amour-est-dans-le-pre/episode-13-8', + 'info_dict': { + 'id': '5395865725001', + 'title': 'Épisode 13 : Les retrouvailles', + 'description': 'md5:336d5ebc5436534e61d16e63ddfca327', + 'ext': 'mp4', + 'timestamp': 1492019320, + 'upload_date': '20170412', + 'uploader_id': '618566855001', + 'creator': 'vtele', + 'view_count': int, + 'series': "L'amour est dans le pré", + 'season_number': 5, + 'episode': 'Épisode 13', + 'episode_number': 13, + }, + 'params': { + 'skip_download': True, + }, + }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/618566855001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + video_id = self._match_id(url) + + data = self._download_json( + 'http://api.noovo.ca/api/v1/pages/single-episode/%s' % video_id, + video_id)['data'] + + content = try_get(data, lambda x: x['contents'][0]) + + brightcove_id = data.get('brightcoveId') or content['brightcoveId'] + + series = try_get( + data, ( + lambda x: x['show']['title'], + lambda x: x['season']['show']['title']), + compat_str) + + episode = None + og = data.get('og') + if isinstance(og, dict) and og.get('type') == 'video.episode': + episode = og.get('title') + + video = content or data + + return { + '_type': 'url_transparent', + 'ie_key': BrightcoveNewIE.ie_key(), + 'url': smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + {'geo_countries': ['CA']}), + 'id': brightcove_id, + 'title': video.get('title'), + 'creator': video.get('source'), + 'view_count': int_or_none(video.get('viewsCount')), + 'series': series, + 'season_number': int_or_none(try_get( + data, lambda x: x['season']['seasonNumber'])), + 'episode': episode, + 'episode_number': int_or_none(data.get('episodeNumber')), + } diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 7fe79cb53..3b4f51f61 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -148,13 +148,34 @@ class NRKBaseIE(InfoExtractor): vcodec = 'none' if data.get('mediaType') == 'Audio' else None - # TODO: extract chapters when https://github.com/rg3/youtube-dl/pull/9409 is merged - for entry in entries: entry.update(common_info) for f in entry['formats']: f['vcodec'] = vcodec + points = data.get('shortIndexPoints') + if isinstance(points, list): + chapters = [] + for next_num, point in enumerate(points, start=1): + if not isinstance(point, dict): + continue + start_time = parse_duration(point.get('startPoint')) + if start_time is None: + continue + end_time = parse_duration( + data.get('duration') + if next_num == len(points) + else points[next_num].get('startPoint')) + if end_time is None: + continue + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + 'title': point.get('title'), + }) + if chapters and len(entries) == 1: + entries[0]['chapters'] = chapters + return self.playlist_result(entries, video_id, title, description) diff --git a/youtube_dl/extractor/nuevo.py b/youtube_dl/extractor/nuevo.py index 87fb94d1f..be1e09d37 100644 --- a/youtube_dl/extractor/nuevo.py +++ b/youtube_dl/extractor/nuevo.py @@ -10,9 +10,10 @@ from ..utils import ( class NuevoBaseIE(InfoExtractor): - def _extract_nuevo(self, config_url, video_id): + def _extract_nuevo(self, config_url, video_id, headers={}): config = self._download_xml( - config_url, video_id, transform_source=lambda s: s.strip()) + config_url, video_id, transform_source=lambda s: s.strip(), + headers=headers) title = xpath_text(config, './title', 'title', fatal=True).strip() video_id = xpath_text(config, './mediaid', default=video_id) diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 1e2c54e68..cc296eabd 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -2,8 +2,6 @@ from __future__ import unicode_literals import re -import calendar -import datetime from .common import InfoExtractor from ..compat import compat_str @@ -144,77 +142,25 @@ class ORFTVthekIE(InfoExtractor): } -class ORFOE1IE(InfoExtractor): - IE_NAME = 'orf:oe1' - IE_DESC = 'Radio Österreich 1' - _VALID_URL = r'https?://oe1\.orf\.at/(?:programm/|konsole\?.*?\btrack_id=)(?P<id>[0-9]+)' - - # Audios on ORF radio are only available for 7 days, so we can't add tests. - _TESTS = [{ - 'url': 'http://oe1.orf.at/konsole?show=on_demand#?track_id=394211', - 'only_matching': True, - }, { - 'url': 'http://oe1.orf.at/konsole?show=ondemand&track_id=443608&load_day=/programm/konsole/tag/20160726', - 'only_matching': True, - }] - - def _real_extract(self, url): - show_id = self._match_id(url) - data = self._download_json( - 'http://oe1.orf.at/programm/%s/konsole' % show_id, - show_id - ) - - timestamp = datetime.datetime.strptime('%s %s' % ( - data['item']['day_label'], - data['item']['time'] - ), '%d.%m.%Y %H:%M') - unix_timestamp = calendar.timegm(timestamp.utctimetuple()) - - return { - 'id': show_id, - 'title': data['item']['title'], - 'url': data['item']['url_stream'], - 'ext': 'mp3', - 'description': data['item'].get('info'), - 'timestamp': unix_timestamp - } - - -class ORFFM4IE(InfoExtractor): - IE_NAME = 'orf:fm4' - IE_DESC = 'radio FM4' - _VALID_URL = r'https?://fm4\.orf\.at/(?:7tage/?#|player/)(?P<date>[0-9]+)/(?P<show>\w+)' - - _TEST = { - 'url': 'http://fm4.orf.at/player/20160110/IS/', - 'md5': '01e736e8f1cef7e13246e880a59ad298', - 'info_dict': { - 'id': '2016-01-10_2100_tl_54_7DaysSun13_11244', - 'ext': 'mp3', - 'title': 'Im Sumpf', - 'description': 'md5:384c543f866c4e422a55f66a62d669cd', - 'duration': 7173, - 'timestamp': 1452456073, - 'upload_date': '20160110', - }, - 'skip': 'Live streams on FM4 got deleted soon', - } - +class ORFRadioIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) + station = mobj.group('station') show_date = mobj.group('date') show_id = mobj.group('show') + if station == 'fm4': + show_id = '4%s' % show_id + data = self._download_json( - 'http://audioapi.orf.at/fm4/json/2.0/broadcasts/%s/4%s' % (show_date, show_id), + 'http://audioapi.orf.at/%s/api/json/current/broadcast/%s/%s' % (station, show_id, show_date), show_id ) def extract_entry_dict(info, title, subtitle): return { 'id': info['loopStreamId'].replace('.mp3', ''), - 'url': 'http://loopstream01.apa.at/?channel=fm4&id=%s' % info['loopStreamId'], + 'url': 'http://loopstream01.apa.at/?channel=%s&id=%s' % (station, info['loopStreamId']), 'title': title, 'description': subtitle, 'duration': (info['end'] - info['start']) / 1000, @@ -233,6 +179,47 @@ class ORFFM4IE(InfoExtractor): } +class ORFFM4IE(ORFRadioIE): + IE_NAME = 'orf:fm4' + IE_DESC = 'radio FM4' + _VALID_URL = r'https?://(?P<station>fm4)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' + + _TEST = { + 'url': 'http://fm4.orf.at/player/20170107/CC', + 'md5': '2b0be47375432a7ef104453432a19212', + 'info_dict': { + 'id': '2017-01-07_2100_tl_54_7DaysSat18_31295', + 'ext': 'mp3', + 'title': 'Solid Steel Radioshow', + 'description': 'Die Mixshow von Coldcut und Ninja Tune.', + 'duration': 3599, + 'timestamp': 1483819257, + 'upload_date': '20170107', + }, + 'skip': 'Shows from ORF radios are only available for 7 days.' + } + + +class ORFOE1IE(ORFRadioIE): + IE_NAME = 'orf:oe1' + IE_DESC = 'Radio Österreich 1' + _VALID_URL = r'https?://(?P<station>oe1)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' + + _TEST = { + 'url': 'http://oe1.orf.at/player/20170108/456544', + 'md5': '34d8a6e67ea888293741c86a099b745b', + 'info_dict': { + 'id': '2017-01-08_0759_tl_51_7DaysSun6_256141', + 'ext': 'mp3', + 'title': 'Morgenjournal', + 'duration': 609, + 'timestamp': 1483858796, + 'upload_date': '20170108', + }, + 'skip': 'Shows from ORF radios are only available for 7 days.' + } + + class ORFIPTVIE(InfoExtractor): IE_NAME = 'orf:iptv' IE_DESC = 'iptv.ORF.at' diff --git a/youtube_dl/extractor/packtpub.py b/youtube_dl/extractor/packtpub.py index 881f3bcc7..bb668c999 100644 --- a/youtube_dl/extractor/packtpub.py +++ b/youtube_dl/extractor/packtpub.py @@ -3,7 +3,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_HTTPError, +) from ..utils import ( clean_html, ExtractorError, @@ -11,6 +14,7 @@ from ..utils import ( strip_or_none, unified_timestamp, urljoin, + urlencode_postdata, ) @@ -34,6 +38,32 @@ class PacktPubIE(PacktPubBaseIE): 'upload_date': '20170331', }, } + _NETRC_MACHINE = 'packtpub' + _TOKEN = None + + def _real_initialize(self): + (username, password) = self._get_login_info() + if username is None: + return + webpage = self._download_webpage(self._PACKT_BASE, None) + login_form = self._form_hidden_inputs( + 'packt-user-login-form', webpage) + login_form.update({ + 'email': username, + 'password': password, + }) + self._download_webpage( + self._PACKT_BASE, None, 'Logging in as %s' % username, + data=urlencode_postdata(login_form)) + try: + self._TOKEN = self._download_json( + '%s/users/tokens/sessions' % self._MAPT_REST, None, + 'Downloading Authorization Token')['data']['token'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 404): + message = self._parse_json(e.cause.read().decode(), None)['message'] + raise ExtractorError(message, expected=True) + raise def _handle_error(self, response): if response.get('status') != 'success': @@ -51,14 +81,17 @@ class PacktPubIE(PacktPubBaseIE): course_id, chapter_id, video_id = mobj.group( 'course_id', 'chapter_id', 'id') + headers = {} + if self._TOKEN: + headers['Authorization'] = self._TOKEN video = self._download_json( '%s/users/me/products/%s/chapters/%s/sections/%s' % (self._MAPT_REST, course_id, chapter_id, video_id), video_id, - 'Downloading JSON video')['data'] + 'Downloading JSON video', headers=headers)['data'] content = video.get('content') if not content: - raise ExtractorError('This video is locked', expected=True) + self.raise_login_required('This video is locked') video_url = content['file'] diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 3e51b4dd7..16cc667d0 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -8,7 +8,9 @@ from ..utils import ( ExtractorError, determine_ext, int_or_none, + float_or_none, js_to_json, + orderedSet, strip_jsonp, strip_or_none, unified_strdate, @@ -263,6 +265,13 @@ class PBSIE(InfoExtractor): }, 'playlist_count': 2, }, + { + 'url': 'http://www.pbs.org/wgbh/americanexperience/films/great-war/', + 'info_dict': { + 'id': 'great-war', + }, + 'playlist_count': 3, + }, { 'url': 'http://www.pbs.org/wgbh/americanexperience/films/death/player/', 'info_dict': { @@ -381,10 +390,10 @@ class PBSIE(InfoExtractor): # tabbed frontline videos MULTI_PART_REGEXES = ( r'<div[^>]+class="videotab[^"]*"[^>]+vid="(\d+)"', - r'<a[^>]+href=["\']#video-\d+["\'][^>]+data-coveid=["\'](\d+)', + r'<a[^>]+href=["\']#(?:video-|part)\d+["\'][^>]+data-cove[Ii]d=["\'](\d+)', ) for p in MULTI_PART_REGEXES: - tabbed_videos = re.findall(p, webpage) + tabbed_videos = orderedSet(re.findall(p, webpage)) if tabbed_videos: return tabbed_videos, presumptive_id, upload_date, description @@ -464,6 +473,7 @@ class PBSIE(InfoExtractor): redirects.append(redirect) redirect_urls.add(redirect_url) + chapters = [] # Player pages may also serve different qualities for page in ('widget/partnerplayer', 'portalplayer'): player = self._download_webpage( @@ -479,6 +489,20 @@ class PBSIE(InfoExtractor): extract_redirect_urls(video_info) if not info: info = video_info + if not chapters: + for chapter_data in re.findall(r'(?s)chapters\.push\(({.*?})\)', player): + chapter = self._parse_json(chapter_data, video_id, js_to_json, fatal=False) + if not chapter: + continue + start_time = float_or_none(chapter.get('start_time'), 1000) + duration = float_or_none(chapter.get('duration'), 1000) + if start_time is None or duration is None: + continue + chapters.append({ + 'start_time': start_time, + 'end_time': start_time + duration, + 'title': chapter.get('title'), + }) formats = [] http_url = None @@ -515,7 +539,7 @@ class PBSIE(InfoExtractor): http_url = format_url self._remove_duplicate_formats(formats) m3u8_formats = list(filter( - lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none', formats)) if http_url: for m3u8_format in m3u8_formats: @@ -588,4 +612,5 @@ class PBSIE(InfoExtractor): 'upload_date': upload_date, 'formats': formats, 'subtitles': subtitles, + 'chapters': chapters, } diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index b25f1f193..1dcc8df00 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -33,7 +33,7 @@ class PornHubIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)| + (?:[a-z]+\.)?pornhub\.com/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| (?:www\.)?thumbzilla\.com/video/ ) (?P<id>[\da-z]+) @@ -97,6 +97,9 @@ class PornHubIE(InfoExtractor): }, { 'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex', 'only_matching': True, + }, { + 'url': 'http://www.pornhub.com/video/show?viewkey=648719015', + 'only_matching': True, }] @staticmethod diff --git a/youtube_dl/extractor/r7.py b/youtube_dl/extractor/r7.py index ed38c77eb..e2202d603 100644 --- a/youtube_dl/extractor/r7.py +++ b/youtube_dl/extractor/r7.py @@ -62,8 +62,7 @@ class R7IE(InfoExtractor): # m3u8 format always matches the http format, let's copy metadata from # one to another m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', - formats)) + lambda f: f.get('vcodec') != 'none', formats)) if len(m3u8_formats) == 1: f_copy = m3u8_formats[0].copy() f_copy.update(f) diff --git a/youtube_dl/extractor/rmcdecouverte.py b/youtube_dl/extractor/rmcdecouverte.py index 2340dae53..e921ca3e6 100644 --- a/youtube_dl/extractor/rmcdecouverte.py +++ b/youtube_dl/extractor/rmcdecouverte.py @@ -13,21 +13,20 @@ class RMCDecouverteIE(InfoExtractor): _VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/mediaplayer-replay.*?\bid=(?P<id>\d+)' _TEST = { - 'url': 'http://rmcdecouverte.bfmtv.com/mediaplayer-replay/?id=1430&title=LES%20HEROS%20DU%2088e%20ETAGE', + 'url': 'http://rmcdecouverte.bfmtv.com/mediaplayer-replay/?id=13502&title=AQUAMEN:LES%20ROIS%20DES%20AQUARIUMS%20:UN%20DELICIEUX%20PROJET', 'info_dict': { - 'id': '5111223049001', + 'id': '5419055995001', 'ext': 'mp4', - 'title': ': LES HEROS DU 88e ETAGE', - 'description': 'Découvrez comment la bravoure de deux hommes dans la Tour Nord du World Trade Center a sauvé la vie d\'innombrables personnes le 11 septembre 2001.', + 'title': 'UN DELICIEUX PROJET', + 'description': 'md5:63610df7c8b1fc1698acd4d0d90ba8b5', 'uploader_id': '1969646226001', - 'upload_date': '20160904', - 'timestamp': 1472951103, + 'upload_date': '20170502', + 'timestamp': 1493745308, }, 'params': { - # rtmp download 'skip_download': True, }, - 'skip': 'Only works from France', + 'skip': 'only available for a week', } BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1969646226001/default_default/index.html?videoId=%s' @@ -35,5 +34,12 @@ class RMCDecouverteIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) - brightcove_id = compat_parse_qs(compat_urlparse.urlparse(brightcove_legacy_url).query)['@videoPlayer'][0] - return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) + if brightcove_legacy_url: + brightcove_id = compat_parse_qs(compat_urlparse.urlparse( + brightcove_legacy_url).query)['@videoPlayer'][0] + else: + brightcove_id = self._search_regex( + r'data-video-id=["\'](\d+)', webpage, 'brightcove id') + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', + brightcove_id) diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py index 9e533103c..58e0b4c80 100644 --- a/youtube_dl/extractor/streamcz.py +++ b/youtube_dl/extractor/streamcz.py @@ -26,7 +26,7 @@ class StreamCZIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.stream.cz/peklonataliri/765767-ecka-pro-deti', - 'md5': '6d3ca61a8d0633c9c542b92fcb936b0c', + 'md5': '934bb6a6d220d99c010783c9719960d5', 'info_dict': { 'id': '765767', 'ext': 'mp4', @@ -37,7 +37,7 @@ class StreamCZIE(InfoExtractor): }, }, { 'url': 'http://www.stream.cz/blanik/10002447-tri-roky-pro-mazanka', - 'md5': 'e54a254fb8b871968fd8403255f28589', + 'md5': '849a88c1e1ca47d41403c2ba5e59e261', 'info_dict': { 'id': '10002447', 'ext': 'mp4', @@ -85,6 +85,14 @@ class StreamCZIE(InfoExtractor): else: title = data['name'] + subtitles = {} + srt_url = data.get('subtitles_srt') + if srt_url: + subtitles['cs'] = [{ + 'ext': 'srt', + 'url': srt_url, + }] + return { 'id': video_id, 'title': title, @@ -93,4 +101,5 @@ class StreamCZIE(InfoExtractor): 'description': data.get('web_site_text'), 'duration': int_or_none(data.get('duration')), 'view_count': int_or_none(data.get('views')), + 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 1b1afab32..3f3c681ae 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -210,7 +210,7 @@ class TEDIE(InfoExtractor): resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False)) m3u8_formats = list(filter( - lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none', formats)) if http_url: for m3u8_format in m3u8_formats: diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 9a424b1c6..de236bbba 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -80,14 +80,33 @@ class ThePlatformBaseIE(OnceIE): 'url': src, }) + duration = info.get('duration') + tp_chapters = info.get('chapters', []) + chapters = [] + if tp_chapters: + def _add_chapter(start_time, end_time): + start_time = float_or_none(start_time, 1000) + end_time = float_or_none(end_time, 1000) + if start_time is None or end_time is None: + return + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + }) + + for chapter in tp_chapters[:-1]: + _add_chapter(chapter.get('startTime'), chapter.get('endTime')) + _add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration) + return { 'title': info['title'], 'subtitles': subtitles, 'description': info['description'], 'thumbnail': info['defaultThumbnailUrl'], - 'duration': int_or_none(info.get('duration'), 1000), + 'duration': float_or_none(duration, 1000), 'timestamp': int_or_none(info.get('pubDate'), 1000) or None, 'uploader': info.get('billingCode'), + 'chapters': chapters, } def _extract_theplatform_metadata(self, path, video_id): diff --git a/youtube_dl/extractor/thescene.py b/youtube_dl/extractor/thescene.py index b8504f0eb..cd642355c 100644 --- a/youtube_dl/extractor/thescene.py +++ b/youtube_dl/extractor/thescene.py @@ -3,10 +3,6 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urlparse -from ..utils import ( - int_or_none, - qualities, -) class TheSceneIE(InfoExtractor): @@ -24,6 +20,9 @@ class TheSceneIE(InfoExtractor): 'season': 'Ready To Wear Spring 2013', 'tags': list, 'categories': list, + 'upload_date': '20120913', + 'timestamp': 1347512400, + 'uploader': 'vogue', }, } @@ -37,32 +36,9 @@ class TheSceneIE(InfoExtractor): self._html_search_regex( r'id=\'js-player-script\'[^>]+src=\'(.+?)\'', webpage, 'player url')) - player = self._download_webpage(player_url, display_id) - info = self._parse_json( - self._search_regex( - r'(?m)video\s*:\s*({.+?}),$', player, 'info json'), - display_id) - - video_id = info['id'] - title = info['title'] - - qualities_order = qualities(('low', 'high')) - formats = [{ - 'format_id': '{0}-{1}'.format(f['type'].split('/')[0], f['quality']), - 'url': f['src'], - 'quality': qualities_order(f['quality']), - } for f in info['sources']] - self._sort_formats(formats) - return { - 'id': video_id, + '_type': 'url_transparent', 'display_id': display_id, - 'title': title, - 'formats': formats, - 'thumbnail': info.get('poster_frame'), - 'duration': int_or_none(info.get('duration')), - 'series': info.get('series_title'), - 'season': info.get('season_title'), - 'tags': info.get('tags'), - 'categories': info.get('categories'), + 'url': player_url, + 'ie_key': 'CondeNast', } diff --git a/youtube_dl/extractor/toggle.py b/youtube_dl/extractor/toggle.py index c54b876d3..348d6ecdf 100644 --- a/youtube_dl/extractor/toggle.py +++ b/youtube_dl/extractor/toggle.py @@ -17,7 +17,7 @@ from ..utils import ( class ToggleIE(InfoExtractor): IE_NAME = 'toggle' - _VALID_URL = r'https?://video\.toggle\.sg/(?:en|zh)/(?:series|clips|movies)/(?:[^/]+/)+(?P<id>[0-9]+)' + _VALID_URL = r'https?://video\.toggle\.sg/(?:en|zh)/(?:[^/]+/){2,}(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://video.toggle.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115', 'info_dict': { @@ -73,6 +73,12 @@ class ToggleIE(InfoExtractor): }, { 'url': 'http://video.toggle.sg/en/movies/seven-days/321936', 'only_matching': True, + }, { + 'url': 'https://video.toggle.sg/en/tv-show/news/may-2017-cna-singapore-tonight/fri-19-may-2017/512456', + 'only_matching': True, + }, { + 'url': 'http://video.toggle.sg/en/channels/eleven-plus/401585', + 'only_matching': True, }] _FORMAT_PREFERENCES = { diff --git a/youtube_dl/extractor/toypics.py b/youtube_dl/extractor/toypics.py index 938e05076..f705a06c9 100644 --- a/youtube_dl/extractor/toypics.py +++ b/youtube_dl/extractor/toypics.py @@ -6,42 +6,48 @@ import re class ToypicsIE(InfoExtractor): - IE_DESC = 'Toypics user profile' - _VALID_URL = r'https?://videos\.toypics\.net/view/(?P<id>[0-9]+)/.*' + IE_DESC = 'Toypics video' + _VALID_URL = r'https?://videos\.toypics\.net/view/(?P<id>[0-9]+)' _TEST = { 'url': 'http://videos.toypics.net/view/514/chancebulged,-2-1/', 'md5': '16e806ad6d6f58079d210fe30985e08b', 'info_dict': { 'id': '514', 'ext': 'mp4', - 'title': 'Chance-Bulge\'d, 2', + 'title': "Chance-Bulge'd, 2", 'age_limit': 18, 'uploader': 'kidsune', } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - page = self._download_webpage(url, video_id) - video_url = self._html_search_regex( - r'src:\s+"(http://static[0-9]+\.toypics\.net/flvideo/[^"]+)"', page, 'video URL') - title = self._html_search_regex( - r'<title>Toypics - ([^<]+)', page, 'title') - username = self._html_search_regex( - r'toypics.net/([^/"]+)" class="user-name">', page, 'username') + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + formats = self._parse_html5_media_entries( + url, webpage, video_id)[0]['formats'] + title = self._html_search_regex([ + r']+class=["\']view-video-title[^>]+>([^<]+)([^<]+) - Toypics', + ], webpage, 'title') + + uploader = self._html_search_regex( + r'More videos from ([^<]+)', webpage, 'uploader', + fatal=False) + return { 'id': video_id, - 'url': video_url, + 'formats': formats, 'title': title, - 'uploader': username, + 'uploader': uploader, 'age_limit': 18, } class ToypicsUserIE(InfoExtractor): IE_DESC = 'Toypics user profile' - _VALID_URL = r'https?://videos\.toypics\.net/(?P[^/?]+)(?:$|[?#])' + _VALID_URL = r'https?://videos\.toypics\.net/(?!view)(?P[^/?#&]+)' _TEST = { 'url': 'http://videos.toypics.net/Mikey', 'info_dict': { @@ -51,8 +57,7 @@ class ToypicsUserIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - username = mobj.group('username') + username = self._match_id(url) profile_page = self._download_webpage( url, username, note='Retrieving profile page') @@ -71,7 +76,7 @@ class ToypicsUserIE(InfoExtractor): note='Downloading page %d/%d' % (n, page_count)) urls.extend( re.findall( - r'

\s+', + r']+class=["\']preview[^>]+>\s*]+href="(https?://videos\.toypics\.net/view/[^"]+)"', lpage)) return { diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py index 1c0be9fc6..efeb677ee 100644 --- a/youtube_dl/extractor/turner.py +++ b/youtube_dl/extractor/turner.py @@ -13,6 +13,7 @@ from ..utils import ( xpath_attr, update_url_query, ExtractorError, + strip_or_none, ) @@ -163,17 +164,21 @@ class TurnerBaseIE(AdobePassIE): 'height': int_or_none(image.get('height')), } for image in video_data.findall('images/image')] + is_live = xpath_text(video_data, 'isLive') == 'true' + return { 'id': video_id, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'formats': formats, 'subtitles': subtitles, 'thumbnails': thumbnails, - 'description': xpath_text(video_data, 'description'), + 'thumbnail': xpath_text(video_data, 'poster'), + 'description': strip_or_none(xpath_text(video_data, 'description')), 'duration': parse_duration(xpath_text(video_data, 'length') or xpath_text(video_data, 'trt')), 'timestamp': self._extract_timestamp(video_data), 'upload_date': xpath_attr(video_data, 'metas', 'version'), 'series': xpath_text(video_data, 'showTitle'), 'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')), 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), + 'is_live': is_live, } diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 06ea2b40a..c5b3288ad 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -150,8 +150,7 @@ class TVPEmbedIE(InfoExtractor): 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) self._sort_formats(m3u8_formats) m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', - m3u8_formats)) + lambda f: f.get('vcodec') != 'none', m3u8_formats)) formats.extend(m3u8_formats) for i, m3u8_format in enumerate(m3u8_formats, 2): http_url = '%s-%d.mp4' % (video_url_base, i) diff --git a/youtube_dl/extractor/tvplayer.py b/youtube_dl/extractor/tvplayer.py index b6537141a..ebde6053f 100644 --- a/youtube_dl/extractor/tvplayer.py +++ b/youtube_dl/extractor/tvplayer.py @@ -2,9 +2,13 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..compat import ( + compat_HTTPError, + compat_str, +) from ..utils import ( extract_attributes, + try_get, urlencode_postdata, ExtractorError, ) @@ -34,25 +38,32 @@ class TVPlayerIE(InfoExtractor): webpage, 'channel element')) title = current_channel['data-name'] - resource_id = self._search_regex( - r'resourceId\s*=\s*"(\d+)"', webpage, 'resource id') - platform = self._search_regex( - r'platform\s*=\s*"([^"]+)"', webpage, 'platform') + resource_id = current_channel['data-id'] + token = self._search_regex( - r'token\s*=\s*"([^"]+)"', webpage, 'token', default='null') - validate = self._search_regex( - r'validate\s*=\s*"([^"]+)"', webpage, 'validate', default='null') + r'data-token=(["\'])(?P(?!\1).+)\1', webpage, + 'token', group='token') + + context = self._download_json( + 'https://tvplayer.com/watch/context', display_id, + 'Downloading JSON context', query={ + 'resource': resource_id, + 'nonce': token, + }) + + validate = context['validate'] + platform = try_get( + context, lambda x: x['platform']['key'], compat_str) or 'firefox' try: response = self._download_json( 'http://api.tvplayer.com/api/v2/stream/live', - resource_id, headers={ + display_id, 'Downloading JSON stream', headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', }, data=urlencode_postdata({ + 'id': resource_id, 'service': 1, 'platform': platform, - 'id': resource_id, - 'token': token, 'validate': validate, }))['tvplayer']['response'] except ExtractorError as e: @@ -63,7 +74,7 @@ class TVPlayerIE(InfoExtractor): '%s said: %s' % (self.IE_NAME, response['error']), expected=True) raise - formats = self._extract_m3u8_formats(response['stream'], resource_id, 'mp4') + formats = self._extract_m3u8_formats(response['stream'], display_id, 'mp4') self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/upskill.py b/youtube_dl/extractor/upskill.py new file mode 100644 index 000000000..30297b4dd --- /dev/null +++ b/youtube_dl/extractor/upskill.py @@ -0,0 +1,176 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .wistia import WistiaIE +from ..compat import compat_str +from ..utils import ( + clean_html, + ExtractorError, + get_element_by_class, + urlencode_postdata, + urljoin, +) + + +class UpskillBaseIE(InfoExtractor): + _LOGIN_URL = 'http://upskillcourses.com/sign_in' + _NETRC_MACHINE = 'upskill' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_page, urlh = self._download_webpage_handle( + self._LOGIN_URL, None, 'Downloading login page') + + login_url = compat_str(urlh.geturl()) + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'user[email]': username, + 'user[password]': password, + }) + + post_url = self._search_regex( + r']+action=(["\'])(?P(?:(?!\1).)+)\1', login_page, + 'post url', default=login_url, group='url') + + if not post_url.startswith('http'): + post_url = urljoin(login_url, post_url) + + response = self._download_webpage( + post_url, None, 'Logging in', + data=urlencode_postdata(login_form), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': login_url, + }) + + # Successful login + if any(re.search(p, response) for p in ( + r'class=["\']user-signout', + r']+\bhref=["\']/sign_out', + r'>\s*Log out\s*<')): + return + + message = get_element_by_class('alert', response) + if message is not None: + raise ExtractorError( + 'Unable to login: %s' % clean_html(message), expected=True) + + raise ExtractorError('Unable to log in') + + +class UpskillIE(UpskillBaseIE): + _VALID_URL = r'https?://(?:www\.)?upskillcourses\.com/courses/[^/]+/lectures/(?P\d+)' + + _TESTS = [{ + 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', + 'info_dict': { + 'id': 'uzw6zw58or', + 'ext': 'mp4', + 'title': 'Welcome to the Course!', + 'description': 'md5:8d66c13403783370af62ca97a7357bdd', + 'duration': 138.763, + 'timestamp': 1479846621, + 'upload_date': '20161122', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://upskillcourses.com/courses/119763/lectures/1747100', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + wistia_url = WistiaIE._extract_url(webpage) + if not wistia_url: + if any(re.search(p, webpage) for p in ( + r'class=["\']lecture-contents-locked', + r'>\s*Lecture contents locked', + r'id=["\']lecture-locked')): + self.raise_login_required('Lecture contents locked') + + title = self._og_search_title(webpage, default=None) + + return { + '_type': 'url_transparent', + 'url': wistia_url, + 'ie_key': WistiaIE.ie_key(), + 'title': title, + } + + +class UpskillCourseIE(UpskillBaseIE): + _VALID_URL = r'https?://(?:www\.)?upskillcourses\.com/courses/(?:enrolled/)?(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/', + 'info_dict': { + 'id': '119763', + 'title': 'The Essential Web Developer Course (Free)', + }, + 'playlist_count': 192, + }, { + 'url': 'http://upskillcourses.com/courses/119763/', + 'only_matching': True, + }, { + 'url': 'http://upskillcourses.com/courses/enrolled/119763', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if UpskillIE.suitable(url) else super( + UpskillCourseIE, cls).suitable(url) + + def _real_extract(self, url): + course_id = self._match_id(url) + + webpage = self._download_webpage(url, course_id) + + course_id = self._search_regex( + r'data-course-id=["\'](\d+)', webpage, 'course id', + default=course_id) + + entries = [] + + for mobj in re.finditer( + r'(?s)(?P

  • ]+class=(["\'])(?:(?!\2).)*?section-item[^>]+>.+?
  • )', + webpage): + li = mobj.group('li') + if 'fa-youtube-play' not in li: + continue + lecture_url = self._search_regex( + r']+href=(["\'])(?P(?:(?!\1).)+)\1', li, + 'lecture url', default=None, group='url') + if not lecture_url: + continue + lecture_id = self._search_regex( + r'/lectures/(\d+)', lecture_url, 'lecture id', default=None) + title = self._html_search_regex( + r']+class=["\']lecture-name[^>]+>([^<]+)', li, + 'title', default=None) + entries.append( + self.url_result( + urljoin('http://upskillcourses.com/', lecture_url), + ie=UpskillIE.ie_key(), video_id=lecture_id, + video_title=clean_html(title))) + + course_title = self._html_search_regex( + (r'(?s)]+class=["\']course-image[^>]+>\s*(.+?)]+class=["\']course-title[^>]+>(.+?))', webpage, 'watch hub')) video_id = watch_hub_data['vms-id'] @@ -32,7 +32,8 @@ class ViceBaseIE(AdobePassIE): resource = self._get_mvpd_resource( 'VICELAND', title, video_id, watch_hub_data.get('video-rating')) - query['tvetoken'] = self._extract_mvpd_auth(url, video_id, 'VICELAND', resource) + query['tvetoken'] = self._extract_mvpd_auth( + url, video_id, 'VICELAND', resource) # signature generation algorithm is reverse engineered from signatureGenerator in # webpack:///../shared/~/vice-player/dist/js/vice-player.js in @@ -45,11 +46,14 @@ class ViceBaseIE(AdobePassIE): try: host = 'www.viceland' if is_locked else self._PREPLAY_HOST - preplay = self._download_json('https://%s.com/en_us/preplay/%s' % (host, video_id), video_id, query=query) + preplay = self._download_json( + 'https://%s.com/%s/preplay/%s' % (host, locale, video_id), + video_id, query=query) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: error = json.loads(e.cause.read().decode()) - raise ExtractorError('%s said: %s' % (self.IE_NAME, error['details']), expected=True) + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, error['details']), expected=True) raise video_data = preplay['video'] @@ -88,41 +92,30 @@ class ViceBaseIE(AdobePassIE): class ViceIE(ViceBaseIE): - _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?videos?/(?P[^/?#&]+)' + IE_NAME = 'vice' + _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:(?P[^/]+)/)?videos?/(?P[^/?#&]+)' _TESTS = [{ - 'url': 'http://www.vice.com/video/cowboy-capitalists-part-1', - 'md5': 'e9d77741f9e42ba583e683cd170660f7', + 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab', + 'md5': '7d3ae2f9ba5f196cdd9f9efd43657ac2', 'info_dict': { - 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp', + 'id': 'N2bzkydjraWDGwnt8jAttCF6Y0PDv4Zj', 'ext': 'flv', - 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', - 'duration': 725.983, + 'title': 'Monkey Labs of Holland', + 'description': 'md5:92b3c7dcbfe477f772dd4afa496c9149', }, 'add_ie': ['Ooyala'], - }, { - 'url': 'http://www.vice.com/video/how-to-hack-a-car', - 'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2', - 'info_dict': { - 'id': '3jstaBeXgAs', - 'ext': 'mp4', - 'title': 'How to Hack a Car: Phreaked Out (Episode 2)', - 'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30', - 'uploader_id': 'MotherboardTV', - 'uploader': 'Motherboard', - 'upload_date': '20140529', - }, - 'add_ie': ['Youtube'], }, { 'url': 'https://video.vice.com/en_us/video/the-signal-from-tolva/5816510690b70e6c5fd39a56', - 'md5': '', 'info_dict': { 'id': '5816510690b70e6c5fd39a56', 'ext': 'mp4', 'uploader': 'Waypoint', 'title': 'The Signal From Tölva', + 'description': 'md5:3927e3c79f9e8094606a2b3c5b5e55d5', 'uploader_id': '57f7d621e05ca860fa9ccaf9', - 'timestamp': 1477941983938, + 'timestamp': 1477941983, + 'upload_date': '20161031', }, 'params': { # m3u8 download @@ -130,19 +123,31 @@ class ViceIE(ViceBaseIE): }, 'add_ie': ['UplynkPreplay'], }, { - 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab', - 'only_matching': True, - }, { - 'url': 'http://www.vice.com/ru/video/big-night-out-ibiza-clive-martin-229', - 'only_matching': True, + 'url': 'https://video.vice.com/alps/video/ulfs-wien-beruchtigste-grafitti-crew-part-1/581b12b60a0e1f4c0fb6ea2f', + 'info_dict': { + 'id': '581b12b60a0e1f4c0fb6ea2f', + 'ext': 'mp4', + 'title': 'ULFs - Wien berüchtigste Grafitti Crew - Part 1', + 'description': '

    Zwischen Hinterzimmer-Tattoos und U-Bahnschächten erzählen uns die Ulfs, wie es ist, "süchtig nach Sachbeschädigung" zu sein.

    ', + 'uploader': 'VICE', + 'uploader_id': '57a204088cb727dec794c67b', + 'timestamp': 1485368119, + 'upload_date': '20170125', + 'age_limit': 14, + }, + 'params': { + # AES-encrypted m3u8 + 'skip_download': True, + }, + 'add_ie': ['UplynkPreplay'], }, { - 'url': 'https://munchies.vice.com/en/videos/watch-the-trailer-for-our-new-series-the-pizza-show', + 'url': 'https://video.vice.com/en_us/video/pizza-show-trailer/56d8c9a54d286ed92f7f30e4', 'only_matching': True, }] _PREPLAY_HOST = 'video.vice' def _real_extract(self, url): - video_id = self._match_id(url) + locale, video_id = re.match(self._VALID_URL, url).groups() webpage, urlh = self._download_webpage_handle(url, video_id) embed_code = self._search_regex( r'embedCode=([^&\'"]+)', webpage, @@ -153,10 +158,11 @@ class ViceIE(ViceBaseIE): r'data-youtube-id="([^"]+)"', webpage, 'youtube id', default=None) if youtube_id: return self.url_result(youtube_id, 'Youtube') - return self._extract_preplay_video(urlh.geturl(), webpage) + return self._extract_preplay_video(urlh.geturl(), locale, webpage) class ViceShowIE(InfoExtractor): + IE_NAME = 'vice:show' _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?show/(?P[^/?#&]+)' _TEST = { @@ -183,6 +189,86 @@ class ViceShowIE(InfoExtractor): r'(.+?)', webpage, 'title', default=None) if title: title = re.sub(r'(.+)\s*\|\s*.+$', r'\1', title).strip() - description = self._html_search_meta('description', webpage, 'description') + description = self._html_search_meta( + 'description', webpage, 'description') return self.playlist_result(entries, show_id, title, description) + + +class ViceArticleIE(InfoExtractor): + IE_NAME = 'vice:article' + _VALID_URL = r'https://www.vice.com/[^/]+/article/(?P[^?#]+)' + + _TESTS = [{ + 'url': 'https://www.vice.com/en_us/article/on-set-with-the-woman-making-mormon-porn-in-utah', + 'info_dict': { + 'id': '58dc0a3dee202d2a0ccfcbd8', + 'ext': 'mp4', + 'title': 'Mormon War on Porn ', + 'description': 'md5:ad396a2481e7f8afb5ed486878421090', + 'uploader': 'VICE', + 'uploader_id': '57a204088cb727dec794c693', + 'timestamp': 1489160690, + 'upload_date': '20170310', + }, + 'params': { + # AES-encrypted m3u8 + 'skip_download': True, + }, + 'add_ie': ['UplynkPreplay'], + }, { + 'url': 'https://www.vice.com/en_us/article/how-to-hack-a-car', + 'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2', + 'info_dict': { + 'id': '3jstaBeXgAs', + 'ext': 'mp4', + 'title': 'How to Hack a Car: Phreaked Out (Episode 2)', + 'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30', + 'uploader_id': 'MotherboardTV', + 'uploader': 'Motherboard', + 'upload_date': '20140529', + }, + 'add_ie': ['Youtube'], + }, { + 'url': 'https://www.vice.com/en_us/article/cowboy-capitalists-part-1', + 'only_matching': True, + }, { + 'url': 'https://www.vice.com/ru/article/big-night-out-ibiza-clive-martin-229', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + prefetch_data = self._parse_json(self._search_regex( + r'window\.__PREFETCH_DATA\s*=\s*({.*});', + webpage, 'prefetch data'), display_id) + body = prefetch_data['body'] + + def _url_res(video_url, ie_key): + return { + '_type': 'url_transparent', + 'url': video_url, + 'display_id': display_id, + 'ie_key': ie_key, + } + + embed_code = self._search_regex( + r'embedCode=([^&\'"]+)', body, + 'ooyala embed code', default=None) + if embed_code: + return _url_res('ooyala:%s' % embed_code, 'Ooyala') + + youtube_url = self._html_search_regex( + r']+src="(.*youtube\.com/.*)"', + body, 'YouTube URL', default=None) + if youtube_url: + return _url_res(youtube_url, 'Youtube') + + video_url = self._html_search_regex( + r'data-video-url="([^"]+)"', + prefetch_data['embed_code'], 'video URL') + + return _url_res(video_url, ViceIE.ie_key()) diff --git a/youtube_dl/extractor/viceland.py b/youtube_dl/extractor/viceland.py index 87f9216b5..bd60235c8 100644 --- a/youtube_dl/extractor/viceland.py +++ b/youtube_dl/extractor/viceland.py @@ -1,11 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .vice import ViceBaseIE class VicelandIE(ViceBaseIE): - _VALID_URL = r'https?://(?:www\.)?viceland\.com/[^/]+/video/[^/]+/(?P[a-f0-9]+)' + _VALID_URL = r'https?://(?:www\.)?viceland\.com/(?P[^/]+)/video/[^/]+/(?P[a-f0-9]+)' _TEST = { 'url': 'https://www.viceland.com/en_us/video/trapped/588a70d0dba8a16007de7316', 'info_dict': { @@ -24,10 +26,13 @@ class VicelandIE(ViceBaseIE): 'skip_download': True, }, 'add_ie': ['UplynkPreplay'], + 'skip': '404', } _PREPLAY_HOST = 'www.viceland' def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + locale = mobj.group('locale') webpage = self._download_webpage(url, video_id) - return self._extract_preplay_video(url, webpage) + return self._extract_preplay_video(url, locale, webpage) diff --git a/youtube_dl/extractor/videopress.py b/youtube_dl/extractor/videopress.py index 049db25a5..e5f964d39 100644 --- a/youtube_dl/extractor/videopress.py +++ b/youtube_dl/extractor/videopress.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import random import re from .common import InfoExtractor @@ -11,6 +10,7 @@ from ..utils import ( float_or_none, parse_age_limit, qualities, + random_birthday, try_get, unified_timestamp, urljoin, @@ -47,13 +47,10 @@ class VideoPressIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + query = random_birthday('birth_year', 'birth_month', 'birth_day') video = self._download_json( 'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id, - video_id, query={ - 'birth_month': random.randint(1, 12), - 'birth_day': random.randint(1, 31), - 'birth_year': random.randint(1950, 1995), - }) + video_id, query=query) title = video['title'] diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py index 5ef7635b6..3e67eb8c2 100644 --- a/youtube_dl/extractor/vier.py +++ b/youtube_dl/extractor/vier.py @@ -5,24 +5,30 @@ import re import itertools from .common import InfoExtractor +from ..utils import ( + urlencode_postdata, + int_or_none, + unified_strdate, +) class VierIE(InfoExtractor): IE_NAME = 'vier' IE_DESC = 'vier.be and vijf.be' _VALID_URL = r'https?://(?:www\.)?(?Pvier|vijf)\.be/(?:[^/]+/videos/(?P[^/]+)(?:/(?P\d+))?|video/v3/embed/(?P\d+))' + _NETRC_MACHINE = 'vier' _TESTS = [{ 'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129', + 'md5': 'e4ae2054a6b040ef1e289e20d111b46e', 'info_dict': { 'id': '16129', 'display_id': 'het-wordt-warm-de-moestuin', 'ext': 'mp4', 'title': 'Het wordt warm in De Moestuin', 'description': 'De vele uren werk eisen hun tol. Wim droomt van assistentie...', - }, - 'params': { - # m3u8 download - 'skip_download': True, + 'upload_date': '20121025', + 'series': 'Plan B', + 'tags': ['De Moestuin', 'Moestuin', 'meisjes', 'Tomaat', 'Wim', 'Droom'], }, }, { 'url': 'http://www.vijf.be/temptationisland/videos/zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas/2561614', @@ -30,32 +36,103 @@ class VierIE(InfoExtractor): 'id': '2561614', 'display_id': 'zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas', 'ext': 'mp4', - 'title': 'ZO grappig: Temptation Island hosts moeten kiezen tussen onmogelijke dilemma\'s', - 'description': 'Het spel is simpel: Annelien Coorevits en Rick Brandsteder krijgen telkens 2 dilemma\'s voorgeschoteld en ze MOETEN een keuze maken.', + 'title': 'md5:84f45fe48b8c1fa296a7f6d208d080a7', + 'description': 'md5:0356d4981e58b8cbee19355cbd51a8fe', + 'upload_date': '20170228', + 'series': 'Temptation Island', + 'tags': list, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839', + 'info_dict': { + 'id': '2674839', + 'display_id': 'jani-gaat-naar-tokio-aflevering-4', + 'ext': 'mp4', + 'title': 'Jani gaat naar Tokio - Aflevering 4', + 'description': 'md5:aa8d611541db6ae9e863125704511f88', + 'upload_date': '20170501', + 'series': 'Jani gaat', + 'episode_number': 4, + 'tags': ['Jani Gaat', 'Volledige Aflevering'], + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires account credentials', + }, { + # Requires account credentials but bypassed extraction via v3/embed page + # without metadata + 'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839', + 'info_dict': { + 'id': '2674839', + 'display_id': 'jani-gaat-naar-tokio-aflevering-4', + 'ext': 'mp4', + 'title': 'jani-gaat-naar-tokio-aflevering-4', }, 'params': { - # m3u8 download 'skip_download': True, }, + 'expected_warnings': ['Log in to extract metadata'], }, { - 'url': 'http://www.vier.be/planb/videos/mieren-herders-van-de-bladluizen', + # Without video id in URL + 'url': 'http://www.vier.be/planb/videos/dit-najaar-plan-b', 'only_matching': True, }, { 'url': 'http://www.vier.be/video/v3/embed/16129', 'only_matching': True, }] + def _real_initialize(self): + self._logged_in = False + + def _login(self, site): + username, password = self._get_login_info() + if username is None or password is None: + return + + login_page = self._download_webpage( + 'http://www.%s.be/user/login' % site, + None, note='Logging in', errnote='Unable to log in', + data=urlencode_postdata({ + 'form_id': 'user_login', + 'name': username, + 'pass': password, + }), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + login_error = self._html_search_regex( + r'(?s)
    \s*
    \s*(.+?)<', + login_page, 'login error', default=None) + if login_error: + self.report_warning('Unable to log in: %s' % login_error) + else: + self._logged_in = True + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) embed_id = mobj.group('embed_id') display_id = mobj.group('display_id') or embed_id + video_id = mobj.group('id') or embed_id site = mobj.group('site') + if not self._logged_in: + self._login(site) + webpage = self._download_webpage(url, display_id) + if r'id="user-login"' in webpage: + self.report_warning( + 'Log in to extract metadata', video_id=display_id) + webpage = self._download_webpage( + 'http://www.%s.be/video/v3/embed/%s' % (site, video_id), + display_id) + video_id = self._search_regex( [r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'], - webpage, 'video id') + webpage, 'video id', default=video_id or display_id) application = self._search_regex( [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'], webpage, 'application', default=site + '_vod') @@ -64,12 +141,25 @@ class VierIE(InfoExtractor): webpage, 'filename') playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename) - formats = self._extract_wowza_formats(playlist_url, display_id, skip_protocols=['dash']) + formats = self._extract_wowza_formats( + playlist_url, display_id, skip_protocols=['dash']) self._sort_formats(formats) title = self._og_search_title(webpage, default=display_id) - description = self._og_search_description(webpage, default=None) + description = self._html_search_regex( + r'(?s)]+\bclass=(["\'])[^>]*?\bfield-type-text-with-summary\b[^>]*?\1[^>]*>.*?

    (?P.+?)

    ', + webpage, 'description', default=None, group='value') thumbnail = self._og_search_thumbnail(webpage, default=None) + upload_date = unified_strdate(self._html_search_regex( + r'(?s)]+\bclass=(["\'])[^>]*?\bfield-name-post-date\b[^>]*?\1[^>]*>.*?(?P\d{2}/\d{2}/\d{4})', + webpage, 'upload date', default=None, group='value')) + + series = self._search_regex( + r'data-program=(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'series', default=None, group='value') + episode_number = int_or_none(self._search_regex( + r'(?i)aflevering (\d+)', title, 'episode number', default=None)) + tags = re.findall(r']+\bhref=["\']/tags/[^>]+>([^<]+)<', webpage) return { 'id': video_id, @@ -77,6 +167,10 @@ class VierIE(InfoExtractor): 'title': title, 'description': description, 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'series': series, + 'episode_number': episode_number, + 'tags': tags, 'formats': formats, } diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index fcf0cb100..d5d5b4c69 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -176,8 +176,7 @@ class ViewsterIE(InfoExtractor): if m3u8_formats: self._sort_formats(m3u8_formats) m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', - m3u8_formats)) + lambda f: f.get('vcodec') != 'none', m3u8_formats)) if len(qualities) == len(m3u8_formats): for q, m3u8_format in zip(qualities, m3u8_formats): f = m3u8_format.copy() diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py index 487047fd7..9959627c0 100644 --- a/youtube_dl/extractor/vrv.py +++ b/youtube_dl/extractor/vrv.py @@ -112,21 +112,41 @@ class VRVIE(VRVBaseIE): audio_locale = streams_json.get('audio_locale') formats = [] - for stream_id, stream in streams_json.get('streams', {}).get('adaptive_hls', {}).items(): - stream_url = stream.get('url') - if not stream_url: - continue - stream_id = stream_id or audio_locale - m3u8_formats = self._extract_m3u8_formats( - stream_url, video_id, 'mp4', m3u8_id=stream_id, - note='Downloading %s m3u8 information' % stream_id, - fatal=False) - if audio_locale: - for f in m3u8_formats: - f['language'] = audio_locale - formats.extend(m3u8_formats) + for stream_type, streams in streams_json.get('streams', {}).items(): + if stream_type in ('adaptive_hls', 'adaptive_dash'): + for stream in streams.values(): + stream_url = stream.get('url') + if not stream_url: + continue + stream_id = stream.get('hardsub_locale') or audio_locale + format_id = '%s-%s' % (stream_type.split('_')[1], stream_id) + if stream_type == 'adaptive_hls': + adaptive_formats = self._extract_m3u8_formats( + stream_url, video_id, 'mp4', m3u8_id=format_id, + note='Downloading %s m3u8 information' % stream_id, + fatal=False) + else: + adaptive_formats = self._extract_mpd_formats( + stream_url, video_id, mpd_id=format_id, + note='Downloading %s MPD information' % stream_id, + fatal=False) + if audio_locale: + for f in adaptive_formats: + if f.get('acodec') != 'none': + f['language'] = audio_locale + formats.extend(adaptive_formats) self._sort_formats(formats) + subtitles = {} + for subtitle in streams_json.get('subtitles', {}).values(): + subtitle_url = subtitle.get('url') + if not subtitle_url: + continue + subtitles.setdefault(subtitle.get('locale', 'en-US'), []).append({ + 'url': subtitle_url, + 'ext': subtitle.get('format', 'ass'), + }) + thumbnails = [] for thumbnail in video_data.get('images', {}).get('thumbnails', []): thumbnail_url = thumbnail.get('source') @@ -142,6 +162,7 @@ class VRVIE(VRVBaseIE): 'id': video_id, 'title': title, 'formats': formats, + 'subtitles': subtitles, 'thumbnails': thumbnails, 'description': video_data.get('description'), 'duration': float_or_none(video_data.get('duration_ms'), 1000), diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py index 839cad986..625d0a1cc 100644 --- a/youtube_dl/extractor/washingtonpost.py +++ b/youtube_dl/extractor/washingtonpost.py @@ -13,6 +13,7 @@ from ..utils import ( class WashingtonPostIE(InfoExtractor): IE_NAME = 'washingtonpost' _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _EMBED_URL = r'https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' _TEST = { 'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d', 'md5': '6f537e1334b714eb15f9563bd4b9cdfa', @@ -27,6 +28,11 @@ class WashingtonPostIE(InfoExtractor): }, } + @classmethod + def _extract_urls(cls, webpage): + return re.findall( + r']+\bsrc=["\'](%s)' % cls._EMBED_URL, webpage) + def _real_extract(self, url): video_id = self._match_id(url) video_data = self._download_json( diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index c634b8dec..2182d6fd4 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -1,10 +1,13 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, float_or_none, + unescapeHTML, ) @@ -34,6 +37,25 @@ class WistiaIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _extract_url(webpage): + match = re.search( + r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) + if match: + return unescapeHTML(match.group('url')) + + match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P[^"\']+)', webpage) + if match: + return 'wistia:%s' % match.group('id') + + match = re.search( + r'''(?sx) + ]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*? + ]+class=(["']).*?\bwistia_async_(?P[a-z0-9]+)\b.*?\2 + ''', webpage) + if match: + return 'wistia:%s' % match.group('id') + def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 5584674a0..bea9b87ad 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( int_or_none, + js_to_json, orderedSet, parse_duration, sanitized_Request, @@ -37,6 +38,22 @@ class XTubeIE(InfoExtractor): 'comment_count': int, 'age_limit': 18, } + }, { + # FLV videos with duplicated formats + 'url': 'http://www.xtube.com/video-watch/A-Super-Run-Part-1-YT-9299752', + 'md5': 'a406963eb349dd43692ec54631efd88b', + 'info_dict': { + 'id': '9299752', + 'display_id': 'A-Super-Run-Part-1-YT', + 'ext': 'flv', + 'title': 'A Super Run - Part 1 (YT)', + 'description': 'md5:ca0d47afff4a9b2942e4b41aa970fd93', + 'uploader': 'tshirtguy59', + 'duration': 579, + 'view_count': int, + 'comment_count': int, + 'age_limit': 18, + }, }, { # new URL schema 'url': 'http://www.xtube.com/video-watch/strange-erotica-625837', @@ -68,8 +85,9 @@ class XTubeIE(InfoExtractor): }) sources = self._parse_json(self._search_regex( - r'(["\'])sources\1\s*:\s*(?P{.+?}),', - webpage, 'sources', group='sources'), video_id) + r'(["\'])?sources\1?\s*:\s*(?P{.+?}),', + webpage, 'sources', group='sources'), video_id, + transform_source=js_to_json) formats = [] for format_id, format_url in sources.items(): @@ -78,6 +96,7 @@ class XTubeIE(InfoExtractor): 'format_id': format_id, 'height': int_or_none(format_id), }) + self._remove_duplicate_formats(formats) self._sort_formats(formats) title = self._search_regex( diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 30825daae..eca603028 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -6,8 +6,10 @@ from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote from ..utils import ( clean_html, - ExtractorError, determine_ext, + ExtractorError, + int_or_none, + parse_duration, ) @@ -20,6 +22,7 @@ class XVideosIE(InfoExtractor): 'id': '4588838', 'ext': 'mp4', 'title': 'Biker Takes his Girl', + 'duration': 108, 'age_limit': 18, } } @@ -36,6 +39,11 @@ class XVideosIE(InfoExtractor): r'(.*?)\s+-\s+XVID', webpage, 'title') video_thumbnail = self._search_regex( r'url_bigthumb=(.+?)&', webpage, 'thumbnail', fatal=False) + video_duration = int_or_none(self._og_search_property( + 'duration', webpage, default=None)) or parse_duration( + self._search_regex( + r'<span[^>]+class=["\']duration["\'][^>]*>.*?(\d[^<]+)', + webpage, 'duration', fatal=False)) formats = [] @@ -67,6 +75,7 @@ class XVideosIE(InfoExtractor): 'id': video_id, 'formats': formats, 'title': video_title, + 'duration': video_duration, 'thumbnail': video_thumbnail, 'age_limit': 18, } diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index fd6268ba4..eb1062142 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -234,7 +234,8 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): 'overembed': 'false', })['playlist'] - tracks, track_ids = playlist['tracks'], map(compat_str, playlist['trackIds']) + tracks = playlist['tracks'] + track_ids = [compat_str(track_id) for track_id in playlist['trackIds']] # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks, # missing tracks should be retrieved manually. diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9e2b9115c..d66693c0c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -38,7 +38,6 @@ from ..utils import ( parse_duration, remove_quotes, remove_start, - sanitized_Request, smuggle_url, str_to_int, try_get, @@ -54,7 +53,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge' - _PASSWORD_CHALLENGE_URL = 'https://accounts.google.com/signin/challenge/sl/password' + + _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup' + _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge' + _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}' + _NETRC_MACHINE = 'youtube' # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False @@ -96,72 +99,150 @@ class YoutubeBaseInfoExtractor(InfoExtractor): login_form = self._hidden_inputs(login_page) - login_form.update({ - 'checkConnection': 'youtube', - 'Email': username, - 'Passwd': password, - }) + def req(url, f_req, note, errnote): + data = login_form.copy() + data.update({ + 'pstMsg': 1, + 'checkConnection': 'youtube', + 'checkedDomains': 'youtube', + 'hl': 'en', + 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]', + 'f.req': json.dumps(f_req), + 'flowName': 'GlifWebSignIn', + 'flowEntry': 'ServiceLogin', + }) + return self._download_json( + url, None, note=note, errnote=errnote, + transform_source=lambda s: re.sub(r'^[^[]*', '', s), + fatal=False, + data=urlencode_postdata(data), headers={ + 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8', + 'Google-Accounts-XSRF': 1, + }) - login_results = self._download_webpage( - self._PASSWORD_CHALLENGE_URL, None, - note='Logging in', errnote='unable to log in', fatal=False, - data=urlencode_postdata(login_form)) - if login_results is False: - return False + def warn(message): + self._downloader.report_warning(message) + + lookup_req = [ + username, + None, [], None, 'US', None, None, 2, False, True, + [ + None, None, + [2, 1, None, 1, + 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', + None, [], 4], + 1, [None, None, []], None, None, None, True + ], + username, + ] - error_msg = self._html_search_regex( - r'<[^>]+id="errormsg_0_Passwd"[^>]*>([^<]+)<', - login_results, 'error message', default=None) - if error_msg: - raise ExtractorError('Unable to login: %s' % error_msg, expected=True) + lookup_results = req( + self._LOOKUP_URL, lookup_req, + 'Looking up account info', 'Unable to look up account info') - if re.search(r'id="errormsg_0_Passwd"', login_results) is not None: - raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True) + if lookup_results is False: + return False - # Two-Factor - # TODO add SMS and phone call support - these require making a request and then prompting the user + user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str) + if not user_hash: + warn('Unable to extract user hash') + return False - if re.search(r'(?i)<form[^>]+id="challenge"', login_results) is not None: - tfa_code = self._get_tfa_info('2-step verification code') + challenge_req = [ + user_hash, + None, 1, None, [1, None, None, None, [password, None, True]], + [ + None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4], + 1, [None, None, []], None, None, None, True + ]] - if not tfa_code: - self._downloader.report_warning( - 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>' - '(Note that only TOTP (Google Authenticator App) codes work at this time.)') - return False + challenge_results = req( + self._CHALLENGE_URL, challenge_req, + 'Logging in', 'Unable to log in') - tfa_code = remove_start(tfa_code, 'G-') + if challenge_results is False: + return - tfa_form_strs = self._form_hidden_inputs('challenge', login_results) + login_res = try_get(challenge_results, lambda x: x[0][5], list) + if login_res: + login_msg = try_get(login_res, lambda x: x[5], compat_str) + warn( + 'Unable to login: %s' % 'Invalid password' + if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg) + return False - tfa_form_strs.update({ - 'Pin': tfa_code, - 'TrustDevice': 'on', - }) + res = try_get(challenge_results, lambda x: x[0][-1], list) + if not res: + warn('Unable to extract result entry') + return False - tfa_data = urlencode_postdata(tfa_form_strs) + tfa = try_get(res, lambda x: x[0][0], list) + if tfa: + tfa_str = try_get(tfa, lambda x: x[2], compat_str) + if tfa_str == 'TWO_STEP_VERIFICATION': + # SEND_SUCCESS - TFA code has been successfully sent to phone + # QUOTA_EXCEEDED - reached the limit of TFA codes + status = try_get(tfa, lambda x: x[5], compat_str) + if status == 'QUOTA_EXCEEDED': + warn('Exceeded the limit of TFA codes, try later') + return False + + tl = try_get(challenge_results, lambda x: x[1][2], compat_str) + if not tl: + warn('Unable to extract TL') + return False + + tfa_code = self._get_tfa_info('2-step verification code') + + if not tfa_code: + warn( + 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>' + '(Note that only TOTP (Google Authenticator App) codes work at this time.)') + return False + + tfa_code = remove_start(tfa_code, 'G-') + + tfa_req = [ + user_hash, None, 2, None, + [ + 9, None, None, None, None, None, None, None, + [None, tfa_code, True, 2] + ]] + + tfa_results = req( + self._TFA_URL.format(tl), tfa_req, + 'Submitting TFA code', 'Unable to submit TFA code') + + if tfa_results is False: + return False + + tfa_res = try_get(tfa_results, lambda x: x[0][5], list) + if tfa_res: + tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str) + warn( + 'Unable to finish TFA: %s' % 'Invalid TFA code' + if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg) + return False + + check_cookie_url = try_get( + tfa_results, lambda x: x[0][-1][2], compat_str) + else: + check_cookie_url = try_get(res, lambda x: x[2], compat_str) - tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data) - tfa_results = self._download_webpage( - tfa_req, None, - note='Submitting TFA code', errnote='unable to submit tfa', fatal=False) + if not check_cookie_url: + warn('Unable to extract CheckCookie URL') + return False - if tfa_results is False: - return False + check_cookie_results = self._download_webpage( + check_cookie_url, None, 'Checking cookie', fatal=False) - if re.search(r'(?i)<form[^>]+id="challenge"', tfa_results) is not None: - self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.') - return False - if re.search(r'(?i)<form[^>]+id="gaia_loginform"', tfa_results) is not None: - self._downloader.report_warning('unable to log in - did the page structure change?') - return False - if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None: - self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.') - return False + if check_cookie_results is False: + return False - if re.search(r'(?i)<form[^>]+id="gaia_loginform"', login_results) is not None: - self._downloader.report_warning('unable to log in: bad username or password') + if 'https://myaccount.google.com/' not in check_cookie_results: + warn('Unable to log in') return False + return True def _real_initialize(self): @@ -963,7 +1044,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_signature_function(self, video_id, player_url, example_sig): id_m = re.match( - r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|/base)?\.(?P<ext>[a-z]+)$', + r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$', player_url) if not id_m: raise ExtractorError('Cannot identify player %r' % player_url) @@ -1257,6 +1338,35 @@ class YoutubeIE(YoutubeBaseInfoExtractor): url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.') + @staticmethod + def _extract_chapters(description, duration): + if not description: + return None + chapter_lines = re.findall( + r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)', + description) + if not chapter_lines: + return None + chapters = [] + for next_num, (chapter_line, time_point) in enumerate( + chapter_lines, start=1): + start_time = parse_duration(time_point) + if start_time is None: + continue + end_time = (duration if next_num == len(chapter_lines) + else parse_duration(chapter_lines[next_num][1])) + if end_time is None: + continue + chapter_title = re.sub( + r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-') + chapter_title = re.sub(r'\s+', ' ', chapter_title) + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + 'title': chapter_title, + }) + return chapters + def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -1325,6 +1435,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: age_gate = False video_info = None + sts = None # Try looking directly into the video webpage ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) if ytplayer_config: @@ -1341,6 +1452,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) if args.get('livestream') == '1' or args.get('live_playback') == 1: is_live = True + sts = ytplayer_config.get('sts') if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): # We also try looking in get_video_info since it may contain different dashmpd # URL that points to a DASH manifest with possibly different itag set (some itags @@ -1349,14 +1461,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # The general idea is to take a union of itags of both DASH manifests (for example # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093) self.report_video_info_webpage_download(video_id) - for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']: - video_info_url = ( - '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' - % (proto, video_id, el_type)) + for el in ('info', 'embedded', 'detailpage', 'vevo', ''): + query = { + 'video_id': video_id, + 'ps': 'default', + 'eurl': '', + 'gl': 'US', + 'hl': 'en', + } + if el: + query['el'] = el + if sts: + query['sts'] = sts video_info_webpage = self._download_webpage( - video_info_url, + '%s://www.youtube.com/get_video_info' % proto, video_id, note=False, - errnote='unable to download video info webpage') + errnote='unable to download video info webpage', + fatal=False, query=query) + if not video_info_webpage: + continue get_video_info = compat_parse_qs(video_info_webpage) if get_video_info.get('use_cipher_signature') != ['True']: add_dash_mpd(get_video_info) @@ -1399,9 +1522,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_title = '_' # description - video_description = get_element_by_id("eow-description", video_webpage) + description_original = video_description = get_element_by_id("eow-description", video_webpage) if video_description: - video_description = re.sub(r'''(?x) + description_original = video_description = re.sub(r'''(?x) <a\s+ (?:[a-zA-Z-]+="[^"]*"\s+)*? (?:title|href)="([^"]+)"\s+ @@ -1558,6 +1681,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if self._downloader.params.get('writeannotations', False): video_annotations = self._extract_annotations(video_id) + chapters = self._extract_chapters(description_original, video_duration) + if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): self.report_rtmp_download() formats = [{ @@ -1629,7 +1754,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_desc = 'flash player %s' % player_version else: player_version = self._search_regex( - [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', r'(?:www|player)-([^/]+)/base\.js'], + [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', + r'(?:www|player)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'], player_url, 'html5 player', fatal=False) player_desc = 'html5 player %s' % player_version @@ -1789,6 +1915,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'duration': video_duration, 'age_limit': 18 if age_gate else 0, 'annotations': video_annotations, + 'chapters': chapters, 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id, 'view_count': view_count, 'like_count': like_count, diff --git a/youtube_dl/extractor/zaq1.py b/youtube_dl/extractor/zaq1.py new file mode 100644 index 000000000..889aff5d8 --- /dev/null +++ b/youtube_dl/extractor/zaq1.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_timestamp, +) + + +class Zaq1IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?zaq1\.pl/video/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://zaq1.pl/video/xev0e', + 'md5': '24a5eb3f052e604ae597c4d0d19b351e', + 'info_dict': { + 'id': 'xev0e', + 'title': 'DJ NA WESELE. TANIEC Z FIGURAMI.węgrów/sokołów podlaski/siedlce/mińsk mazowiecki/warszawa', + 'description': 'www.facebook.com/weseledjKontakt: 728 448 199 / 505 419 147', + 'ext': 'mp4', + 'duration': 511, + 'timestamp': 1490896361, + 'uploader': 'Anonim', + 'upload_date': '20170330', + 'view_count': int, + } + }, { + # malformed JSON-LD + 'url': 'http://zaq1.pl/video/x81vn', + 'info_dict': { + 'id': 'x81vn', + 'title': 'SEKRETNE ŻYCIE WALTERA MITTY', + 'ext': 'mp4', + 'duration': 6234, + 'timestamp': 1493494860, + 'uploader': 'Anonim', + 'upload_date': '20170429', + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Failed to parse JSON'], + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_url = self._search_regex( + r'data-video-url=(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'video url', group='url') + + info = self._search_json_ld(webpage, video_id, fatal=False) + + def extract_data(field, name, fatal=False): + return self._search_regex( + r'data-%s=(["\'])(?P<field>(?:(?!\1).)+)\1' % field, + webpage, field, fatal=fatal, group='field') + + if not info.get('title'): + info['title'] = extract_data('file-name', 'title', fatal=True) + + if not info.get('duration'): + info['duration'] = int_or_none(extract_data('duration', 'duration')) + + if not info.get('thumbnail'): + info['thumbnail'] = extract_data('photo-url', 'thumbnail') + + if not info.get('timestamp'): + info['timestamp'] = unified_timestamp(self._html_search_meta( + 'uploadDate', webpage, 'timestamp')) + + if not info.get('interactionCount'): + info['view_count'] = int_or_none(self._html_search_meta( + 'interactionCount', webpage, 'view count')) + + uploader = self._html_search_regex( + r'Wideo dodał:\s*<a[^>]*>([^<]+)</a>', webpage, 'uploader', + fatal=False) + + width = int_or_none(self._html_search_meta( + 'width', webpage, fatal=False)) + height = int_or_none(self._html_search_meta( + 'height', webpage, fatal=False)) + + info.update({ + 'id': video_id, + 'formats': [{ + 'url': video_url, + 'width': width, + 'height': height, + 'http_headers': { + 'Referer': url, + }, + }], + 'uploader': uploader, + }) + + return info diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index 24cdec28c..7bda59610 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -6,6 +6,7 @@ import re from .utils import ( ExtractorError, + remove_quotes, ) _OPERATORS = [ @@ -57,7 +58,6 @@ class JSInterpreter(object): def interpret_expression(self, expr, local_vars, allow_recursion): expr = expr.strip() - if expr == '': # Empty expression return None @@ -121,11 +121,19 @@ class JSInterpreter(object): pass m = re.match( - r'(?P<var>%s)\.(?P<member>[^(]+)(?:\(+(?P<args>[^()]*)\))?$' % _NAME_RE, + r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr) + if m: + val = local_vars[m.group('in')] + idx = self.interpret_expression( + m.group('idx'), local_vars, allow_recursion - 1) + return val[idx] + + m = re.match( + r'(?P<var>%s)(?:\.(?P<member>[^(]+)|\[(?P<member2>[^]]+)\])\s*(?:\(+(?P<args>[^()]*)\))?$' % _NAME_RE, expr) if m: variable = m.group('var') - member = m.group('member') + member = remove_quotes(m.group('member') or m.group('member2')) arg_str = m.group('args') if variable in local_vars: @@ -173,14 +181,6 @@ class JSInterpreter(object): return obj[member](argvals) - m = re.match( - r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr) - if m: - val = local_vars[m.group('in')] - idx = self.interpret_expression( - m.group('idx'), local_vars, allow_recursion - 1) - return val[idx] - for op, opfunc in _OPERATORS: m = re.match(r'(?P<x>.+?)%s(?P<y>.+)' % re.escape(op), expr) if not m: @@ -211,21 +211,25 @@ class JSInterpreter(object): raise ExtractorError('Unsupported JS expression %r' % expr) def extract_object(self, objname): + _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')''' obj = {} obj_m = re.search( - (r'(?<!this\.)%s\s*=\s*\{' % re.escape(objname)) + - r'\s*(?P<fields>([a-zA-Z$0-9]+\s*:\s*function\(.*?\)\s*\{.*?\}(?:,\s*)?)*)' + - r'\}\s*;', + r'''(?x) + (?<!this\.)%s\s*=\s*{\s* + (?P<fields>(%s\s*:\s*function\s*\(.*?\)\s*{.*?}(?:,\s*)?)*) + }\s*; + ''' % (re.escape(objname), _FUNC_NAME_RE), self.code) fields = obj_m.group('fields') # Currently, it only supports function definitions fields_m = re.finditer( - r'(?P<key>[a-zA-Z$0-9]+)\s*:\s*function' - r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}', + r'''(?x) + (?P<key>%s)\s*:\s*function\s*\((?P<args>[a-z,]+)\){(?P<code>[^}]+)} + ''' % _FUNC_NAME_RE, fields) for f in fields_m: argnames = f.group('args').split(',') - obj[f.group('key')] = self.build_function(argnames, f.group('code')) + obj[remove_quotes(f.group('key'))] = self.build_function(argnames, f.group('code')) return obj diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 52309fb84..3021a6f41 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -814,11 +814,12 @@ def parseOpts(overrideArguments=None): '--metadata-from-title', metavar='FORMAT', dest='metafromtitle', help='Parse additional metadata like song title / artist from the video title. ' - 'The format syntax is the same as --output, ' - 'the parsed parameters replace existing values. ' - 'Additional templates: %(album)s, %(artist)s. ' + 'The format syntax is the same as --output. Regular expression with ' + 'named capture groups may also be used. ' + 'The parsed parameters replace existing values. ' 'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like ' - '"Coldplay - Paradise"') + '"Coldplay - Paradise". ' + 'Example (regex): --metadata-from-title "(?P<artist>.+?) - (?P<title>.+)"') postproc.add_option( '--xattrs', action='store_true', dest='xattrs', default=False, diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 665109558..c91ec8588 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -4,6 +4,7 @@ import io import os import subprocess import time +import re from .common import AudioConversionError, PostProcessor @@ -22,6 +23,7 @@ from ..utils import ( subtitles_filename, dfxp2srt, ISO639Utils, + replace_extension, ) @@ -429,17 +431,40 @@ class FFmpegMetadataPP(FFmpegPostProcessor): filename = info['filepath'] temp_filename = prepend_extension(filename, 'temp') + in_filenames = [filename] + options = [] if info['ext'] == 'm4a': - options = ['-vn', '-acodec', 'copy'] + options.extend(['-vn', '-acodec', 'copy']) else: - options = ['-c', 'copy'] + options.extend(['-c', 'copy']) for (name, value) in metadata.items(): options.extend(['-metadata', '%s=%s' % (name, value)]) + chapters = info.get('chapters', []) + if chapters: + metadata_filename = encodeFilename(replace_extension(filename, 'meta')) + with io.open(metadata_filename, 'wt', encoding='utf-8') as f: + def ffmpeg_escape(text): + return re.sub(r'(=|;|#|\\|\n)', r'\\\1', text) + + metadata_file_content = ';FFMETADATA1\n' + for chapter in chapters: + metadata_file_content += '[CHAPTER]\nTIMEBASE=1/1000\n' + metadata_file_content += 'START=%d\n' % (chapter['start_time'] * 1000) + metadata_file_content += 'END=%d\n' % (chapter['end_time'] * 1000) + chapter_title = chapter.get('title') + if chapter_title: + metadata_file_content += 'title=%s\n' % ffmpeg_escape(chapter_title) + f.write(metadata_file_content) + in_filenames.append(metadata_filename) + options.extend(['-map_metadata', '1']) + self._downloader.to_screen('[ffmpeg] Adding metadata to \'%s\'' % filename) - self.run_ffmpeg(filename, temp_filename, options) + self.run_ffmpeg_multiple_files(in_filenames, temp_filename, options) + if chapters: + os.remove(metadata_filename) os.remove(encodeFilename(filename)) os.rename(encodeFilename(temp_filename), encodeFilename(filename)) return [], info diff --git a/youtube_dl/postprocessor/metadatafromtitle.py b/youtube_dl/postprocessor/metadatafromtitle.py index a7d637a3c..c73f02447 100644 --- a/youtube_dl/postprocessor/metadatafromtitle.py +++ b/youtube_dl/postprocessor/metadatafromtitle.py @@ -9,7 +9,9 @@ class MetadataFromTitlePP(PostProcessor): def __init__(self, downloader, titleformat): super(MetadataFromTitlePP, self).__init__(downloader) self._titleformat = titleformat - self._titleregex = self.format_to_regex(titleformat) + self._titleregex = (self.format_to_regex(titleformat) + if re.search(r'%\(\w+\)s', titleformat) + else titleformat) def format_to_regex(self, fmt): r""" diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 91e235ff2..4293a77f5 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -11,6 +11,7 @@ import contextlib import ctypes import datetime import email.utils +import email.header import errno import functools import gzip @@ -421,8 +422,8 @@ def clean_html(html): # Newline vs <br /> html = html.replace('\n', ' ') - html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html) - html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html) + html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html) + html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html) # Strip html tags html = re.sub('<.*?>', '', html) # Replace html entities @@ -1194,6 +1195,11 @@ def unified_timestamp(date_str, day_first=True): # Remove AM/PM + timezone date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) + # Remove unrecognized timezones from ISO 8601 alike timestamps + m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str) + if m: + date_str = date_str[:-len(m.group('tz'))] + for expression in date_formats(day_first): try: dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta) @@ -2092,6 +2098,58 @@ def update_Request(req, url=None, data=None, headers={}, query={}): return new_req +def _multipart_encode_impl(data, boundary): + content_type = 'multipart/form-data; boundary=%s' % boundary + + out = b'' + for k, v in data.items(): + out += b'--' + boundary.encode('ascii') + b'\r\n' + if isinstance(k, compat_str): + k = k.encode('utf-8') + if isinstance(v, compat_str): + v = v.encode('utf-8') + # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578 + # suggests sending UTF-8 directly. Firefox sends UTF-8, too + content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n' + if boundary.encode('ascii') in content: + raise ValueError('Boundary overlaps with data') + out += content + + out += b'--' + boundary.encode('ascii') + b'--\r\n' + + return out, content_type + + +def multipart_encode(data, boundary=None): + ''' + Encode a dict to RFC 7578-compliant form-data + + data: + A dict where keys and values can be either Unicode or bytes-like + objects. + boundary: + If specified a Unicode object, it's used as the boundary. Otherwise + a random boundary is generated. + + Reference: https://tools.ietf.org/html/rfc7578 + ''' + has_specified_boundary = boundary is not None + + while True: + if boundary is None: + boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff)) + + try: + out, content_type = _multipart_encode_impl(data, boundary) + break + except ValueError: + if has_specified_boundary: + raise + boundary = None + + return out, content_type + + def dict_get(d, key_or_keys, default=None, skip_false_values=True): if isinstance(key_or_keys, (list, tuple)): for key in key_or_keys: @@ -2273,10 +2331,8 @@ def mimetype2ext(mt): return { '3gpp': '3gp', 'smptett+xml': 'tt', - 'srt': 'srt', 'ttaf+xml': 'dfxp', 'ttml+xml': 'ttml', - 'vtt': 'vtt', 'x-flv': 'flv', 'x-mp4-fragmented': 'mp4', 'x-ms-wmv': 'wmv', @@ -2284,11 +2340,11 @@ def mimetype2ext(mt): 'x-mpegurl': 'm3u8', 'vnd.apple.mpegurl': 'm3u8', 'dash+xml': 'mpd', - 'f4m': 'f4m', 'f4m+xml': 'f4m', 'hds+xml': 'f4m', 'vnd.ms-sstr+xml': 'ism', 'quicktime': 'mov', + 'mp2t': 'ts', }.get(res, res) @@ -2304,11 +2360,11 @@ def parse_codecs(codecs_str): if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'): if not vcodec: vcodec = full_codec - elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'): + elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'): if not acodec: acodec = full_codec else: - write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr) + write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr) if not vcodec and not acodec: if len(splited_codecs) == 2: return { @@ -3757,3 +3813,11 @@ def write_xattr(path, key, value): "Couldn't find a tool to set the xattrs. " "Install either the python 'xattr' module, " "or the 'xattr' binary.") + + +def random_birthday(year_field, month_field, day_field): + return { + year_field: str(random.randint(1950, 1995)), + month_field: str(random.randint(1, 12)), + day_field: str(random.randint(1, 31)), + } diff --git a/youtube_dl/version.py b/youtube_dl/version.py index e206501e1..c006eba76 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.04.26' +__version__ = '2017.05.23'