Merge pull request #12909 from remitamine/raw-sub
authorYen Chi Hsuan <yan12125@gmail.com>
Wed, 13 Sep 2017 09:36:40 +0000 (17:36 +0800)
committerGitHub <noreply@github.com>
Wed, 13 Sep 2017 09:36:40 +0000 (17:36 +0800)
[YoutubeDL] write raw subtitle files

239 files changed:
.github/ISSUE_TEMPLATE.md
.github/ISSUE_TEMPLATE_tmpl.md
AUTHORS
CONTRIBUTING.md
ChangeLog
Makefile
README.md
devscripts/prepare_manpage.py
docs/supportedsites.md
test/test_InfoExtractor.py
test/test_YoutubeDL.py
test/test_options.py [new file with mode: 0644]
test/test_utils.py
test/test_youtube_chapters.py [new file with mode: 0644]
test/testdata/mpd/float_duration.mpd [new file with mode: 0644]
youtube_dl/YoutubeDL.py
youtube_dl/compat.py
youtube_dl/downloader/common.py
youtube_dl/downloader/dash.py
youtube_dl/downloader/external.py
youtube_dl/downloader/fragment.py
youtube_dl/downloader/hls.py
youtube_dl/downloader/http.py
youtube_dl/downloader/ism.py
youtube_dl/extractor/abc.py
youtube_dl/extractor/abcnews.py
youtube_dl/extractor/abcotvs.py
youtube_dl/extractor/adn.py
youtube_dl/extractor/adobepass.py
youtube_dl/extractor/adultswim.py
youtube_dl/extractor/aliexpress.py [new file with mode: 0644]
youtube_dl/extractor/aljazeera.py
youtube_dl/extractor/amcnetworks.py
youtube_dl/extractor/amp.py
youtube_dl/extractor/animeondemand.py
youtube_dl/extractor/anvato.py
youtube_dl/extractor/aparat.py
youtube_dl/extractor/appletrailers.py
youtube_dl/extractor/ard.py
youtube_dl/extractor/arte.py
youtube_dl/extractor/asiancrush.py [new file with mode: 0644]
youtube_dl/extractor/audioboom.py
youtube_dl/extractor/bandcamp.py
youtube_dl/extractor/bbc.py
youtube_dl/extractor/beampro.py
youtube_dl/extractor/bilibili.py
youtube_dl/extractor/bpb.py
youtube_dl/extractor/brightcove.py
youtube_dl/extractor/buzzfeed.py
youtube_dl/extractor/cbc.py
youtube_dl/extractor/cbs.py
youtube_dl/extractor/cbsinteractive.py
youtube_dl/extractor/cbsnews.py
youtube_dl/extractor/cda.py
youtube_dl/extractor/charlierose.py
youtube_dl/extractor/chilloutzone.py
youtube_dl/extractor/cinchcast.py
youtube_dl/extractor/cjsw.py [new file with mode: 0644]
youtube_dl/extractor/clipfish.py [deleted file]
youtube_dl/extractor/clippit.py [new file with mode: 0644]
youtube_dl/extractor/cloudy.py
youtube_dl/extractor/common.py
youtube_dl/extractor/condenast.py
youtube_dl/extractor/corus.py
youtube_dl/extractor/coub.py
youtube_dl/extractor/cracked.py
youtube_dl/extractor/crackle.py
youtube_dl/extractor/crunchyroll.py
youtube_dl/extractor/cspan.py
youtube_dl/extractor/dailymail.py
youtube_dl/extractor/dailymotion.py
youtube_dl/extractor/democracynow.py
youtube_dl/extractor/disney.py
youtube_dl/extractor/dispeak.py
youtube_dl/extractor/dotsub.py
youtube_dl/extractor/douyutv.py
youtube_dl/extractor/dplay.py
youtube_dl/extractor/dramafever.py
youtube_dl/extractor/drbonanza.py
youtube_dl/extractor/drtuber.py
youtube_dl/extractor/drtv.py
youtube_dl/extractor/dvtv.py
youtube_dl/extractor/eagleplatform.py
youtube_dl/extractor/egghead.py
youtube_dl/extractor/espn.py
youtube_dl/extractor/extractors.py
youtube_dl/extractor/facebook.py
youtube_dl/extractor/firsttv.py
youtube_dl/extractor/fivetv.py
youtube_dl/extractor/flickr.py
youtube_dl/extractor/fourtube.py
youtube_dl/extractor/fox.py
youtube_dl/extractor/foxgay.py
youtube_dl/extractor/foxsports.py
youtube_dl/extractor/francetv.py
youtube_dl/extractor/funimation.py
youtube_dl/extractor/funnyordie.py
youtube_dl/extractor/gaskrank.py
youtube_dl/extractor/gdcvault.py
youtube_dl/extractor/generic.py
youtube_dl/extractor/gfycat.py
youtube_dl/extractor/giantbomb.py
youtube_dl/extractor/godtv.py [deleted file]
youtube_dl/extractor/golem.py
youtube_dl/extractor/googledrive.py
youtube_dl/extractor/hgtv.py
youtube_dl/extractor/hitbox.py
youtube_dl/extractor/ign.py
youtube_dl/extractor/imdb.py
youtube_dl/extractor/infoq.py
youtube_dl/extractor/itv.py
youtube_dl/extractor/joj.py [new file with mode: 0755]
youtube_dl/extractor/jove.py
youtube_dl/extractor/kaltura.py
youtube_dl/extractor/karrierevideos.py
youtube_dl/extractor/laola1tv.py
youtube_dl/extractor/leeco.py
youtube_dl/extractor/limelight.py
youtube_dl/extractor/liveleak.py
youtube_dl/extractor/manyvids.py [new file with mode: 0644]
youtube_dl/extractor/medialaan.py
youtube_dl/extractor/mediaset.py [new file with mode: 0644]
youtube_dl/extractor/megaphone.py [new file with mode: 0644]
youtube_dl/extractor/mitele.py
youtube_dl/extractor/mixcloud.py
youtube_dl/extractor/mlb.py
youtube_dl/extractor/mpora.py [deleted file]
youtube_dl/extractor/msn.py
youtube_dl/extractor/mtv.py
youtube_dl/extractor/myspace.py
youtube_dl/extractor/nbc.py
youtube_dl/extractor/newgrounds.py
youtube_dl/extractor/nexx.py [new file with mode: 0644]
youtube_dl/extractor/nick.py
youtube_dl/extractor/niconico.py
youtube_dl/extractor/njpwworld.py
youtube_dl/extractor/nonktube.py [new file with mode: 0644]
youtube_dl/extractor/noovo.py [new file with mode: 0644]
youtube_dl/extractor/npo.py
youtube_dl/extractor/nrk.py
youtube_dl/extractor/nuevo.py
youtube_dl/extractor/onet.py
youtube_dl/extractor/ooyala.py
youtube_dl/extractor/orf.py
youtube_dl/extractor/packtpub.py
youtube_dl/extractor/pandatv.py
youtube_dl/extractor/pandoratv.py
youtube_dl/extractor/pbs.py
youtube_dl/extractor/pearvideo.py [new file with mode: 0644]
youtube_dl/extractor/periscope.py
youtube_dl/extractor/pluralsight.py
youtube_dl/extractor/podomatic.py
youtube_dl/extractor/polskieradio.py
youtube_dl/extractor/pornhd.py
youtube_dl/extractor/pornhub.py
youtube_dl/extractor/qqmusic.py
youtube_dl/extractor/radiocanada.py
youtube_dl/extractor/rai.py
youtube_dl/extractor/redbulltv.py
youtube_dl/extractor/reddit.py [new file with mode: 0644]
youtube_dl/extractor/redtube.py
youtube_dl/extractor/rmcdecouverte.py
youtube_dl/extractor/rtlnl.py
youtube_dl/extractor/rutube.py
youtube_dl/extractor/rutv.py
youtube_dl/extractor/ruv.py [new file with mode: 0644]
youtube_dl/extractor/safari.py
youtube_dl/extractor/sexu.py
youtube_dl/extractor/slideshare.py
youtube_dl/extractor/sohu.py
youtube_dl/extractor/soundcloud.py
youtube_dl/extractor/spiegel.py
youtube_dl/extractor/spiegeltv.py
youtube_dl/extractor/sportbox.py
youtube_dl/extractor/streamango.py
youtube_dl/extractor/streamcz.py
youtube_dl/extractor/svt.py
youtube_dl/extractor/tastytrade.py [new file with mode: 0644]
youtube_dl/extractor/tbs.py
youtube_dl/extractor/teamfourstar.py [deleted file]
youtube_dl/extractor/ted.py
youtube_dl/extractor/theplatform.py
youtube_dl/extractor/thescene.py
youtube_dl/extractor/thisoldhouse.py
youtube_dl/extractor/toggle.py
youtube_dl/extractor/toutv.py
youtube_dl/extractor/toypics.py
youtube_dl/extractor/tudou.py
youtube_dl/extractor/turbo.py
youtube_dl/extractor/turner.py
youtube_dl/extractor/tvplayer.py
youtube_dl/extractor/twentymin.py
youtube_dl/extractor/twitter.py
youtube_dl/extractor/udemy.py
youtube_dl/extractor/upskill.py [new file with mode: 0644]
youtube_dl/extractor/veoh.py
youtube_dl/extractor/vevo.py
youtube_dl/extractor/vh1.py
youtube_dl/extractor/vice.py
youtube_dl/extractor/viceland.py
youtube_dl/extractor/videopress.py
youtube_dl/extractor/vidio.py
youtube_dl/extractor/vidme.py
youtube_dl/extractor/vier.py
youtube_dl/extractor/viidea.py
youtube_dl/extractor/vimeo.py
youtube_dl/extractor/vine.py
youtube_dl/extractor/viu.py
youtube_dl/extractor/vk.py
youtube_dl/extractor/vlive.py
youtube_dl/extractor/voot.py [new file with mode: 0644]
youtube_dl/extractor/vrv.py
youtube_dl/extractor/vzaar.py
youtube_dl/extractor/washingtonpost.py
youtube_dl/extractor/watchbox.py [new file with mode: 0644]
youtube_dl/extractor/watchindianporn.py
youtube_dl/extractor/wistia.py
youtube_dl/extractor/wsj.py
youtube_dl/extractor/xfileshare.py
youtube_dl/extractor/xhamster.py
youtube_dl/extractor/xtube.py
youtube_dl/extractor/xuite.py
youtube_dl/extractor/xvideos.py
youtube_dl/extractor/xxxymovies.py
youtube_dl/extractor/yam.py [deleted file]
youtube_dl/extractor/yandexdisk.py [new file with mode: 0644]
youtube_dl/extractor/yandexmusic.py
youtube_dl/extractor/youjizz.py
youtube_dl/extractor/youku.py
youtube_dl/extractor/youporn.py
youtube_dl/extractor/youtube.py
youtube_dl/extractor/zaq1.py [new file with mode: 0644]
youtube_dl/jsinterp.py
youtube_dl/options.py
youtube_dl/postprocessor/execafterdownload.py
youtube_dl/postprocessor/ffmpeg.py
youtube_dl/postprocessor/metadatafromtitle.py
youtube_dl/utils.py
youtube_dl/version.py

index a84cbc8010a64d79da3bd0efcceba6f9baaecb41..f40cb2c4e9c2e42015b057ad3d8fe931514f7439 100644 (file)
@@ -1,16 +1,16 @@
 ## Please follow the guide below
 
 - You will be asked some questions and requested to provide some information, please read them **carefully** and answer honestly
-- Put an `x` into all the boxes [ ] relevant to your *issue* (like that [x])
-- Use *Preview* tab to see how your issue will actually look like
+- Put an `x` into all the boxes [ ] relevant to your *issue* (like this: `[x]`)
+- Use the *Preview* tab to see what your issue will actually look like
 
 ---
 
-### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.04.28*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
-- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.04.28**
+### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.09.11*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
+- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.09.11**
 
 ### Before submitting an *issue* make sure you have:
-- [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
+- [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
 - [ ] [Searched](https://github.com/rg3/youtube-dl/search?type=Issues) the bugtracker for similar issues including closed ones
 
 ### What is the purpose of your *issue*?
 
 ### If the purpose of this *issue* is a *bug report*, *site support request* or you are not completely sure provide the full verbose output as follows:
 
-Add `-v` flag to **your command line** you run youtube-dl with, copy the **whole** output and insert it here. It should look similar to one below (replace it with **your** log inserted between triple ```):
+Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl -v <your command line>`), copy the **whole** output and insert it here. It should look similar to one below (replace it with **your** log inserted between triple ```):
+
 ```
-$ youtube-dl -v <your command line>
 [debug] System config: []
 [debug] User config: []
 [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
 [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
-[debug] youtube-dl version 2017.04.28
+[debug] youtube-dl version 2017.09.11
 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
 [debug] Proxy map: {}
index df79503d3ec8fe02e76b6f2c529a60959037934e..26f61d3b43e85fb3b45ac09a7dd9be8c9c2e9b26 100644 (file)
@@ -1,16 +1,16 @@
 ## Please follow the guide below
 
 - You will be asked some questions and requested to provide some information, please read them **carefully** and answer honestly
-- Put an `x` into all the boxes [ ] relevant to your *issue* (like that [x])
-- Use *Preview* tab to see how your issue will actually look like
+- Put an `x` into all the boxes [ ] relevant to your *issue* (like this: `[x]`)
+- Use the *Preview* tab to see what your issue will actually look like
 
 ---
 
-### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *%(version)s*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
+### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *%(version)s*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
 - [ ] I've **verified** and **I assure** that I'm running youtube-dl **%(version)s**
 
 ### Before submitting an *issue* make sure you have:
-- [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
+- [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
 - [ ] [Searched](https://github.com/rg3/youtube-dl/search?type=Issues) the bugtracker for similar issues including closed ones
 
 ### What is the purpose of your *issue*?
@@ -28,9 +28,9 @@
 
 ### If the purpose of this *issue* is a *bug report*, *site support request* or you are not completely sure provide the full verbose output as follows:
 
-Add `-v` flag to **your command line** you run youtube-dl with, copy the **whole** output and insert it here. It should look similar to one below (replace it with **your** log inserted between triple ```):
+Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl -v <your command line>`), copy the **whole** output and insert it here. It should look similar to one below (replace it with **your** log inserted between triple ```):
+
 ```
-$ youtube-dl -v <your command line>
 [debug] System config: []
 [debug] User config: []
 [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
diff --git a/AUTHORS b/AUTHORS
index 1bdb74285eeaf836145c1fff518bb1abff899101..478c7872f8cd7d8876b2b15033056ae896a2be68 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -212,3 +212,15 @@ Xiao Di Guan
 Thomas Winant
 Daniel Twardowski
 Jeremie Jarosh
+Gerard Rovira
+Marvin Ewald
+Frédéric Bournival
+Timendum
+gritstub
+Adam Voss
+Mike Fährmann
+Jan Kundrát
+Giuseppe Fabiano
+Örn Guðjónsson
+Parmjit Virk
+Genki Sky
index d606eab0edb0af07bf112f1e7f05e2f7668d5b8a..333acee8010e1799fe6cae92a42e08b71f207436 100644 (file)
@@ -3,7 +3,7 @@
 $ youtube-dl -v <your command line>
 [debug] System config: []
 [debug] User config: []
-[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
+[debug] Command-line args: [u'-v', u'https://www.youtube.com/watch?v=BaW_jenozKcj']
 [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
 [debug] youtube-dl version 2015.12.06
 [debug] Git HEAD: 135392e
@@ -34,7 +34,7 @@ For bug reports, this means that your report should contain the *complete* outpu
 
 If your server has multiple IPs or you suspect censorship, adding `--call-home` may be a good idea to get more diagnostics. If the error is `ERROR: Unable to extract ...` and you cannot reproduce it from multiple countries, add `--dump-pages` (warning: this will yield a rather large output, redirect it to the file `log.txt` by adding `>log.txt 2>&1` to your command-line) or upload the `.dump` files you get when you add `--write-pages` [somewhere](https://gist.github.com/).
 
-**Site support requests must contain an example URL**. An example URL is a URL you might want to download, like `http://www.youtube.com/watch?v=BaW_jenozKc`. There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. `http://www.youtube.com/`) is *not* an example URL.
+**Site support requests must contain an example URL**. An example URL is a URL you might want to download, like `https://www.youtube.com/watch?v=BaW_jenozKc`. There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. `https://www.youtube.com/`) is *not* an example URL.
 
 ###  Are you using the latest version?
 
@@ -70,7 +70,7 @@ It may sound strange, but some bug reports we receive are completely unrelated t
 
 # DEVELOPER INSTRUCTIONS
 
-Most users do not need to build youtube-dl and can [download the builds](http://rg3.github.io/youtube-dl/download.html) or get them from their distribution.
+Most users do not need to build youtube-dl and can [download the builds](https://rg3.github.io/youtube-dl/download.html) or get them from their distribution.
 
 To run youtube-dl as a developer, you don't need to build anything either. Simply execute
 
@@ -82,6 +82,8 @@ To run the test, simply invoke your favorite test runner, or execute a test file
     python test/test_download.py
     nosetests
 
+See item 6 of [new extractor tutorial](#adding-support-for-a-new-site) for how to run extractor specific test cases.
+
 If you want to create a build of youtube-dl yourself, you'll need
 
 * python
@@ -118,7 +120,7 @@ After you have ensured this site is distributing its content legally, you can fo
     class YourExtractorIE(InfoExtractor):
         _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P<id>[0-9]+)'
         _TEST = {
-            'url': 'http://yourextractor.com/watch/42',
+            'url': 'https://yourextractor.com/watch/42',
             'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)',
             'info_dict': {
                 'id': '42',
@@ -149,10 +151,10 @@ After you have ensured this site is distributing its content legally, you can fo
             }
     ```
 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py).
-6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc.
+6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in.
 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want.
-8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](http://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+.
-9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this:
+8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+.
+9. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this:
 
         $ git add youtube_dl/extractor/extractors.py
         $ git add youtube_dl/extractor/yourextractor.py
index cd49ac42d4e8725340bddc679ebe73f4dfbb7e4b..c286da6c668979770a647327b060f586c54deb75 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,592 @@
+version 2017.09.11
+
+Extractors
+* [rutube:playlist] Fix suitable (#14166)
+
+
+version 2017.09.10
+
+Core
++ [utils] Introduce bool_or_none
+* [YoutubeDL] Ensure dir existence for each requested format (#14116)
+
+Extractors
+* [fox] Fix extraction (#14147)
+* [rutube] Use bool_or_none
+* [rutube] Rework and generalize playlist extractors (#13565)
++ [rutube:playlist] Add support for playlists (#13534, #13565)
++ [radiocanada] Add fallback for title extraction (#14145)
+* [vk] Use dedicated YouTube embeds extraction routine
+* [vice] Use dedicated YouTube embeds extraction routine
+* [cracked] Use dedicated YouTube embeds extraction routine
+* [chilloutzone] Use dedicated YouTube embeds extraction routine
+* [abcnews] Use dedicated YouTube embeds extraction routine
+* [youtube] Separate methods for embeds extraction
+* [redtube] Fix formats extraction (#14122)
+* [arte] Relax unavailability check (#14112)
++ [manyvids] Add support for preview videos from manyvids.com (#14053, #14059)
+* [vidme:user] Relax URL regular expression (#14054)
+* [bpb] Fix extraction (#14043, #14086)
+* [soundcloud] Fix download URL with private tracks (#14093)
+* [aliexpress:live] Add support for live.aliexpress.com (#13698, #13707)
+* [viidea] Capture and output lecture error message (#14099)
+* [radiocanada] Skip unsupported platforms (#14100)
+
+
+version 2017.09.02
+
+Extractors
+* [youtube] Force old layout for each webpage (#14068, #14072, #14074, #14076,
+  #14077, #14079, #14082, #14083, #14094, #14095, #14096)
+* [youtube] Fix upload date extraction (#14065)
++ [charlierose] Add support for episodes (#14062)
++ [bbccouk] Add support for w-prefixed ids (#14056)
+* [googledrive] Extend URL regular expression (#9785)
++ [googledrive] Add support for source format (#14046)
+* [pornhd] Fix extraction (#14005)
+
+
+version 2017.08.27.1
+
+Extractors
+
+* [youtube] Fix extraction with --youtube-skip-dash-manifest enabled (#14037)
+
+
+version 2017.08.27
+
+Core
++ [extractor/common] Extract height and format id for HTML5 videos (#14034)
+* [downloader/http] Rework HTTP downloader (#506, #809, #2849, #4240, #6023,
+  #8625, #9483)
+    * Simplify code and split into separate routines to facilitate maintaining
+    * Make retry mechanism work on errors during actual download not only
+      during connection establishment phase
+    * Retry on ECONNRESET and ETIMEDOUT during reading data from network
+    * Retry on content too short
+    * Show error description on retry
+
+Extractors
+* [generic] Lower preference for extraction from LD-JSON
+* [rai] Fix audio formats extraction (#14024)
+* [youtube] Fix controversy videos extraction (#14027, #14029)
+* [mixcloud] Fix extraction (#14015, #14020)
+
+
+version 2017.08.23
+
+Core
++ [extractor/common] Introduce _parse_xml
+* [extractor/common] Make HLS and DASH extraction in_parse_html5_media_entries
+  non fatal (#13970)
+* [utils] Fix unescapeHTML for misformed string like "&a&quot;" (#13935)
+
+Extractors
+* [cbc:watch] Bypass geo restriction (#13993)
+* [toutv] Relax DRM check (#13994)
++ [googledrive] Add support for subtitles (#13619, #13638)
+* [pornhub] Relax uploader regular expression (#13906, #13975)
+* [bandcamp:album] Extract track titles (#13962)
++ [bbccouk] Add support for events URLs (#13893)
++ [liveleak] Support multi-video pages (#6542)
++ [liveleak] Support another liveleak embedding pattern (#13336)
+* [cda] Fix extraction (#13935)
++ [laola1tv] Add support for tv.ittf.com (#13965)
+* [mixcloud] Fix extraction (#13958, #13974, #13980, #14003)
+
+
+version 2017.08.18
+
+Core
+* [YoutubeDL] Sanitize byte string format URLs (#13951)
++ [extractor/common] Add support for float durations in _parse_mpd_formats
+  (#13919)
+
+Extractors
+* [arte] Detect unavailable videos (#13945)
+* [generic] Convert redirect URLs to unicode strings (#13951)
+* [udemy] Fix paid course detection (#13943)
+* [pluralsight] Use RPC API for course extraction (#13937)
++ [clippit] Add support for clippituser.tv
++ [qqmusic] Support new URL schemes (#13805)
+* [periscope] Renew HLS extraction (#13917)
+* [mixcloud] Extract decrypt key
+
+
+version 2017.08.13
+
+Core
+* [YoutubeDL] Make sure format id is not empty
+* [extractor/common] Make _family_friendly_search optional
+* [extractor/common] Respect source's type attribute for HTML5 media (#13892)
+
+Extractors
+* [pornhub:playlistbase] Skip videos from drop-down menu (#12819, #13902)
++ [fourtube] Add support pornerbros.com (#6022)
++ [fourtube] Add support porntube.com (#7859, #13901)
++ [fourtube] Add support fux.com
+* [limelight] Improve embeds detection (#13895)
++ [reddit] Add support for v.redd.it and reddit.com (#13847)
+* [aparat] Extract all formats (#13887)
+* [mixcloud] Fix play info decryption (#13885)
++ [generic] Add support for vzaar embeds (#13876)
+
+
+version 2017.08.09
+
+Core
+* [utils] Skip missing params in cli_bool_option (#13865)
+
+Extractors
+* [xxxymovies] Fix title extraction (#13868)
++ [nick] Add support for nick.com.pl (#13860)
+* [mixcloud] Fix play info decryption (#13867)
+* [20min] Fix embeds extraction (#13852)
+* [dplayit] Fix extraction (#13851)
++ [niconico] Support videos with multiple formats (#13522)
++ [niconico] Support HTML5-only videos (#13806)
+
+
+version 2017.08.06
+
+Core
+* Use relative paths for DASH fragments (#12990)
+
+Extractors
+* [pluralsight] Fix format selection
+- [mpora] Remove extractor (#13826)
++ [voot] Add support for voot.com (#10255, #11644, #11814, #12350, #13218)
+* [vlive:channel] Limit number of videos per page to 100 (#13830)
+* [podomatic] Extend URL regular expression (#13827)
+* [cinchcast] Extend URL regular expression
+* [yandexdisk] Relax URL regular expression (#13824)
+* [vidme] Extract DASH and HLS formats
+- [teamfour] Remove extractor (#13782)
+* [pornhd] Fix extraction (#13783)
+* [udemy] Fix subtitles extraction (#13812)
+* [mlb] Extend URL regular expression (#13740, #13773)
++ [pbs] Add support for new URL schema (#13801)
+* [nrktv] Update API host (#13796)
+
+
+version 2017.07.30.1
+
+Core
+* [downloader/hls] Use redirect URL as manifest base (#13755)
+* [options] Correctly hide login info from debug outputs (#13696)
+
+Extractors
++ [watchbox] Add support for watchbox.de (#13739)
+- [clipfish] Remove extractor
++ [youjizz] Fix extraction (#13744)
++ [generic] Add support for another ooyala embed pattern (#13727)
++ [ard] Add support for lives (#13771)
+* [soundcloud] Update client id
++ [soundcloud:trackstation] Add support for track stations (#13733)
+* [svtplay] Use geo verification proxy for API request
+* [svtplay] Update API URL (#13767)
++ [yandexdisk] Add support for yadi.sk (#13755)
++ [megaphone] Add support for megaphone.fm
+* [amcnetworks] Make rating optional (#12453)
+* [cloudy] Fix extraction (#13737)
++ [nickru] Add support for nickelodeon.ru
+* [mtv] Improve thumbnal extraction
+* [nick] Automate geo-restriction bypass (#13711)
+* [niconico] Improve error reporting (#13696)
+
+
+version 2017.07.23
+
+Core
+* [YoutubeDL] Improve default format specification (#13704)
+* [YoutubeDL] Do not override id, extractor and extractor_key for
+  url_transparent entities
+* [extractor/common] Fix playlist_from_matches
+
+Extractors
+* [itv] Fix production id extraction (#13671, #13703)
+* [vidio] Make duration non fatal and fix typo
+* [mtv] Skip missing video parts (#13690)
+* [sportbox:embed] Fix extraction
++ [npo] Add support for npo3.nl URLs (#13695)
+* [dramafever] Remove video id from title (#13699)
++ [egghead:lesson] Add support for lessons (#6635)
+* [funnyordie] Extract more metadata (#13677)
+* [youku:show] Fix playlist extraction (#13248)
++ [dispeak] Recognize sevt subdomain (#13276)
+* [adn] Improve error reporting (#13663)
+* [crunchyroll] Relax series and season regular expression (#13659)
++ [spiegel:article] Add support for nexx iframe embeds (#13029)
++ [nexx:embed] Add support for iframe embeds
+* [nexx] Improve JS embed extraction
++ [pearvideo] Add support for pearvideo.com (#13031)
+
+
+version 2017.07.15
+
+Core
+* [YoutubeDL] Don't expand environment variables in meta fields (#13637)
+
+Extractors
+* [spiegeltv] Delegate extraction to nexx extractor (#13159)
++ [nexx] Add support for nexx.cloud (#10807, #13465)
+* [generic] Fix rutube embeds extraction (#13641)
+* [karrierevideos] Fix title extraction (#13641)
+* [youtube] Don't capture YouTube Red ad for creator meta field (#13621)
+* [slideshare] Fix extraction (#13617)
++ [5tv] Add another video URL pattern (#13354, #13606)
+* [drtv] Make HLS and HDS extraction non fatal
+* [ted] Fix subtitles extraction (#13628, #13629)
+* [vine] Make sure the title won't be empty
++ [twitter] Support HLS streams in vmap URLs
++ [periscope] Support pscp.tv URLs in embedded frames
+* [twitter] Extract mp4 urls via mobile API (#12726)
+* [niconico] Fix authentication error handling (#12486)
+* [giantbomb] Extract m3u8 formats (#13626)
++ [vlive:playlist] Add support for playlists (#13613)
+
+
+version 2017.07.09
+
+Core
++ [extractor/common] Add support for AMP tags in _parse_html5_media_entries
++ [utils] Support attributes with no values in get_elements_by_attribute
+
+Extractors
++ [dailymail] Add support for embeds
++ [joj] Add support for joj.sk (#13268)
+* [abc.net.au:iview] Extract more formats (#13492, #13489)
+* [egghead:course] Fix extraction (#6635, #13370)
++ [cjsw] Add support for cjsw.com (#13525)
++ [eagleplatform] Add support for referrer protected videos (#13557)
++ [eagleplatform] Add support for another embed pattern (#13557)
+* [veoh] Extend URL regular expression (#13601)
+* [npo:live] Fix live stream id extraction (#13568, #13605)
+* [googledrive] Fix height extraction (#13603)
++ [dailymotion] Add support for new layout (#13580)
+- [yam] Remove extractor
+* [xhamster] Extract all formats and fix duration extraction (#13593)
++ [xhamster] Add support for new URL schema (#13593)
+* [espn] Extend URL regular expression (#13244, #13549)
+* [kaltura] Fix typo in subtitles extraction (#13569)
+* [vier] Adapt extraction to redesign (#13575)
+
+
+version 2017.07.02
+
+Core
+* [extractor/common] Improve _json_ld
+
+Extractors
++ [thisoldhouse] Add more fallbacks for video id
+* [thisoldhouse] Fix video id extraction (#13540, #13541)
+* [xfileshare] Extend format regular expression (#13536)
+* [ted] Fix extraction (#13535)
++ [tastytrade] Add support for tastytrade.com (#13521)
+* [dplayit] Relax video id regular expression (#13524)
++ [generic] Extract more generic metadata (#13527)
++ [bbccouk] Capture and output error message (#13501, #13518)
+* [cbsnews] Relax video info regular expression (#13284, #13503)
++ [facebook] Add support for plugin video embeds and multiple embeds (#13493)
+* [soundcloud] Switch to https for API requests (#13502)
+* [pandatv] Switch to https for API and download URLs
++ [pandatv] Add support for https URLs (#13491)
++ [niconico] Support sp subdomain (#13494)
+
+
+version 2017.06.25
+
+Core
++ [adobepass] Add support for DIRECTV NOW (mso ATTOTT) (#13472)
+* [YoutubeDL] Skip malformed formats for better extraction robustness
+
+Extractors
++ [wsj] Add support for barrons.com (#13470)
++ [ign] Add another video id pattern (#13328)
++ [raiplay:live] Add support for live streams (#13414)
++ [redbulltv] Add support for live videos and segments (#13486)
++ [onetpl] Add support for videos embedded via pulsembed (#13482)
+* [ooyala] Make more robust
+* [ooyala] Skip empty format URLs (#13471, #13476)
+* [hgtv.com:show] Fix typo
+
+
+version 2017.06.23
+
+Core
+* [adobepass] Fix extraction on older python 2.6
+
+Extractors
+* [youtube] Adapt to new automatic captions rendition (#13467)
+* [hgtv.com:show] Relax video config regular expression (#13279, #13461)
+* [drtuber] Fix formats extraction (#12058)
+* [youporn] Fix upload date extraction
+* [youporn] Improve formats extraction
+* [youporn] Fix title extraction (#13456)
+* [googledrive] Fix formats sorting (#13443)
+* [watchindianporn] Fix extraction (#13411, #13415)
++ [vimeo] Add fallback mp4 extension for original format
++ [ruv] Add support for ruv.is (#13396)
+* [viu] Fix extraction on older python 2.6
+* [pandora.tv] Fix upload_date extraction (#12846)
++ [asiancrush] Add support for asiancrush.com (#13420)
+
+
+version 2017.06.18
+
+Core
+* [downloader/common] Use utils.shell_quote for debug command line
+* [utils] Use compat_shlex_quote in shell_quote
+* [postprocessor/execafterdownload] Encode command line (#13407)
+* [compat] Fix compat_shlex_quote on Windows (#5889, #10254)
+* [postprocessor/metadatafromtitle] Fix missing optional meta fields processing
+   in --metadata-from-title (#13408)
+* [extractor/common] Fix json dumping with --geo-bypass
++ [extractor/common] Improve jwplayer subtitles extraction
++ [extractor/common] Improve jwplayer formats extraction (#13379)
+
+Extractors
+* [polskieradio] Fix extraction (#13392)
++ [xfileshare] Add support for fastvideo.me (#13385)
+* [bilibili] Fix extraction of videos with double quotes in titles (#13387)
+* [4tube] Fix extraction (#13381, #13382)
++ [disney] Add support for disneychannel.de (#13383)
+* [npo] Improve URL regular expression (#13376)
++ [corus] Add support for showcase.ca
++ [corus] Add support for history.ca (#13359)
+
+
+version 2017.06.12
+
+Core
+* [utils] Handle compat_HTMLParseError in extract_attributes (#13349)
++ [compat] Introduce compat_HTMLParseError
+* [utils] Improve unified_timestamp
+* [extractor/generic] Ensure format id is unicode string
+* [extractor/common] Return unicode string from _match_id
++ [YoutubeDL] Sanitize more fields (#13313)
+
+Extractors
++ [xfileshare] Add support for rapidvideo.tv (#13348)
+* [xfileshare] Modernize and pass Referer
++ [rutv] Add support for testplayer.vgtrk.com (#13347)
++ [newgrounds] Extract more metadata (#13232)
++ [newgrounds:playlist] Add support for playlists (#10611)
+* [newgrounds] Improve formats and uploader extraction (#13346)
+* [msn] Fix formats extraction
+* [turbo] Ensure format id is string
+* [sexu] Ensure height is int
+* [jove] Ensure comment count is int
+* [golem] Ensure format id is string
+* [gfycat] Ensure filesize is int
+* [foxgay] Ensure height is int
+* [flickr] Ensure format id is string
+* [sohu] Fix numeric fields
+* [safari] Improve authentication detection (#13319)
+* [liveleak] Ensure height is int (#13313)
+* [streamango] Make title optional (#13292)
+* [rtlnl] Improve URL regular expression (#13295)
+* [tvplayer] Fix extraction (#13291)
+
+
+version 2017.06.05
+
+Core
+* [YoutubeDL] Don't emit ANSI escape codes on Windows (#13270)
+
+Extractors
++ [bandcamp:weekly] Add support for bandcamp weekly (#12758)
+* [pornhub:playlist] Fix extraction (#13281)
+- [godtv] Remove extractor (#13175)
+* [safari] Fix typo (#13252)
+* [youtube] Improve chapters extraction (#13247)
+* [1tv] Lower preference for HTTP formats (#13246)
+* [francetv] Relax URL regular expression
+* [drbonanza] Fix extraction (#13231)
+* [packtpub] Fix authentication (#13240)
+
+
+version 2017.05.29
+
+Extractors
+* [youtube] Fix DASH MPD extraction for videos with non-encrypted format URLs
+  (#13211)
+* [xhamster] Fix uploader and like/dislike count extraction (#13216))
++ [xhamster] Extract categories (#11728)
++ [abcnews] Add support for embed URLs (#12851)
+* [gaskrank] Fix extraction (#12493)
+* [medialaan] Fix videos with missing videoUrl (#12774)
+* [dvtv] Fix playlist support
++ [dvtv] Add support for DASH and HLS formats (#3063)
++ [beam:vod] Add support for beam.pro/mixer.com VODs (#13032))
+* [cbsinteractive] Relax URL regular expression (#13213)
+* [adn] Fix formats extraction
++ [youku] Extract more metadata (#10433)
+* [cbsnews] Fix extraction (#13205)
+
+
+version 2017.05.26
+
+Core
++ [utils] strip_jsonp() can recognize more patterns
+* [postprocessor/ffmpeg] Fix metadata filename handling on Python 2 (#13182)
+
+Extractors
++ [youtube] DASH MPDs with cipher signatures are recognized now (#11381)
++ [bbc] Add support for authentication
+* [tudou] Merge into youku extractor (#12214)
+* [youku:show] Fix extraction
+* [youku] Fix extraction (#13191)
+* [udemy] Fix extraction for outputs' format entries without URL (#13192)
+* [vimeo] Fix formats' sorting (#13189)
+* [cbsnews] Fix extraction for 60 Minutes videos (#12861)
+
+
+version 2017.05.23
+
+Core
++ [downloader/external] Pass -loglevel to ffmpeg downloader (#13183)
++ [adobepass] Add support for Bright House Networks (#13149)
+
+Extractors
++ [streamcz] Add support for subtitles (#13174)
+* [youtube] Fix DASH manifest signature decryption (#8944, #13156)
+* [toggle] Relax URL regular expression (#13172)
+* [toypics] Fix extraction (#13077)
+* [njpwworld] Fix extraction (#13162, #13169)
++ [hitbox] Add support for smashcast.tv (#13154)
+* [mitele] Update app key regular expression (#13158)
+
+
+version 2017.05.18.1
+
+Core
+* [jsinterp] Fix typo and cleanup regular expressions (#13134)
+
+
+version 2017.05.18
+
+Core
++ [jsinterp] Add support for quoted names and indexers (#13123, #13124, #13125,
+  #13126, #13128, #13129, #13130, #13131, #13132)
++ [extractor/common] Add support for schemeless URLs in _extract_wowza_formats
+  (#13088, #13092)
++ [utils] Recognize more audio codecs (#13081)
+
+Extractors
++ [vier] Extract more metadata (#12539)
+* [vier] Improve extraction (#12801)
+    + Add support for authentication
+    * Bypass authentication when no credentials provided
+    * Improve extraction robustness
+* [dailymail] Fix sources extraction (#13057)
+* [dailymotion] Extend URL regular expression (#13079)
+
+
+version 2017.05.14
+
+Core
++ [extractor/common] Respect Width and Height attributes in ISM manifests
++ [postprocessor/metadatafromtitle] Add support regular expression syntax for
+  --metadata-from-title (#13065)
+
+Extractors
++ [mediaset] Add support for video.mediaset.it (#12708, #12964)
+* [orf:radio] Fix extraction (#11643, #12926)
+* [aljazeera] Extend URL regular expression (#13053)
+* [imdb] Relax URL regular expression (#13056)
++ [francetv] Add support for mobile.france.tv (#13068)
++ [upskill] Add support for upskillcourses.com (#13043)
+* [thescene] Fix extraction (#13061)
+* [condenast] Improve embed support
+* [liveleak] Fix extraction (#12053)
++ [douyu] Support Douyu shows (#12228)
+* [myspace] Improve URL regular expression (#13040)
+* [adultswim] Use desktop platform in assets URL (#13041)
+
+
+version 2017.05.09
+
+Core
+* [YoutubeDL] Force --restrict-filenames when no locale is set on all python
+  versions (#13027)
+
+Extractors
+* [francetv] Adapt to site redesign (#13034)
++ [packtpub] Add support for authentication (#12622)
+* [drtv] Lower preference for SignLanguage formats (#13013, #13016)
++ [cspan] Add support for brightcove live embeds (#13028)
+* [vrv] Extract DASH formats and subtitles
+* [funimation] Fix authentication (#13021)
+* [adultswim] Fix extraction (#8640, #10950, #11042, #12121)
+    + Add support for Adobe Pass authentication
+    + Add support for live streams
+    + Add support for show pages
+* [turner] Extract thumbnail, is_live and strip description
++ [nonktube] Add support for nonktube.com (#8647, #13024)
++ [nuevo] Pass headers to _extract_nuevo
+* [nbc] Improve extraction (#12364)
+
+
+version 2017.05.07
+
+Common
+* [extractor/common] Fix typo in _extract_akamai_formats
++ [postprocessor/ffmpeg] Embed chapters into media file with --add-metadata
++ [extractor/common] Introduce chapters meta field
+
+Extractors
+* [youtube] Fix authentication (#12820, #12927, #12973, #12992, #12993, #12995,
+  #13003)
+* [bilibili] Fix video downloading (#13001)
+* [rmcdecouverte] Fix extraction (#12937)
+* [theplatform] Extract chapters
+* [bandcamp] Fix thumbnail extraction (#12980)
+* [pornhub] Extend URL regular expression (#12996)
++ [youtube] Extract chapters
++ [nrk] Extract chapters
++ [vice] Add support for ooyala embeds in article pages
++ [vice] Support vice articles (#12968)
+* [vice] Fix extraction for non en_us videos (#12967)
+* [gdcvault] Fix extraction for some videos (#12733)
+* [pbs] Improve multipart video support (#12981)
+* [laola1tv] Fix extraction (#12880)
++ [cda] Support birthday verification (#12789)
+* [leeco] Fix extraction (#12974)
++ [pbs] Extract chapters
+* [amp] Imporove thumbnail and subtitles extraction
+* [foxsports] Fix extraction (#12945)
+- [coub] Remove comment count extraction (#12941)
+
+
+version 2017.05.01
+
+Core
++ [extractor/common] Extract view count from JSON-LD
+* [utils] Improve unified_timestamp
++ [utils] Add video/mp2t to mimetype2ext
+* [downloader/external] Properly handle live stream downloading cancellation
+  (#8932)
++ [utils] Add support for unicode whitespace in clean_html on python 2 (#12906)
+
+Extractors
+* [infoq] Make audio format extraction non fatal (#12938)
+* [brightcove] Allow whitespace around attribute names in embedded code
++ [zaq1] Add support for zaq1.pl (#12693)
++ [xvideos] Extract duration (#12828)
+* [vevo] Fix extraction (#12879)
++ [noovo] Add support for noovo.ca (#12792)
++ [washingtonpost] Add support for embeds (#12699)
+* [yandexmusic:playlist] Fix extraction for python 3 (#12888)
+* [anvato] Improve extraction (#12913)
+    * Promote to regular shortcut based extractor
+    * Add mcp to access key mapping table
+    * Add support for embeds extraction
+    * Add support for anvato embeds in generic extractor
+* [xtube] Fix extraction for older FLV videos (#12734)
+* [tvplayer] Fix extraction (#12908)
+
+
 version 2017.04.28
 
 Core
@@ -24,19 +613,19 @@ Core
 * [YoutubeDL] Fix output template for missing timestamp (#12796)
 * [socks] Handle cases where credentials are required but missing
 * [extractor/common] Improve HLS extraction (#12211)
-    - Extract m3u8 parsing to separate method
-    - Improve rendition groups extraction
-    - Build stream name according stream GROUP-ID
-    - Ignore reference to AUDIO group without URI when stream has no CODECS
-    - Use float for scaled tbr in _parse_m3u8_formats
+    * Extract m3u8 parsing to separate method
+    * Improve rendition groups extraction
+    * Build stream name according stream GROUP-ID
+    * Ignore reference to AUDIO group without URI when stream has no CODECS
+    * Use float for scaled tbr in _parse_m3u8_formats
 * [utils] Add support for TTML styles in dfxp2srt
 * [downloader/hls] No need to download keys for fragments that have been
   already downloaded
 * [downloader/fragment] Improve fragment downloading
-    - Resume immediately
-    - Don't concatenate fragments and decrypt them on every resume
-    - Optimize disk storage usage, don't store intermediate fragments on disk
-    - Store bookkeeping download state file
+    * Resume immediately
+    * Don't concatenate fragments and decrypt them on every resume
+    * Optimize disk storage usage, don't store intermediate fragments on disk
+    * Store bookkeeping download state file
 + [extractor/common] Add support for multiple getters in try_get
 + [extractor/common] Add support for video of WebPage context in _json_ld
   (#12778)
index 023556391191a73fb122cea898cc8565ae66e92a..c74eea7922e9308d70b5c88a3d5c068e4e1f7826 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -46,8 +46,15 @@ tar: youtube-dl.tar.gz
 pypi-files: youtube-dl.bash-completion README.txt youtube-dl.1 youtube-dl.fish
 
 youtube-dl: youtube_dl/*.py youtube_dl/*/*.py
-       zip --quiet youtube-dl youtube_dl/*.py youtube_dl/*/*.py
-       zip --quiet --junk-paths youtube-dl youtube_dl/__main__.py
+       mkdir -p zip
+       for d in youtube_dl youtube_dl/downloader youtube_dl/extractor youtube_dl/postprocessor ; do \
+         mkdir -p zip/$$d ;\
+         cp -pPR $$d/*.py zip/$$d/ ;\
+       done
+       touch -t 200001010101 zip/youtube_dl/*.py zip/youtube_dl/*/*.py
+       mv zip/youtube_dl/__main__.py zip/
+       cd zip ; zip -q ../youtube-dl youtube_dl/*.py youtube_dl/*/*.py __main__.py
+       rm -rf zip
        echo '#!$(PYTHON)' > youtube-dl
        cat youtube-dl.zip >> youtube-dl
        rm youtube-dl.zip
@@ -101,7 +108,7 @@ youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-
                --exclude '*.pyc' \
                --exclude '*.pyo' \
                --exclude '*~' \
-               --exclude '__pycache' \
+               --exclude '__pycache__' \
                --exclude '.git' \
                --exclude 'testdata' \
                --exclude 'docs/_build' \
index 1ecd2005ac3384160ebdcf5d2a34de1001f17b8d..28ee63f40aae01ddc9d2abfb001a125e3e2e1105 100644 (file)
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ If you do not have curl, you can alternatively use a recent wget:
     sudo wget https://yt-dl.org/downloads/latest/youtube-dl -O /usr/local/bin/youtube-dl
     sudo chmod a+rx /usr/local/bin/youtube-dl
 
-Windows users can [download an .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in any location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29) except for `%SYSTEMROOT%\System32` (e.g. **do not** put in `C:\Windows\System32`).
+Windows users can [download an .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in any location on their [PATH](https://en.wikipedia.org/wiki/PATH_%28variable%29) except for `%SYSTEMROOT%\System32` (e.g. **do not** put in `C:\Windows\System32`).
 
 You can also use pip:
 
@@ -33,7 +33,7 @@ You can also use pip:
     
 This command will update youtube-dl if you have already installed it. See the [pypi page](https://pypi.python.org/pypi/youtube_dl) for more information.
 
-OS X users can install youtube-dl with [Homebrew](http://brew.sh/):
+OS X users can install youtube-dl with [Homebrew](https://brew.sh/):
 
     brew install youtube-dl
 
@@ -145,18 +145,18 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo
     --max-views COUNT                Do not download any videos with more than
                                      COUNT views
     --match-filter FILTER            Generic video filter. Specify any key (see
-                                     help for -o for a list of available keys)
-                                     to match if the key is present, !key to
-                                     check if the key is not present, key >
-                                     NUMBER (like "comment_count > 12", also
-                                     works with >=, <, <=, !=, =) to compare
-                                     against a number, key = 'LITERAL' (like
-                                     "uploader = 'Mike Smith'", also works with
-                                     !=) to match against a string literal and &
-                                     to require multiple matches. Values which
-                                     are not known are excluded unless you put a
-                                     question mark (?) after the operator. For
-                                     example, to only match videos that have
+                                     the "OUTPUT TEMPLATE" for a list of
+                                     available keys) to match if the key is
+                                     present, !key to check if the key is not
+                                     present, key > NUMBER (like "comment_count
+                                     > 12", also works with >=, <, <=, !=, =) to
+                                     compare against a number, key = 'LITERAL'
+                                     (like "uploader = 'Mike Smith'", also works
+                                     with !=) to match against a string literal
+                                     and & to require multiple matches. Values
+                                     which are not known are excluded unless you
+                                     put a question mark (?) after the operator.
+                                     For example, to only match videos that have
                                      been liked more than 100 times and disliked
                                      less than 50 times (or the dislike
                                      functionality is not available at the given
@@ -277,8 +277,8 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo
     --get-filename                   Simulate, quiet but print output filename
     --get-format                     Simulate, quiet but print output format
     -j, --dump-json                  Simulate, quiet but print JSON information.
-                                     See --output for a description of available
-                                     keys.
+                                     See the "OUTPUT TEMPLATE" for a description
+                                     of available keys.
     -J, --dump-single-json           Simulate, quiet but print JSON information
                                      for each command-line argument. If the URL
                                      refers to a playlist, dump the whole
@@ -400,12 +400,14 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo
     --add-metadata                   Write metadata to the video file
     --metadata-from-title FORMAT     Parse additional metadata like song title /
                                      artist from the video title. The format
-                                     syntax is the same as --output, the parsed
-                                     parameters replace existing values.
-                                     Additional templates: %(album)s,
-                                     %(artist)s. Example: --metadata-from-title
-                                     "%(artist)s - %(title)s" matches a title
-                                     like "Coldplay - Paradise"
+                                     syntax is the same as --output. Regular
+                                     expression with named capture groups may
+                                     also be used. The parsed parameters replace
+                                     existing values. Example: --metadata-from-
+                                     title "%(artist)s - %(title)s" matches a
+                                     title like "Coldplay - Paradise". Example
+                                     (regex): --metadata-from-title
+                                     "(?P<artist>.+?) - (?P<title>.+)"
     --xattrs                         Write metadata to the video file's xattrs
                                      (using dublin core and xdg standards)
     --fixup POLICY                   Automatically correct known faults of the
@@ -456,7 +458,7 @@ You can also use `--config-location` if you want to use custom configuration fil
 
 ### Authentication with `.netrc` file
 
-You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every youtube-dl execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](http://stackoverflow.com/tags/.netrc/info) on a per extractor basis. For that you will need to create a `.netrc` file in your `$HOME` and restrict permissions to read/write by only you:
+You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every youtube-dl execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](https://stackoverflow.com/tags/.netrc/info) on a per extractor basis. For that you will need to create a `.netrc` file in your `$HOME` and restrict permissions to read/write by only you:
 ```
 touch $HOME/.netrc
 chmod a-rwx,u+rw $HOME/.netrc
@@ -472,7 +474,10 @@ machine twitch login my_twitch_account_name password my_twitch_password
 ```
 To activate authentication with the `.netrc` file you should pass `--netrc` to youtube-dl or place it in the [configuration file](#configuration).
 
-On Windows you may also need to setup the `%HOME%` environment variable manually.
+On Windows you may also need to setup the `%HOME%` environment variable manually. For example:
+```
+set HOME=%USERPROFILE%
+```
 
 # OUTPUT TEMPLATE
 
@@ -480,7 +485,7 @@ The `-o` option allows users to indicate a template for the output file names.
 
 **tl;dr:** [navigate me to examples](#output-template-examples).
 
-The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "http://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences may be formatted according to [python string formatting operations](https://docs.python.org/2/library/stdtypes.html#string-formatting). For example, `%(NAME)s` or `%(NAME)05d`. To clarify, that is a percent symbol followed by a name in parentheses, followed by a formatting operations. Allowed names along with sequence type are:
+The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "https://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences may be formatted according to [python string formatting operations](https://docs.python.org/2/library/stdtypes.html#string-formatting). For example, `%(NAME)s` or `%(NAME)05d`. To clarify, that is a percent symbol followed by a name in parentheses, followed by a formatting operations. Allowed names along with sequence type are:
 
  - `id` (string): Video identifier
  - `title` (string): Video title
@@ -530,13 +535,14 @@ The basic usage is not to set any template arguments when downloading a single f
  - `playlist_id` (string): Playlist identifier
  - `playlist_title` (string): Playlist title
 
-
 Available for the video that belongs to some logical chapter or section:
+
  - `chapter` (string): Name or title of the chapter the video belongs to
  - `chapter_number` (numeric): Number of the chapter the video belongs to
  - `chapter_id` (string): Id of the chapter the video belongs to
 
 Available for the video that is an episode of some series or programme:
+
  - `series` (string): Title of the series or programme the video episode belongs to
  - `season` (string): Title of the season the video episode belongs to
  - `season_number` (numeric): Number of the season the video episode belongs to
@@ -546,6 +552,7 @@ Available for the video that is an episode of some series or programme:
  - `episode_id` (string): Id of the video episode
 
 Available for the media that is a track or a part of a music album:
+
  - `track` (string): Title of the track
  - `track_number` (numeric): Number of the track within an album or a disc
  - `track_id` (string): Id of the track
@@ -577,7 +584,7 @@ If you are using an output template inside a Windows batch file then you must es
 
 #### Output template examples
 
-Note on Windows you may need to use double quotes instead of single.
+Note that on Windows you may need to use double quotes instead of single.
 
 ```bash
 $ youtube-dl --get-filename -o '%(title)s.%(ext)s' BaW_jenozKc
@@ -596,7 +603,7 @@ $ youtube-dl -o '%(uploader)s/%(playlist)s/%(playlist_index)s - %(title)s.%(ext)
 $ youtube-dl -u user -p password -o '~/MyVideos/%(playlist)s/%(chapter_number)s - %(chapter)s/%(title)s.%(ext)s' https://www.udemy.com/java-tutorial/
 
 # Download entire series season keeping each series and each season in separate directory under C:/MyVideos
-$ youtube-dl -o "C:/MyVideos/%(series)s/%(season_number)s - %(season)s/%(episode_number)s - %(episode)s.%(ext)s" http://videomore.ru/kino_v_detalayah/5_sezon/367617
+$ youtube-dl -o "C:/MyVideos/%(series)s/%(season_number)s - %(season)s/%(episode_number)s - %(episode)s.%(ext)s" https://videomore.ru/kino_v_detalayah/5_sezon/367617
 
 # Stream the video being downloaded to stdout
 $ youtube-dl -o - BaW_jenozKc
@@ -647,7 +654,7 @@ Also filtering work for comparisons `=` (equals), `!=` (not equals), `^=` (begin
  - `acodec`: Name of the audio codec in use
  - `vcodec`: Name of the video codec in use
  - `container`: Name of the container format
- - `protocol`: The protocol that will be used for the actual download, lower-case (`http`, `https`, `rtsp`, `rtmp`, `rtmpe`, `mms`, `f4m`, `ism`, `m3u8`, or `m3u8_native`)
+ - `protocol`: The protocol that will be used for the actual download, lower-case (`http`, `https`, `rtsp`, `rtmp`, `rtmpe`, `mms`, `f4m`, `ism`, `http_dash_segments`, `m3u8`, or `m3u8_native`)
  - `format_id`: A short description of the format
 
 Note that none of the aforementioned meta fields are guaranteed to be present since this solely depends on the metadata obtained by particular extractor, i.e. the metadata offered by the video hoster.
@@ -664,7 +671,7 @@ If you want to preserve the old format selection behavior (prior to youtube-dl 2
 
 #### Format selection examples
 
-Note on Windows you may need to use double quotes instead of single.
+Note that on Windows you may need to use double quotes instead of single.
 
 ```bash
 # Download best mp4 format available or any other best if no mp4 available
@@ -709,17 +716,17 @@ $ youtube-dl --dateafter 20000101 --datebefore 20091231
 
 ### How do I update youtube-dl?
 
-If you've followed [our manual installation instructions](http://rg3.github.io/youtube-dl/download.html), you can simply run `youtube-dl -U` (or, on Linux, `sudo youtube-dl -U`).
+If you've followed [our manual installation instructions](https://rg3.github.io/youtube-dl/download.html), you can simply run `youtube-dl -U` (or, on Linux, `sudo youtube-dl -U`).
 
 If you have used pip, a simple `sudo pip install -U youtube-dl` is sufficient to update.
 
-If you have installed youtube-dl using a package manager like *apt-get* or *yum*, use the standard system update mechanism to update. Note that distribution packages are often outdated. As a rule of thumb, youtube-dl releases at least once a month, and often weekly or even daily. Simply go to http://yt-dl.org/ to find out the current version. Unfortunately, there is nothing we youtube-dl developers can do if your distribution serves a really outdated version. You can (and should) complain to your distribution in their bugtracker or support forum.
+If you have installed youtube-dl using a package manager like *apt-get* or *yum*, use the standard system update mechanism to update. Note that distribution packages are often outdated. As a rule of thumb, youtube-dl releases at least once a month, and often weekly or even daily. Simply go to https://yt-dl.org to find out the current version. Unfortunately, there is nothing we youtube-dl developers can do if your distribution serves a really outdated version. You can (and should) complain to your distribution in their bugtracker or support forum.
 
 As a last resort, you can also uninstall the version installed by your package manager and follow our manual installation instructions. For that, remove the distribution's package, with a line like
 
     sudo apt-get remove -y youtube-dl
 
-Afterwards, simply follow [our manual installation instructions](http://rg3.github.io/youtube-dl/download.html):
+Afterwards, simply follow [our manual installation instructions](https://rg3.github.io/youtube-dl/download.html):
 
 ```
 sudo wget https://yt-dl.org/latest/youtube-dl -O /usr/local/bin/youtube-dl
@@ -759,11 +766,11 @@ Apparently YouTube requires you to pass a CAPTCHA test if you download too much.
 
 youtube-dl works fine on its own on most sites. However, if you want to convert video/audio, you'll need [avconv](https://libav.org/) or [ffmpeg](https://www.ffmpeg.org/). On some sites - most notably YouTube - videos can be retrieved in a higher quality format without sound. youtube-dl will detect whether avconv/ffmpeg is present and automatically pick the best option.
 
-Videos or video formats streamed via RTMP protocol can only be downloaded when [rtmpdump](https://rtmpdump.mplayerhq.hu/) is installed. Downloading MMS and RTSP videos requires either [mplayer](http://mplayerhq.hu/) or [mpv](https://mpv.io/) to be installed.
+Videos or video formats streamed via RTMP protocol can only be downloaded when [rtmpdump](https://rtmpdump.mplayerhq.hu/) is installed. Downloading MMS and RTSP videos requires either [mplayer](https://mplayerhq.hu/) or [mpv](https://mpv.io/) to be installed.
 
 ### I have downloaded a video but how can I play it?
 
-Once the video is fully downloaded, use any video player, such as [mpv](https://mpv.io/), [vlc](http://www.videolan.org/) or [mplayer](http://www.mplayerhq.hu/).
+Once the video is fully downloaded, use any video player, such as [mpv](https://mpv.io/), [vlc](https://www.videolan.org/) or [mplayer](https://www.mplayerhq.hu/).
 
 ### I extracted a video URL with `-g`, but it does not play on another machine / in my web browser.
 
@@ -838,10 +845,10 @@ Use the `-o` to specify an [output template](#output-template), for example `-o
 
 ### How do I download a video starting with a `-`?
 
-Either prepend `http://www.youtube.com/watch?v=` or separate the ID from the options with `--`:
+Either prepend `https://www.youtube.com/watch?v=` or separate the ID from the options with `--`:
 
     youtube-dl -- -wNyEUrxzFU
-    youtube-dl "http://www.youtube.com/watch?v=-wNyEUrxzFU"
+    youtube-dl "https://www.youtube.com/watch?v=-wNyEUrxzFU"
 
 ### How do I pass cookies to youtube-dl?
 
@@ -855,9 +862,9 @@ Passing cookies to youtube-dl is a good way to workaround login when a particula
 
 ### How do I stream directly to media player?
 
-You will first need to tell youtube-dl to stream media to stdout with `-o -`, and also tell your media player to read from stdin (it must be capable of this for streaming) and then pipe former to latter. For example, streaming to [vlc](http://www.videolan.org/) can be achieved with:
+You will first need to tell youtube-dl to stream media to stdout with `-o -`, and also tell your media player to read from stdin (it must be capable of this for streaming) and then pipe former to latter. For example, streaming to [vlc](https://www.videolan.org/) can be achieved with:
 
-    youtube-dl -o - "http://www.youtube.com/watch?v=BaW_jenozKcj" | vlc -
+    youtube-dl -o - "https://www.youtube.com/watch?v=BaW_jenozKcj" | vlc -
 
 ### How do I download only new videos from a playlist?
 
@@ -877,7 +884,7 @@ When youtube-dl detects an HLS video, it can download it either with the built-i
 
 When youtube-dl knows that one particular downloader works better for a given website, that downloader will be picked. Otherwise, youtube-dl will pick the best downloader for general compatibility, which at the moment happens to be ffmpeg. This choice may change in future versions of youtube-dl, with improvements of the built-in downloader and/or ffmpeg.
 
-In particular, the generic extractor (used when your website is not in the [list of supported sites by youtube-dl](http://rg3.github.io/youtube-dl/supportedsites.html) cannot mandate one specific downloader.
+In particular, the generic extractor (used when your website is not in the [list of supported sites by youtube-dl](https://rg3.github.io/youtube-dl/supportedsites.html) cannot mandate one specific downloader.
 
 If you put either `--hls-prefer-native` or `--hls-prefer-ffmpeg` into your configuration, a different subset of videos will fail to download correctly. Instead, it is much better to [file an issue](https://yt-dl.org/bug) or a pull request which details why the native or the ffmpeg HLS downloader is a better choice for your use case.
 
@@ -903,7 +910,7 @@ Feel free to bump the issue from time to time by writing a small comment ("Issue
 
 ### How can I detect whether a given URL is supported by youtube-dl?
 
-For one, have a look at the [list of supported sites](docs/supportedsites.md). Note that it can sometimes happen that the site changes its URL scheme (say, from http://example.com/video/1234567 to http://example.com/v/1234567 ) and youtube-dl reports an URL of a service in that list as unsupported. In that case, simply report a bug.
+For one, have a look at the [list of supported sites](docs/supportedsites.md). Note that it can sometimes happen that the site changes its URL scheme (say, from https://example.com/video/1234567 to https://example.com/v/1234567 ) and youtube-dl reports an URL of a service in that list as unsupported. In that case, simply report a bug.
 
 It is *not* possible to detect whether a URL is supported or not. That's because youtube-dl contains a generic extractor which matches **all** URLs. You may be tempted to disable, exclude, or remove the generic extractor, but the generic extractor not only allows users to extract videos from lots of websites that embed a video from another service, but may also be used to extract video from a service that it's hosting itself. Therefore, we neither recommend nor support disabling, excluding, or removing the generic extractor.
 
@@ -917,7 +924,7 @@ youtube-dl is an open-source project manned by too few volunteers, so we'd rathe
 
 # DEVELOPER INSTRUCTIONS
 
-Most users do not need to build youtube-dl and can [download the builds](http://rg3.github.io/youtube-dl/download.html) or get them from their distribution.
+Most users do not need to build youtube-dl and can [download the builds](https://rg3.github.io/youtube-dl/download.html) or get them from their distribution.
 
 To run youtube-dl as a developer, you don't need to build anything either. Simply execute
 
@@ -929,6 +936,8 @@ To run the test, simply invoke your favorite test runner, or execute a test file
     python test/test_download.py
     nosetests
 
+See item 6 of [new extractor tutorial](#adding-support-for-a-new-site) for how to run extractor specific test cases.
+
 If you want to create a build of youtube-dl yourself, you'll need
 
 * python
@@ -965,7 +974,7 @@ After you have ensured this site is distributing its content legally, you can fo
     class YourExtractorIE(InfoExtractor):
         _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P<id>[0-9]+)'
         _TEST = {
-            'url': 'http://yourextractor.com/watch/42',
+            'url': 'https://yourextractor.com/watch/42',
             'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)',
             'info_dict': {
                 'id': '42',
@@ -996,10 +1005,10 @@ After you have ensured this site is distributing its content legally, you can fo
             }
     ```
 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py).
-6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc.
+6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in.
 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want.
-8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](http://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+.
-9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this:
+8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+.
+9. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this:
 
         $ git add youtube_dl/extractor/extractors.py
         $ git add youtube_dl/extractor/yourextractor.py
@@ -1155,7 +1164,7 @@ import youtube_dl
 
 ydl_opts = {}
 with youtube_dl.YoutubeDL(ydl_opts) as ydl:
-    ydl.download(['http://www.youtube.com/watch?v=BaW_jenozKc'])
+    ydl.download(['https://www.youtube.com/watch?v=BaW_jenozKc'])
 ```
 
 Most likely, you'll want to use various options. For a list of options available, have a look at [`youtube_dl/YoutubeDL.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L129-L279). For a start, if you want to intercept youtube-dl's output, set a `logger` object.
@@ -1194,19 +1203,19 @@ ydl_opts = {
     'progress_hooks': [my_hook],
 }
 with youtube_dl.YoutubeDL(ydl_opts) as ydl:
-    ydl.download(['http://www.youtube.com/watch?v=BaW_jenozKc'])
+    ydl.download(['https://www.youtube.com/watch?v=BaW_jenozKc'])
 ```
 
 # BUGS
 
-Bugs and suggestions should be reported at: <https://github.com/rg3/youtube-dl/issues>. Unless you were prompted to or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email. For discussions, join us in the IRC channel [#youtube-dl](irc://chat.freenode.net/#youtube-dl) on freenode ([webchat](http://webchat.freenode.net/?randomnick=1&channels=youtube-dl)).
+Bugs and suggestions should be reported at: <https://github.com/rg3/youtube-dl/issues>. Unless you were prompted to or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email. For discussions, join us in the IRC channel [#youtube-dl](irc://chat.freenode.net/#youtube-dl) on freenode ([webchat](https://webchat.freenode.net/?randomnick=1&channels=youtube-dl)).
 
 **Please include the full output of youtube-dl when run with `-v`**, i.e. **add** `-v` flag to **your command line**, copy the **whole** output and post it in the issue body wrapped in \`\`\` for better formatting. It should look similar to this:
 ```
 $ youtube-dl -v <your command line>
 [debug] System config: []
 [debug] User config: []
-[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
+[debug] Command-line args: [u'-v', u'https://www.youtube.com/watch?v=BaW_jenozKcj']
 [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
 [debug] youtube-dl version 2015.12.06
 [debug] Git HEAD: 135392e
@@ -1237,7 +1246,7 @@ For bug reports, this means that your report should contain the *complete* outpu
 
 If your server has multiple IPs or you suspect censorship, adding `--call-home` may be a good idea to get more diagnostics. If the error is `ERROR: Unable to extract ...` and you cannot reproduce it from multiple countries, add `--dump-pages` (warning: this will yield a rather large output, redirect it to the file `log.txt` by adding `>log.txt 2>&1` to your command-line) or upload the `.dump` files you get when you add `--write-pages` [somewhere](https://gist.github.com/).
 
-**Site support requests must contain an example URL**. An example URL is a URL you might want to download, like `http://www.youtube.com/watch?v=BaW_jenozKc`. There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. `http://www.youtube.com/`) is *not* an example URL.
+**Site support requests must contain an example URL**. An example URL is a URL you might want to download, like `https://www.youtube.com/watch?v=BaW_jenozKc`. There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. `https://www.youtube.com/`) is *not* an example URL.
 
 ###  Are you using the latest version?
 
index f9fe63f1ffd5073b312f22e8f08fb7798fa3f7a4..76bf873e1bd70b7e5c3a20caf2ea80f0941a2dea 100644 (file)
@@ -8,7 +8,7 @@ import re
 ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 README_FILE = os.path.join(ROOT_DIR, 'README.md')
 
-PREFIX = '''%YOUTUBE-DL(1)
+PREFIX = r'''%YOUTUBE-DL(1)
 
 # NAME
 
index 0be7af8c402c554ecff7b2cf249591df046dc744..798a81d3c4bac705ea15ac77ec9a4064687df92d 100644 (file)
  - **afreecatv**: afreecatv.com
  - **afreecatv:global**: afreecatv.com
  - **AirMozilla**
+ - **AliExpressLive**
  - **AlJazeera**
  - **Allocine**
  - **AlphaPorno**
  - **AMCNetworks**
- - **anderetijden**: npo.nl and ntr.nl
+ - **anderetijden**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl
  - **AnimeOnDemand**
  - **anitube.se**
+ - **Anvato**
  - **AnySex**
  - **Aparat**
  - **AppleConnect**
@@ -66,6 +68,8 @@
  - **arte.tv:info**
  - **arte.tv:magazine**
  - **arte.tv:playlist**
+ - **AsianCrush**
+ - **AsianCrushPlaylist**
  - **AtresPlayer**
  - **ATTTechChannel**
  - **ATVAt**
  - **bambuser:channel**
  - **Bandcamp**
  - **Bandcamp:album**
+ - **Bandcamp:weekly**
  - **bangumi.bilibili.com**: BiliBili番剧
  - **bbc**: BBC
  - **bbc.co.uk**: BBC iPlayer
  - **bbc.co.uk:article**: BBC articles
  - **bbc.co.uk:iplayer:playlist**
  - **bbc.co.uk:playlist**
- - **Beam:live**
  - **Beatport**
  - **Beeg**
  - **BehindKink**
  - **chirbit**
  - **chirbit:profile**
  - **Cinchcast**
- - **Clipfish**
+ - **CJSW**
  - **cliphunter**
+ - **Clippit**
  - **ClipRs**
  - **Clipsyndicate**
  - **CloserToTruth**
  - **DiscoveryVR**
  - **Disney**
  - **Dotsub**
+ - **DouyuShow**
  - **DouyuTV**: 斗鱼
  - **DPlay**
  - **DPlayIt**
  - **EbaumsWorld**
  - **EchoMsk**
  - **egghead:course**: egghead.io course
+ - **egghead:lesson**: egghead.io lesson
  - **eHow**
  - **Einthusan**
  - **eitb.tv**
  - **france2.fr:generation-quoi**
  - **FranceCulture**
  - **FranceInter**
- - **francetv**: France 2, 3, 4, 5 and Ô
+ - **FranceTV**
+ - **FranceTVEmbed**
  - **francetvinfo.fr**
  - **Freesound**
  - **freespeech.org**
  - **Funimation**
  - **FunnyOrDie**
  - **Fusion**
+ - **Fux**
  - **FXNetworks**
  - **GameInformer**
  - **GameOne**
  - **Go**
  - **Go90**
  - **GodTube**
- - **GodTV**
  - **Golem**
  - **GoogleDrive**
  - **Goshgay**
  - **IPrima**
  - **iqiyi**: 爱奇艺
  - **Ir90Tv**
+ - **ITTF**
  - **ITV**
  - **ivi**: ivi.ru
  - **ivi:compilation**: ivi.ru compilations
  - **Jamendo**
  - **JamendoAlbum**
  - **JeuxVideo**
+ - **Joj**
  - **Jove**
  - **jpopsuki.tv**
  - **JWPlatform**
  - **limelight:channel_list**
  - **LiTV**
  - **LiveLeak**
+ - **LiveLeakEmbed**
  - **livestream**
  - **livestream:original**
  - **LnkGo**
  - **MakerTV**
  - **mangomolo:live**
  - **mangomolo:video**
+ - **ManyVids**
  - **MatchTV**
  - **MDR**: MDR.DE and KiKA
  - **media.ccc.de**
  - **Medialaan**
+ - **Mediaset**
  - **Medici**
+ - **megaphone.fm**: megaphone.fm embedded players
  - **Meipai**: 美拍
  - **MelonVOD**
  - **META**
  - **mixcloud:playlist**
  - **mixcloud:stream**
  - **mixcloud:user**
+ - **Mixer:live**
+ - **Mixer:vod**
  - **MLB**
  - **Mnet**
  - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net
  - **MovieFap**
  - **Moviezine**
  - **MovingImage**
- - **MPORA**
  - **MSN**
  - **mtg**: MTG services
  - **mtv**
  - **netease:song**: 网易云音乐
  - **Netzkino**
  - **Newgrounds**
+ - **NewgroundsPlaylist**
  - **Newstube**
  - **NextMedia**: 蘋果日報
  - **NextMediaActionNews**: 蘋果日報 - 動新聞
  - **NextTV**: 壹電視
+ - **Nexx**
+ - **NexxEmbed**
  - **nfb**: National Film Board of Canada
  - **nfl.com**
  - **NhkVod**
  - **nhl.com:videocenter:category**: NHL videocenter category
  - **nick.com**
  - **nick.de**
+ - **nickelodeonru**
  - **nicknight**
  - **niconico**: ニコニコ動画
  - **NiconicoPlaylist**
  - **NJPWWorld**: 新日本プロレスワールド
  - **NobelPrize**
  - **Noco**
+ - **NonkTube**
+ - **Noovo**
  - **Normalboots**
  - **NosVideo**
  - **Nova**: TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz
  - **NowTVList**
  - **nowvideo**: NowVideo
  - **Noz**
- - **npo**: npo.nl and ntr.nl
+ - **npo**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl
  - **npo.nl:live**
  - **npo.nl:radio**
  - **npo.nl:radio:fragment**
  - **Patreon**
  - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET  (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC)
  - **pcmag**
+ - **PearVideo**
  - **People**
  - **periscope**: Periscope
  - **periscope:user**: Periscope user videos
  - **pluralsight**
  - **pluralsight:course**
  - **plus.google**: Google Plus
- - **pluzz.francetv.fr**
  - **podomatic**
  - **Pokemon**
  - **PolskieRadio**
  - **PolskieRadioCategory**
  - **PornCom**
+ - **PornerBros**
  - **PornFlip**
  - **PornHd**
  - **PornHub**: PornHub and Thumbzilla
  - **Pornotube**
  - **PornoVoisines**
  - **PornoXO**
+ - **PornTube**
  - **PressTV**
  - **PrimeShareTV**
  - **PromptFile**
  - **RadioJavan**
  - **Rai**
  - **RaiPlay**
+ - **RaiPlayLive**
  - **RBMARadio**
  - **RDS**: RDS.ca
  - **RedBullTV**
+ - **Reddit**
+ - **RedditR**
  - **RedTube**
  - **RegioTV**
  - **RENTV**
  - **rutube:embed**: Rutube embedded videos
  - **rutube:movie**: Rutube movies
  - **rutube:person**: Rutube person videos
+ - **rutube:playlist**: Rutube playlists
  - **RUTV**: RUTV.RU
  - **Ruutu**
+ - **Ruv**
  - **safari**: safaribooksonline.com online video
  - **safari:api**
  - **safari:course**: safaribooksonline.com online courses
  - **soundcloud:playlist**
  - **soundcloud:search**: Soundcloud search
  - **soundcloud:set**
+ - **soundcloud:trackstation**
  - **soundcloud:user**
  - **soundgasm**
  - **soundgasm:profile**
  - **Tagesschau**
  - **tagesschau:player**
  - **Tass**
- - **TBS**
+ - **TastyTrade**
+ - **TBS** (Currently broken)
  - **TDSLifeway**
  - **teachertube**: teachertube.com videos
  - **teachertube:user:collection**: teachertube.com user and collection videos
  - **TeachingChannel**
  - **Teamcoco**
- - **TeamFourStar**
  - **TechTalks**
  - **techtv.mit.edu**
  - **ted**
  - **ToonGoggles**
  - **Tosh**: Tosh.0
  - **tou.tv**
- - **Toypics**: Toypics user profile
+ - **Toypics**: Toypics video
  - **ToypicsUser**: Toypics user profile
  - **TrailerAddict** (Currently broken)
  - **Trilulilu**
  - **TruTV**
  - **Tube8**
  - **TubiTv**
- - **tudou**
- - **tudou:album**
- - **tudou:playlist**
  - **Tumblr**
  - **tunein:clip**
  - **tunein:program**
  - **uol.com.br**
  - **uplynk**
  - **uplynk:preplay**
+ - **Upskill**
+ - **UpskillCourse**
  - **Urort**: NRK P3 Urørt
  - **URPlay**
  - **USANetwork**
  - **VGTV**: VGTV, BTTV, FTV, Aftenposten and Aftonbladet
  - **vh1.com**
  - **Viafree**
- - **Vice**
+ - **vice**
+ - **vice:article**
+ - **vice:show**
  - **Viceland**
- - **ViceShow**
  - **Vidbit**
  - **Viddler**
  - **Videa**
  - **vk:wallpost**
  - **vlive**
  - **vlive:channel**
+ - **vlive:playlist**
  - **Vodlocker**
  - **VODPl**
  - **VODPlatform**
  - **VoiceRepublic**
+ - **Voot**
  - **VoxMedia**
  - **Vporn**
- - **vpro**: npo.nl and ntr.nl
+ - **vpro**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl
  - **Vrak**
  - **VRT**: deredactie.be, sporza.be, cobra.be and cobra.canvas.be
  - **vrv**
  - **washingtonpost**
  - **washingtonpost:article**
  - **wat.tv**
+ - **WatchBox**
  - **WatchIndianPorn**: Watch Indian Porn
  - **WDR**
  - **wdr:mobile**
  - **wholecloud**: WholeCloud
  - **Wimp**
  - **Wistia**
- - **wnl**: npo.nl and ntr.nl
+ - **wnl**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl
  - **WorldStarHipHop**
  - **wrzuta.pl**
  - **wrzuta.pl:playlist**
  - **WSJArticle**
  - **XBef**
  - **XboxClips**
- - **XFileShare**: XFileShare based sites: DaClips, FileHoot, GorillaVid, MovPod, PowerWatch, Rapidvideo.ws, TheVideoBee, Vidto, Streamin.To, XVIDSTAGE, Vid ABC, VidBom, vidlo
+ - **XFileShare**: XFileShare based sites: DaClips, FileHoot, GorillaVid, MovPod, PowerWatch, Rapidvideo.ws, TheVideoBee, Vidto, Streamin.To, XVIDSTAGE, Vid ABC, VidBom, vidlo, RapidVideo.TV, FastVideo.me
  - **XHamster**
  - **XHamsterEmbed**
  - **xiami:album**: 虾米音乐 - 专辑
  - **XVideos**
  - **XXXYMovies**
  - **Yahoo**: Yahoo screen and movies
- - **Yam**: 蕃薯藤yam天空部落
+ - **YandexDisk**
  - **yandexmusic:album**: Яндекс.Музыка - Альбом
  - **yandexmusic:playlist**: Яндекс.Музыка - Плейлист
  - **yandexmusic:track**: Яндекс.Музыка - Трек
  - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword)
  - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication)
  - **Zapiks**
+ - **Zaq1**
  - **ZDF**
  - **ZDFChannel**
  - **zingmp3**: mp3.zing.vn
index 6f52e11f7b4b58c388e1a2bde12a64a76c62d107..f18a823fcf834e4bbae95e9d72f9f3821c307c2b 100644 (file)
@@ -10,6 +10,7 @@ import unittest
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 from test.helper import FakeYDL, expect_dict, expect_value
+from youtube_dl.compat import compat_etree_fromstring
 from youtube_dl.extractor.common import InfoExtractor
 from youtube_dl.extractor import YoutubeIE, get_info_extractor
 from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError
@@ -488,6 +489,91 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
                 self.ie._sort_formats(formats)
                 expect_value(self, formats, expected_formats, None)
 
+    def test_parse_mpd_formats(self):
+        _TEST_CASES = [
+            (
+                # https://github.com/rg3/youtube-dl/issues/13919
+                'float_duration',
+                'http://unknown/manifest.mpd',
+                [{
+                    'manifest_url': 'http://unknown/manifest.mpd',
+                    'ext': 'mp4',
+                    'format_id': '318597',
+                    'format_note': 'DASH video',
+                    'protocol': 'http_dash_segments',
+                    'acodec': 'none',
+                    'vcodec': 'avc1.42001f',
+                    'tbr': 318.597,
+                    'width': 340,
+                    'height': 192,
+                }, {
+                    'manifest_url': 'http://unknown/manifest.mpd',
+                    'ext': 'mp4',
+                    'format_id': '638590',
+                    'format_note': 'DASH video',
+                    'protocol': 'http_dash_segments',
+                    'acodec': 'none',
+                    'vcodec': 'avc1.42001f',
+                    'tbr': 638.59,
+                    'width': 512,
+                    'height': 288,
+                }, {
+                    'manifest_url': 'http://unknown/manifest.mpd',
+                    'ext': 'mp4',
+                    'format_id': '1022565',
+                    'format_note': 'DASH video',
+                    'protocol': 'http_dash_segments',
+                    'acodec': 'none',
+                    'vcodec': 'avc1.4d001f',
+                    'tbr': 1022.565,
+                    'width': 688,
+                    'height': 384,
+                }, {
+                    'manifest_url': 'http://unknown/manifest.mpd',
+                    'ext': 'mp4',
+                    'format_id': '2046506',
+                    'format_note': 'DASH video',
+                    'protocol': 'http_dash_segments',
+                    'acodec': 'none',
+                    'vcodec': 'avc1.4d001f',
+                    'tbr': 2046.506,
+                    'width': 1024,
+                    'height': 576,
+                }, {
+                    'manifest_url': 'http://unknown/manifest.mpd',
+                    'ext': 'mp4',
+                    'format_id': '3998017',
+                    'format_note': 'DASH video',
+                    'protocol': 'http_dash_segments',
+                    'acodec': 'none',
+                    'vcodec': 'avc1.640029',
+                    'tbr': 3998.017,
+                    'width': 1280,
+                    'height': 720,
+                }, {
+                    'manifest_url': 'http://unknown/manifest.mpd',
+                    'ext': 'mp4',
+                    'format_id': '5997485',
+                    'format_note': 'DASH video',
+                    'protocol': 'http_dash_segments',
+                    'acodec': 'none',
+                    'vcodec': 'avc1.640032',
+                    'tbr': 5997.485,
+                    'width': 1920,
+                    'height': 1080,
+                }]
+            ),
+        ]
+
+        for mpd_file, mpd_url, expected_formats in _TEST_CASES:
+            with io.open('./test/testdata/mpd/%s.mpd' % mpd_file,
+                         mode='r', encoding='utf-8') as f:
+                formats = self.ie._parse_mpd_formats(
+                    compat_etree_fromstring(f.read().encode('utf-8')),
+                    mpd_url=mpd_url)
+                self.ie._sort_formats(formats)
+                expect_value(self, formats, expected_formats, None)
+
 
 if __name__ == '__main__':
     unittest.main()
index 75945e38f837fc11496856180336fc2828121cca..e70cbcd375a4670bb586612ccaa107605dc985dc 100644 (file)
@@ -41,6 +41,7 @@ def _make_result(formats, **kwargs):
         'id': 'testid',
         'title': 'testttitle',
         'extractor': 'testex',
+        'extractor_key': 'TestEx',
     }
     res.update(**kwargs)
     return res
@@ -370,6 +371,19 @@ class TestFormatSelection(unittest.TestCase):
         ydl = YDL({'format': 'best[height>360]'})
         self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy())
 
+    def test_format_selection_issue_10083(self):
+        # See https://github.com/rg3/youtube-dl/issues/10083
+        formats = [
+            {'format_id': 'regular', 'height': 360, 'url': TEST_URL},
+            {'format_id': 'video', 'height': 720, 'acodec': 'none', 'url': TEST_URL},
+            {'format_id': 'audio', 'vcodec': 'none', 'url': TEST_URL},
+        ]
+        info_dict = _make_result(formats)
+
+        ydl = YDL({'format': 'best[height>360]/bestvideo[height>360]+bestaudio'})
+        ydl.process_ie_result(info_dict.copy())
+        self.assertEqual(ydl.downloaded_info_dicts[0]['format_id'], 'video+audio')
+
     def test_invalid_format_specs(self):
         def assert_syntax_error(format_spec):
             ydl = YDL({'format': format_spec})
@@ -448,6 +462,17 @@ class TestFormatSelection(unittest.TestCase):
             pass
         self.assertEqual(ydl.downloaded_info_dicts, [])
 
+    def test_default_format_spec(self):
+        ydl = YDL({'simulate': True})
+        self.assertEqual(ydl._default_format_spec({}), 'bestvideo+bestaudio/best')
+
+        ydl = YDL({'outtmpl': '-'})
+        self.assertEqual(ydl._default_format_spec({}), 'best')
+
+        ydl = YDL({})
+        self.assertEqual(ydl._default_format_spec({}, download=False), 'bestvideo+bestaudio/best')
+        self.assertEqual(ydl._default_format_spec({'is_live': True}), 'best')
+
 
 class TestYoutubeDL(unittest.TestCase):
     def test_subtitles(self):
@@ -527,6 +552,8 @@ class TestYoutubeDL(unittest.TestCase):
             'ext': 'mp4',
             'width': None,
             'height': 1080,
+            'title1': '$PATH',
+            'title2': '%PATH%',
         }
 
         def fname(templ):
@@ -545,10 +572,14 @@ class TestYoutubeDL(unittest.TestCase):
         self.assertEqual(fname('%(height)0 6d.%(ext)s'), ' 01080.mp4')
         self.assertEqual(fname('%(height)0   6d.%(ext)s'), ' 01080.mp4')
         self.assertEqual(fname('%(height)   0   6d.%(ext)s'), ' 01080.mp4')
+        self.assertEqual(fname('%%'), '%')
+        self.assertEqual(fname('%%%%'), '%%')
         self.assertEqual(fname('%%(height)06d.%(ext)s'), '%(height)06d.mp4')
         self.assertEqual(fname('%(width)06d.%(ext)s'), 'NA.mp4')
         self.assertEqual(fname('%(width)06d.%%(ext)s'), 'NA.%(ext)s')
         self.assertEqual(fname('%%(width)06d.%(ext)s'), '%(width)06d.mp4')
+        self.assertEqual(fname('Hello %(title1)s'), 'Hello $PATH')
+        self.assertEqual(fname('Hello %(title2)s'), 'Hello %PATH%')
 
     def test_format_note(self):
         ydl = YoutubeDL()
@@ -755,7 +786,8 @@ class TestYoutubeDL(unittest.TestCase):
                     '_type': 'url_transparent',
                     'url': 'foo2:',
                     'ie_key': 'Foo2',
-                    'title': 'foo1 title'
+                    'title': 'foo1 title',
+                    'id': 'foo1_id',
                 }
 
         class Foo2IE(InfoExtractor):
@@ -781,6 +813,9 @@ class TestYoutubeDL(unittest.TestCase):
         downloaded = ydl.downloaded_info_dicts[0]
         self.assertEqual(downloaded['url'], TEST_URL)
         self.assertEqual(downloaded['title'], 'foo1 title')
+        self.assertEqual(downloaded['id'], 'testid')
+        self.assertEqual(downloaded['extractor'], 'testex')
+        self.assertEqual(downloaded['extractor_key'], 'TestEx')
 
 
 if __name__ == '__main__':
diff --git a/test/test_options.py b/test/test_options.py
new file mode 100644 (file)
index 0000000..3a25a6b
--- /dev/null
@@ -0,0 +1,26 @@
+# coding: utf-8
+
+from __future__ import unicode_literals
+
+# Allow direct execution
+import os
+import sys
+import unittest
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from youtube_dl.options import _hide_login_info
+
+
+class TestOptions(unittest.TestCase):
+    def test_hide_login_info(self):
+        self.assertEqual(_hide_login_info(['-u', 'foo', '-p', 'bar']),
+                         ['-u', 'PRIVATE', '-p', 'PRIVATE'])
+        self.assertEqual(_hide_login_info(['-u']), ['-u'])
+        self.assertEqual(_hide_login_info(['-u', 'foo', '-u', 'bar']),
+                         ['-u', 'PRIVATE', '-u', 'PRIVATE'])
+        self.assertEqual(_hide_login_info(['--username=foo']),
+                         ['--username=PRIVATE'])
+
+
+if __name__ == '__main__':
+    unittest.main()
index 4cd8188506ad296a407622b759d0ccf89c3c7117..e50f3764e57050c560365eb566979e171538985b 100644 (file)
@@ -44,6 +44,7 @@ from youtube_dl.utils import (
     limit_length,
     mimetype2ext,
     month_by_name,
+    multipart_encode,
     ohdave_rsa_encrypt,
     OnDemandPagedList,
     orderedSet,
@@ -97,6 +98,7 @@ from youtube_dl.compat import (
     compat_chr,
     compat_etree_fromstring,
     compat_getenv,
+    compat_os_name,
     compat_setenv,
     compat_urlparse,
     compat_parse_qs,
@@ -277,6 +279,7 @@ class TestUtil(unittest.TestCase):
         self.assertEqual(unescapeHTML('&#47;'), '/')
         self.assertEqual(unescapeHTML('&eacute;'), 'é')
         self.assertEqual(unescapeHTML('&#2013266066;'), '&#2013266066;')
+        self.assertEqual(unescapeHTML('&a&quot;'), '&a"')
         # HTML5 entities
         self.assertEqual(unescapeHTML('&period;&apos;'), '.\'')
 
@@ -338,6 +341,8 @@ class TestUtil(unittest.TestCase):
         self.assertEqual(unified_timestamp('UNKNOWN DATE FORMAT'), None)
         self.assertEqual(unified_timestamp('May 16, 2016 11:15 PM'), 1463440500)
         self.assertEqual(unified_timestamp('Feb 7, 2016 at 6:35 pm'), 1454870100)
+        self.assertEqual(unified_timestamp('2017-03-30T17:52:41Q'), 1490896361)
+        self.assertEqual(unified_timestamp('Sep 11, 2013 | 5:49 AM'), 1378878540)
 
     def test_determine_ext(self):
         self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4')
@@ -445,7 +450,9 @@ class TestUtil(unittest.TestCase):
 
     def test_shell_quote(self):
         args = ['ffmpeg', '-i', encodeFilename('ñ€ß\'.mp4')]
-        self.assertEqual(shell_quote(args), """ffmpeg -i 'ñ€ß'"'"'.mp4'""")
+        self.assertEqual(
+            shell_quote(args),
+            """ffmpeg -i 'ñ€ß'"'"'.mp4'""" if compat_os_name != 'nt' else '''ffmpeg -i "ñ€ß'.mp4"''')
 
     def test_str_to_int(self):
         self.assertEqual(str_to_int('123,456'), 123456)
@@ -619,6 +626,16 @@ class TestUtil(unittest.TestCase):
             'http://example.com/path', {'test': '第二行тест'})),
             query_dict('http://example.com/path?test=%E7%AC%AC%E4%BA%8C%E8%A1%8C%D1%82%D0%B5%D1%81%D1%82'))
 
+    def test_multipart_encode(self):
+        self.assertEqual(
+            multipart_encode({b'field': b'value'}, boundary='AAAAAA')[0],
+            b'--AAAAAA\r\nContent-Disposition: form-data; name="field"\r\n\r\nvalue\r\n--AAAAAA--\r\n')
+        self.assertEqual(
+            multipart_encode({'欄位'.encode('utf-8'): '值'.encode('utf-8')}, boundary='AAAAAA')[0],
+            b'--AAAAAA\r\nContent-Disposition: form-data; name="\xe6\xac\x84\xe4\xbd\x8d"\r\n\r\n\xe5\x80\xbc\r\n--AAAAAA--\r\n')
+        self.assertRaises(
+            ValueError, multipart_encode, {b'field': b'value'}, boundary='value')
+
     def test_dict_get(self):
         FALSE_VALUES = {
             'none': None,
@@ -666,6 +683,14 @@ class TestUtil(unittest.TestCase):
         d = json.loads(stripped)
         self.assertEqual(d, {'status': 'success'})
 
+        stripped = strip_jsonp('window.cb && window.cb({"status": "success"});')
+        d = json.loads(stripped)
+        self.assertEqual(d, {'status': 'success'})
+
+        stripped = strip_jsonp('window.cb && cb({"status": "success"});')
+        d = json.loads(stripped)
+        self.assertEqual(d, {'status': 'success'})
+
     def test_uppercase_escape(self):
         self.assertEqual(uppercase_escape('aä'), 'aä')
         self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐')
@@ -895,10 +920,13 @@ class TestUtil(unittest.TestCase):
             supports_outside_bmp = False
         if supports_outside_bmp:
             self.assertEqual(extract_attributes('<e x="Smile &#128512;!">'), {'x': 'Smile \U0001f600!'})
+        # Malformed HTML should not break attributes extraction on older Python
+        self.assertEqual(extract_attributes('<mal"formed/>'), {})
 
     def test_clean_html(self):
         self.assertEqual(clean_html('a:\nb'), 'a: b')
         self.assertEqual(clean_html('a:\n   "b"'), 'a:    "b"')
+        self.assertEqual(clean_html('a<br>\xa0b'), 'a\nb')
 
     def test_intlist_to_bytes(self):
         self.assertEqual(
@@ -908,7 +936,7 @@ class TestUtil(unittest.TestCase):
     def test_args_to_str(self):
         self.assertEqual(
             args_to_str(['foo', 'ba/r', '-baz', '2 be', '']),
-            'foo ba/r -baz \'2 be\' \'\''
+            'foo ba/r -baz \'2 be\' \'\'' if compat_os_name != 'nt' else 'foo ba/r -baz "2 be" ""'
         )
 
     def test_parse_filesize(self):
@@ -1155,6 +1183,10 @@ part 3</font></u>
             cli_bool_option(
                 {'nocheckcertificate': False}, '--check-certificate', 'nocheckcertificate', 'false', 'true', '='),
             ['--check-certificate=true'])
+        self.assertEqual(
+            cli_bool_option(
+                {}, '--check-certificate', 'nocheckcertificate', 'false', 'true', '='),
+            [])
 
     def test_ohdave_rsa_encrypt(self):
         N = 0xab86b6371b5318aaa1d3c9e612a9f1264f372323c8c0f19875b5fc3b3fd3afcc1e5bec527aa94bfa85bffc157e4245aebda05389a5357b75115ac94f074aefcd
@@ -1204,6 +1236,12 @@ part 3</font></u>
         self.assertEqual(get_element_by_attribute('class', 'foo', html), None)
         self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None)
 
+        html = '''
+            <div itemprop="author" itemscope>foo</div>
+        '''
+
+        self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo')
+
     def test_get_elements_by_class(self):
         html = '''
             <span class="foo bar">nice</span><span class="foo bar">also nice</span>
diff --git a/test/test_youtube_chapters.py b/test/test_youtube_chapters.py
new file mode 100644 (file)
index 0000000..324ca85
--- /dev/null
@@ -0,0 +1,275 @@
+#!/usr/bin/env python
+# coding: utf-8
+from __future__ import unicode_literals
+
+# Allow direct execution
+import os
+import sys
+import unittest
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from test.helper import expect_value
+from youtube_dl.extractor import YoutubeIE
+
+
+class TestYoutubeChapters(unittest.TestCase):
+
+    _TEST_CASES = [
+        (
+            # https://www.youtube.com/watch?v=A22oy8dFjqc
+            # pattern: 00:00 - <title>
+            '''This is the absolute ULTIMATE experience of Queen's set at LIVE AID, this is the best video mixed to the absolutely superior stereo radio broadcast. This vastly superior audio mix takes a huge dump on all of the official mixes. Best viewed in 1080p. ENJOY! ***MAKE SURE TO READ THE DESCRIPTION***<br /><a href="#" onclick="yt.www.watch.player.seekTo(00*60+36);return false;">00:36</a> - Bohemian Rhapsody<br /><a href="#" onclick="yt.www.watch.player.seekTo(02*60+42);return false;">02:42</a> - Radio Ga Ga<br /><a href="#" onclick="yt.www.watch.player.seekTo(06*60+53);return false;">06:53</a> - Ay Oh!<br /><a href="#" onclick="yt.www.watch.player.seekTo(07*60+34);return false;">07:34</a> - Hammer To Fall<br /><a href="#" onclick="yt.www.watch.player.seekTo(12*60+08);return false;">12:08</a> - Crazy Little Thing Called Love<br /><a href="#" onclick="yt.www.watch.player.seekTo(16*60+03);return false;">16:03</a> - We Will Rock You<br /><a href="#" onclick="yt.www.watch.player.seekTo(17*60+18);return false;">17:18</a> - We Are The Champions<br /><a href="#" onclick="yt.www.watch.player.seekTo(21*60+12);return false;">21:12</a> - Is This The World We Created...?<br /><br />Short song analysis:<br /><br />- "Bohemian Rhapsody": Although it's a short medley version, it's one of the best performances of the ballad section, with Freddie nailing the Bb4s with the correct studio phrasing (for the first time ever!).<br /><br />- "Radio Ga Ga": Although it's missing one chorus, this is one of - if not the best - the best versions ever, Freddie nails all the Bb4s and sounds very clean! Spike Edney's Roland Jupiter 8 also really shines through on this mix, compared to the DVD releases!<br /><br />- "Audience Improv": A great improv, Freddie sounds strong and confident. You gotta love when he sustains that A4 for 4 seconds!<br /><br />- "Hammer To Fall": Despite missing a verse and a chorus, it's a strong version (possibly the best ever). Freddie sings the song amazingly, and even ad-libs a C#5 and a C5! Also notice how heavy Brian's guitar sounds compared to the thin DVD mixes - it roars!<br /><br />- "Crazy Little Thing Called Love": A great version, the crowd loves the song, the jam is great as well! Only downside to this is the slight feedback issues.<br /><br />- "We Will Rock You": Although cut down to the 1st verse and chorus, Freddie sounds strong. He nails the A4, and the solo from Dr. May is brilliant!<br /><br />- "We Are the Champions": Perhaps the high-light of the performance - Freddie is very daring on this version, he sustains the pre-chorus Bb4s, nails the 1st C5, belts great A4s, but most importantly: He nails the chorus Bb4s, in all 3 choruses! This is the only time he has ever done so! It has to be said though, the last one sounds a bit rough, but that's a side effect of belting high notes for the past 18 minutes, with nodules AND laryngitis!<br /><br />- "Is This The World We Created... ?": Freddie and Brian perform a beautiful version of this, and it is one of the best versions ever. It's both sad and hilarious that a couple of BBC engineers are talking over the song, one of them being completely oblivious of the fact that he is interrupting the performance, on live television... Which was being televised to almost 2 billion homes.<br /><br /><br />All rights go to their respective owners!<br />-----Copyright Disclaimer Under Section 107 of the Copyright Act 1976, allowance is made for fair use for purposes such as criticism, comment, news reporting, teaching, scholarship, and research. Fair use is a use permitted by copyright statute that might otherwise be infringing. Non-profit, educational or personal use tips the balance in favor of fair use''',
+            1477,
+            [{
+                'start_time': 36,
+                'end_time': 162,
+                'title': 'Bohemian Rhapsody',
+            }, {
+                'start_time': 162,
+                'end_time': 413,
+                'title': 'Radio Ga Ga',
+            }, {
+                'start_time': 413,
+                'end_time': 454,
+                'title': 'Ay Oh!',
+            }, {
+                'start_time': 454,
+                'end_time': 728,
+                'title': 'Hammer To Fall',
+            }, {
+                'start_time': 728,
+                'end_time': 963,
+                'title': 'Crazy Little Thing Called Love',
+            }, {
+                'start_time': 963,
+                'end_time': 1038,
+                'title': 'We Will Rock You',
+            }, {
+                'start_time': 1038,
+                'end_time': 1272,
+                'title': 'We Are The Champions',
+            }, {
+                'start_time': 1272,
+                'end_time': 1477,
+                'title': 'Is This The World We Created...?',
+            }]
+        ),
+        (
+            # https://www.youtube.com/watch?v=ekYlRhALiRQ
+            # pattern: <num>. <title> 0:00
+            '1.  Those Beaten Paths of Confusion <a href="#" onclick="yt.www.watch.player.seekTo(0*60+00);return false;">0:00</a><br />2.  Beyond the Shadows of Emptiness & Nothingness <a href="#" onclick="yt.www.watch.player.seekTo(11*60+47);return false;">11:47</a><br />3.  Poison Yourself...With Thought <a href="#" onclick="yt.www.watch.player.seekTo(26*60+30);return false;">26:30</a><br />4.  The Agents of Transformation <a href="#" onclick="yt.www.watch.player.seekTo(35*60+57);return false;">35:57</a><br />5.  Drowning in the Pain of Consciousness <a href="#" onclick="yt.www.watch.player.seekTo(44*60+32);return false;">44:32</a><br />6.  Deny the Disease of Life <a href="#" onclick="yt.www.watch.player.seekTo(53*60+07);return false;">53:07</a><br /><br />More info/Buy: http://crepusculonegro.storenvy.com/products/257645-cn-03-arizmenda-within-the-vacuum-of-infinity<br /><br />No copyright is intended. The rights to this video are assumed by the owner and its affiliates.',
+            4009,
+            [{
+                'start_time': 0,
+                'end_time': 707,
+                'title': '1. Those Beaten Paths of Confusion',
+            }, {
+                'start_time': 707,
+                'end_time': 1590,
+                'title': '2. Beyond the Shadows of Emptiness & Nothingness',
+            }, {
+                'start_time': 1590,
+                'end_time': 2157,
+                'title': '3. Poison Yourself...With Thought',
+            }, {
+                'start_time': 2157,
+                'end_time': 2672,
+                'title': '4. The Agents of Transformation',
+            }, {
+                'start_time': 2672,
+                'end_time': 3187,
+                'title': '5. Drowning in the Pain of Consciousness',
+            }, {
+                'start_time': 3187,
+                'end_time': 4009,
+                'title': '6. Deny the Disease of Life',
+            }]
+        ),
+        (
+            # https://www.youtube.com/watch?v=WjL4pSzog9w
+            # pattern: 00:00 <title>
+            '<a href="https://arizmenda.bandcamp.com/merch/despairs-depths-descended-cd" class="yt-uix-servicelink  " data-target-new-window="True" data-servicelink="CDAQ6TgYACITCNf1raqT2dMCFdRjGAod_o0CBSj4HQ" data-url="https://arizmenda.bandcamp.com/merch/despairs-depths-descended-cd" rel="nofollow noopener" target="_blank">https://arizmenda.bandcamp.com/merch/...</a><br /><br /><a href="#" onclick="yt.www.watch.player.seekTo(00*60+00);return false;">00:00</a> Christening Unborn Deformities <br /><a href="#" onclick="yt.www.watch.player.seekTo(07*60+08);return false;">07:08</a> Taste of Purity<br /><a href="#" onclick="yt.www.watch.player.seekTo(16*60+16);return false;">16:16</a> Sculpting Sins of a Universal Tongue<br /><a href="#" onclick="yt.www.watch.player.seekTo(24*60+45);return false;">24:45</a> Birth<br /><a href="#" onclick="yt.www.watch.player.seekTo(31*60+24);return false;">31:24</a> Neves<br /><a href="#" onclick="yt.www.watch.player.seekTo(37*60+55);return false;">37:55</a> Libations in Limbo',
+            2705,
+            [{
+                'start_time': 0,
+                'end_time': 428,
+                'title': 'Christening Unborn Deformities',
+            }, {
+                'start_time': 428,
+                'end_time': 976,
+                'title': 'Taste of Purity',
+            }, {
+                'start_time': 976,
+                'end_time': 1485,
+                'title': 'Sculpting Sins of a Universal Tongue',
+            }, {
+                'start_time': 1485,
+                'end_time': 1884,
+                'title': 'Birth',
+            }, {
+                'start_time': 1884,
+                'end_time': 2275,
+                'title': 'Neves',
+            }, {
+                'start_time': 2275,
+                'end_time': 2705,
+                'title': 'Libations in Limbo',
+            }]
+        ),
+        (
+            # https://www.youtube.com/watch?v=o3r1sn-t3is
+            # pattern: <title> 00:00 <note>
+            'Download this show in MP3: <a href="http://sh.st/njZKK" class="yt-uix-servicelink  " data-url="http://sh.st/njZKK" data-target-new-window="True" data-servicelink="CDAQ6TgYACITCK3j8_6o2dMCFVDCGAoduVAKKij4HQ" rel="nofollow noopener" target="_blank">http://sh.st/njZKK</a><br /><br />Setlist:<br />I-E-A-I-A-I-O <a href="#" onclick="yt.www.watch.player.seekTo(00*60+45);return false;">00:45</a><br />Suite-Pee <a href="#" onclick="yt.www.watch.player.seekTo(4*60+26);return false;">4:26</a>  (Incomplete)<br />Attack <a href="#" onclick="yt.www.watch.player.seekTo(5*60+31);return false;">5:31</a> (First live performance since 2011)<br />Prison Song <a href="#" onclick="yt.www.watch.player.seekTo(8*60+42);return false;">8:42</a><br />Know <a href="#" onclick="yt.www.watch.player.seekTo(12*60+32);return false;">12:32</a> (First live performance since 2011)<br />Aerials <a href="#" onclick="yt.www.watch.player.seekTo(15*60+32);return false;">15:32</a><br />Soldier Side - Intro <a href="#" onclick="yt.www.watch.player.seekTo(19*60+13);return false;">19:13</a><br />B.Y.O.B. <a href="#" onclick="yt.www.watch.player.seekTo(20*60+09);return false;">20:09</a><br />Soil <a href="#" onclick="yt.www.watch.player.seekTo(24*60+32);return false;">24:32</a><br />Darts <a href="#" onclick="yt.www.watch.player.seekTo(27*60+48);return false;">27:48</a><br />Radio/Video <a href="#" onclick="yt.www.watch.player.seekTo(30*60+38);return false;">30:38</a><br />Hypnotize <a href="#" onclick="yt.www.watch.player.seekTo(35*60+05);return false;">35:05</a><br />Temper <a href="#" onclick="yt.www.watch.player.seekTo(38*60+08);return false;">38:08</a> (First live performance since 1999)<br />CUBErt <a href="#" onclick="yt.www.watch.player.seekTo(41*60+00);return false;">41:00</a><br />Needles <a href="#" onclick="yt.www.watch.player.seekTo(42*60+57);return false;">42:57</a><br />Deer Dance <a href="#" onclick="yt.www.watch.player.seekTo(46*60+27);return false;">46:27</a><br />Bounce <a href="#" onclick="yt.www.watch.player.seekTo(49*60+38);return false;">49:38</a><br />Suggestions <a href="#" onclick="yt.www.watch.player.seekTo(51*60+25);return false;">51:25</a><br />Psycho <a href="#" onclick="yt.www.watch.player.seekTo(53*60+52);return false;">53:52</a><br />Chop Suey! <a href="#" onclick="yt.www.watch.player.seekTo(58*60+13);return false;">58:13</a><br />Lonely Day <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+01*60+15);return false;">1:01:15</a><br />Question! <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+04*60+14);return false;">1:04:14</a><br />Lost in Hollywood <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+08*60+10);return false;">1:08:10</a><br />Vicinity of Obscenity  <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+13*60+40);return false;">1:13:40</a>(First live performance since 2012)<br />Forest <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+16*60+17);return false;">1:16:17</a><br />Cigaro <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+20*60+02);return false;">1:20:02</a><br />Toxicity <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+23*60+57);return false;">1:23:57</a>(with Chino Moreno)<br />Sugar <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+27*60+53);return false;">1:27:53</a>',
+            5640,
+            [{
+                'start_time': 45,
+                'end_time': 266,
+                'title': 'I-E-A-I-A-I-O',
+            }, {
+                'start_time': 266,
+                'end_time': 331,
+                'title': 'Suite-Pee (Incomplete)',
+            }, {
+                'start_time': 331,
+                'end_time': 522,
+                'title': 'Attack (First live performance since 2011)',
+            }, {
+                'start_time': 522,
+                'end_time': 752,
+                'title': 'Prison Song',
+            }, {
+                'start_time': 752,
+                'end_time': 932,
+                'title': 'Know (First live performance since 2011)',
+            }, {
+                'start_time': 932,
+                'end_time': 1153,
+                'title': 'Aerials',
+            }, {
+                'start_time': 1153,
+                'end_time': 1209,
+                'title': 'Soldier Side - Intro',
+            }, {
+                'start_time': 1209,
+                'end_time': 1472,
+                'title': 'B.Y.O.B.',
+            }, {
+                'start_time': 1472,
+                'end_time': 1668,
+                'title': 'Soil',
+            }, {
+                'start_time': 1668,
+                'end_time': 1838,
+                'title': 'Darts',
+            }, {
+                'start_time': 1838,
+                'end_time': 2105,
+                'title': 'Radio/Video',
+            }, {
+                'start_time': 2105,
+                'end_time': 2288,
+                'title': 'Hypnotize',
+            }, {
+                'start_time': 2288,
+                'end_time': 2460,
+                'title': 'Temper (First live performance since 1999)',
+            }, {
+                'start_time': 2460,
+                'end_time': 2577,
+                'title': 'CUBErt',
+            }, {
+                'start_time': 2577,
+                'end_time': 2787,
+                'title': 'Needles',
+            }, {
+                'start_time': 2787,
+                'end_time': 2978,
+                'title': 'Deer Dance',
+            }, {
+                'start_time': 2978,
+                'end_time': 3085,
+                'title': 'Bounce',
+            }, {
+                'start_time': 3085,
+                'end_time': 3232,
+                'title': 'Suggestions',
+            }, {
+                'start_time': 3232,
+                'end_time': 3493,
+                'title': 'Psycho',
+            }, {
+                'start_time': 3493,
+                'end_time': 3675,
+                'title': 'Chop Suey!',
+            }, {
+                'start_time': 3675,
+                'end_time': 3854,
+                'title': 'Lonely Day',
+            }, {
+                'start_time': 3854,
+                'end_time': 4090,
+                'title': 'Question!',
+            }, {
+                'start_time': 4090,
+                'end_time': 4420,
+                'title': 'Lost in Hollywood',
+            }, {
+                'start_time': 4420,
+                'end_time': 4577,
+                'title': 'Vicinity of Obscenity (First live performance since 2012)',
+            }, {
+                'start_time': 4577,
+                'end_time': 4802,
+                'title': 'Forest',
+            }, {
+                'start_time': 4802,
+                'end_time': 5037,
+                'title': 'Cigaro',
+            }, {
+                'start_time': 5037,
+                'end_time': 5273,
+                'title': 'Toxicity (with Chino Moreno)',
+            }, {
+                'start_time': 5273,
+                'end_time': 5640,
+                'title': 'Sugar',
+            }]
+        ),
+        (
+            # https://www.youtube.com/watch?v=PkYLQbsqCE8
+            # pattern: <num> - <title> [<latinized title>] 0:00:00
+            '''Затемно (Zatemno) is an Obscure Black Metal Band from Russia.<br /><br />"Во прах (Vo prakh)'' Into The Ashes", Debut mini-album released may 6, 2016, by Death Knell Productions<br />Released on 6 panel digipak CD, limited to 100 copies only<br />And digital format on Bandcamp<br /><br />Tracklist<br /><br />1 - Во прах [Vo prakh] <a href="#" onclick="yt.www.watch.player.seekTo(0*3600+00*60+00);return false;">0:00:00</a><br />2 - Искупление [Iskupleniye] <a href="#" onclick="yt.www.watch.player.seekTo(0*3600+08*60+10);return false;">0:08:10</a><br />3 - Из серпов луны...[Iz serpov luny] <a href="#" onclick="yt.www.watch.player.seekTo(0*3600+14*60+30);return false;">0:14:30</a><br /><br />Links:<br /><a href="https://deathknellprod.bandcamp.com/album/--2" class="yt-uix-servicelink  " data-target-new-window="True" data-url="https://deathknellprod.bandcamp.com/album/--2" data-servicelink="CC8Q6TgYACITCNP234Kr2dMCFcNxGAodQqsIwSj4HQ" target="_blank" rel="nofollow noopener">https://deathknellprod.bandcamp.com/a...</a><br /><a href="https://www.facebook.com/DeathKnellProd/" class="yt-uix-servicelink  " data-target-new-window="True" data-url="https://www.facebook.com/DeathKnellProd/" data-servicelink="CC8Q6TgYACITCNP234Kr2dMCFcNxGAodQqsIwSj4HQ" target="_blank" rel="nofollow noopener">https://www.facebook.com/DeathKnellProd/</a><br /><br /><br />I don't have any right about this artifact, my only intention is to spread the music of the band, all rights are reserved to the Затемно (Zatemno) and his producers, Death Knell Productions.<br /><br />------------------------------------------------------------------<br /><br />Subscribe for more videos like this.<br />My link: <a href="https://web.facebook.com/AttackOfTheDragons" class="yt-uix-servicelink  " data-target-new-window="True" data-url="https://web.facebook.com/AttackOfTheDragons" data-servicelink="CC8Q6TgYACITCNP234Kr2dMCFcNxGAodQqsIwSj4HQ" target="_blank" rel="nofollow noopener">https://web.facebook.com/AttackOfTheD...</a>''',
+            1138,
+            [{
+                'start_time': 0,
+                'end_time': 490,
+                'title': '1 - Во прах [Vo prakh]',
+            }, {
+                'start_time': 490,
+                'end_time': 870,
+                'title': '2 - Искупление [Iskupleniye]',
+            }, {
+                'start_time': 870,
+                'end_time': 1138,
+                'title': '3 - Из серпов луны...[Iz serpov luny]',
+            }]
+        ),
+        (
+            # https://www.youtube.com/watch?v=xZW70zEasOk
+            # time point more than duration
+            '''● LCS Spring finals: Saturday and Sunday from <a href="#" onclick="yt.www.watch.player.seekTo(13*60+30);return false;">13:30</a> outside the venue! <br />● PAX East: Fri, Sat & Sun - more info in tomorrows video on the main channel!''',
+            283,
+            []
+        ),
+    ]
+
+    def test_youtube_chapters(self):
+        for description, duration, expected_chapters in self._TEST_CASES:
+            ie = YoutubeIE()
+            expect_value(
+                self, ie._extract_chapters(description, duration),
+                expected_chapters, None)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/testdata/mpd/float_duration.mpd b/test/testdata/mpd/float_duration.mpd
new file mode 100644 (file)
index 0000000..8dc1d2d
--- /dev/null
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<MPD xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="urn:mpeg:dash:schema:mpd:2011" type="static" minBufferTime="PT2S" profiles="urn:mpeg:dash:profile:isoff-on-demand:2011" mediaPresentationDuration="PT6014S">
+       <Period bitstreamSwitching="true">
+               <AdaptationSet mimeType="audio/mp4" codecs="mp4a.40.2" startWithSAP="1" segmentAlignment="true">
+                       <SegmentTemplate timescale="1000000" presentationTimeOffset="0" initialization="ai_$RepresentationID$.mp4d" media="a_$RepresentationID$_$Number$.mp4d" duration="2000000.0" startNumber="0"></SegmentTemplate>
+                       <Representation id="318597" bandwidth="61587"></Representation>
+               </AdaptationSet>
+               <AdaptationSet mimeType="video/mp4" startWithSAP="1" segmentAlignment="true">
+                       <SegmentTemplate timescale="1000000" presentationTimeOffset="0" initialization="vi_$RepresentationID$.mp4d" media="v_$RepresentationID$_$Number$.mp4d" duration="2000000.0" startNumber="0"></SegmentTemplate>
+                       <Representation id="318597" codecs="avc1.42001f" width="340" height="192" bandwidth="318597"></Representation>
+                       <Representation id="638590" codecs="avc1.42001f" width="512" height="288" bandwidth="638590"></Representation>
+                       <Representation id="1022565" codecs="avc1.4d001f" width="688" height="384" bandwidth="1022565"></Representation>
+                       <Representation id="2046506" codecs="avc1.4d001f" width="1024" height="576" bandwidth="2046506"></Representation>
+                       <Representation id="3998017" codecs="avc1.640029" width="1280" height="720" bandwidth="3998017"></Representation>
+                       <Representation id="5997485" codecs="avc1.640032" width="1920" height="1080" bandwidth="5997485"></Representation>
+               </AdaptationSet>
+       </Period>
+</MPD>
\ No newline at end of file
index c7100bb91efbddc52806ce723e386d1366fba666..5405a87c5f377d7673b08e5560546d639614a6ca 100755 (executable)
@@ -26,6 +26,8 @@ import tokenize
 import traceback
 import random
 
+from string import ascii_letters
+
 from .compat import (
     compat_basestring,
     compat_cookiejar,
@@ -58,6 +60,7 @@ from .utils import (
     format_bytes,
     formatSeconds,
     GeoRestrictedError,
+    int_or_none,
     ISO3166Utils,
     locked_file,
     make_HTTPS_handler,
@@ -302,6 +305,17 @@ class YoutubeDL(object):
                         postprocessor.
     """
 
+    _NUMERIC_FIELDS = set((
+        'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
+        'timestamp', 'upload_year', 'upload_month', 'upload_day',
+        'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
+        'average_rating', 'comment_count', 'age_limit',
+        'start_time', 'end_time',
+        'chapter_number', 'season_number', 'episode_number',
+        'track_number', 'disc_number', 'release_year',
+        'playlist_index',
+    ))
+
     params = None
     _ies = []
     _pps = []
@@ -370,10 +384,10 @@ class YoutubeDL(object):
                 else:
                     raise
 
-        if (sys.version_info >= (3,) and sys.platform != 'win32' and
+        if (sys.platform != 'win32' and
                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
                 not params.get('restrictfilenames', False)):
-            # On Python 3, the Unicode filesystem API will throw errors (#1474)
+            # Unicode filesystem API will throw errors (#1474, #13027)
             self.report_warning(
                 'Assuming --restrict-filenames since file system encoding '
                 'cannot encode all characters. '
@@ -498,24 +512,25 @@ class YoutubeDL(object):
     def to_console_title(self, message):
         if not self.params.get('consoletitle', False):
             return
-        if compat_os_name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
-            # c_wchar_p() might not be necessary if `message` is
-            # already of type unicode()
-            ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
+        if compat_os_name == 'nt':
+            if ctypes.windll.kernel32.GetConsoleWindow():
+                # c_wchar_p() might not be necessary if `message` is
+                # already of type unicode()
+                ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
         elif 'TERM' in os.environ:
             self._write_string('\033]0;%s\007' % message, self._screen_file)
 
     def save_console_title(self):
         if not self.params.get('consoletitle', False):
             return
-        if 'TERM' in os.environ:
+        if compat_os_name != 'nt' and 'TERM' in os.environ:
             # Save the title on stack
             self._write_string('\033[22;0t', self._screen_file)
 
     def restore_console_title(self):
         if not self.params.get('consoletitle', False):
             return
-        if 'TERM' in os.environ:
+        if compat_os_name != 'nt' and 'TERM' in os.environ:
             # Restore the title from stack
             self._write_string('\033[23;0t', self._screen_file)
 
@@ -638,22 +653,11 @@ class YoutubeDL(object):
                     r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
                     outtmpl)
 
-            NUMERIC_FIELDS = set((
-                'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
-                'timestamp', 'upload_year', 'upload_month', 'upload_day',
-                'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
-                'average_rating', 'comment_count', 'age_limit',
-                'start_time', 'end_time',
-                'chapter_number', 'season_number', 'episode_number',
-                'track_number', 'disc_number', 'release_year',
-                'playlist_index',
-            ))
-
             # Missing numeric fields used together with integer presentation types
             # in format specification will break the argument substitution since
             # string 'NA' is returned for missing fields. We will patch output
             # template for missing fields to meet string presentation type.
-            for numeric_field in NUMERIC_FIELDS:
+            for numeric_field in self._NUMERIC_FIELDS:
                 if numeric_field not in template_dict:
                     # As of [1] format syntax is:
                     #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
@@ -672,7 +676,19 @@ class YoutubeDL(object):
                         FORMAT_RE.format(numeric_field),
                         r'%({0})s'.format(numeric_field), outtmpl)
 
-            filename = expand_path(outtmpl % template_dict)
+            # expand_path translates '%%' into '%' and '$$' into '$'
+            # correspondingly that is not what we want since we need to keep
+            # '%%' intact for template dict substitution step. Working around
+            # with boundary-alike separator hack.
+            sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
+            outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep))
+
+            # outtmpl should be expand_path'ed before template dict substitution
+            # because meta fields may contain env variables we don't want to
+            # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
+            # title "Hello $PATH", we don't want `$PATH` to be expanded.
+            filename = expand_path(outtmpl).replace(sep, '') % template_dict
+
             # Temporary fix for #4787
             # 'Treat' all problem characters by passing filename through preferredencoding
             # to workaround encoding issues with subprocess on python2 @ Windows
@@ -844,7 +860,7 @@ class YoutubeDL(object):
 
             force_properties = dict(
                 (k, v) for k, v in ie_result.items() if v is not None)
-            for f in ('_type', 'url', 'ie_key'):
+            for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
                 if f in force_properties:
                     del force_properties[f]
             new_result = info.copy()
@@ -1048,6 +1064,25 @@ class YoutubeDL(object):
             return op(actual_value, comparison_value)
         return _filter
 
+    def _default_format_spec(self, info_dict, download=True):
+        req_format_list = []
+
+        def can_have_partial_formats():
+            if self.params.get('simulate', False):
+                return True
+            if not download:
+                return True
+            if self.params.get('outtmpl', DEFAULT_OUTTMPL) == '-':
+                return False
+            if info_dict.get('is_live'):
+                return False
+            merger = FFmpegMergerPP(self)
+            return merger.available and merger.can_merge()
+        if can_have_partial_formats():
+            req_format_list.append('bestvideo+bestaudio')
+        req_format_list.append('best')
+        return '/'.join(req_format_list)
+
     def build_format_selector(self, format_spec):
         def syntax_error(note, start):
             message = (
@@ -1344,9 +1379,28 @@ class YoutubeDL(object):
         if 'title' not in info_dict:
             raise ExtractorError('Missing "title" field in extractor result')
 
-        if not isinstance(info_dict['id'], compat_str):
-            self.report_warning('"id" field is not a string - forcing string conversion')
-            info_dict['id'] = compat_str(info_dict['id'])
+        def report_force_conversion(field, field_not, conversion):
+            self.report_warning(
+                '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
+                % (field, field_not, conversion))
+
+        def sanitize_string_field(info, string_field):
+            field = info.get(string_field)
+            if field is None or isinstance(field, compat_str):
+                return
+            report_force_conversion(string_field, 'a string', 'string')
+            info[string_field] = compat_str(field)
+
+        def sanitize_numeric_fields(info):
+            for numeric_field in self._NUMERIC_FIELDS:
+                field = info.get(numeric_field)
+                if field is None or isinstance(field, compat_numeric_types):
+                    continue
+                report_force_conversion(numeric_field, 'numeric', 'int')
+                info[numeric_field] = int_or_none(field)
+
+        sanitize_string_field(info_dict, 'id')
+        sanitize_numeric_fields(info_dict)
 
         if 'playlist' not in info_dict:
             # It isn't part of a playlist
@@ -1427,16 +1481,28 @@ class YoutubeDL(object):
         if not formats:
             raise ExtractorError('No video formats found!')
 
+        def is_wellformed(f):
+            url = f.get('url')
+            if not url:
+                self.report_warning(
+                    '"url" field is missing or empty - skipping format, '
+                    'there is an error in extractor')
+                return False
+            if isinstance(url, bytes):
+                sanitize_string_field(f, 'url')
+            return True
+
+        # Filter out malformed formats for better extraction robustness
+        formats = list(filter(is_wellformed, formats))
+
         formats_dict = {}
 
         # We check that all the formats have the format and format_id fields
         for i, format in enumerate(formats):
-            if 'url' not in format:
-                raise ExtractorError('Missing "url" key in result (index %d)' % i)
-
+            sanitize_string_field(format, 'format_id')
+            sanitize_numeric_fields(format)
             format['url'] = sanitize_url(format['url'])
-
-            if format.get('format_id') is None:
+            if not format.get('format_id'):
                 format['format_id'] = compat_str(i)
             else:
                 # Sanitize format_id from characters used in format selector expression
@@ -1489,14 +1555,10 @@ class YoutubeDL(object):
 
         req_format = self.params.get('format')
         if req_format is None:
-            req_format_list = []
-            if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and
-                    not info_dict.get('is_live')):
-                merger = FFmpegMergerPP(self)
-                if merger.available and merger.can_merge():
-                    req_format_list.append('bestvideo+bestaudio')
-            req_format_list.append('best')
-            req_format = '/'.join(req_format_list)
+            req_format = self._default_format_spec(info_dict, download=download)
+            if self.params.get('verbose'):
+                self.to_stdout('[debug] Default format spec: %s' % req_format)
+
         format_selector = self.build_format_selector(req_format)
 
         # While in format selection we may need to have an access to the original
@@ -1648,12 +1710,17 @@ class YoutubeDL(object):
         if filename is None:
             return
 
-        try:
-            dn = os.path.dirname(sanitize_path(encodeFilename(filename)))
-            if dn and not os.path.exists(dn):
-                os.makedirs(dn)
-        except (OSError, IOError) as err:
-            self.report_error('unable to create directory ' + error_to_compat_str(err))
+        def ensure_dir_exists(path):
+            try:
+                dn = os.path.dirname(path)
+                if dn and not os.path.exists(dn):
+                    os.makedirs(dn)
+                return True
+            except (OSError, IOError) as err:
+                self.report_error('unable to create directory ' + error_to_compat_str(err))
+                return False
+
+        if not ensure_dir_exists(sanitize_path(encodeFilename(filename))):
             return
 
         if self.params.get('writedescription', False):
@@ -1792,8 +1859,11 @@ class YoutubeDL(object):
                         for f in requested_formats:
                             new_info = dict(info_dict)
                             new_info.update(f)
-                            fname = self.prepare_filename(new_info)
-                            fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext'])
+                            fname = prepend_extension(
+                                self.prepare_filename(new_info),
+                                'f%s' % f['format_id'], new_info['ext'])
+                            if not ensure_dir_exists(fname):
+                                return
                             downloaded.append(fname)
                             partial_success = dl(fname, new_info)
                             success = success and partial_success
@@ -1860,7 +1930,7 @@ class YoutubeDL(object):
                         info_dict.get('protocol') == 'm3u8' and
                         self.params.get('hls_prefer_native')):
                     if fixup_policy == 'warn':
-                        self.report_warning('%s: malformated aac bitstream.' % (
+                        self.report_warning('%s: malformed AAC bitstream detected.' % (
                             info_dict['id']))
                     elif fixup_policy == 'detect_or_warn':
                         fixup_pp = FFmpegFixupM3u8PP(self)
@@ -1869,7 +1939,7 @@ class YoutubeDL(object):
                             info_dict['__postprocessors'].append(fixup_pp)
                         else:
                             self.report_warning(
-                                '%s: malformated aac bitstream. %s'
+                                '%s: malformed AAC bitstream detected. %s'
                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
                     else:
                         assert fixup_policy in ('ignore', 'never')
index 39527117fa27f049d5bc56390439091e38ca5f49..9e4e13bcf6a5230995589ea1823873f5ac315c71 100644 (file)
@@ -2322,6 +2322,19 @@ try:
 except ImportError:  # Python 2
     from HTMLParser import HTMLParser as compat_HTMLParser
 
+try:  # Python 2
+    from HTMLParser import HTMLParseError as compat_HTMLParseError
+except ImportError:  # Python <3.4
+    try:
+        from html.parser import HTMLParseError as compat_HTMLParseError
+    except ImportError:  # Python >3.4
+
+        # HTMLParseError has been deprecated in Python 3.3 and removed in
+        # Python 3.5. Introducing dummy exception for Python >3.5 for compatible
+        # and uniform cross-version exceptiong handling
+        class compat_HTMLParseError(Exception):
+            pass
+
 try:
     from subprocess import DEVNULL
     compat_subprocess_get_DEVNULL = lambda: DEVNULL
@@ -2604,14 +2617,22 @@ except ImportError:  # Python 2
                 parsed_result[name] = [value]
         return parsed_result
 
-try:
-    from shlex import quote as compat_shlex_quote
-except ImportError:  # Python < 3.3
+
+compat_os_name = os._name if os.name == 'java' else os.name
+
+
+if compat_os_name == 'nt':
     def compat_shlex_quote(s):
-        if re.match(r'^[-_\w./]+$', s):
-            return s
-        else:
-            return "'" + s.replace("'", "'\"'\"'") + "'"
+        return s if re.match(r'^[-_\w./]+$', s) else '"%s"' % s.replace('"', '\\"')
+else:
+    try:
+        from shlex import quote as compat_shlex_quote
+    except ImportError:  # Python < 3.3
+        def compat_shlex_quote(s):
+            if re.match(r'^[-_\w./]+$', s):
+                return s
+            else:
+                return "'" + s.replace("'", "'\"'\"'") + "'"
 
 
 try:
@@ -2636,9 +2657,6 @@ def compat_ord(c):
         return ord(c)
 
 
-compat_os_name = os._name if os.name == 'java' else os.name
-
-
 if sys.version_info >= (3, 0):
     compat_getenv = os.getenv
     compat_expanduser = os.path.expanduser
@@ -2882,6 +2900,7 @@ else:
 
 
 __all__ = [
+    'compat_HTMLParseError',
     'compat_HTMLParser',
     'compat_HTTPError',
     'compat_basestring',
index 5d66211476c521c80054558b5d1134ac4179a828..75b8166c514485bad26fa87a90ca08ab330d654f 100644 (file)
@@ -8,10 +8,11 @@ import random
 
 from ..compat import compat_os_name
 from ..utils import (
+    decodeArgument,
     encodeFilename,
     error_to_compat_str,
-    decodeArgument,
     format_bytes,
+    shell_quote,
     timeconvert,
 )
 
@@ -303,11 +304,11 @@ class FileDownloader(object):
         """Report attempt to resume at given byte."""
         self.to_screen('[download] Resuming download at byte %s' % resume_len)
 
-    def report_retry(self, count, retries):
+    def report_retry(self, err, count, retries):
         """Report retry in case of HTTP error 5xx"""
         self.to_screen(
-            '[download] Got server HTTP error. Retrying (attempt %d of %s)...'
-            % (count, self.format_retries(retries)))
+            '[download] Got server HTTP error: %s. Retrying (attempt %d of %s)...'
+            % (error_to_compat_str(err), count, self.format_retries(retries)))
 
     def report_file_already_downloaded(self, file_name):
         """Report file has already been fully downloaded."""
@@ -381,10 +382,5 @@ class FileDownloader(object):
         if exe is None:
             exe = os.path.basename(str_args[0])
 
-        try:
-            import pipes
-            shell_quote = lambda args: ' '.join(map(pipes.quote, str_args))
-        except ImportError:
-            shell_quote = repr
         self.to_screen('[debug] %s command line: %s' % (
             exe, shell_quote(str_args)))
index 7491fdad857af2f36433b8991b6208fbd9cef99e..576ece6db369254bf491cac972dc845ef1ac2653 100644 (file)
@@ -2,6 +2,7 @@ from __future__ import unicode_literals
 
 from .fragment import FragmentFD
 from ..compat import compat_urllib_error
+from ..utils import urljoin
 
 
 class DashSegmentsFD(FragmentFD):
@@ -12,12 +13,13 @@ class DashSegmentsFD(FragmentFD):
     FD_NAME = 'dashsegments'
 
     def real_download(self, filename, info_dict):
-        segments = info_dict['fragments'][:1] if self.params.get(
+        fragment_base_url = info_dict.get('fragment_base_url')
+        fragments = info_dict['fragments'][:1] if self.params.get(
             'test', False) else info_dict['fragments']
 
         ctx = {
             'filename': filename,
-            'total_frags': len(segments),
+            'total_frags': len(fragments),
         }
 
         self._prepare_and_start_frag_download(ctx)
@@ -26,7 +28,7 @@ class DashSegmentsFD(FragmentFD):
         skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)
 
         frag_index = 0
-        for i, segment in enumerate(segments):
+        for i, fragment in enumerate(fragments):
             frag_index += 1
             if frag_index <= ctx['fragment_index']:
                 continue
@@ -36,7 +38,11 @@ class DashSegmentsFD(FragmentFD):
             count = 0
             while count <= fragment_retries:
                 try:
-                    success, frag_content = self._download_fragment(ctx, segment['url'], info_dict)
+                    fragment_url = fragment.get('url')
+                    if not fragment_url:
+                        assert fragment_base_url
+                        fragment_url = urljoin(fragment_base_url, fragment['path'])
+                    success, frag_content = self._download_fragment(ctx, fragment_url, info_dict)
                     if not success:
                         return False
                     self._append_fragment(ctx, frag_content)
index e13cf547d10cbf472440c9f23d010a586b2c453c..db018fa89e7b137c55fae08bc78d5b8d1c98f83f 100644 (file)
@@ -29,7 +29,17 @@ class ExternalFD(FileDownloader):
         self.report_destination(filename)
         tmpfilename = self.temp_name(filename)
 
-        retval = self._call_downloader(tmpfilename, info_dict)
+        try:
+            retval = self._call_downloader(tmpfilename, info_dict)
+        except KeyboardInterrupt:
+            if not info_dict.get('is_live'):
+                raise
+            # Live stream downloading cancellation should be considered as
+            # correct and expected termination thus all postprocessing
+            # should take place
+            retval = 0
+            self.to_screen('[%s] Interrupted by user' % self.get_basename())
+
         if retval == 0:
             fsize = os.path.getsize(encodeFilename(tmpfilename))
             self.to_screen('\r[%s] Downloaded %s bytes' % (self.get_basename(), fsize))
@@ -202,6 +212,11 @@ class FFmpegFD(ExternalFD):
 
         args = [ffpp.executable, '-y']
 
+        for log_level in ('quiet', 'verbose'):
+            if self.params.get(log_level, False):
+                args += ['-loglevel', log_level]
+                break
+
         seekable = info_dict.get('_seekable')
         if seekable is not None:
             # setting -seekable prevents ffmpeg from guessing if the server
index d529ae09ad14d38e8cc6b8659ebf5fd982e26406..bccc8ecc1e91af231bbf63cf07f9ee9e50948b25 100644 (file)
@@ -49,7 +49,7 @@ class FragmentFD(FileDownloader):
                 index:  0-based index of current fragment among all fragments
             fragment_count:
                 Total count of fragments
-                
+
     This feature is experimental and file format may change in future.
     """
 
index 0e29c8a2ad2559737d8c0210e9c6784310616ec6..46308cf072c25086d896bb759adad10a74d2cfc6 100644 (file)
@@ -59,9 +59,9 @@ class HlsFD(FragmentFD):
         man_url = info_dict['url']
         self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME)
 
-        manifest = self.ydl.urlopen(self._prepare_url(info_dict, man_url)).read()
-
-        s = manifest.decode('utf-8', 'ignore')
+        urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url))
+        man_url = urlh.geturl()
+        s = urlh.read().decode('utf-8', 'ignore')
 
         if not self.can_download(s, info_dict):
             if info_dict.get('extra_param_to_segment_url'):
index af405b9509572bfd42bb11bd48bec5300d8105b3..8a6638cc2d47c4afa08104d77e0877f92749ac8a 100644 (file)
@@ -22,8 +22,16 @@ from ..utils import (
 class HttpFD(FileDownloader):
     def real_download(self, filename, info_dict):
         url = info_dict['url']
-        tmpfilename = self.temp_name(filename)
-        stream = None
+
+        class DownloadContext(dict):
+            __getattr__ = dict.get
+            __setattr__ = dict.__setitem__
+            __delattr__ = dict.__delitem__
+
+        ctx = DownloadContext()
+        ctx.filename = filename
+        ctx.tmpfilename = self.temp_name(filename)
+        ctx.stream = None
 
         # Do not include the Accept-Encoding header
         headers = {'Youtubedl-no-compression': 'True'}
@@ -38,46 +46,51 @@ class HttpFD(FileDownloader):
         if is_test:
             request.add_header('Range', 'bytes=0-%s' % str(self._TEST_FILE_SIZE - 1))
 
-        # Establish possible resume length
-        if os.path.isfile(encodeFilename(tmpfilename)):
-            resume_len = os.path.getsize(encodeFilename(tmpfilename))
-        else:
-            resume_len = 0
-
-        open_mode = 'wb'
-        if resume_len != 0:
-            if self.params.get('continuedl', True):
-                self.report_resuming_byte(resume_len)
-                request.add_header('Range', 'bytes=%d-' % resume_len)
-                open_mode = 'ab'
-            else:
-                resume_len = 0
+        ctx.open_mode = 'wb'
+        ctx.resume_len = 0
+
+        if self.params.get('continuedl', True):
+            # Establish possible resume length
+            if os.path.isfile(encodeFilename(ctx.tmpfilename)):
+                ctx.resume_len = os.path.getsize(encodeFilename(ctx.tmpfilename))
 
         count = 0
         retries = self.params.get('retries', 0)
-        while count <= retries:
+
+        class SucceedDownload(Exception):
+            pass
+
+        class RetryDownload(Exception):
+            def __init__(self, source_error):
+                self.source_error = source_error
+
+        def establish_connection():
+            if ctx.resume_len != 0:
+                self.report_resuming_byte(ctx.resume_len)
+                request.add_header('Range', 'bytes=%d-' % ctx.resume_len)
+                ctx.open_mode = 'ab'
             # Establish connection
             try:
-                data = self.ydl.urlopen(request)
+                ctx.data = self.ydl.urlopen(request)
                 # When trying to resume, Content-Range HTTP header of response has to be checked
                 # to match the value of requested Range HTTP header. This is due to a webservers
                 # that don't support resuming and serve a whole file with no Content-Range
                 # set in response despite of requested Range (see
                 # https://github.com/rg3/youtube-dl/issues/6057#issuecomment-126129799)
-                if resume_len > 0:
-                    content_range = data.headers.get('Content-Range')
+                if ctx.resume_len > 0:
+                    content_range = ctx.data.headers.get('Content-Range')
                     if content_range:
                         content_range_m = re.search(r'bytes (\d+)-', content_range)
                         # Content-Range is present and matches requested Range, resume is possible
-                        if content_range_m and resume_len == int(content_range_m.group(1)):
-                            break
+                        if content_range_m and ctx.resume_len == int(content_range_m.group(1)):
+                            return
                     # Content-Range is either not present or invalid. Assuming remote webserver is
                     # trying to send the whole file, resume is not possible, so wiping the local file
                     # and performing entire redownload
                     self.report_unable_to_resume()
-                    resume_len = 0
-                    open_mode = 'wb'
-                break
+                    ctx.resume_len = 0
+                    ctx.open_mode = 'wb'
+                return
             except (compat_urllib_error.HTTPError, ) as err:
                 if (err.code < 500 or err.code >= 600) and err.code != 416:
                     # Unexpected HTTP error
@@ -86,15 +99,15 @@ class HttpFD(FileDownloader):
                     # Unable to resume (requested range not satisfiable)
                     try:
                         # Open the connection again without the range header
-                        data = self.ydl.urlopen(basic_request)
-                        content_length = data.info()['Content-Length']
+                        ctx.data = self.ydl.urlopen(basic_request)
+                        content_length = ctx.data.info()['Content-Length']
                     except (compat_urllib_error.HTTPError, ) as err:
                         if err.code < 500 or err.code >= 600:
                             raise
                     else:
                         # Examine the reported length
                         if (content_length is not None and
-                                (resume_len - 100 < int(content_length) < resume_len + 100)):
+                                (ctx.resume_len - 100 < int(content_length) < ctx.resume_len + 100)):
                             # The file had already been fully downloaded.
                             # Explanation to the above condition: in issue #175 it was revealed that
                             # YouTube sometimes adds or removes a few bytes from the end of the file,
@@ -102,152 +115,184 @@ class HttpFD(FileDownloader):
                             # I decided to implement a suggested change and consider the file
                             # completely downloaded if the file size differs less than 100 bytes from
                             # the one in the hard drive.
-                            self.report_file_already_downloaded(filename)
-                            self.try_rename(tmpfilename, filename)
+                            self.report_file_already_downloaded(ctx.filename)
+                            self.try_rename(ctx.tmpfilename, ctx.filename)
                             self._hook_progress({
-                                'filename': filename,
+                                'filename': ctx.filename,
                                 'status': 'finished',
-                                'downloaded_bytes': resume_len,
-                                'total_bytes': resume_len,
+                                'downloaded_bytes': ctx.resume_len,
+                                'total_bytes': ctx.resume_len,
                             })
-                            return True
+                            raise SucceedDownload()
                         else:
                             # The length does not match, we start the download over
                             self.report_unable_to_resume()
-                            resume_len = 0
-                            open_mode = 'wb'
-                            break
-            except socket.error as e:
-                if e.errno != errno.ECONNRESET:
+                            ctx.resume_len = 0
+                            ctx.open_mode = 'wb'
+                            return
+                raise RetryDownload(err)
+            except socket.error as err:
+                if err.errno != errno.ECONNRESET:
                     # Connection reset is no problem, just retry
                     raise
+                raise RetryDownload(err)
+
+        def download():
+            data_len = ctx.data.info().get('Content-length', None)
+
+            # Range HTTP header may be ignored/unsupported by a webserver
+            # (e.g. extractor/scivee.py, extractor/bambuser.py).
+            # However, for a test we still would like to download just a piece of a file.
+            # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control
+            # block size when downloading a file.
+            if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE):
+                data_len = self._TEST_FILE_SIZE
+
+            if data_len is not None:
+                data_len = int(data_len) + ctx.resume_len
+                min_data_len = self.params.get('min_filesize')
+                max_data_len = self.params.get('max_filesize')
+                if min_data_len is not None and data_len < min_data_len:
+                    self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len))
+                    return False
+                if max_data_len is not None and data_len > max_data_len:
+                    self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len))
+                    return False
 
-            # Retry
-            count += 1
-            if count <= retries:
-                self.report_retry(count, retries)
-
-        if count > retries:
-            self.report_error('giving up after %s retries' % retries)
-            return False
-
-        data_len = data.info().get('Content-length', None)
-
-        # Range HTTP header may be ignored/unsupported by a webserver
-        # (e.g. extractor/scivee.py, extractor/bambuser.py).
-        # However, for a test we still would like to download just a piece of a file.
-        # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control
-        # block size when downloading a file.
-        if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE):
-            data_len = self._TEST_FILE_SIZE
-
-        if data_len is not None:
-            data_len = int(data_len) + resume_len
-            min_data_len = self.params.get('min_filesize')
-            max_data_len = self.params.get('max_filesize')
-            if min_data_len is not None and data_len < min_data_len:
-                self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len))
-                return False
-            if max_data_len is not None and data_len > max_data_len:
-                self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len))
-                return False
-
-        byte_counter = 0 + resume_len
-        block_size = self.params.get('buffersize', 1024)
-        start = time.time()
+            byte_counter = 0 + ctx.resume_len
+            block_size = self.params.get('buffersize', 1024)
+            start = time.time()
 
-        # measure time over whole while-loop, so slow_down() and best_block_size() work together properly
-        now = None  # needed for slow_down() in the first loop run
-        before = start  # start measuring
-        while True:
+            # measure time over whole while-loop, so slow_down() and best_block_size() work together properly
+            now = None  # needed for slow_down() in the first loop run
+            before = start  # start measuring
 
-            # Download and write
-            data_block = data.read(block_size if not is_test else min(block_size, data_len - byte_counter))
-            byte_counter += len(data_block)
+            def retry(e):
+                if ctx.tmpfilename != '-':
+                    ctx.stream.close()
+                ctx.stream = None
+                ctx.resume_len = os.path.getsize(encodeFilename(ctx.tmpfilename))
+                raise RetryDownload(e)
 
-            # exit loop when download is finished
-            if len(data_block) == 0:
-                break
+            while True:
+                try:
+                    # Download and write
+                    data_block = ctx.data.read(block_size if not is_test else min(block_size, data_len - byte_counter))
+                # socket.timeout is a subclass of socket.error but may not have
+                # errno set
+                except socket.timeout as e:
+                    retry(e)
+                except socket.error as e:
+                    if e.errno not in (errno.ECONNRESET, errno.ETIMEDOUT):
+                        raise
+                    retry(e)
+
+                byte_counter += len(data_block)
+
+                # exit loop when download is finished
+                if len(data_block) == 0:
+                    break
+
+                # Open destination file just in time
+                if ctx.stream is None:
+                    try:
+                        ctx.stream, ctx.tmpfilename = sanitize_open(
+                            ctx.tmpfilename, ctx.open_mode)
+                        assert ctx.stream is not None
+                        ctx.filename = self.undo_temp_name(ctx.tmpfilename)
+                        self.report_destination(ctx.filename)
+                    except (OSError, IOError) as err:
+                        self.report_error('unable to open for writing: %s' % str(err))
+                        return False
+
+                    if self.params.get('xattr_set_filesize', False) and data_len is not None:
+                        try:
+                            write_xattr(ctx.tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8'))
+                        except (XAttrUnavailableError, XAttrMetadataError) as err:
+                            self.report_error('unable to set filesize xattr: %s' % str(err))
 
-            # Open destination file just in time
-            if stream is None:
                 try:
-                    (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
-                    assert stream is not None
-                    filename = self.undo_temp_name(tmpfilename)
-                    self.report_destination(filename)
-                except (OSError, IOError) as err:
-                    self.report_error('unable to open for writing: %s' % str(err))
+                    ctx.stream.write(data_block)
+                except (IOError, OSError) as err:
+                    self.to_stderr('\n')
+                    self.report_error('unable to write data: %s' % str(err))
                     return False
 
-                if self.params.get('xattr_set_filesize', False) and data_len is not None:
-                    try:
-                        write_xattr(tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8'))
-                    except (XAttrUnavailableError, XAttrMetadataError) as err:
-                        self.report_error('unable to set filesize xattr: %s' % str(err))
-
-            try:
-                stream.write(data_block)
-            except (IOError, OSError) as err:
+                # Apply rate limit
+                self.slow_down(start, now, byte_counter - ctx.resume_len)
+
+                # end measuring of one loop run
+                now = time.time()
+                after = now
+
+                # Adjust block size
+                if not self.params.get('noresizebuffer', False):
+                    block_size = self.best_block_size(after - before, len(data_block))
+
+                before = after
+
+                # Progress message
+                speed = self.calc_speed(start, now, byte_counter - ctx.resume_len)
+                if data_len is None:
+                    eta = None
+                else:
+                    eta = self.calc_eta(start, time.time(), data_len - ctx.resume_len, byte_counter - ctx.resume_len)
+
+                self._hook_progress({
+                    'status': 'downloading',
+                    'downloaded_bytes': byte_counter,
+                    'total_bytes': data_len,
+                    'tmpfilename': ctx.tmpfilename,
+                    'filename': ctx.filename,
+                    'eta': eta,
+                    'speed': speed,
+                    'elapsed': now - start,
+                })
+
+                if is_test and byte_counter == data_len:
+                    break
+
+            if ctx.stream is None:
                 self.to_stderr('\n')
-                self.report_error('unable to write data: %s' % str(err))
+                self.report_error('Did not get any data blocks')
                 return False
+            if ctx.tmpfilename != '-':
+                ctx.stream.close()
 
-            # Apply rate limit
-            self.slow_down(start, now, byte_counter - resume_len)
+            if data_len is not None and byte_counter != data_len:
+                err = ContentTooShortError(byte_counter, int(data_len))
+                if count <= retries:
+                    retry(err)
+                raise err
 
-            # end measuring of one loop run
-            now = time.time()
-            after = now
+            self.try_rename(ctx.tmpfilename, ctx.filename)
 
-            # Adjust block size
-            if not self.params.get('noresizebuffer', False):
-                block_size = self.best_block_size(after - before, len(data_block))
-
-            before = after
-
-            # Progress message
-            speed = self.calc_speed(start, now, byte_counter - resume_len)
-            if data_len is None:
-                eta = None
-            else:
-                eta = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
+            # Update file modification time
+            if self.params.get('updatetime', True):
+                info_dict['filetime'] = self.try_utime(ctx.filename, ctx.data.info().get('last-modified', None))
 
             self._hook_progress({
-                'status': 'downloading',
                 'downloaded_bytes': byte_counter,
-                'total_bytes': data_len,
-                'tmpfilename': tmpfilename,
-                'filename': filename,
-                'eta': eta,
-                'speed': speed,
-                'elapsed': now - start,
+                'total_bytes': byte_counter,
+                'filename': ctx.filename,
+                'status': 'finished',
+                'elapsed': time.time() - start,
             })
 
-            if is_test and byte_counter == data_len:
-                break
-
-        if stream is None:
-            self.to_stderr('\n')
-            self.report_error('Did not get any data blocks')
-            return False
-        if tmpfilename != '-':
-            stream.close()
-
-        if data_len is not None and byte_counter != data_len:
-            raise ContentTooShortError(byte_counter, int(data_len))
-        self.try_rename(tmpfilename, filename)
-
-        # Update file modification time
-        if self.params.get('updatetime', True):
-            info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
-
-        self._hook_progress({
-            'downloaded_bytes': byte_counter,
-            'total_bytes': byte_counter,
-            'filename': filename,
-            'status': 'finished',
-            'elapsed': time.time() - start,
-        })
-
-        return True
+            return True
+
+        while count <= retries:
+            try:
+                establish_connection()
+                download()
+                return True
+            except RetryDownload as e:
+                count += 1
+                if count <= retries:
+                    self.report_retry(e.source_error, count, retries)
+                continue
+            except SucceedDownload:
+                return True
+
+        self.report_error('giving up after %s retries' % retries)
+        return False
index 5f6f9faefbad62592d699135437ec81add27ae39..9b001ecff4f407a94f74b59a8e1505ed27fd9d03 100644 (file)
@@ -98,7 +98,7 @@ def write_piff_header(stream, params):
 
     if is_audio:
         smhd_payload = s88.pack(0)  # balance
-        smhd_payload = u16.pack(0)  # reserved
+        smhd_payload += u16.pack(0)  # reserved
         media_header_box = full_box(b'smhd', 0, 0, smhd_payload)  # Sound Media Header
     else:
         vmhd_payload = u16.pack(0)  # graphics mode
@@ -126,7 +126,6 @@ def write_piff_header(stream, params):
         if fourcc == 'AACL':
             sample_entry_box = box(b'mp4a', sample_entry_payload)
     else:
-        sample_entry_payload = sample_entry_payload
         sample_entry_payload += u16.pack(0)  # pre defined
         sample_entry_payload += u16.pack(0)  # reserved
         sample_entry_payload += u32.pack(0) * 3  # pre defined
index 0247cabf9df8a6c61602085dcabe5f139b53420a..60f753b95c6e89158c3d292bd62e8bee2cd74746 100644 (file)
@@ -3,11 +3,13 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_str
 from ..utils import (
     ExtractorError,
     js_to_json,
     int_or_none,
     parse_iso8601,
+    try_get,
 )
 
 
@@ -124,7 +126,20 @@ class ABCIViewIE(InfoExtractor):
         title = video_params.get('title') or video_params['seriesTitle']
         stream = next(s for s in video_params['playlist'] if s.get('type') == 'program')
 
-        formats = self._extract_akamai_formats(stream['hds-unmetered'], video_id)
+        format_urls = [
+            try_get(stream, lambda x: x['hds-unmetered'], compat_str)]
+
+        # May have higher quality video
+        sd_url = try_get(
+            stream, lambda x: x['streams']['hds']['sd'], compat_str)
+        if sd_url:
+            format_urls.append(sd_url.replace('metered', 'um'))
+
+        formats = []
+        for format_url in format_urls:
+            if format_url:
+                formats.extend(
+                    self._extract_akamai_formats(format_url, video_id))
         self._sort_formats(formats)
 
         subtitles = {}
index 4f56c4c11935ee85a9412a39de20138bb83cc33d..f770fe901369e85d3df881cf651313f540544b03 100644 (file)
@@ -7,12 +7,21 @@ import time
 
 from .amp import AMPIE
 from .common import InfoExtractor
+from .youtube import YoutubeIE
 from ..compat import compat_urlparse
 
 
 class AbcNewsVideoIE(AMPIE):
     IE_NAME = 'abcnews:video'
-    _VALID_URL = r'https?://abcnews\.go\.com/[^/]+/video/(?P<display_id>[0-9a-z-]+)-(?P<id>\d+)'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        abcnews\.go\.com/
+                        (?:
+                            [^/]+/video/(?P<display_id>[0-9a-z-]+)-|
+                            video/embed\?.*?\bid=
+                        )
+                        (?P<id>\d+)
+                    '''
 
     _TESTS = [{
         'url': 'http://abcnews.go.com/ThisWeek/video/week-exclusive-irans-foreign-minister-zarif-20411932',
@@ -29,6 +38,9 @@ class AbcNewsVideoIE(AMPIE):
             # m3u8 download
             'skip_download': True,
         },
+    }, {
+        'url': 'http://abcnews.go.com/video/embed?id=46979033',
+        'only_matching': True,
     }, {
         'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478',
         'only_matching': True,
@@ -97,9 +109,7 @@ class AbcNewsIE(InfoExtractor):
             r'window\.abcnvideo\.url\s*=\s*"([^"]+)"', webpage, 'video URL')
         full_video_url = compat_urlparse.urljoin(url, video_url)
 
-        youtube_url = self._html_search_regex(
-            r'<iframe[^>]+src="(https://www\.youtube\.com/embed/[^"]+)"',
-            webpage, 'YouTube URL', default=None)
+        youtube_url = YoutubeIE._extract_url(webpage)
 
         timestamp = None
         date_str = self._html_search_regex(
@@ -129,7 +139,7 @@ class AbcNewsIE(InfoExtractor):
         }
 
         if youtube_url:
-            entries = [entry, self.url_result(youtube_url, 'Youtube')]
+            entries = [entry, self.url_result(youtube_url, ie=YoutubeIE.ie_key())]
             return self.playlist_result(entries)
 
         return entry
index 76e98132b9d18514e54ed37e11df61089a75678c..03b92a39c785453123a4c5fbc5e09c98c7128634 100644 (file)
@@ -22,7 +22,7 @@ class ABCOTVSIE(InfoExtractor):
                 'display_id': 'east-bay-museum-celebrates-vintage-synthesizers',
                 'ext': 'mp4',
                 'title': 'East Bay museum celebrates vintage synthesizers',
-                'description': 'md5:a4f10fb2f2a02565c1749d4adbab4b10',
+                'description': 'md5:24ed2bd527096ec2a5c67b9d5a9005f3',
                 'thumbnail': r're:^https?://.*\.jpg$',
                 'timestamp': 1421123075,
                 'upload_date': '20150113',
index 66caf6a818278796648dd4496da95c661fcdb36c..cffdab6ca4a4488caf06aa73579e363a7ede8fd4 100644 (file)
@@ -15,6 +15,7 @@ from ..utils import (
     intlist_to_bytes,
     srt_subtitles_timecode,
     strip_or_none,
+    urljoin,
 )
 
 
@@ -31,25 +32,28 @@ class ADNIE(InfoExtractor):
             'description': 'md5:2f7b5aa76edbc1a7a92cedcda8a528d5',
         }
     }
+    _BASE_URL = 'http://animedigitalnetwork.fr'
 
     def _get_subtitles(self, sub_path, video_id):
         if not sub_path:
             return None
 
         enc_subtitles = self._download_webpage(
-            'http://animedigitalnetwork.fr/' + sub_path,
-            video_id, fatal=False)
+            urljoin(self._BASE_URL, sub_path),
+            video_id, fatal=False, headers={
+                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0',
+            })
         if not enc_subtitles:
             return None
 
         # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js
         dec_subtitles = intlist_to_bytes(aes_cbc_decrypt(
             bytes_to_intlist(base64.b64decode(enc_subtitles[24:])),
-            bytes_to_intlist(b'\nd\xaf\xd2J\xd0\xfc\xe1\xfc\xdf\xb61\xe8\xe1\xf0\xcc'),
+            bytes_to_intlist(b'\x1b\xe0\x29\x61\x38\x94\x24\x00\x12\xbd\xc5\x80\xac\xce\xbe\xb0'),
             bytes_to_intlist(base64.b64decode(enc_subtitles[:24]))
         ))
         subtitles_json = self._parse_json(
-            dec_subtitles[:-compat_ord(dec_subtitles[-1])],
+            dec_subtitles[:-compat_ord(dec_subtitles[-1])].decode(),
             None, fatal=False)
         if not subtitles_json:
             return None
@@ -103,9 +107,18 @@ class ADNIE(InfoExtractor):
         metas = options.get('metas') or {}
         title = metas.get('title') or video_info['title']
         links = player_config.get('links') or {}
+        error = None
+        if not links:
+            links_url = player_config['linksurl']
+            links_data = self._download_json(urljoin(
+                self._BASE_URL, links_url), video_id)
+            links = links_data.get('links') or {}
+            error = links_data.get('error')
 
         formats = []
         for format_id, qualities in links.items():
+            if not isinstance(qualities, dict):
+                continue
             for load_balancer_url in qualities.values():
                 load_balancer_data = self._download_json(
                     load_balancer_url, video_id, fatal=False) or {}
@@ -119,7 +132,8 @@ class ADNIE(InfoExtractor):
                     for f in m3u8_formats:
                         f['language'] = 'fr'
                 formats.extend(m3u8_formats)
-        error = options.get('error')
+        if not error:
+            error = options.get('error')
         if not formats and error:
             raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
         self._sort_formats(formats)
index 7da96c65c694fbd57fa47b8de30f77851e5b5f13..b83b51efb624a876bbb46658e50b6f6714e10048 100644 (file)
@@ -6,12 +6,16 @@ import time
 import xml.etree.ElementTree as etree
 
 from .common import InfoExtractor
-from ..compat import compat_urlparse
+from ..compat import (
+    compat_kwargs,
+    compat_urlparse,
+)
 from ..utils import (
     unescapeHTML,
     urlencode_postdata,
     unified_timestamp,
     ExtractorError,
+    NO_DEFAULT,
 )
 
 
@@ -21,6 +25,11 @@ MSO_INFO = {
         'username_field': 'username',
         'password_field': 'password',
     },
+    'ATTOTT': {
+        'name': 'DIRECTV NOW',
+        'username_field': 'email',
+        'password_field': 'loginpassword',
+    },
     'Rogers': {
         'name': 'Rogers',
         'username_field': 'UserName',
@@ -36,6 +45,11 @@ MSO_INFO = {
         'username_field': 'Ecom_User_ID',
         'password_field': 'Ecom_Password',
     },
+    'Brighthouse': {
+        'name': 'Bright House Networks | Spectrum',
+        'username_field': 'j_username',
+        'password_field': 'j_password',
+    },
     'Charter_Direct': {
         'name': 'Charter Spectrum',
         'username_field': 'IDToken1',
@@ -1308,11 +1322,14 @@ class AdobePassIE(InfoExtractor):
     _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0'
     _MVPD_CACHE = 'ap-mvpd'
 
+    _DOWNLOADING_LOGIN_PAGE = 'Downloading Provider Login Page'
+
     def _download_webpage_handle(self, *args, **kwargs):
         headers = kwargs.get('headers', {})
         headers.update(self.geo_verification_headers())
         kwargs['headers'] = headers
-        return super(AdobePassIE, self)._download_webpage_handle(*args, **kwargs)
+        return super(AdobePassIE, self)._download_webpage_handle(
+            *args, **compat_kwargs(kwargs))
 
     @staticmethod
     def _get_mvpd_resource(provider_id, title, guid, rating):
@@ -1356,6 +1373,21 @@ class AdobePassIE(InfoExtractor):
                 'Use --ap-mso to specify Adobe Pass Multiple-system operator Identifier '
                 'and --ap-username and --ap-password or --netrc to provide account credentials.', expected=True)
 
+        def extract_redirect_url(html, url=None, fatal=False):
+            # TODO: eliminate code duplication with generic extractor and move
+            # redirection code into _download_webpage_handle
+            REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
+            redirect_url = self._search_regex(
+                r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
+                r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
+                html, 'meta refresh redirect',
+                default=NO_DEFAULT if fatal else None, fatal=fatal)
+            if not redirect_url:
+                return None
+            if url:
+                redirect_url = compat_urlparse.urljoin(url, unescapeHTML(redirect_url))
+            return redirect_url
+
         mvpd_headers = {
             'ap_42': 'anonymous',
             'ap_11': 'Linux i686',
@@ -1405,16 +1437,15 @@ class AdobePassIE(InfoExtractor):
                         if '<form name="signin"' in provider_redirect_page:
                             provider_login_page_res = provider_redirect_page_res
                         elif 'http-equiv="refresh"' in provider_redirect_page:
-                            oauth_redirect_url = self._html_search_regex(
-                                r'content="0;\s*url=([^\'"]+)',
-                                provider_redirect_page, 'meta refresh redirect')
+                            oauth_redirect_url = extract_redirect_url(
+                                provider_redirect_page, fatal=True)
                             provider_login_page_res = self._download_webpage_handle(
                                 oauth_redirect_url, video_id,
-                                'Downloading Provider Login Page')
+                                self._DOWNLOADING_LOGIN_PAGE)
                         else:
                             provider_login_page_res = post_form(
                                 provider_redirect_page_res,
-                                'Downloading Provider Login Page')
+                                self._DOWNLOADING_LOGIN_PAGE)
 
                         mvpd_confirm_page_res = post_form(
                             provider_login_page_res, 'Logging in', {
@@ -1461,8 +1492,17 @@ class AdobePassIE(InfoExtractor):
                             'Content-Type': 'application/x-www-form-urlencoded'
                         })
                 else:
+                    # Some providers (e.g. DIRECTV NOW) have another meta refresh
+                    # based redirect that should be followed.
+                    provider_redirect_page, urlh = provider_redirect_page_res
+                    provider_refresh_redirect_url = extract_redirect_url(
+                        provider_redirect_page, url=urlh.geturl())
+                    if provider_refresh_redirect_url:
+                        provider_redirect_page_res = self._download_webpage_handle(
+                            provider_refresh_redirect_url, video_id,
+                            'Downloading Provider Redirect Page (meta refresh)')
                     provider_login_page_res = post_form(
-                        provider_redirect_page_res, 'Downloading Provider Login Page')
+                        provider_redirect_page_res, self._DOWNLOADING_LOGIN_PAGE)
                     mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', {
                         mso_info.get('username_field', 'username'): username,
                         mso_info.get('password_field', 'password'): password,
index 989505c8232abf53f99d0af594c84e45f8778eb0..acc4ce38dca31a4ec8401d253044eb7e4fb91b3e 100644 (file)
@@ -5,91 +5,52 @@ import re
 
 from .turner import TurnerBaseIE
 from ..utils import (
-    ExtractorError,
     int_or_none,
+    strip_or_none,
 )
 
 
 class AdultSwimIE(TurnerBaseIE):
-    _VALID_URL = r'https?://(?:www\.)?adultswim\.com/videos/(?P<is_playlist>playlists/)?(?P<show_path>[^/]+)/(?P<episode_path>[^/?#]+)/?'
+    _VALID_URL = r'https?://(?:www\.)?adultswim\.com/videos/(?P<show_path>[^/?#]+)(?:/(?P<episode_path>[^/?#]+))?'
 
     _TESTS = [{
         'url': 'http://adultswim.com/videos/rick-and-morty/pilot',
-        'playlist': [
-            {
-                'md5': '247572debc75c7652f253c8daa51a14d',
-                'info_dict': {
-                    'id': 'rQxZvXQ4ROaSOqq-or2Mow-0',
-                    'ext': 'flv',
-                    'title': 'Rick and Morty - Pilot Part 1',
-                    'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. "
-                },
-            },
-            {
-                'md5': '77b0e037a4b20ec6b98671c4c379f48d',
-                'info_dict': {
-                    'id': 'rQxZvXQ4ROaSOqq-or2Mow-3',
-                    'ext': 'flv',
-                    'title': 'Rick and Morty - Pilot Part 4',
-                    'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. "
-                },
-            },
-        ],
         'info_dict': {
             'id': 'rQxZvXQ4ROaSOqq-or2Mow',
+            'ext': 'mp4',
             'title': 'Rick and Morty - Pilot',
-            'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. "
+            'description': 'Rick moves in with his daughter\'s family and establishes himself as a bad influence on his grandson, Morty.',
+            'timestamp': 1493267400,
+            'upload_date': '20170427',
         },
-        'skip': 'This video is only available for registered users',
-    }, {
-        'url': 'http://www.adultswim.com/videos/playlists/american-parenting/putting-francine-out-of-business/',
-        'playlist': [
-            {
-                'md5': '2eb5c06d0f9a1539da3718d897f13ec5',
-                'info_dict': {
-                    'id': '-t8CamQlQ2aYZ49ItZCFog-0',
-                    'ext': 'flv',
-                    'title': 'American Dad - Putting Francine Out of Business',
-                    'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].'
-                },
-            }
-        ],
-        'info_dict': {
-            'id': '-t8CamQlQ2aYZ49ItZCFog',
-            'title': 'American Dad - Putting Francine Out of Business',
-            'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].'
+        'params': {
+            # m3u8 download
+            'skip_download': True,
         },
+        'expected_warnings': ['Unable to download f4m manifest'],
     }, {
         'url': 'http://www.adultswim.com/videos/tim-and-eric-awesome-show-great-job/dr-steve-brule-for-your-wine/',
-        'playlist': [
-            {
-                'md5': '3e346a2ab0087d687a05e1e7f3b3e529',
-                'info_dict': {
-                    'id': 'sY3cMUR_TbuE4YmdjzbIcQ-0',
-                    'ext': 'mp4',
-                    'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine',
-                    'description': 'Dr. Brule reports live from Wine Country with a special report on wines.  \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n',
-                },
-            }
-        ],
         'info_dict': {
             'id': 'sY3cMUR_TbuE4YmdjzbIcQ',
+            'ext': 'mp4',
             'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine',
-            'description': 'Dr. Brule reports live from Wine Country with a special report on wines.  \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n',
+            'description': 'Dr. Brule reports live from Wine Country with a special report on wines.  \nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.',
+            'upload_date': '20080124',
+            'timestamp': 1201150800,
         },
         'params': {
             # m3u8 download
             'skip_download': True,
-        }
+        },
     }, {
-        # heroMetadata.trailer
         'url': 'http://www.adultswim.com/videos/decker/inside-decker-a-new-hero/',
         'info_dict': {
             'id': 'I0LQFQkaSUaFp8PnAWHhoQ',
             'ext': 'mp4',
             'title': 'Decker - Inside Decker: A New Hero',
-            'description': 'md5:c916df071d425d62d70c86d4399d3ee0',
-            'duration': 249.008,
+            'description': 'The guys recap the conclusion of the season. They announce a new hero, take a peek into the Victorville Film Archive and welcome back the talented James Dean.',
+            'timestamp': 1469480460,
+            'upload_date': '20160725',
         },
         'params': {
             # m3u8 download
@@ -97,136 +58,102 @@ class AdultSwimIE(TurnerBaseIE):
         },
         'expected_warnings': ['Unable to download f4m manifest'],
     }, {
-        'url': 'http://www.adultswim.com/videos/toonami/friday-october-14th-2016/',
+        'url': 'http://www.adultswim.com/videos/attack-on-titan',
+        'info_dict': {
+            'id': 'b7A69dzfRzuaXIECdxW8XQ',
+            'title': 'Attack on Titan',
+            'description': 'md5:6c8e003ea0777b47013e894767f5e114',
+        },
+        'playlist_mincount': 12,
+    }, {
+        'url': 'http://www.adultswim.com/videos/streams/williams-stream',
         'info_dict': {
-            'id': 'eYiLsKVgQ6qTC6agD67Sig',
-            'title': 'Toonami - Friday, October 14th, 2016',
-            'description': 'md5:99892c96ffc85e159a428de85c30acde',
+            'id': 'd8DEBj7QRfetLsRgFnGEyg',
+            'ext': 'mp4',
+            'title': r're:^Williams Stream \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
+            'description': 'original programming',
         },
-        'playlist': [{
-            'md5': '',
-            'info_dict': {
-                'id': 'eYiLsKVgQ6qTC6agD67Sig',
-                'ext': 'mp4',
-                'title': 'Toonami - Friday, October 14th, 2016',
-                'description': 'md5:99892c96ffc85e159a428de85c30acde',
-            },
-        }],
         'params': {
             # m3u8 download
             'skip_download': True,
         },
-        'expected_warnings': ['Unable to download f4m manifest'],
     }]
 
-    @staticmethod
-    def find_video_info(collection, slug):
-        for video in collection.get('videos'):
-            if video.get('slug') == slug:
-                return video
-
-    @staticmethod
-    def find_collection_by_linkURL(collections, linkURL):
-        for collection in collections:
-            if collection.get('linkURL') == linkURL:
-                return collection
-
-    @staticmethod
-    def find_collection_containing_video(collections, slug):
-        for collection in collections:
-            for video in collection.get('videos'):
-                if video.get('slug') == slug:
-                    return collection, video
-        return None, None
-
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        show_path = mobj.group('show_path')
-        episode_path = mobj.group('episode_path')
-        is_playlist = True if mobj.group('is_playlist') else False
-
-        webpage = self._download_webpage(url, episode_path)
-
-        # Extract the value of `bootstrappedData` from the Javascript in the page.
-        bootstrapped_data = self._parse_json(self._search_regex(
-            r'var bootstrappedData = ({.*});', webpage, 'bootstraped data'), episode_path)
-
-        # Downloading videos from a /videos/playlist/ URL needs to be handled differently.
-        # NOTE: We are only downloading one video (the current one) not the playlist
-        if is_playlist:
-            collections = bootstrapped_data['playlists']['collections']
-            collection = self.find_collection_by_linkURL(collections, show_path)
-            video_info = self.find_video_info(collection, episode_path)
-
-            show_title = video_info['showTitle']
-            segment_ids = [video_info['videoPlaybackID']]
+        show_path, episode_path = re.match(self._VALID_URL, url).groups()
+        display_id = episode_path or show_path
+        webpage = self._download_webpage(url, display_id)
+        initial_data = self._parse_json(self._search_regex(
+            r'AS_INITIAL_DATA(?:__)?\s*=\s*({.+?});',
+            webpage, 'initial data'), display_id)
+
+        is_stream = show_path == 'streams'
+        if is_stream:
+            if not episode_path:
+                episode_path = 'live-stream'
+
+            video_data = next(stream for stream_path, stream in initial_data['streams'].items() if stream_path == episode_path)
+            video_id = video_data.get('stream')
+
+            if not video_id:
+                entries = []
+                for episode in video_data.get('archiveEpisodes', []):
+                    episode_url = episode.get('url')
+                    if not episode_url:
+                        continue
+                    entries.append(self.url_result(
+                        episode_url, 'AdultSwim', episode.get('id')))
+                return self.playlist_result(
+                    entries, video_data.get('id'), video_data.get('title'),
+                    strip_or_none(video_data.get('description')))
         else:
-            collections = bootstrapped_data['show']['collections']
-            collection, video_info = self.find_collection_containing_video(collections, episode_path)
-            # Video wasn't found in the collections, let's try `slugged_video`.
-            if video_info is None:
-                if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path:
-                    video_info = bootstrapped_data['slugged_video']
-            if not video_info:
-                video_info = bootstrapped_data.get(
-                    'heroMetadata', {}).get('trailer', {}).get('video')
-            if not video_info:
-                video_info = bootstrapped_data.get('onlineOriginals', [None])[0]
-            if not video_info:
-                raise ExtractorError('Unable to find video info')
-
-            show = bootstrapped_data['show']
-            show_title = show['title']
-            stream = video_info.get('stream')
-            if stream and stream.get('videoPlaybackID'):
-                segment_ids = [stream['videoPlaybackID']]
-            elif video_info.get('clips'):
-                segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']]
-            elif video_info.get('videoPlaybackID'):
-                segment_ids = [video_info['videoPlaybackID']]
-            elif video_info.get('id'):
-                segment_ids = [video_info['id']]
-            else:
-                if video_info.get('auth') is True:
-                    raise ExtractorError(
-                        'This video is only available via cable service provider subscription that'
-                        ' is not currently supported. You may want to use --cookies.', expected=True)
-                else:
-                    raise ExtractorError('Unable to find stream or clips')
-
-        episode_id = video_info['id']
-        episode_title = video_info['title']
-        episode_description = video_info.get('description')
-        episode_duration = int_or_none(video_info.get('duration'))
-        view_count = int_or_none(video_info.get('views'))
+            show_data = initial_data['show']
+
+            if not episode_path:
+                entries = []
+                for video in show_data.get('videos', []):
+                    slug = video.get('slug')
+                    if not slug:
+                        continue
+                    entries.append(self.url_result(
+                        'http://adultswim.com/videos/%s/%s' % (show_path, slug),
+                        'AdultSwim', video.get('id')))
+                return self.playlist_result(
+                    entries, show_data.get('id'), show_data.get('title'),
+                    strip_or_none(show_data.get('metadata', {}).get('description')))
+
+            video_data = show_data['sluggedVideo']
+            video_id = video_data['id']
+
+        info = self._extract_cvp_info(
+            'http://www.adultswim.com/videos/api/v0/assets?platform=desktop&id=' + video_id,
+            video_id, {
+                'secure': {
+                    'media_src': 'http://androidhls-secure.cdn.turner.com/adultswim/big',
+                    'tokenizer_src': 'http://www.adultswim.com/astv/mvpd/processors/services/token_ipadAdobe.do',
+                },
+            }, {
+                'url': url,
+                'site_name': 'AdultSwim',
+                'auth_required': video_data.get('auth'),
+            })
 
-        entries = []
-        for part_num, segment_id in enumerate(segment_ids):
-            segement_info = self._extract_cvp_info(
-                'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=desktop' % segment_id,
-                segment_id, {
-                    'secure': {
-                        'media_src': 'http://androidhls-secure.cdn.turner.com/adultswim/big',
-                        'tokenizer_src': 'http://www.adultswim.com/astv/mvpd/processors/services/token_ipadAdobe.do',
-                    },
-                })
-            segment_title = '%s - %s' % (show_title, episode_title)
-            if len(segment_ids) > 1:
-                segment_title += ' Part %d' % (part_num + 1)
-            segement_info.update({
-                'id': segment_id,
-                'title': segment_title,
-                'description': episode_description,
+        info.update({
+            'id': video_id,
+            'display_id': display_id,
+            'description': info.get('description') or strip_or_none(video_data.get('description')),
+        })
+        if not is_stream:
+            info.update({
+                'duration': info.get('duration') or int_or_none(video_data.get('duration')),
+                'timestamp': info.get('timestamp') or int_or_none(video_data.get('launch_date')),
+                'season_number': info.get('season_number') or int_or_none(video_data.get('season_number')),
+                'episode': info['title'],
+                'episode_number': info.get('episode_number') or int_or_none(video_data.get('episode_number')),
             })
-            entries.append(segement_info)
 
-        return {
-            '_type': 'playlist',
-            'id': episode_id,
-            'display_id': episode_path,
-            'entries': entries,
-            'title': '%s - %s' % (show_title, episode_title),
-            'description': episode_description,
-            'duration': episode_duration,
-            'view_count': view_count,
-        }
+            info['series'] = video_data.get('collection_title') or info.get('series')
+            if info['series'] and info['series'] != info['title']:
+                info['title'] = '%s - %s' % (info['series'], info['title'])
+
+        return info
diff --git a/youtube_dl/extractor/aliexpress.py b/youtube_dl/extractor/aliexpress.py
new file mode 100644 (file)
index 0000000..6f241e6
--- /dev/null
@@ -0,0 +1,53 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    float_or_none,
+    try_get,
+)
+
+
+class AliExpressLiveIE(InfoExtractor):
+    _VALID_URL = r'https?://live\.aliexpress\.com/live/(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://live.aliexpress.com/live/2800002704436634',
+        'md5': 'e729e25d47c5e557f2630eaf99b740a5',
+        'info_dict': {
+            'id': '2800002704436634',
+            'ext': 'mp4',
+            'title': 'CASIMA7.22',
+            'thumbnail': r're:http://.*\.jpg',
+            'uploader': 'CASIMA Official Store',
+            'timestamp': 1500717600,
+            'upload_date': '20170722',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        data = self._parse_json(
+            self._search_regex(
+                r'(?s)runParams\s*=\s*({.+?})\s*;?\s*var',
+                webpage, 'runParams'),
+            video_id)
+
+        title = data['title']
+
+        formats = self._extract_m3u8_formats(
+            data['replyStreamUrl'], video_id, 'mp4',
+            entry_protocol='m3u8_native', m3u8_id='hls')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': data.get('coverUrl'),
+            'uploader': try_get(
+                data, lambda x: x['followBar']['name'], compat_str),
+            'timestamp': float_or_none(data.get('startTimeLong'), scale=1000),
+            'formats': formats,
+        }
index 388e578d569a27bdfd3a7d597d3ebcd5f31ccb94..c68be31340296c5b4a40fa36a7b17bc50f598f4b 100644 (file)
@@ -4,9 +4,9 @@ from .common import InfoExtractor
 
 
 class AlJazeeraIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/programmes/.*?/(?P<id>[^/]+)\.html'
+    _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?:programmes|video)/.*?/(?P<id>[^/]+)\.html'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html',
         'info_dict': {
             'id': '3792260579001',
@@ -19,7 +19,10 @@ class AlJazeeraIE(InfoExtractor):
         },
         'add_ie': ['BrightcoveNew'],
         'skip': 'Not accessible from Travis CI server',
-    }
+    }, {
+        'url': 'http://www.aljazeera.com/video/news/2017/05/sierra-leone-709-carat-diamond-auctioned-170511100111930.html',
+        'only_matching': True,
+    }]
     BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/665003303001/default_default/index.html?videoId=%s'
 
     def _real_extract(self, url):
index 3a0ec6776a565f66353ce7f8cb5bb188f5ccbc12..dd3b18d72d05f3deaab902b75cb6064e4b9d16ac 100644 (file)
@@ -3,9 +3,10 @@ from __future__ import unicode_literals
 
 from .theplatform import ThePlatformIE
 from ..utils import (
-    update_url_query,
-    parse_age_limit,
     int_or_none,
+    parse_age_limit,
+    try_get,
+    update_url_query,
 )
 
 
@@ -68,7 +69,8 @@ class AMCNetworksIE(ThePlatformIE):
         info = self._parse_theplatform_metadata(theplatform_metadata)
         video_id = theplatform_metadata['pid']
         title = theplatform_metadata['title']
-        rating = theplatform_metadata['ratings'][0]['rating']
+        rating = try_get(
+            theplatform_metadata, lambda x: x['ratings'][0]['rating'])
         auth_required = self._search_regex(
             r'window\.authRequired\s*=\s*(true|false);',
             webpage, 'auth required')
index 98f8e69cdc7780da950c2b0d0f46a498d2883d33..fde1a8ff74d8d6bcc85eb2520947e96d4205b176 100644 (file)
@@ -34,9 +34,12 @@ class AMPIE(InfoExtractor):
             if isinstance(media_thumbnail, dict):
                 media_thumbnail = [media_thumbnail]
             for thumbnail_data in media_thumbnail:
-                thumbnail = thumbnail_data['@attributes']
+                thumbnail = thumbnail_data.get('@attributes', {})
+                thumbnail_url = thumbnail.get('url')
+                if not thumbnail_url:
+                    continue
                 thumbnails.append({
-                    'url': self._proto_relative_url(thumbnail['url'], 'http:'),
+                    'url': self._proto_relative_url(thumbnail_url, 'http:'),
                     'width': int_or_none(thumbnail.get('width')),
                     'height': int_or_none(thumbnail.get('height')),
                 })
@@ -47,9 +50,14 @@ class AMPIE(InfoExtractor):
             if isinstance(media_subtitle, dict):
                 media_subtitle = [media_subtitle]
             for subtitle_data in media_subtitle:
-                subtitle = subtitle_data['@attributes']
-                lang = subtitle.get('lang') or 'en'
-                subtitles[lang] = [{'url': subtitle['href']}]
+                subtitle = subtitle_data.get('@attributes', {})
+                subtitle_href = subtitle.get('href')
+                if not subtitle_href:
+                    continue
+                subtitles.setdefault(subtitle.get('lang') or 'en', []).append({
+                    'url': subtitle_href,
+                    'ext': mimetype2ext(subtitle.get('type')) or determine_ext(subtitle_href),
+                })
 
         formats = []
         media_content = get_media_node('content')
index 9e28f25790bb284b5ba00376292cf33c8ded8cb1..69d36331157010cf5411316096b1a940be5422c1 100644 (file)
@@ -3,16 +3,13 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..compat import (
-    compat_urlparse,
-    compat_str,
-)
+from ..compat import compat_str
 from ..utils import (
     determine_ext,
     extract_attributes,
     ExtractorError,
-    sanitized_Request,
     urlencode_postdata,
+    urljoin,
 )
 
 
@@ -21,6 +18,8 @@ class AnimeOnDemandIE(InfoExtractor):
     _LOGIN_URL = 'https://www.anime-on-demand.de/users/sign_in'
     _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply'
     _NETRC_MACHINE = 'animeondemand'
+    # German-speaking countries of Europe
+    _GEO_COUNTRIES = ['AT', 'CH', 'DE', 'LI', 'LU']
     _TESTS = [{
         # jap, OmU
         'url': 'https://www.anime-on-demand.de/anime/161',
@@ -46,6 +45,10 @@ class AnimeOnDemandIE(InfoExtractor):
         # Full length film, non-series, ger/jap, Dub/OmU, account required
         'url': 'https://www.anime-on-demand.de/anime/185',
         'only_matching': True,
+    }, {
+        # Flash videos
+        'url': 'https://www.anime-on-demand.de/anime/12',
+        'only_matching': True,
     }]
 
     def _login(self):
@@ -72,14 +75,13 @@ class AnimeOnDemandIE(InfoExtractor):
             'post url', default=self._LOGIN_URL, group='url')
 
         if not post_url.startswith('http'):
-            post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url)
-
-        request = sanitized_Request(
-            post_url, urlencode_postdata(login_form))
-        request.add_header('Referer', self._LOGIN_URL)
+            post_url = urljoin(self._LOGIN_URL, post_url)
 
         response = self._download_webpage(
-            request, None, 'Logging in as %s' % username)
+            post_url, None, 'Logging in as %s' % username,
+            data=urlencode_postdata(login_form), headers={
+                'Referer': self._LOGIN_URL,
+            })
 
         if all(p not in response for p in ('>Logout<', 'href="/users/sign_out"')):
             error = self._search_regex(
@@ -120,10 +122,11 @@ class AnimeOnDemandIE(InfoExtractor):
             formats = []
 
             for input_ in re.findall(
-                    r'<input[^>]+class=["\'].*?streamstarter_html5[^>]+>', html):
+                    r'<input[^>]+class=["\'].*?streamstarter[^>]+>', html):
                 attributes = extract_attributes(input_)
+                title = attributes.get('data-dialog-header')
                 playlist_urls = []
-                for playlist_key in ('data-playlist', 'data-otherplaylist'):
+                for playlist_key in ('data-playlist', 'data-otherplaylist', 'data-stream'):
                     playlist_url = attributes.get(playlist_key)
                     if isinstance(playlist_url, compat_str) and re.match(
                             r'/?[\da-zA-Z]+', playlist_url):
@@ -147,19 +150,38 @@ class AnimeOnDemandIE(InfoExtractor):
                         format_id_list.append(compat_str(num))
                     format_id = '-'.join(format_id_list)
                     format_note = ', '.join(filter(None, (kind, lang_note)))
-                    request = sanitized_Request(
-                        compat_urlparse.urljoin(url, playlist_url),
+                    item_id_list = []
+                    if format_id:
+                        item_id_list.append(format_id)
+                    item_id_list.append('videomaterial')
+                    playlist = self._download_json(
+                        urljoin(url, playlist_url), video_id,
+                        'Downloading %s JSON' % ' '.join(item_id_list),
                         headers={
                             'X-Requested-With': 'XMLHttpRequest',
                             'X-CSRF-Token': csrf_token,
                             'Referer': url,
                             'Accept': 'application/json, text/javascript, */*; q=0.01',
-                        })
-                    playlist = self._download_json(
-                        request, video_id, 'Downloading %s playlist JSON' % format_id,
-                        fatal=False)
+                        }, fatal=False)
                     if not playlist:
                         continue
+                    stream_url = playlist.get('streamurl')
+                    if stream_url:
+                        rtmp = re.search(
+                            r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+/))(?P<playpath>mp[34]:.+)',
+                            stream_url)
+                        if rtmp:
+                            formats.append({
+                                'url': rtmp.group('url'),
+                                'app': rtmp.group('app'),
+                                'play_path': rtmp.group('playpath'),
+                                'page_url': url,
+                                'player_url': 'https://www.anime-on-demand.de/assets/jwplayer.flash-55abfb34080700304d49125ce9ffb4a6.swf',
+                                'rtmp_real_time': True,
+                                'format_id': 'rtmp',
+                                'ext': 'flv',
+                            })
+                            continue
                     start_video = playlist.get('startvideo', 0)
                     playlist = playlist.get('playlist')
                     if not playlist or not isinstance(playlist, list):
@@ -222,7 +244,7 @@ class AnimeOnDemandIE(InfoExtractor):
                     f.update({
                         'id': '%s-%s' % (f['id'], m.group('kind').lower()),
                         'title': m.group('title'),
-                        'url': compat_urlparse.urljoin(url, m.group('href')),
+                        'url': urljoin(url, m.group('href')),
                     })
                     entries.append(f)
 
index 9fd91c2f6e042f0368448dc8ab1fa28e6e207c48..8023da70236599e1777172ac416a8ad828a6ec0c 100644 (file)
@@ -5,6 +5,7 @@ import base64
 import hashlib
 import json
 import random
+import re
 import time
 
 from .common import InfoExtractor
@@ -16,6 +17,7 @@ from ..utils import (
     intlist_to_bytes,
     int_or_none,
     strip_jsonp,
+    unescapeHTML,
 )
 
 
@@ -26,6 +28,8 @@ def md5_text(s):
 
 
 class AnvatoIE(InfoExtractor):
+    _VALID_URL = r'anvato:(?P<access_key_or_mcp>[^:]+):(?P<id>\d+)'
+
     # Copied from anvplayer.min.js
     _ANVACK_TABLE = {
         'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ',
@@ -114,6 +118,22 @@ class AnvatoIE(InfoExtractor):
         'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ'
     }
 
+    _MCP_TO_ACCESS_KEY_TABLE = {
+        'qa': 'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922',
+        'lin': 'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749',
+        'univison': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa',
+        'uni': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa',
+        'dev': 'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a',
+        'sps': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336',
+        'spsstg': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336',
+        'anv': 'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3',
+        'gray': 'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900',
+        'hearst': 'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99',
+        'cbs': 'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe',
+        'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582'
+    }
+
+    _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1'
     _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce'
 
     def __init__(self, *args, **kwargs):
@@ -217,9 +237,42 @@ class AnvatoIE(InfoExtractor):
             'subtitles': subtitles,
         }
 
+    @staticmethod
+    def _extract_urls(ie, webpage, video_id):
+        entries = []
+        for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage):
+            anvplayer_data = ie._parse_json(
+                mobj.group('anvp'), video_id, transform_source=unescapeHTML,
+                fatal=False)
+            if not anvplayer_data:
+                continue
+            video = anvplayer_data.get('video')
+            if not isinstance(video, compat_str) or not video.isdigit():
+                continue
+            access_key = anvplayer_data.get('accessKey')
+            if not access_key:
+                mcp = anvplayer_data.get('mcp')
+                if mcp:
+                    access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get(
+                        mcp.lower())
+            if not access_key:
+                continue
+            entries.append(ie.url_result(
+                'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(),
+                video_id=video))
+        return entries
+
     def _extract_anvato_videos(self, webpage, video_id):
-        anvplayer_data = self._parse_json(self._html_search_regex(
-            r'<script[^>]+data-anvp=\'([^\']+)\'', webpage,
-            'Anvato player data'), video_id)
+        anvplayer_data = self._parse_json(
+            self._html_search_regex(
+                self._ANVP_RE, webpage, 'Anvato player data', group='anvp'),
+            video_id)
         return self._get_anvato_videos(
             anvplayer_data['accessKey'], anvplayer_data['video'])
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        access_key, video_id = mobj.group('access_key_or_mcp', 'id')
+        if access_key not in self._ANVACK_TABLE:
+            access_key = self._MCP_TO_ACCESS_KEY_TABLE[access_key]
+        return self._get_anvato_videos(access_key, video_id)
index 025e29aa46fe5db97c323fa95d947470f1f2023a..e394cb66143e2e72f029d629fe4dd6b4b5e7cf8e 100644 (file)
@@ -3,13 +3,13 @@ from __future__ import unicode_literals
 
 from .common import InfoExtractor
 from ..utils import (
-    ExtractorError,
-    HEADRequest,
+    int_or_none,
+    mimetype2ext,
 )
 
 
 class AparatIE(InfoExtractor):
-    _VALID_URL = r'^https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)'
+    _VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)'
 
     _TEST = {
         'url': 'http://www.aparat.com/v/wP8On',
@@ -29,30 +29,41 @@ class AparatIE(InfoExtractor):
         # Note: There is an easier-to-parse configuration at
         # http://www.aparat.com/video/video/config/videohash/%video_id
         # but the URL in there does not work
-        embed_url = 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id
-        webpage = self._download_webpage(embed_url, video_id)
-
-        file_list = self._parse_json(self._search_regex(
-            r'fileList\s*=\s*JSON\.parse\(\'([^\']+)\'\)', webpage, 'file list'), video_id)
-        for i, item in enumerate(file_list[0]):
-            video_url = item['file']
-            req = HEADRequest(video_url)
-            res = self._request_webpage(
-                req, video_id, note='Testing video URL %d' % i, errnote=False)
-            if res:
-                break
-        else:
-            raise ExtractorError('No working video URLs found')
+        webpage = self._download_webpage(
+            'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id,
+            video_id)
 
         title = self._search_regex(r'\s+title:\s*"([^"]+)"', webpage, 'title')
+
+        file_list = self._parse_json(
+            self._search_regex(
+                r'fileList\s*=\s*JSON\.parse\(\'([^\']+)\'\)', webpage,
+                'file list'),
+            video_id)
+
+        formats = []
+        for item in file_list[0]:
+            file_url = item.get('file')
+            if not file_url:
+                continue
+            ext = mimetype2ext(item.get('type'))
+            label = item.get('label')
+            formats.append({
+                'url': file_url,
+                'ext': ext,
+                'format_id': label or ext,
+                'height': int_or_none(self._search_regex(
+                    r'(\d+)[pP]', label or '', 'height', default=None)),
+            })
+        self._sort_formats(formats)
+
         thumbnail = self._search_regex(
             r'image:\s*"([^"]+)"', webpage, 'thumbnail', fatal=False)
 
         return {
             'id': video_id,
             'title': title,
-            'url': video_url,
-            'ext': 'mp4',
             'thumbnail': thumbnail,
             'age_limit': self._family_friendly_search(webpage),
+            'formats': formats,
         }
index a6801f3d4860414c286277c92bd994e16212cffd..b45b431e19c1526eb9578e9cbf6a13a97939b2a8 100644 (file)
@@ -70,7 +70,8 @@ class AppleTrailersIE(InfoExtractor):
     }, {
         'url': 'http://trailers.apple.com/trailers/magnolia/blackthorn/',
         'info_dict': {
-            'id': 'blackthorn',
+            'id': '4489',
+            'title': 'Blackthorn',
         },
         'playlist_mincount': 2,
         'expected_warnings': ['Unable to download JSON metadata'],
@@ -261,7 +262,7 @@ class AppleTrailersSectionIE(InfoExtractor):
             'title': 'Most Popular',
             'id': 'mostpopular',
         },
-        'playlist_mincount': 80,
+        'playlist_mincount': 30,
     }, {
         'url': 'http://trailers.apple.com/#section=moviestudios',
         'info_dict': {
index 2d5599456688eba9756e28c2ffe9dbae48decb2c..3f248b14728ab3655a2e17f7b38a95184042d770 100644 (file)
@@ -93,6 +93,7 @@ class ARDMediathekIE(InfoExtractor):
 
         duration = int_or_none(media_info.get('_duration'))
         thumbnail = media_info.get('_previewImage')
+        is_live = media_info.get('_isLive') is True
 
         subtitles = {}
         subtitle_url = media_info.get('_subtitleUrl')
@@ -106,6 +107,7 @@ class ARDMediathekIE(InfoExtractor):
             'id': video_id,
             'duration': duration,
             'thumbnail': thumbnail,
+            'is_live': is_live,
             'formats': formats,
             'subtitles': subtitles,
         }
@@ -166,9 +168,11 @@ class ARDMediathekIE(InfoExtractor):
         # determine video id from url
         m = re.match(self._VALID_URL, url)
 
+        document_id = None
+
         numid = re.search(r'documentId=([0-9]+)', url)
         if numid:
-            video_id = numid.group(1)
+            document_id = video_id = numid.group(1)
         else:
             video_id = m.group('video_id')
 
@@ -228,12 +232,16 @@ class ARDMediathekIE(InfoExtractor):
                 'formats': formats,
             }
         else:  # request JSON file
+            if not document_id:
+                video_id = self._search_regex(
+                    r'/play/(?:config|media)/(\d+)', webpage, 'media id')
             info = self._extract_media_info(
-                'http://www.ardmediathek.de/play/media/%s' % video_id, webpage, video_id)
+                'http://www.ardmediathek.de/play/media/%s' % video_id,
+                webpage, video_id)
 
         info.update({
             'id': video_id,
-            'title': title,
+            'title': self._live_title(title) if info.get('is_live') else title,
             'description': description,
             'thumbnail': thumbnail,
         })
index 56baef29d4f644c1b52c7d2e5f26fcca7e89e9e4..5cde90c5b23d1f92a7709c766102bb298e63cadb 100644 (file)
@@ -9,12 +9,13 @@ from ..compat import (
     compat_urllib_parse_urlparse,
 )
 from ..utils import (
+    ExtractorError,
     find_xpath_attr,
-    unified_strdate,
     get_element_by_attribute,
     int_or_none,
     NO_DEFAULT,
     qualities,
+    unified_strdate,
 )
 
 # There are different sources of video in arte.tv, the extraction process
@@ -79,6 +80,13 @@ class ArteTVBaseIE(InfoExtractor):
         info = self._download_json(json_url, video_id)
         player_info = info['videoJsonPlayer']
 
+        vsr = player_info['VSR']
+
+        if not vsr:
+            raise ExtractorError(
+                'Video %s is not available' % player_info.get('VID') or video_id,
+                expected=True)
+
         upload_date_str = player_info.get('shootingDate')
         if not upload_date_str:
             upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0]
@@ -107,7 +115,7 @@ class ArteTVBaseIE(InfoExtractor):
         langcode = LANGS.get(lang, lang)
 
         formats = []
-        for format_id, format_dict in player_info['VSR'].items():
+        for format_id, format_dict in vsr.items():
             f = dict(format_dict)
             versionCode = f.get('versionCode')
             l = re.escape(langcode)
diff --git a/youtube_dl/extractor/asiancrush.py b/youtube_dl/extractor/asiancrush.py
new file mode 100644 (file)
index 0000000..594c88c
--- /dev/null
@@ -0,0 +1,93 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .kaltura import KalturaIE
+from ..utils import (
+    extract_attributes,
+    remove_end,
+    urlencode_postdata,
+)
+
+
+class AsianCrushIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?asiancrush\.com/video/(?:[^/]+/)?0+(?P<id>\d+)v\b'
+    _TESTS = [{
+        'url': 'https://www.asiancrush.com/video/012869v/women-who-flirt/',
+        'md5': 'c3b740e48d0ba002a42c0b72857beae6',
+        'info_dict': {
+            'id': '1_y4tmjm5r',
+            'ext': 'mp4',
+            'title': 'Women Who Flirt',
+            'description': 'md5:3db14e9186197857e7063522cb89a805',
+            'timestamp': 1496936429,
+            'upload_date': '20170608',
+            'uploader_id': 'craig@crifkin.com',
+        },
+    }, {
+        'url': 'https://www.asiancrush.com/video/she-was-pretty/011886v-pretty-episode-3/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        data = self._download_json(
+            'https://www.asiancrush.com/wp-admin/admin-ajax.php', video_id,
+            data=urlencode_postdata({
+                'postid': video_id,
+                'action': 'get_channel_kaltura_vars',
+            }))
+
+        entry_id = data['entry_id']
+
+        return self.url_result(
+            'kaltura:%s:%s' % (data['partner_id'], entry_id),
+            ie=KalturaIE.ie_key(), video_id=entry_id,
+            video_title=data.get('vid_label'))
+
+
+class AsianCrushPlaylistIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?asiancrush\.com/series/0+(?P<id>\d+)s\b'
+    _TEST = {
+        'url': 'https://www.asiancrush.com/series/012481s/scholar-walks-night/',
+        'info_dict': {
+            'id': '12481',
+            'title': 'Scholar Who Walks the Night',
+            'description': 'md5:7addd7c5132a09fd4741152d96cce886',
+        },
+        'playlist_count': 20,
+    }
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        entries = []
+
+        for mobj in re.finditer(
+                r'<a[^>]+href=(["\'])(?P<url>%s.*?)\1[^>]*>' % AsianCrushIE._VALID_URL,
+                webpage):
+            attrs = extract_attributes(mobj.group(0))
+            if attrs.get('class') == 'clearfix':
+                entries.append(self.url_result(
+                    mobj.group('url'), ie=AsianCrushIE.ie_key()))
+
+        title = remove_end(
+            self._html_search_regex(
+                r'(?s)<h1\b[^>]\bid=["\']movieTitle[^>]+>(.+?)</h1>', webpage,
+                'title', default=None) or self._og_search_title(
+                webpage, default=None) or self._html_search_meta(
+                'twitter:title', webpage, 'title',
+                default=None) or self._search_regex(
+                r'<title>([^<]+)</title>', webpage, 'title', fatal=False),
+            ' | AsianCrush')
+
+        description = self._og_search_description(
+            webpage, default=None) or self._html_search_meta(
+            'twitter:description', webpage, 'description', fatal=False)
+
+        return self.playlist_result(entries, playlist_id, title, description)
index e48bb89721df88bf75b68c8ba8feb49bedd4b2c0..393f381c6129f41688e2e163b6135a5254eaf3ea 100644 (file)
@@ -43,7 +43,7 @@ class AudioBoomIE(InfoExtractor):
 
         def from_clip(field):
             if clip:
-                clip.get(field)
+                return clip.get(field)
 
         audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property(
             'audio', webpage, 'audio url')
index df2972f2684a82a7c58cafdd2fc6083cd035d8bd..be41bd5a22477fce2aca4a043799574e148fdc57 100644 (file)
@@ -14,14 +14,16 @@ from ..utils import (
     ExtractorError,
     float_or_none,
     int_or_none,
+    KNOWN_EXTENSIONS,
     parse_filesize,
     unescapeHTML,
     update_url_query,
+    unified_strdate,
 )
 
 
 class BandcampIE(InfoExtractor):
-    _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>.*)'
+    _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>[^/?#&]+)'
     _TESTS = [{
         'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
         'md5': 'c557841d5e50261777a6585648adf439',
@@ -47,6 +49,7 @@ class BandcampIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
         title = mobj.group('title')
         webpage = self._download_webpage(url, title)
+        thumbnail = self._html_search_meta('og:image', webpage, default=None)
         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
         if not m_download:
             m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage)
@@ -75,6 +78,7 @@ class BandcampIE(InfoExtractor):
                 return {
                     'id': track_id,
                     'title': data['title'],
+                    'thumbnail': thumbnail,
                     'formats': formats,
                     'duration': float_or_none(data.get('duration')),
                 }
@@ -143,7 +147,7 @@ class BandcampIE(InfoExtractor):
         return {
             'id': video_id,
             'title': title,
-            'thumbnail': info.get('thumb_url'),
+            'thumbnail': info.get('thumb_url') or thumbnail,
             'uploader': info.get('artist'),
             'artist': artist,
             'track': track,
@@ -153,7 +157,7 @@ class BandcampIE(InfoExtractor):
 
 class BandcampAlbumIE(InfoExtractor):
     IE_NAME = 'Bandcamp:album'
-    _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^?#]+)|/?(?:$|[?#]))'
+    _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?'
 
     _TESTS = [{
         'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
@@ -220,6 +224,12 @@ class BandcampAlbumIE(InfoExtractor):
         'playlist_count': 2,
     }]
 
+    @classmethod
+    def suitable(cls, url):
+        return (False
+                if BandcampWeeklyIE.suitable(url) or BandcampIE.suitable(url)
+                else super(BandcampAlbumIE, cls).suitable(url))
+
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         uploader_id = mobj.group('subdomain')
@@ -232,7 +242,12 @@ class BandcampAlbumIE(InfoExtractor):
             raise ExtractorError('The page doesn\'t contain any tracks')
         # Only tracks with duration info have songs
         entries = [
-            self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key())
+            self.url_result(
+                compat_urlparse.urljoin(url, t_path),
+                ie=BandcampIE.ie_key(),
+                video_title=self._search_regex(
+                    r'<span\b[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)',
+                    elem_content, 'track title', fatal=False))
             for elem_content, t_path in track_elements
             if self._html_search_meta('duration', elem_content, default=None)]
 
@@ -248,3 +263,92 @@ class BandcampAlbumIE(InfoExtractor):
             'title': title,
             'entries': entries,
         }
+
+
+class BandcampWeeklyIE(InfoExtractor):
+    IE_NAME = 'Bandcamp:weekly'
+    _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://bandcamp.com/?show=224',
+        'md5': 'b00df799c733cf7e0c567ed187dea0fd',
+        'info_dict': {
+            'id': '224',
+            'ext': 'opus',
+            'title': 'BC Weekly April 4th 2017 - Magic Moments',
+            'description': 'md5:5d48150916e8e02d030623a48512c874',
+            'duration': 5829.77,
+            'release_date': '20170404',
+            'series': 'Bandcamp Weekly',
+            'episode': 'Magic Moments',
+            'episode_number': 208,
+            'episode_id': '224',
+        }
+    }, {
+        'url': 'https://bandcamp.com/?blah/blah@&show=228',
+        'only_matching': True
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        blob = self._parse_json(
+            self._search_regex(
+                r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage,
+                'blob', group='blob'),
+            video_id, transform_source=unescapeHTML)
+
+        show = blob['bcw_show']
+
+        # This is desired because any invalid show id redirects to `bandcamp.com`
+        # which happens to expose the latest Bandcamp Weekly episode.
+        show_id = int_or_none(show.get('show_id')) or int_or_none(video_id)
+
+        formats = []
+        for format_id, format_url in show['audio_stream'].items():
+            if not isinstance(format_url, compat_str):
+                continue
+            for known_ext in KNOWN_EXTENSIONS:
+                if known_ext in format_id:
+                    ext = known_ext
+                    break
+            else:
+                ext = None
+            formats.append({
+                'format_id': format_id,
+                'url': format_url,
+                'ext': ext,
+                'vcodec': 'none',
+            })
+        self._sort_formats(formats)
+
+        title = show.get('audio_title') or 'Bandcamp Weekly'
+        subtitle = show.get('subtitle')
+        if subtitle:
+            title += ' - %s' % subtitle
+
+        episode_number = None
+        seq = blob.get('bcw_seq')
+
+        if seq and isinstance(seq, list):
+            try:
+                episode_number = next(
+                    int_or_none(e.get('episode_number'))
+                    for e in seq
+                    if isinstance(e, dict) and int_or_none(e.get('id')) == show_id)
+            except StopIteration:
+                pass
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': show.get('desc') or show.get('short_desc'),
+            'duration': float_or_none(show.get('audio_duration')),
+            'is_live': False,
+            'release_date': unified_strdate(show.get('published_date')),
+            'series': 'Bandcamp Weekly',
+            'episode': show.get('subtitle'),
+            'episode_number': episode_number,
+            'episode_id': compat_str(video_id),
+            'formats': formats
+        }
index dd65b8d86a36f3bbb486072260f14ab652a1868e..8b20c03d6e424b95e42b1bea1ac3fb91e24bea11 100644 (file)
@@ -6,14 +6,18 @@ import itertools
 
 from .common import InfoExtractor
 from ..utils import (
+    clean_html,
     dict_get,
     ExtractorError,
     float_or_none,
+    get_element_by_class,
     int_or_none,
     parse_duration,
     parse_iso8601,
     try_get,
     unescapeHTML,
+    urlencode_postdata,
+    urljoin,
 )
 from ..compat import (
     compat_etree_fromstring,
@@ -25,19 +29,23 @@ from ..compat import (
 class BBCCoUkIE(InfoExtractor):
     IE_NAME = 'bbc.co.uk'
     IE_DESC = 'BBC iPlayer'
-    _ID_REGEX = r'[pb][\da-z]{7}'
+    _ID_REGEX = r'[pbw][\da-z]{7}'
     _VALID_URL = r'''(?x)
                     https?://
                         (?:www\.)?bbc\.co\.uk/
                         (?:
                             programmes/(?!articles/)|
                             iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
-                            music/clips[/#]|
-                            radio/player/
+                            music/(?:clips|audiovideo/popular)[/#]|
+                            radio/player/|
+                            events/[^/]+/play/[^/]+/
                         )
                         (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
                     ''' % _ID_REGEX
 
+    _LOGIN_URL = 'https://account.bbc.com/signin'
+    _NETRC_MACHINE = 'bbc'
+
     _MEDIASELECTOR_URLS = [
         # Provides HQ HLS streams with even better quality that pc mediaset but fails
         # with geolocation in some cases when it's even not geo restricted at all (e.g.
@@ -222,11 +230,49 @@ class BBCCoUkIE(InfoExtractor):
         }, {
             'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
             'only_matching': True,
-        }
-    ]
+        }, {
+            'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
+            'only_matching': True,
+        }, {
+            'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
+            'only_matching': True,
+        }]
 
     _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8'
 
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None:
+            return
+
+        login_page = self._download_webpage(
+            self._LOGIN_URL, None, 'Downloading signin page')
+
+        login_form = self._hidden_inputs(login_page)
+
+        login_form.update({
+            'username': username,
+            'password': password,
+        })
+
+        post_url = urljoin(self._LOGIN_URL, self._search_regex(
+            r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
+            'post url', default=self._LOGIN_URL, group='url'))
+
+        response, urlh = self._download_webpage_handle(
+            post_url, None, 'Logging in', data=urlencode_postdata(login_form),
+            headers={'Referer': self._LOGIN_URL})
+
+        if self._LOGIN_URL in urlh.geturl():
+            error = clean_html(get_element_by_class('form-message', response))
+            if error:
+                raise ExtractorError(
+                    'Unable to login: %s' % error, expected=True)
+            raise ExtractorError('Unable to log in')
+
+    def _real_initialize(self):
+        self._login()
+
     class MediaSelectionError(Exception):
         def __init__(self, id):
             self.id = id
@@ -483,6 +529,12 @@ class BBCCoUkIE(InfoExtractor):
 
         webpage = self._download_webpage(url, group_id, 'Downloading video page')
 
+        error = self._search_regex(
+            r'<div\b[^>]+\bclass=["\']smp__message delta["\'][^>]*>([^<]+)<',
+            webpage, 'error', default=None)
+        if error:
+            raise ExtractorError(error, expected=True)
+
         programme_id = None
         duration = None
 
index f3a9e3278f0cb3b4426d519162b996a835761af9..2eaec1ab4981d87281b9580d9798956d1b36ea29 100644 (file)
@@ -6,18 +6,33 @@ from ..utils import (
     ExtractorError,
     clean_html,
     compat_str,
+    float_or_none,
     int_or_none,
     parse_iso8601,
     try_get,
+    urljoin,
 )
 
 
-class BeamProLiveIE(InfoExtractor):
-    IE_NAME = 'Beam:live'
-    _VALID_URL = r'https?://(?:\w+\.)?beam\.pro/(?P<id>[^/?#&]+)'
+class BeamProBaseIE(InfoExtractor):
+    _API_BASE = 'https://mixer.com/api/v1'
     _RATINGS = {'family': 0, 'teen': 13, '18+': 18}
+
+    def _extract_channel_info(self, chan):
+        user_id = chan.get('userId') or try_get(chan, lambda x: x['user']['id'])
+        return {
+            'uploader': chan.get('token') or try_get(
+                chan, lambda x: x['user']['username'], compat_str),
+            'uploader_id': compat_str(user_id) if user_id else None,
+            'age_limit': self._RATINGS.get(chan.get('audience')),
+        }
+
+
+class BeamProLiveIE(BeamProBaseIE):
+    IE_NAME = 'Mixer:live'
+    _VALID_URL = r'https?://(?:\w+\.)?(?:beam\.pro|mixer\.com)/(?P<id>[^/?#&]+)'
     _TEST = {
-        'url': 'http://www.beam.pro/niterhayven',
+        'url': 'http://mixer.com/niterhayven',
         'info_dict': {
             'id': '261562',
             'ext': 'mp4',
@@ -38,11 +53,17 @@ class BeamProLiveIE(InfoExtractor):
         },
     }
 
+    _MANIFEST_URL_TEMPLATE = '%s/channels/%%s/manifest.%%s' % BeamProBaseIE._API_BASE
+
+    @classmethod
+    def suitable(cls, url):
+        return False if BeamProVodIE.suitable(url) else super(BeamProLiveIE, cls).suitable(url)
+
     def _real_extract(self, url):
         channel_name = self._match_id(url)
 
         chan = self._download_json(
-            'https://beam.pro/api/v1/channels/%s' % channel_name, channel_name)
+            '%s/channels/%s' % (self._API_BASE, channel_name), channel_name)
 
         if chan.get('online') is False:
             raise ExtractorError(
@@ -50,24 +71,118 @@ class BeamProLiveIE(InfoExtractor):
 
         channel_id = chan['id']
 
+        def manifest_url(kind):
+            return self._MANIFEST_URL_TEMPLATE % (channel_id, kind)
+
         formats = self._extract_m3u8_formats(
-            'https://beam.pro/api/v1/channels/%s/manifest.m3u8' % channel_id,
-            channel_name, ext='mp4', m3u8_id='hls', fatal=False)
+            manifest_url('m3u8'), channel_name, ext='mp4', m3u8_id='hls',
+            fatal=False)
+        formats.extend(self._extract_smil_formats(
+            manifest_url('smil'), channel_name, fatal=False))
         self._sort_formats(formats)
 
-        user_id = chan.get('userId') or try_get(chan, lambda x: x['user']['id'])
-
-        return {
+        info = {
             'id': compat_str(chan.get('id') or channel_name),
             'title': self._live_title(chan.get('name') or channel_name),
             'description': clean_html(chan.get('description')),
-            'thumbnail': try_get(chan, lambda x: x['thumbnail']['url'], compat_str),
+            'thumbnail': try_get(
+                chan, lambda x: x['thumbnail']['url'], compat_str),
             'timestamp': parse_iso8601(chan.get('updatedAt')),
-            'uploader': chan.get('token') or try_get(
-                chan, lambda x: x['user']['username'], compat_str),
-            'uploader_id': compat_str(user_id) if user_id else None,
-            'age_limit': self._RATINGS.get(chan.get('audience')),
             'is_live': True,
             'view_count': int_or_none(chan.get('viewersTotal')),
             'formats': formats,
         }
+        info.update(self._extract_channel_info(chan))
+
+        return info
+
+
+class BeamProVodIE(BeamProBaseIE):
+    IE_NAME = 'Mixer:vod'
+    _VALID_URL = r'https?://(?:\w+\.)?(?:beam\.pro|mixer\.com)/[^/?#&]+\?.*?\bvod=(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://mixer.com/willow8714?vod=2259830',
+        'md5': 'b2431e6e8347dc92ebafb565d368b76b',
+        'info_dict': {
+            'id': '2259830',
+            'ext': 'mp4',
+            'title': 'willow8714\'s Channel',
+            'duration': 6828.15,
+            'thumbnail': r're:https://.*source\.png$',
+            'timestamp': 1494046474,
+            'upload_date': '20170506',
+            'uploader': 'willow8714',
+            'uploader_id': '6085379',
+            'age_limit': 13,
+            'view_count': int,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }
+
+    @staticmethod
+    def _extract_format(vod, vod_type):
+        if not vod.get('baseUrl'):
+            return []
+
+        if vod_type == 'hls':
+            filename, protocol = 'manifest.m3u8', 'm3u8_native'
+        elif vod_type == 'raw':
+            filename, protocol = 'source.mp4', 'https'
+        else:
+            assert False
+
+        data = vod.get('data') if isinstance(vod.get('data'), dict) else {}
+
+        format_id = [vod_type]
+        if isinstance(data.get('Height'), compat_str):
+            format_id.append('%sp' % data['Height'])
+
+        return [{
+            'url': urljoin(vod['baseUrl'], filename),
+            'format_id': '-'.join(format_id),
+            'ext': 'mp4',
+            'protocol': protocol,
+            'width': int_or_none(data.get('Width')),
+            'height': int_or_none(data.get('Height')),
+            'fps': int_or_none(data.get('Fps')),
+            'tbr': int_or_none(data.get('Bitrate'), 1000),
+        }]
+
+    def _real_extract(self, url):
+        vod_id = self._match_id(url)
+
+        vod_info = self._download_json(
+            '%s/recordings/%s' % (self._API_BASE, vod_id), vod_id)
+
+        state = vod_info.get('state')
+        if state != 'AVAILABLE':
+            raise ExtractorError(
+                'VOD %s is not available (state: %s)' % (vod_id, state),
+                expected=True)
+
+        formats = []
+        thumbnail_url = None
+
+        for vod in vod_info['vods']:
+            vod_type = vod.get('format')
+            if vod_type in ('hls', 'raw'):
+                formats.extend(self._extract_format(vod, vod_type))
+            elif vod_type == 'thumbnail':
+                thumbnail_url = urljoin(vod.get('baseUrl'), 'source.png')
+
+        self._sort_formats(formats)
+
+        info = {
+            'id': vod_id,
+            'title': vod_info.get('name') or vod_id,
+            'duration': float_or_none(vod_info.get('duration')),
+            'thumbnail': thumbnail_url,
+            'timestamp': parse_iso8601(vod_info.get('createdAt')),
+            'view_count': int_or_none(vod_info.get('viewsTotal')),
+            'formats': formats,
+        }
+        info.update(self._extract_channel_info(vod_info.get('channel') or {}))
+
+        return info
index 80dd8382e4e8758274e3a7ba2418479ee3d2fbbc..1e57310d657fee2c47cb8e63a9868a36724f0f6a 100644 (file)
@@ -54,6 +54,22 @@ class BiliBiliIE(InfoExtractor):
             'description': '如果你是神明,并且能够让妄想成为现实。那你会进行怎么样的妄想?是淫靡的世界?独裁社会?毁灭性的制裁?还是……2015年,涩谷。从6年前发生的大灾害“涩谷地震”之后复兴了的这个街区里新设立的私立高中...',
         },
         'skip': 'Geo-restricted to China',
+    }, {
+        # Title with double quotes
+        'url': 'http://www.bilibili.com/video/av8903802/',
+        'info_dict': {
+            'id': '8903802',
+            'ext': 'mp4',
+            'title': '阿滴英文|英文歌分享#6 "Closer',
+            'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文',
+            'uploader': '阿滴英文',
+            'uploader_id': '65880958',
+            'timestamp': 1488382620,
+            'upload_date': '20170301',
+        },
+        'params': {
+            'skip_download': True,  # Test metadata only
+        },
     }]
 
     _APP_KEY = '84956560bc028eb7'
@@ -122,6 +138,11 @@ class BiliBiliIE(InfoExtractor):
                     'preference': -2 if 'hd.mp4' in backup_url else -3,
                 })
 
+            for a_format in formats:
+                a_format.setdefault('http_headers', {}).update({
+                    'Referer': url,
+                })
+
             self._sort_formats(formats)
 
             entries.append({
@@ -130,7 +151,7 @@ class BiliBiliIE(InfoExtractor):
                 'formats': formats,
             })
 
-        title = self._html_search_regex('<h1[^>]+title="([^"]+)">', webpage, 'title')
+        title = self._html_search_regex('<h1[^>]*>([^<]+)</h1>', webpage, 'title')
         description = self._html_search_meta('description', webpage)
         timestamp = unified_timestamp(self._html_search_regex(
             r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', default=None))
index 9661ade4f312e5c5e1068a42d3a693dd936fd1d6..07833532e9e4d2d992ac1b2501439b6b9f95e7a4 100644 (file)
@@ -33,13 +33,18 @@ class BpbIE(InfoExtractor):
         title = self._html_search_regex(
             r'<h2 class="white">(.*?)</h2>', webpage, 'title')
         video_info_dicts = re.findall(
-            r"({\s*src:\s*'http://film\.bpb\.de/[^}]+})", webpage)
+            r"({\s*src\s*:\s*'https?://film\.bpb\.de/[^}]+})", webpage)
 
         formats = []
         for video_info in video_info_dicts:
-            video_info = self._parse_json(video_info, video_id, transform_source=js_to_json)
-            quality = video_info['quality']
-            video_url = video_info['src']
+            video_info = self._parse_json(
+                video_info, video_id, transform_source=js_to_json, fatal=False)
+            if not video_info:
+                continue
+            video_url = video_info.get('src')
+            if not video_url:
+                continue
+            quality = 'high' if '_high' in video_url else 'low'
             formats.append({
                 'url': video_url,
                 'preference': 10 if quality == 'high' else 0,
index 97602ca303e4543b4a31f556e526e26df21bc0b5..0ed59bcbc44ecaf413d7003c4f6f605b1201ca10 100644 (file)
@@ -5,6 +5,7 @@ import re
 import json
 
 from .common import InfoExtractor
+from .adobepass import AdobePassIE
 from ..compat import (
     compat_etree_fromstring,
     compat_parse_qs,
@@ -448,7 +449,7 @@ class BrightcoveLegacyIE(InfoExtractor):
         return info
 
 
-class BrightcoveNewIE(InfoExtractor):
+class BrightcoveNewIE(AdobePassIE):
     IE_NAME = 'brightcove:new'
     _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>\d+|ref:[^&]+)'
     _TESTS = [{
@@ -522,7 +523,7 @@ class BrightcoveNewIE(InfoExtractor):
         # [2] looks like:
         for video, script_tag, account_id, player_id, embed in re.findall(
                 r'''(?isx)
-                    (<video\s+[^>]*data-video-id=['"]?[^>]+>)
+                    (<video\s+[^>]*\bdata-video-id\s*=\s*['"]?[^>]+>)
                     (?:.*?
                         (<script[^>]+
                             src=["\'](?:https?:)?//players\.brightcove\.net/
@@ -602,6 +603,20 @@ class BrightcoveNewIE(InfoExtractor):
                 raise ExtractorError(message, expected=True)
             raise
 
+        errors = json_data.get('errors')
+        if errors and errors[0].get('error_subcode') == 'TVE_AUTH':
+            custom_fields = json_data['custom_fields']
+            tve_token = self._extract_mvpd_auth(
+                smuggled_data['source_url'], video_id,
+                custom_fields['bcadobepassrequestorid'],
+                custom_fields['bcadobepassresourceid'])
+            json_data = self._download_json(
+                api_url, video_id, headers={
+                    'Accept': 'application/json;pk=%s' % policy_key
+                }, query={
+                    'tveToken': tve_token,
+                })
+
         title = json_data['name'].strip()
 
         formats = []
@@ -667,7 +682,6 @@ class BrightcoveNewIE(InfoExtractor):
                     })
                 formats.append(f)
 
-        errors = json_data.get('errors')
         if not formats and errors:
             error = errors[0]
             raise ExtractorError(
@@ -684,7 +698,7 @@ class BrightcoveNewIE(InfoExtractor):
 
         is_live = False
         duration = float_or_none(json_data.get('duration'), 1000)
-        if duration and duration < 0:
+        if duration is not None and duration <= 0:
             is_live = True
 
         return {
index 75fa92d7cfc0204f4539e10c762585b2537abbb6..ec411091efe7dc15b28c7f4a3939bff89395fa59 100644 (file)
@@ -84,9 +84,10 @@ class BuzzFeedIE(InfoExtractor):
                 continue
             entries.append(self.url_result(video['url']))
 
-        facebook_url = FacebookIE._extract_url(webpage)
-        if facebook_url:
-            entries.append(self.url_result(facebook_url))
+        facebook_urls = FacebookIE._extract_urls(webpage)
+        entries.extend([
+            self.url_result(facebook_url)
+            for facebook_url in facebook_urls])
 
         return {
             '_type': 'playlist',
index cf678e7f843225f00a69546c59ba27a2b9c93c3d..9faf4022758c46e8ebbd38db74881ad26e692f19 100644 (file)
@@ -96,6 +96,7 @@ class CBCIE(InfoExtractor):
         'info_dict': {
             'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks',
             'id': 'dog-indoor-exercise-winter-1.3928238',
+            'description': 'md5:c18552e41726ee95bd75210d1ca9194c',
         },
         'playlist_mincount': 6,
     }]
@@ -165,12 +166,11 @@ class CBCPlayerIE(InfoExtractor):
             'uploader': 'CBCC-NEW',
         },
     }, {
-        # available only when we add `formats=MPEG4,FLV,MP3` to theplatform url
         'url': 'http://www.cbc.ca/player/play/2164402062',
-        'md5': '17a61eb813539abea40618d6323a7f82',
+        'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6',
         'info_dict': {
             'id': '2164402062',
-            'ext': 'flv',
+            'ext': 'mp4',
             'title': 'Cancer survivor four times over',
             'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.',
             'timestamp': 1320410746,
@@ -200,6 +200,7 @@ class CBCWatchBaseIE(InfoExtractor):
         'media': 'http://search.yahoo.com/mrss/',
         'clearleap': 'http://www.clearleap.com/namespace/clearleap/1.0/',
     }
+    _GEO_COUNTRIES = ['CA']
 
     def _call_api(self, path, video_id):
         url = path if path.startswith('http') else self._API_BASE_URL + path
@@ -287,6 +288,11 @@ class CBCWatchBaseIE(InfoExtractor):
 class CBCWatchVideoIE(CBCWatchBaseIE):
     IE_NAME = 'cbc.ca:watch:video'
     _VALID_URL = r'https?://api-cbc\.cloud\.clearleap\.com/cloffice/client/web/play/?\?.*?\bcontentId=(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+    _TEST = {
+        # geo-restricted to Canada, bypassable
+        'url': 'https://api-cbc.cloud.clearleap.com/cloffice/client/web/play/?contentId=3c84472a-1eea-4dee-9267-2655d5055dcf&categoryId=ebc258f5-ee40-4cca-b66b-ba6bd55b7235',
+        'only_matching': True,
+    }
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -323,9 +329,10 @@ class CBCWatchIE(CBCWatchBaseIE):
     IE_NAME = 'cbc.ca:watch'
     _VALID_URL = r'https?://watch\.cbc\.ca/(?:[^/]+/)+(?P<id>[0-9a-f-]+)'
     _TESTS = [{
+        # geo-restricted to Canada, bypassable
         'url': 'http://watch.cbc.ca/doc-zone/season-6/customer-disservice/38e815a-009e3ab12e4',
         'info_dict': {
-            'id': '38e815a-009e3ab12e4',
+            'id': '9673749a-5e77-484c-8b62-a1092a6b5168',
             'ext': 'mp4',
             'title': 'Customer (Dis)Service',
             'description': 'md5:8bdd6913a0fe03d4b2a17ebe169c7c87',
@@ -337,8 +344,8 @@ class CBCWatchIE(CBCWatchBaseIE):
             'skip_download': True,
             'format': 'bestvideo',
         },
-        'skip': 'Geo-restricted to Canada',
     }, {
+        # geo-restricted to Canada, bypassable
         'url': 'http://watch.cbc.ca/arthur/all/1ed4b385-cd84-49cf-95f0-80f004680057',
         'info_dict': {
             'id': '1ed4b385-cd84-49cf-95f0-80f004680057',
@@ -346,7 +353,6 @@ class CBCWatchIE(CBCWatchBaseIE):
             'description': 'Arthur, the sweetest 8-year-old aardvark, and his pals solve all kinds of problems with humour, kindness and teamwork.',
         },
         'playlist_mincount': 30,
-        'skip': 'Geo-restricted to Canada',
     }]
 
     def _real_extract(self, url):
index 58f258c54b059b09888cf0e26a4718a69c704faa..1268e38ef3c266bd5f4fac39b961f4788150ba71 100644 (file)
@@ -49,13 +49,13 @@ class CBSIE(CBSBaseIE):
         'only_matching': True,
     }]
 
-    def _extract_video_info(self, content_id):
+    def _extract_video_info(self, content_id, site='cbs', mpx_acc=2198311517):
         items_data = self._download_xml(
             'http://can.cbs.com/thunder/player/videoPlayerService.php',
-            content_id, query={'partner': 'cbs', 'contentId': content_id})
+            content_id, query={'partner': site, 'contentId': content_id})
         video_data = xpath_element(items_data, './/item')
         title = xpath_text(video_data, 'videoTitle', 'title', True)
-        tp_path = 'dJ5BDC/media/guid/2198311517/%s' % content_id
+        tp_path = 'dJ5BDC/media/guid/%d/%s' % (mpx_acc, content_id)
         tp_release_url = 'http://link.theplatform.com/s/' + tp_path
 
         asset_types = []
index 57b18e81d412b20162f60e8d8e44699b76f2e3af..681d63e29222715a0bb0d45462169edde0787330 100644 (file)
@@ -3,17 +3,18 @@ from __future__ import unicode_literals
 
 import re
 
-from .theplatform import ThePlatformIE
+from .cbs import CBSIE
 from ..utils import int_or_none
 
 
-class CBSInteractiveIE(ThePlatformIE):
-    _VALID_URL = r'https?://(?:www\.)?(?P<site>cnet|zdnet)\.com/(?:videos|video/share)/(?P<id>[^/?]+)'
+class CBSInteractiveIE(CBSIE):
+    _VALID_URL = r'https?://(?:www\.)?(?P<site>cnet|zdnet)\.com/(?:videos|video(?:/share)?)/(?P<id>[^/?]+)'
     _TESTS = [{
         'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/',
         'info_dict': {
-            'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60',
-            'ext': 'flv',
+            'id': 'R49SYt__yAfmlXR85z4f7gNmCBDcN_00',
+            'display_id': 'hands-on-with-microsofts-windows-8-1-update',
+            'ext': 'mp4',
             'title': 'Hands-on with Microsoft Windows 8.1 Update',
             'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.',
             'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861',
@@ -22,13 +23,19 @@ class CBSInteractiveIE(ThePlatformIE):
             'timestamp': 1396479627,
             'upload_date': '20140402',
         },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
     }, {
         'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/',
+        'md5': 'f11d27b2fa18597fbf92444d2a9ed386',
         'info_dict': {
-            'id': '56527b93-d25d-44e3-b738-f989ce2e49ba',
-            'ext': 'flv',
+            'id': 'kjOJd_OoVJqbg_ZD8MZCOk8Wekb9QccK',
+            'display_id': 'whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187',
+            'ext': 'mp4',
             'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)',
-            'description': 'Khail and Ashley wonder what other civic woes can be solved by self-tweeting objects, investigate a new kind of VR camera and watch an origami robot self-assemble, walk, climb, dig and dissolve. #TDPothole',
+            'description': 'md5:d2b9a95a5ffe978ae6fbd4cf944d618f',
             'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40',
             'uploader': 'Ashley Esqueda',
             'duration': 1482,
@@ -38,23 +45,28 @@ class CBSInteractiveIE(ThePlatformIE):
     }, {
         'url': 'http://www.zdnet.com/video/share/video-keeping-android-smartphones-and-tablets-secure/',
         'info_dict': {
-            'id': 'bc1af9f0-a2b5-4e54-880d-0d95525781c0',
+            'id': 'k0r4T_ehht4xW_hAOqiVQPuBDPZ8SRjt',
+            'display_id': 'video-keeping-android-smartphones-and-tablets-secure',
             'ext': 'mp4',
             'title': 'Video: Keeping Android smartphones and tablets secure',
             'description': 'Here\'s the best way to keep Android devices secure, and what you do when they\'ve come to the end of their lives.',
             'uploader_id': 'f2d97ea2-8175-11e2-9d12-0018fe8a00b0',
             'uploader': 'Adrian Kingsley-Hughes',
-            'timestamp': 1448961720,
-            'upload_date': '20151201',
+            'duration': 731,
+            'timestamp': 1449129925,
+            'upload_date': '20151203',
         },
         'params': {
             # m3u8 download
             'skip_download': True,
-        }
+        },
+    }, {
+        'url': 'http://www.zdnet.com/video/huawei-matebook-x-video/',
+        'only_matching': True,
     }]
-    TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/kYEXFC/%s?mbr=true'
+
     MPX_ACCOUNTS = {
-        'cnet': 2288573011,
+        'cnet': 2198311517,
         'zdnet': 2387448114,
     }
 
@@ -68,7 +80,8 @@ class CBSInteractiveIE(ThePlatformIE):
         data = self._parse_json(data_json, display_id)
         vdata = data.get('video') or data['videos'][0]
 
-        video_id = vdata['id']
+        video_id = vdata['mpxRefId']
+
         title = vdata['title']
         author = vdata.get('author')
         if author:
@@ -78,20 +91,7 @@ class CBSInteractiveIE(ThePlatformIE):
             uploader = None
             uploader_id = None
 
-        media_guid_path = 'media/guid/%d/%s' % (self.MPX_ACCOUNTS[site], vdata['mpxRefId'])
-        formats, subtitles = [], {}
-        for (fkey, vid) in vdata['files'].items():
-            if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']:
-                continue
-            release_url = self.TP_RELEASE_URL_TEMPLATE % vid
-            if fkey == 'hds':
-                release_url += '&manifest=f4m'
-            tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % fkey)
-            formats.extend(tp_formats)
-            subtitles = self._merge_subtitles(subtitles, tp_subtitles)
-        self._sort_formats(formats)
-
-        info = self._extract_theplatform_metadata('kYEXFC/%s' % media_guid_path, video_id)
+        info = self._extract_video_info(video_id, site, self.MPX_ACCOUNTS[site])
         info.update({
             'id': video_id,
             'display_id': display_id,
@@ -99,7 +99,5 @@ class CBSInteractiveIE(ThePlatformIE):
             'duration': int_or_none(vdata.get('duration')),
             'uploader': uploader,
             'uploader_id': uploader_id,
-            'subtitles': subtitles,
-            'formats': formats,
         })
         return info
index 17bb9af4fe2a8a0066611a2bbc2d090ad7cf5e30..51df15faca995d8055361fccc8dc74a290e806d9 100644 (file)
@@ -15,19 +15,23 @@ class CBSNewsIE(CBSIE):
 
     _TESTS = [
         {
-            'url': 'http://www.cbsnews.com/news/tesla-and-spacex-elon-musks-industrial-empire/',
+            # 60 minutes
+            'url': 'http://www.cbsnews.com/news/artificial-intelligence-positioned-to-be-a-game-changer/',
             'info_dict': {
-                'id': 'tesla-and-spacex-elon-musks-industrial-empire',
-                'ext': 'flv',
-                'title': 'Tesla and SpaceX: Elon Musk\'s industrial empire',
-                'thumbnail': 'http://beta.img.cbsnews.com/i/2014/03/30/60147937-2f53-4565-ad64-1bdd6eb64679/60-0330-pelley-640x360.jpg',
-                'duration': 791,
+                'id': '_B6Ga3VJrI4iQNKsir_cdFo9Re_YJHE_',
+                'ext': 'mp4',
+                'title': 'Artificial Intelligence',
+                'description': 'md5:8818145f9974431e0fb58a1b8d69613c',
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'duration': 1606,
+                'uploader': 'CBSI-NEW',
+                'timestamp': 1498431900,
+                'upload_date': '20170625',
             },
             'params': {
-                # rtmp download
+                # m3u8 download
                 'skip_download': True,
             },
-            'skip': 'Subscribers only',
         },
         {
             'url': 'http://www.cbsnews.com/videos/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/',
@@ -52,6 +56,22 @@ class CBSNewsIE(CBSIE):
                 'skip_download': True,
             },
         },
+        {
+            # 48 hours
+            'url': 'http://www.cbsnews.com/news/maria-ridulph-murder-will-the-nations-oldest-cold-case-to-go-to-trial-ever-get-solved/',
+            'info_dict': {
+                'id': 'QpM5BJjBVEAUFi7ydR9LusS69DPLqPJ1',
+                'ext': 'mp4',
+                'title': 'Cold as Ice',
+                'description': 'Can a childhood memory of a friend\'s murder solve a 1957 cold case? "48 Hours" correspondent Erin Moriarty has the latest.',
+                'upload_date': '20170604',
+                'timestamp': 1496538000,
+                'uploader': 'CBSI-NEW',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
     ]
 
     def _real_extract(self, url):
@@ -60,12 +80,18 @@ class CBSNewsIE(CBSIE):
         webpage = self._download_webpage(url, video_id)
 
         video_info = self._parse_json(self._html_search_regex(
-            r'(?:<ul class="media-list items" id="media-related-items"><li data-video-info|<div id="cbsNewsVideoPlayer" data-video-player-options)=\'({.+?})\'',
-            webpage, 'video JSON info'), video_id)
+            r'(?:<ul class="media-list items" id="media-related-items"[^>]*><li data-video-info|<div id="cbsNewsVideoPlayer" data-video-player-options)=\'({.+?})\'',
+            webpage, 'video JSON info', default='{}'), video_id, fatal=False)
+
+        if video_info:
+            item = video_info['item'] if 'item' in video_info else video_info
+        else:
+            state = self._parse_json(self._search_regex(
+                r'data-cbsvideoui-options=(["\'])(?P<json>{.+?})\1', webpage,
+                'playlist JSON info', group='json'), video_id)['state']
+            item = state['playlist'][state['pid']]
 
-        item = video_info['item'] if 'item' in video_info else video_info
-        guid = item['mpxRefId']
-        return self._extract_video_info(guid)
+        return self._extract_video_info(item['mpxRefId'], 'cbsnews')
 
 
 class CBSNewsLiveVideoIE(InfoExtractor):
index 1ee35b50197d148f1d8564a9c553bd8a8031cd24..0c3af23d58270a9710c54e30a0eee7b375be562c 100755 (executable)
@@ -9,7 +9,10 @@ from ..utils import (
     ExtractorError,
     float_or_none,
     int_or_none,
+    multipart_encode,
     parse_duration,
+    random_birthday,
+    urljoin,
 )
 
 
@@ -27,7 +30,8 @@ class CDAIE(InfoExtractor):
             'description': 'md5:269ccd135d550da90d1662651fcb9772',
             'thumbnail': r're:^https?://.*\.jpg$',
             'average_rating': float,
-            'duration': 39
+            'duration': 39,
+            'age_limit': 0,
         }
     }, {
         'url': 'http://www.cda.pl/video/57413289',
@@ -41,13 +45,41 @@ class CDAIE(InfoExtractor):
             'uploader': 'crash404',
             'view_count': int,
             'average_rating': float,
-            'duration': 137
+            'duration': 137,
+            'age_limit': 0,
         }
+    }, {
+        # Age-restricted
+        'url': 'http://www.cda.pl/video/1273454c4',
+        'info_dict': {
+            'id': '1273454c4',
+            'ext': 'mp4',
+            'title': 'Bronson (2008) napisy HD 1080p',
+            'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c',
+            'height': 1080,
+            'uploader': 'boniek61',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 5554,
+            'age_limit': 18,
+            'view_count': int,
+            'average_rating': float,
+        },
     }, {
         'url': 'http://ebd.cda.pl/0x0/5749950c',
         'only_matching': True,
     }]
 
+    def _download_age_confirm_page(self, url, video_id, *args, **kwargs):
+        form_data = random_birthday('rok', 'miesiac', 'dzien')
+        form_data.update({'return': url, 'module': 'video', 'module_id': video_id})
+        data, content_type = multipart_encode(form_data)
+        return self._download_webpage(
+            urljoin(url, '/a/validatebirth'), video_id, *args,
+            data=data, headers={
+                'Referer': url,
+                'Content-Type': content_type,
+            }, **kwargs)
+
     def _real_extract(self, url):
         video_id = self._match_id(url)
         self._set_cookie('cda.pl', 'cda.player', 'html5')
@@ -57,6 +89,13 @@ class CDAIE(InfoExtractor):
         if 'Ten film jest dostępny dla użytkowników premium' in webpage:
             raise ExtractorError('This video is only available for premium users.', expected=True)
 
+        need_confirm_age = False
+        if self._html_search_regex(r'(<form[^>]+action="/a/validatebirth")',
+                                   webpage, 'birthday validate form', default=None):
+            webpage = self._download_age_confirm_page(
+                url, video_id, note='Confirming age')
+            need_confirm_age = True
+
         formats = []
 
         uploader = self._search_regex(r'''(?x)
@@ -81,10 +120,11 @@ class CDAIE(InfoExtractor):
             'thumbnail': self._og_search_thumbnail(webpage),
             'formats': formats,
             'duration': None,
+            'age_limit': 18 if need_confirm_age else 0,
         }
 
         def extract_format(page, version):
-            json_str = self._search_regex(
+            json_str = self._html_search_regex(
                 r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page,
                 '%s player_json' % version, fatal=False, group='player_data')
             if not json_str:
@@ -121,7 +161,12 @@ class CDAIE(InfoExtractor):
         for href, resolution in re.findall(
                 r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)',
                 webpage):
-            webpage = self._download_webpage(
+            if need_confirm_age:
+                handler = self._download_age_confirm_page
+            else:
+                handler = self._download_webpage
+
+            webpage = handler(
                 self._BASE_URL + href, video_id,
                 'Downloading %s version information' % resolution, fatal=False)
             if not webpage:
@@ -129,6 +174,7 @@ class CDAIE(InfoExtractor):
                 # invalid version is requested.
                 self.report_warning('Unable to download %s version information' % resolution)
                 continue
+
             extract_format(webpage, resolution)
 
         self._sort_formats(formats)
index 2d517f23194b023a7fcd0fd917a63ac08a6fa80c..42c9af263e994d89edece2d29aac8949d577c69f 100644 (file)
@@ -5,7 +5,7 @@ from ..utils import remove_end
 
 
 class CharlieRoseIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?charlierose\.com/video(?:s|/player)/(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:www\.)?charlierose\.com/(?:video|episode)(?:s|/player)/(?P<id>\d+)'
     _TESTS = [{
         'url': 'https://charlierose.com/videos/27996',
         'md5': 'fda41d49e67d4ce7c2411fd2c4702e09',
@@ -24,6 +24,9 @@ class CharlieRoseIE(InfoExtractor):
     }, {
         'url': 'https://charlierose.com/videos/27996',
         'only_matching': True,
+    }, {
+        'url': 'https://charlierose.com/episodes/30887?autoplay=true',
+        'only_matching': True,
     }]
 
     _PLAYER_BASE = 'https://charlierose.com/video/player/%s'
index 0206d96db4670fb29a40353839dae15911b9c6d3..d4769da75e021562d4229474585e048780e9d0d2 100644 (file)
@@ -5,6 +5,7 @@ import base64
 import json
 
 from .common import InfoExtractor
+from .youtube import YoutubeIE
 from ..utils import (
     clean_html,
     ExtractorError
@@ -70,11 +71,9 @@ class ChilloutzoneIE(InfoExtractor):
 
         # If nativePlatform is None a fallback mechanism is used (i.e. youtube embed)
         if native_platform is None:
-            youtube_url = self._html_search_regex(
-                r'<iframe.* src="((?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"',
-                webpage, 'fallback video URL', default=None)
-            if youtube_url is not None:
-                return self.url_result(youtube_url, ie='Youtube')
+            youtube_url = YoutubeIE._extract_url(webpage)
+            if youtube_url:
+                return self.url_result(youtube_url, ie=YoutubeIE.ie_key())
 
         # Non Fallback: Decide to use native source (e.g. youtube or vimeo) or
         # the own CDN
index 562c9bbbb4d02305d22a7ad5bc22ef9056d25258..b861d54b0a1a3c37ca9999d4bbb865d10e36abcf 100644 (file)
@@ -9,12 +9,20 @@ from ..utils import (
 
 
 class CinchcastIE(InfoExtractor):
-    _VALID_URL = r'https?://player\.cinchcast\.com/.*?assetId=(?P<id>[0-9]+)'
-    _TEST = {
+    _VALID_URL = r'https?://player\.cinchcast\.com/.*?(?:assetId|show_id)=(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://player.cinchcast.com/?show_id=5258197&platformId=1&assetType=single',
+        'info_dict': {
+            'id': '5258197',
+            'ext': 'mp3',
+            'title': 'Train Your Brain to Up Your Game with Coach Mandy',
+            'upload_date': '20130816',
+        },
+    }, {
         # Actual test is run in generic, look for undergroundwellness
         'url': 'http://player.cinchcast.com/?platformId=1&#038;assetType=single&#038;assetId=7141703',
         'only_matching': True,
-    }
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
diff --git a/youtube_dl/extractor/cjsw.py b/youtube_dl/extractor/cjsw.py
new file mode 100644 (file)
index 0000000..505bdbe
--- /dev/null
@@ -0,0 +1,72 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    unescapeHTML,
+)
+
+
+class CJSWIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?cjsw\.com/program/(?P<program>[^/]+)/episode/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://cjsw.com/program/freshly-squeezed/episode/20170620',
+        'md5': 'cee14d40f1e9433632c56e3d14977120',
+        'info_dict': {
+            'id': '91d9f016-a2e7-46c5-8dcb-7cbcd7437c41',
+            'ext': 'mp3',
+            'title': 'Freshly Squeezed – Episode June 20, 2017',
+            'description': 'md5:c967d63366c3898a80d0c7b0ff337202',
+            'series': 'Freshly Squeezed',
+            'episode_id': '20170620',
+        },
+    }, {
+        # no description
+        'url': 'http://cjsw.com/program/road-pops/episode/20170707/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        program, episode_id = mobj.group('program', 'id')
+        audio_id = '%s/%s' % (program, episode_id)
+
+        webpage = self._download_webpage(url, episode_id)
+
+        title = unescapeHTML(self._search_regex(
+            (r'<h1[^>]+class=["\']episode-header__title["\'][^>]*>(?P<title>[^<]+)',
+             r'data-audio-title=(["\'])(?P<title>(?:(?!\1).)+)\1'),
+            webpage, 'title', group='title'))
+
+        audio_url = self._search_regex(
+            r'<button[^>]+data-audio-src=(["\'])(?P<url>(?:(?!\1).)+)\1',
+            webpage, 'audio url', group='url')
+
+        audio_id = self._search_regex(
+            r'/([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})\.mp3',
+            audio_url, 'audio id', default=audio_id)
+
+        formats = [{
+            'url': audio_url,
+            'ext': determine_ext(audio_url, 'mp3'),
+            'vcodec': 'none',
+        }]
+
+        description = self._html_search_regex(
+            r'<p>(?P<description>.+?)</p>', webpage, 'description',
+            default=None)
+        series = self._search_regex(
+            r'data-showname=(["\'])(?P<name>(?:(?!\1).)+)\1', webpage,
+            'series', default=program, group='name')
+
+        return {
+            'id': audio_id,
+            'title': title,
+            'description': description,
+            'formats': formats,
+            'series': series,
+            'episode_id': episode_id,
+        }
diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py
deleted file mode 100644 (file)
index 0920f62..0000000
+++ /dev/null
@@ -1,67 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..utils import (
-    int_or_none,
-    unified_strdate,
-)
-
-
-class ClipfishIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?clipfish\.de/(?:[^/]+/)+video/(?P<id>[0-9]+)'
-    _TEST = {
-        'url': 'http://www.clipfish.de/special/ugly-americans/video/4343170/s01-e01-ugly-americans-date-in-der-hoelle/',
-        'md5': 'b9a5dc46294154c1193e2d10e0c95693',
-        'info_dict': {
-            'id': '4343170',
-            'ext': 'mp4',
-            'title': 'S01 E01 - Ugly Americans - Date in der Hölle',
-            'description': 'Mark Lilly arbeitet im Sozialdienst der Stadt New York und soll Immigranten bei ihrer Einbürgerung in die USA zur Seite stehen.',
-            'upload_date': '20161005',
-            'duration': 1291,
-            'view_count': int,
-        }
-    }
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-
-        video_info = self._download_json(
-            'http://www.clipfish.de/devapi/id/%s?format=json&apikey=hbbtv' % video_id,
-            video_id)['items'][0]
-
-        formats = []
-
-        m3u8_url = video_info.get('media_videourl_hls')
-        if m3u8_url:
-            formats.append({
-                'url': m3u8_url.replace('de.hls.fra.clipfish.de', 'hls.fra.clipfish.de'),
-                'ext': 'mp4',
-                'format_id': 'hls',
-            })
-
-        mp4_url = video_info.get('media_videourl')
-        if mp4_url:
-            formats.append({
-                'url': mp4_url,
-                'format_id': 'mp4',
-                'width': int_or_none(video_info.get('width')),
-                'height': int_or_none(video_info.get('height')),
-                'tbr': int_or_none(video_info.get('bitrate')),
-            })
-
-        descr = video_info.get('descr')
-        if descr:
-            descr = descr.strip()
-
-        return {
-            'id': video_id,
-            'title': video_info['title'],
-            'description': descr,
-            'formats': formats,
-            'thumbnail': video_info.get('media_content_thumbnail_large') or video_info.get('media_thumbnail'),
-            'duration': int_or_none(video_info.get('media_length')),
-            'upload_date': unified_strdate(video_info.get('pubDate')),
-            'view_count': int_or_none(video_info.get('media_views'))
-        }
diff --git a/youtube_dl/extractor/clippit.py b/youtube_dl/extractor/clippit.py
new file mode 100644 (file)
index 0000000..a1a7a77
--- /dev/null
@@ -0,0 +1,74 @@
+# coding: utf-8
+
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    parse_iso8601,
+    qualities,
+)
+
+import re
+
+
+class ClippitIE(InfoExtractor):
+
+    _VALID_URL = r'https?://(?:www\.)?clippituser\.tv/c/(?P<id>[a-z]+)'
+    _TEST = {
+        'url': 'https://www.clippituser.tv/c/evmgm',
+        'md5': '963ae7a59a2ec4572ab8bf2f2d2c5f09',
+        'info_dict': {
+            'id': 'evmgm',
+            'ext': 'mp4',
+            'title': 'Bye bye Brutus. #BattleBots  - Clippit',
+            'uploader': 'lizllove',
+            'uploader_url': 'https://www.clippituser.tv/p/lizllove',
+            'timestamp': 1472183818,
+            'upload_date': '20160826',
+            'description': 'BattleBots | ABC',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._html_search_regex(r'<title.*>(.+?)</title>', webpage, 'title')
+
+        FORMATS = ('sd', 'hd')
+        quality = qualities(FORMATS)
+        formats = []
+        for format_id in FORMATS:
+            url = self._html_search_regex(r'data-%s-file="(.+?)"' % format_id,
+                                          webpage, 'url', fatal=False)
+            if not url:
+                continue
+            match = re.search(r'/(?P<height>\d+)\.mp4', url)
+            formats.append({
+                'url': url,
+                'format_id': format_id,
+                'quality': quality(format_id),
+                'height': int(match.group('height')) if match else None,
+            })
+
+        uploader = self._html_search_regex(r'class="username".*>\s+(.+?)\n',
+                                           webpage, 'uploader', fatal=False)
+        uploader_url = ('https://www.clippituser.tv/p/' + uploader
+                        if uploader else None)
+
+        timestamp = self._html_search_regex(r'datetime="(.+?)"',
+                                            webpage, 'date', fatal=False)
+        thumbnail = self._html_search_regex(r'data-image="(.+?)"',
+                                            webpage, 'thumbnail', fatal=False)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'uploader': uploader,
+            'uploader_url': uploader_url,
+            'timestamp': parse_iso8601(timestamp),
+            'description': self._og_search_description(webpage),
+            'thumbnail': thumbnail,
+        }
index 9bc8dbea449509cf275d10470d6e01f337288a21..85ca20eccd0c5bb2e8e39468d6ad62428d918059 100644 (file)
@@ -30,7 +30,11 @@ class CloudyIE(InfoExtractor):
         video_id = self._match_id(url)
 
         webpage = self._download_webpage(
-            'http://www.cloudy.ec/embed.php?id=%s' % video_id, video_id)
+            'https://www.cloudy.ec/embed.php', video_id, query={
+                'id': video_id,
+                'playerPage': 1,
+                'autoplay': 1,
+            })
 
         info = self._parse_html5_media_entries(url, webpage, video_id)[0]
 
index 2cb55d6af826db72ff3539aa88c3c2f0768e38e6..74d30ec50ac7f9d5db44d46bab1dd482fcc34f59 100644 (file)
@@ -27,6 +27,7 @@ from ..compat import (
     compat_urllib_parse_urlencode,
     compat_urllib_request,
     compat_urlparse,
+    compat_xml_parse_error,
 )
 from ..downloader.f4m import remove_encrypted_media
 from ..utils import (
@@ -245,6 +246,10 @@ class InfoExtractor(object):
                     specified in the URL.
     end_time:       Time in seconds where the reproduction should end, as
                     specified in the URL.
+    chapters:       A list of dictionaries, with the following entries:
+                        * "start_time" - The start time of the chapter in seconds
+                        * "end_time" - The end time of the chapter in seconds
+                        * "title" (optional, string)
 
     The following fields should only be used when the video belongs to some logical
     chapter or section:
@@ -372,7 +377,7 @@ class InfoExtractor(object):
             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
         m = cls._VALID_URL_RE.match(url)
         assert m
-        return m.group('id')
+        return compat_str(m.group('id'))
 
     @classmethod
     def working(cls):
@@ -416,7 +421,7 @@ class InfoExtractor(object):
             if country_code:
                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
                 if self._downloader.params.get('verbose', False):
-                    self._downloader.to_stdout(
+                    self._downloader.to_screen(
                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
                         % (self._x_forwarded_for_ip, country_code.upper()))
 
@@ -642,15 +647,29 @@ class InfoExtractor(object):
 
     def _download_xml(self, url_or_request, video_id,
                       note='Downloading XML', errnote='Unable to download XML',
-                      transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
+                      transform_source=None, fatal=True, encoding=None,
+                      data=None, headers={}, query={}):
         """Return the xml as an xml.etree.ElementTree.Element"""
         xml_string = self._download_webpage(
-            url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
+            url_or_request, video_id, note, errnote, fatal=fatal,
+            encoding=encoding, data=data, headers=headers, query=query)
         if xml_string is False:
             return xml_string
+        return self._parse_xml(
+            xml_string, video_id, transform_source=transform_source,
+            fatal=fatal)
+
+    def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
         if transform_source:
             xml_string = transform_source(xml_string)
-        return compat_etree_fromstring(xml_string.encode('utf-8'))
+        try:
+            return compat_etree_fromstring(xml_string.encode('utf-8'))
+        except compat_xml_parse_error as ve:
+            errmsg = '%s: Failed to parse XML ' % video_id
+            if fatal:
+                raise ExtractorError(errmsg, cause=ve)
+            else:
+                self.report_warning(errmsg + str(ve))
 
     def _download_json(self, url_or_request, video_id,
                        note='Downloading JSON metadata',
@@ -726,12 +745,12 @@ class InfoExtractor(object):
             video_info['title'] = video_title
         return video_info
 
-    def playlist_from_matches(self, matches, video_id, video_title, getter=None, ie=None):
-        urlrs = orderedSet(
+    def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
+        urls = orderedSet(
             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
             for m in matches)
         return self.playlist_result(
-            urlrs, playlist_id=video_id, playlist_title=video_title)
+            urls, playlist_id=playlist_id, playlist_title=playlist_title)
 
     @staticmethod
     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
@@ -936,7 +955,8 @@ class InfoExtractor(object):
 
     def _family_friendly_search(self, html):
         # See http://schema.org/VideoObject
-        family_friendly = self._html_search_meta('isFamilyFriendly', html)
+        family_friendly = self._html_search_meta(
+            'isFamilyFriendly', html, default=None)
 
         if not family_friendly:
             return None
@@ -990,6 +1010,7 @@ class InfoExtractor(object):
                 'tbr': int_or_none(e.get('bitrate')),
                 'width': int_or_none(e.get('width')),
                 'height': int_or_none(e.get('height')),
+                'view_count': int_or_none(e.get('interactionCount')),
             })
 
         for e in json_ld:
@@ -997,17 +1018,17 @@ class InfoExtractor(object):
                 item_type = e.get('@type')
                 if expected_type is not None and expected_type != item_type:
                     return info
-                if item_type == 'TVEpisode':
+                if item_type in ('TVEpisode', 'Episode'):
                     info.update({
                         'episode': unescapeHTML(e.get('name')),
                         'episode_number': int_or_none(e.get('episodeNumber')),
                         'description': unescapeHTML(e.get('description')),
                     })
                     part_of_season = e.get('partOfSeason')
-                    if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
+                    if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
-                    if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
+                    if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
                         info['series'] = unescapeHTML(part_of_series.get('name'))
                 elif item_type == 'Article':
                     info.update({
@@ -1017,10 +1038,10 @@ class InfoExtractor(object):
                     })
                 elif item_type == 'VideoObject':
                     extract_video_object(e)
-                elif item_type == 'WebPage':
-                    video = e.get('video')
-                    if isinstance(video, dict) and video.get('@type') == 'VideoObject':
-                        extract_video_object(video)
+                    continue
+                video = e.get('video')
+                if isinstance(video, dict) and video.get('@type') == 'VideoObject':
+                    extract_video_object(video)
                 break
         return dict((k, v) for k, v in info.items() if v is not None)
 
@@ -1780,7 +1801,7 @@ class InfoExtractor(object):
                     ms_info['timescale'] = int(timescale)
                 segment_duration = source.get('duration')
                 if segment_duration:
-                    ms_info['segment_duration'] = int(segment_duration)
+                    ms_info['segment_duration'] = float(segment_duration)
 
             def extract_Initialization(source):
                 initialization = source.find(_add_ns('Initialization'))
@@ -1887,9 +1908,13 @@ class InfoExtractor(object):
                                 'Bandwidth': bandwidth,
                             }
 
+                        def location_key(location):
+                            return 'url' if re.match(r'^https?://', location) else 'path'
+
                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
 
                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
+                            media_location_key = location_key(media_template)
 
                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
                             # can't be used at the same time
@@ -1899,7 +1924,7 @@ class InfoExtractor(object):
                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
                                 representation_ms_info['fragments'] = [{
-                                    'url': media_template % {
+                                    media_location_key: media_template % {
                                         'Number': segment_number,
                                         'Bandwidth': bandwidth,
                                     },
@@ -1923,7 +1948,7 @@ class InfoExtractor(object):
                                         'Number': segment_number,
                                     }
                                     representation_ms_info['fragments'].append({
-                                        'url': segment_url,
+                                        media_location_key: segment_url,
                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
                                     })
 
@@ -1947,8 +1972,9 @@ class InfoExtractor(object):
                             for s in representation_ms_info['s']:
                                 duration = float_or_none(s['d'], timescale)
                                 for r in range(s.get('r', 0) + 1):
+                                    segment_uri = representation_ms_info['segment_urls'][segment_index]
                                     fragments.append({
-                                        'url': representation_ms_info['segment_urls'][segment_index],
+                                        location_key(segment_uri): segment_uri,
                                         'duration': duration,
                                     })
                                     segment_index += 1
@@ -1957,6 +1983,7 @@ class InfoExtractor(object):
                         # No fragments key is present in this case.
                         if 'fragments' in representation_ms_info:
                             f.update({
+                                'fragment_base_url': base_url,
                                 'fragments': [],
                                 'protocol': 'http_dash_segments',
                             })
@@ -1964,10 +1991,8 @@ class InfoExtractor(object):
                                 initialization_url = representation_ms_info['initialization_url']
                                 if not f.get('url'):
                                     f['url'] = initialization_url
-                                f['fragments'].append({'url': initialization_url})
+                                f['fragments'].append({location_key(initialization_url): initialization_url})
                             f['fragments'].extend(representation_ms_info['fragments'])
-                            for fragment in f['fragments']:
-                                fragment['url'] = urljoin(base_url, fragment['url'])
                         try:
                             existing_format = next(
                                 fo for fo in formats
@@ -1996,6 +2021,12 @@ class InfoExtractor(object):
             compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
 
     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
+        """
+        Parse formats from ISM manifest.
+        References:
+         1. [MS-SSTR]: Smooth Streaming Protocol,
+            https://msdn.microsoft.com/en-us/library/ff469518.aspx
+        """
         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
             return []
 
@@ -2017,8 +2048,11 @@ class InfoExtractor(object):
                     self.report_warning('%s is not a supported codec' % fourcc)
                     continue
                 tbr = int(track.attrib['Bitrate']) // 1000
-                width = int_or_none(track.get('MaxWidth'))
-                height = int_or_none(track.get('MaxHeight'))
+                # [1] does not mention Width and Height attributes. However,
+                # they're often present while MaxWidth and MaxHeight are
+                # missing, so should be used as fallbacks
+                width = int_or_none(track.get('MaxWidth') or track.get('Width'))
+                height = int_or_none(track.get('MaxHeight') or track.get('Height'))
                 sampling_rate = int_or_none(track.get('SamplingRate'))
 
                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
@@ -2096,19 +2130,19 @@ class InfoExtractor(object):
                 return f
             return {}
 
-        def _media_formats(src, cur_media_type):
+        def _media_formats(src, cur_media_type, type_info={}):
             full_url = absolute_url(src)
-            ext = determine_ext(full_url)
+            ext = type_info.get('ext') or determine_ext(full_url)
             if ext == 'm3u8':
                 is_plain_url = False
                 formats = self._extract_m3u8_formats(
                     full_url, video_id, ext='mp4',
                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
-                    preference=preference)
+                    preference=preference, fatal=False)
             elif ext == 'mpd':
                 is_plain_url = False
                 formats = self._extract_mpd_formats(
-                    full_url, video_id, mpd_id=mpd_id)
+                    full_url, video_id, mpd_id=mpd_id, fatal=False)
             else:
                 is_plain_url = True
                 formats = [{
@@ -2118,15 +2152,18 @@ class InfoExtractor(object):
             return is_plain_url, formats
 
         entries = []
+        # amp-video and amp-audio are very similar to their HTML5 counterparts
+        # so we wll include them right here (see
+        # https://www.ampproject.org/docs/reference/components/amp-video)
         media_tags = [(media_tag, media_type, '')
                       for media_tag, media_type
-                      in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
+                      in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
         media_tags.extend(re.findall(
             # We only allow video|audio followed by a whitespace or '>'.
             # Allowing more characters may end up in significant slow down (see
             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
             # http://www.porntrex.com/maps/videositemap.xml).
-            r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
+            r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
         for media_tag, media_type, media_content in media_tags:
             media_info = {
                 'formats': [],
@@ -2144,9 +2181,15 @@ class InfoExtractor(object):
                     src = source_attributes.get('src')
                     if not src:
                         continue
-                    is_plain_url, formats = _media_formats(src, media_type)
+                    f = parse_content_type(source_attributes.get('type'))
+                    is_plain_url, formats = _media_formats(src, media_type, f)
                     if is_plain_url:
-                        f = parse_content_type(source_attributes.get('type'))
+                        # res attribute is not standard but seen several times
+                        # in the wild
+                        f.update({
+                            'height': int_or_none(source_attributes.get('res')),
+                            'format_id': source_attributes.get('label'),
+                        })
                         f.update(formats[0])
                         media_info['formats'].append(f)
                     else:
@@ -2169,7 +2212,7 @@ class InfoExtractor(object):
     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
         formats = []
         hdcore_sign = 'hdcore=3.7.0'
-        f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
+        f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
         hds_host = hosts.get('hds')
         if hds_host:
             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
@@ -2191,8 +2234,9 @@ class InfoExtractor(object):
 
     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
-        url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
-        http_base_url = 'http' + url_base
+        url_base = self._search_regex(
+            r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url')
+        http_base_url = '%s:%s' % ('http', url_base)
         formats = []
         if 'm3u8' not in skip_protocols:
             formats.extend(self._extract_m3u8_formats(
@@ -2226,7 +2270,7 @@ class InfoExtractor(object):
             for protocol in ('rtmp', 'rtsp'):
                 if protocol not in skip_protocols:
                     formats.append({
-                        'url': protocol + url_base,
+                        'url': '%s:%s' % (protocol, url_base),
                         'format_id': protocol,
                         'protocol': protocol,
                     })
@@ -2284,6 +2328,8 @@ class InfoExtractor(object):
             tracks = video_data.get('tracks')
             if tracks and isinstance(tracks, list):
                 for track in tracks:
+                    if not isinstance(track, dict):
+                        continue
                     if track.get('kind') != 'captions':
                         continue
                     track_url = urljoin(base_url, track.get('file'))
@@ -2313,6 +2359,8 @@ class InfoExtractor(object):
         urls = []
         formats = []
         for source in jwplayer_sources_data:
+            if not isinstance(source, dict):
+                continue
             source_url = self._proto_relative_url(source.get('file'))
             if not source_url:
                 continue
index d3463b8747a2109e8efef6b93d70843e542b4590..0c3f0c0e4e980d06ba62b4a7b00be3d74c0e5cbe 100644 (file)
@@ -16,7 +16,6 @@ from ..utils import (
     mimetype2ext,
     orderedSet,
     parse_iso8601,
-    remove_end,
 )
 
 
@@ -50,10 +49,17 @@ class CondeNastIE(InfoExtractor):
         'wmagazine': 'W Magazine',
     }
 
-    _VALID_URL = r'https?://(?:video|www|player)\.(?P<site>%s)\.com/(?P<type>watch|series|video|embed(?:js)?)/(?P<id>[^/?#]+)' % '|'.join(_SITES.keys())
+    _VALID_URL = r'''(?x)https?://(?:video|www|player(?:-backend)?)\.(?:%s)\.com/
+        (?:
+            (?:
+                embed(?:js)?|
+                (?:script|inline)/video
+            )/(?P<id>[0-9a-f]{24})(?:/(?P<player_id>[0-9a-f]{24}))?(?:.+?\btarget=(?P<target>[^&]+))?|
+            (?P<type>watch|series|video)/(?P<display_id>[^/?#]+)
+        )''' % '|'.join(_SITES.keys())
     IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values()))
 
-    EMBED_URL = r'(?:https?:)?//player\.(?P<site>%s)\.com/(?P<type>embed(?:js)?)/.+?' % '|'.join(_SITES.keys())
+    EMBED_URL = r'(?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+?' % '|'.join(_SITES.keys())
 
     _TESTS = [{
         'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led',
@@ -89,6 +95,12 @@ class CondeNastIE(InfoExtractor):
             'upload_date': '20150916',
             'timestamp': 1442434955,
         }
+    }, {
+        'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player',
+        'only_matching': True,
+    }, {
+        'url': 'http://player-backend.cnevids.com/script/video/59138decb57ac36b83000005.js',
+        'only_matching': True,
     }]
 
     def _extract_series(self, url, webpage):
@@ -104,7 +116,7 @@ class CondeNastIE(InfoExtractor):
         entries = [self.url_result(build_url(path), 'CondeNast') for path in paths]
         return self.playlist_result(entries, playlist_title=title)
 
-    def _extract_video(self, webpage, url_type):
+    def _extract_video_params(self, webpage):
         query = {}
         params = self._search_regex(
             r'(?s)var params = {(.+?)}[;,]', webpage, 'player params', default=None)
@@ -123,17 +135,30 @@ class CondeNastIE(InfoExtractor):
                 'playerId': params['data-player'],
                 'target': params['id'],
             })
-        video_id = query['videoId']
+        return query
+
+    def _extract_video(self, params):
+        video_id = params['videoId']
+
         video_info = None
-        info_page = self._download_json(
-            'http://player.cnevids.com/player/video.js',
-            video_id, 'Downloading video info', fatal=False, query=query)
-        if info_page:
-            video_info = info_page.get('video')
-        if not video_info:
+        if params.get('playerId'):
+            info_page = self._download_json(
+                'http://player.cnevids.com/player/video.js',
+                video_id, 'Downloading video info', fatal=False, query=params)
+            if info_page:
+                video_info = info_page.get('video')
+            if not video_info:
+                info_page = self._download_webpage(
+                    'http://player.cnevids.com/player/loader.js',
+                    video_id, 'Downloading loader info', query=params)
+        else:
             info_page = self._download_webpage(
-                'http://player.cnevids.com/player/loader.js',
-                video_id, 'Downloading loader info', query=query)
+                'https://player.cnevids.com/inline/video/%s.js' % video_id,
+                video_id, 'Downloading inline info', query={
+                    'target': params.get('target', 'embedplayer')
+                })
+
+        if not video_info:
             video_info = self._parse_json(
                 self._search_regex(
                     r'(?s)var\s+config\s*=\s*({.+?});', info_page, 'config'),
@@ -161,9 +186,7 @@ class CondeNastIE(InfoExtractor):
             })
         self._sort_formats(formats)
 
-        info = self._search_json_ld(
-            webpage, video_id, fatal=False) if url_type != 'embed' else {}
-        info.update({
+        return {
             'id': video_id,
             'formats': formats,
             'title': title,
@@ -174,22 +197,26 @@ class CondeNastIE(InfoExtractor):
             'series': video_info.get('series_title'),
             'season': video_info.get('season_title'),
             'timestamp': parse_iso8601(video_info.get('premiere_date')),
-        })
-        return info
+            'categories': video_info.get('categories'),
+        }
 
     def _real_extract(self, url):
-        site, url_type, item_id = re.match(self._VALID_URL, url).groups()
+        video_id, player_id, target, url_type, display_id = re.match(self._VALID_URL, url).groups()
 
-        # Convert JS embed to regular embed
-        if url_type == 'embedjs':
-            parsed_url = compat_urlparse.urlparse(url)
-            url = compat_urlparse.urlunparse(parsed_url._replace(
-                path=remove_end(parsed_url.path, '.js').replace('/embedjs/', '/embed/')))
-            url_type = 'embed'
+        if video_id:
+            return self._extract_video({
+                'videoId': video_id,
+                'playerId': player_id,
+                'target': target,
+            })
 
-        webpage = self._download_webpage(url, item_id)
+        webpage = self._download_webpage(url, display_id)
 
         if url_type == 'series':
             return self._extract_series(url, webpage)
         else:
-            return self._extract_video(webpage, url_type)
+            params = self._extract_video_params(webpage)
+            info = self._search_json_ld(
+                webpage, display_id, fatal=False)
+            info.update(self._extract_video(params))
+            return info
index 7b2f5008b5b525842d7ac248b1ca2f8f366a72dd..807a29eeaad9d71ba56384774d23f0e5e3ecc397 100644 (file)
@@ -8,7 +8,16 @@ from ..utils import int_or_none
 
 
 class CorusIE(ThePlatformFeedIE):
-    _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:globaltv|etcanada)\.com|(?:hgtv|foodnetwork|slice)\.ca)/(?:video/|(?:[^/]+/)+(?:videos/[a-z0-9-]+-|video\.html\?.*?\bv=))(?P<id>\d+)'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:www\.)?
+                        (?P<domain>
+                            (?:globaltv|etcanada)\.com|
+                            (?:hgtv|foodnetwork|slice|history|showcase)\.ca
+                        )
+                        /(?:video/|(?:[^/]+/)+(?:videos/[a-z0-9-]+-|video\.html\?.*?\bv=))
+                        (?P<id>\d+)
+                    '''
     _TESTS = [{
         'url': 'http://www.hgtv.ca/shows/bryan-inc/videos/movie-night-popcorn-with-bryan-870923331648/',
         'md5': '05dcbca777bf1e58c2acbb57168ad3a6',
@@ -27,6 +36,12 @@ class CorusIE(ThePlatformFeedIE):
     }, {
         'url': 'http://etcanada.com/video/873675331955/meet-the-survivor-game-changers-castaways-part-2/',
         'only_matching': True,
+    }, {
+        'url': 'http://www.history.ca/the-world-without-canada/video/full-episodes/natural-resources/video.html?v=955054659646#video',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.showcase.ca/eyewitness/video/eyewitness++106/video.html?v=955070531919&p=1&s=da#video',
+        'only_matching': True,
     }]
 
     _TP_FEEDS = {
@@ -50,6 +65,14 @@ class CorusIE(ThePlatformFeedIE):
             'feed_id': '5tUJLgV2YNJ5',
             'account_id': 2414427935,
         },
+        'history': {
+            'feed_id': 'tQFx_TyyEq4J',
+            'account_id': 2369613659,
+        },
+        'showcase': {
+            'feed_id': '9H6qyshBZU3E',
+            'account_id': 2414426607,
+        },
     }
 
     def _real_extract(self, url):
index 5fa1f006b82675d299d1cef30fbe2108496256d5..6ea03e65cbf84d4edba9a74f53ce7e1106bf511a 100644 (file)
@@ -24,12 +24,11 @@ class CoubIE(InfoExtractor):
             'duration': 4.6,
             'timestamp': 1428527772,
             'upload_date': '20150408',
-            'uploader': 'Артём Лоскутников',
+            'uploader': 'Artyom Loskutnikov',
             'uploader_id': 'artyom.loskutnikov',
             'view_count': int,
             'like_count': int,
             'repost_count': int,
-            'comment_count': int,
             'age_limit': 0,
         },
     }, {
@@ -118,7 +117,6 @@ class CoubIE(InfoExtractor):
         view_count = int_or_none(coub.get('views_count') or coub.get('views_increase_count'))
         like_count = int_or_none(coub.get('likes_count'))
         repost_count = int_or_none(coub.get('recoubs_count'))
-        comment_count = int_or_none(coub.get('comments_count'))
 
         age_restricted = coub.get('age_restricted', coub.get('age_restricted_by_admin'))
         if age_restricted is not None:
@@ -137,7 +135,6 @@ class CoubIE(InfoExtractor):
             'view_count': view_count,
             'like_count': like_count,
             'repost_count': repost_count,
-            'comment_count': comment_count,
             'age_limit': age_limit,
             'formats': formats,
         }
index 94d03ce2af108a4a711f09f3db9fecd5bd62566e..f77a68ecedf74bc5192fbc475848f6b1a1234315 100644 (file)
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from .youtube import YoutubeIE
 from ..utils import (
     parse_iso8601,
     str_to_int,
@@ -41,11 +42,9 @@ class CrackedIE(InfoExtractor):
 
         webpage = self._download_webpage(url, video_id)
 
-        youtube_url = self._search_regex(
-            r'<iframe[^>]+src="((?:https?:)?//www\.youtube\.com/embed/[^"]+)"',
-            webpage, 'youtube url', default=None)
+        youtube_url = YoutubeIE._extract_url(webpage)
         if youtube_url:
-            return self.url_result(youtube_url, 'Youtube')
+            return self.url_result(youtube_url, ie=YoutubeIE.ie_key())
 
         video_url = self._html_search_regex(
             [r'var\s+CK_vidSrc\s*=\s*"([^"]+)"', r'<video\s+src="([^"]+)"'],
index f919ed208d16d0cb8e8299ceacec1d96b0237c81..13f425b2bf1672c66a723f22a0e6e3d73c0456c3 100644 (file)
@@ -21,9 +21,10 @@ class CrackleIE(InfoExtractor):
             'season_number': 8,
             'episode_number': 4,
             'subtitles': {
-                'en-US': [{
-                    'ext': 'ttml',
-                }]
+                'en-US': [
+                    {'ext': 'vtt'},
+                    {'ext': 'tt'},
+                ]
             },
         },
         'params': {
index 2ed8b30bbf54b5e2211818dd381432b9c92223e6..8bdaf0c2c5af7aa1a562033487a16867c3304f19 100644 (file)
@@ -171,7 +171,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):
         'info_dict': {
             'id': '727589',
             'ext': 'mp4',
-            'title': "KONOSUBA -God's blessing on this wonderful world! 2 Episode 1 – Give Me Deliverance from this Judicial Injustice!",
+            'title': "KONOSUBA -God's blessing on this wonderful world! 2 Episode 1 – Give Me Deliverance From This Judicial Injustice!",
             'description': 'md5:cbcf05e528124b0f3a0a419fc805ea7d',
             'thumbnail': r're:^https?://.*\.jpg$',
             'uploader': 'Kadokawa Pictures Inc.',
@@ -179,7 +179,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):
             'series': "KONOSUBA -God's blessing on this wonderful world!",
             'season': "KONOSUBA -God's blessing on this wonderful world! 2",
             'season_number': 2,
-            'episode': 'Give Me Deliverance from this Judicial Injustice!',
+            'episode': 'Give Me Deliverance From This Judicial Injustice!',
             'episode_number': 1,
         },
         'params': {
@@ -510,7 +510,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
 
         # webpage provide more accurate data than series_title from XML
         series = self._html_search_regex(
-            r'id=["\']showmedia_about_episode_num[^>]+>\s*<a[^>]+>([^<]+)',
+            r'(?s)<h\d[^>]+\bid=["\']showmedia_about_episode_num[^>]+>(.+?)</h\d',
             webpage, 'series', fatal=False)
         season = xpath_text(metadata, 'series_title')
 
@@ -518,7 +518,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
         episode_number = int_or_none(xpath_text(metadata, 'episode_number'))
 
         season_number = int_or_none(self._search_regex(
-            r'(?s)<h4[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h4>\s*<h4>\s*Season (\d+)',
+            r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)',
             webpage, 'season number', default=None))
 
         return {
index d4576160b4489e599e4ca7dabc1e18c9d685610f..171820e2722b1c8eb1f7fdcfb5991b3186c58f01 100644 (file)
@@ -10,6 +10,7 @@ from ..utils import (
     smuggle_url,
     determine_ext,
     ExtractorError,
+    extract_attributes,
 )
 from .senateisvp import SenateISVPIE
 from .ustream import UstreamIE
@@ -68,6 +69,7 @@ class CSpanIE(InfoExtractor):
             'uploader_id': '12987475',
         },
     }]
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s'
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -78,6 +80,19 @@ class CSpanIE(InfoExtractor):
         if ustream_url:
             return self.url_result(ustream_url, UstreamIE.ie_key())
 
+        if '&vod' not in url:
+            bc = self._search_regex(
+                r"(<[^>]+id='brightcove-player-embed'[^>]+>)",
+                webpage, 'brightcove embed', default=None)
+            if bc:
+                bc_attr = extract_attributes(bc)
+                bc_url = self.BRIGHTCOVE_URL_TEMPLATE % (
+                    bc_attr.get('data-bcaccountid', '3162030207001'),
+                    bc_attr.get('data-noprebcplayerid', 'SyGGpuJy3g'),
+                    bc_attr.get('data-newbcplayerid', 'default'),
+                    bc_attr['data-bcid'])
+                return self.url_result(smuggle_url(bc_url, {'source_url': url}))
+
         # We first look for clipid, because clipprog always appears before
         patterns = [r'id=\'clip(%s)\'\s*value=\'([0-9]+)\'' % t for t in ('id', 'prog')]
         results = list(filter(None, (re.search(p, webpage) for p in patterns)))
index 98c835bf12f0508ac75d57c37a64e89f14543183..af39780353ac1e044ffb07464dfa64032ba18f0e 100644 (file)
@@ -1,17 +1,21 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import re
+
 from .common import InfoExtractor
+from ..compat import compat_str
 from ..utils import (
     int_or_none,
     determine_protocol,
+    try_get,
     unescapeHTML,
 )
 
 
 class DailyMailIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/video/[^/]+/video-(?P<id>[0-9]+)'
-    _TEST = {
+    _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/(?:video/[^/]+/video-|embed/video/)(?P<id>[0-9]+)'
+    _TESTS = [{
         'url': 'http://www.dailymail.co.uk/video/tvshowbiz/video-1295863/The-Mountain-appears-sparkling-water-ad-Heavy-Bubbles.html',
         'md5': 'f6129624562251f628296c3a9ffde124',
         'info_dict': {
@@ -20,7 +24,16 @@ class DailyMailIE(InfoExtractor):
             'title': 'The Mountain appears in sparkling water ad for \'Heavy Bubbles\'',
             'description': 'md5:a93d74b6da172dd5dc4d973e0b766a84',
         }
-    }
+    }, {
+        'url': 'http://www.dailymail.co.uk/embed/video/1295863.html',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return re.findall(
+            r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?dailymail\.co\.uk/embed/video/\d+\.html)',
+            webpage)
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -28,8 +41,14 @@ class DailyMailIE(InfoExtractor):
         video_data = self._parse_json(self._search_regex(
             r"data-opts='({.+?})'", webpage, 'video data'), video_id)
         title = unescapeHTML(video_data['title'])
-        video_sources = self._download_json(video_data.get(
-            'sources', {}).get('url') or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id, video_id)
+
+        sources_url = (try_get(
+            video_data,
+            (lambda x: x['plugins']['sources']['url'],
+             lambda x: x['sources']['url']), compat_str) or
+            'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id)
+
+        video_sources = self._download_json(sources_url, video_id)
 
         formats = []
         for rendition in video_sources['renditions']:
index 246efde4342cf5fd4c2b3807c16a118bab5175dc..74e99133132c381e0420ca0cb470753e110f05cd 100644 (file)
@@ -38,7 +38,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor):
 
 
 class DailymotionIE(DailymotionBaseInfoExtractor):
-    _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:embed|swf|#)/)?video/(?P<id>[^/?_]+)'
+    _VALID_URL = r'(?i)https?://(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:embed|swf|#)/)?video|swf)/(?P<id>[^/?_]+)'
     IE_NAME = 'dailymotion'
 
     _FORMATS = [
@@ -49,68 +49,82 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
         ('stream_h264_hd1080_url', 'hd180'),
     ]
 
-    _TESTS = [
-        {
-            'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames',
-            'md5': '2137c41a8e78554bb09225b8eb322406',
-            'info_dict': {
-                'id': 'x2iuewm',
-                'ext': 'mp4',
-                'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News',
-                'description': 'Several come bundled with the Steam Controller.',
-                'thumbnail': r're:^https?:.*\.(?:jpg|png)$',
-                'duration': 74,
-                'timestamp': 1425657362,
-                'upload_date': '20150306',
-                'uploader': 'IGN',
-                'uploader_id': 'xijv66',
-                'age_limit': 0,
-                'view_count': int,
-            }
+    _TESTS = [{
+        'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news',
+        'md5': '074b95bdee76b9e3654137aee9c79dfe',
+        'info_dict': {
+            'id': 'x5kesuj',
+            'ext': 'mp4',
+            'title': 'Office Christmas Party Review –  Jason Bateman, Olivia Munn, T.J. Miller',
+            'description': 'Office Christmas Party Review -  Jason Bateman, Olivia Munn, T.J. Miller',
+            'thumbnail': r're:^https?:.*\.(?:jpg|png)$',
+            'duration': 187,
+            'timestamp': 1493651285,
+            'upload_date': '20170501',
+            'uploader': 'Deadline',
+            'uploader_id': 'x1xm8ri',
+            'age_limit': 0,
+            'view_count': int,
+        },
+    }, {
+        'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames',
+        'md5': '2137c41a8e78554bb09225b8eb322406',
+        'info_dict': {
+            'id': 'x2iuewm',
+            'ext': 'mp4',
+            'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News',
+            'description': 'Several come bundled with the Steam Controller.',
+            'thumbnail': r're:^https?:.*\.(?:jpg|png)$',
+            'duration': 74,
+            'timestamp': 1425657362,
+            'upload_date': '20150306',
+            'uploader': 'IGN',
+            'uploader_id': 'xijv66',
+            'age_limit': 0,
+            'view_count': int,
         },
+        'skip': 'video gone',
+    }, {
         # Vevo video
-        {
-            'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi',
-            'info_dict': {
-                'title': 'Roar (Official)',
-                'id': 'USUV71301934',
-                'ext': 'mp4',
-                'uploader': 'Katy Perry',
-                'upload_date': '20130905',
-            },
-            'params': {
-                'skip_download': True,
-            },
-            'skip': 'VEVO is only available in some countries',
+        'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi',
+        'info_dict': {
+            'title': 'Roar (Official)',
+            'id': 'USUV71301934',
+            'ext': 'mp4',
+            'uploader': 'Katy Perry',
+            'upload_date': '20130905',
+        },
+        'params': {
+            'skip_download': True,
         },
+        'skip': 'VEVO is only available in some countries',
+    }, {
         # age-restricted video
-        {
-            'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband',
-            'md5': '0d667a7b9cebecc3c89ee93099c4159d',
-            'info_dict': {
-                'id': 'xyh2zz',
-                'ext': 'mp4',
-                'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]',
-                'uploader': 'HotWaves1012',
-                'age_limit': 18,
-            },
-            'skip': 'video gone',
+        'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband',
+        'md5': '0d667a7b9cebecc3c89ee93099c4159d',
+        'info_dict': {
+            'id': 'xyh2zz',
+            'ext': 'mp4',
+            'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]',
+            'uploader': 'HotWaves1012',
+            'age_limit': 18,
         },
+        'skip': 'video gone',
+    }, {
         # geo-restricted, player v5
-        {
-            'url': 'http://www.dailymotion.com/video/xhza0o',
-            'only_matching': True,
-        },
+        'url': 'http://www.dailymotion.com/video/xhza0o',
+        'only_matching': True,
+    }, {
         # with subtitles
-        {
-            'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news',
-            'only_matching': True,
-        },
-        {
-            'url': 'http://www.dailymotion.com/swf/video/x3n92nf',
-            'only_matching': True,
-        }
-    ]
+        'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.dailymotion.com/swf/video/x3n92nf',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.dailymotion.com/swf/x3ss1m_funny-magic-trick-barry-and-stuart_fun',
+        'only_matching': True,
+    }]
 
     @staticmethod
     def _extract_urls(webpage):
@@ -133,7 +147,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
         view_count_str = self._search_regex(
             (r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:([\s\d,.]+)"',
              r'video_views_count[^>]+>\s+([\s\d\,.]+)'),
-            webpage, 'view count', fatal=False)
+            webpage, 'view count', default=None)
         if view_count_str:
             view_count_str = re.sub(r'\s', '', view_count_str)
         view_count = str_to_int(view_count_str)
@@ -145,7 +159,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
             [r'buildPlayer\(({.+?})\);\n',  # See https://github.com/rg3/youtube-dl/issues/7826
              r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);',
              r'buildPlayer\(({.+?})\);',
-             r'var\s+config\s*=\s*({.+?});'],
+             r'var\s+config\s*=\s*({.+?});',
+             # New layout regex (see https://github.com/rg3/youtube-dl/issues/13580)
+             r'__PLAYER_CONFIG__\s*=\s*({.+?});'],
             webpage, 'player v5', default=None)
         if player_v5:
             player = self._parse_json(player_v5, video_id)
index bdfe638b4d7bd6fa39a4d22a0718033eb130cfa7..5c9c0ecdc0e20c1dbf9871edb65c43a24c714396 100644 (file)
@@ -21,7 +21,8 @@ class DemocracynowIE(InfoExtractor):
         'info_dict': {
             'id': '2015-0703-001',
             'ext': 'mp4',
-            'title': 'Daily Show',
+            'title': 'Daily Show for July 03, 2015',
+            'description': 'md5:80eb927244d6749900de6072c7cc2c86',
         },
     }, {
         'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree',
index 939d1338c2ec53933b23997110845e99c95217f8..968c4c7fd5216f65741ee21a8052452fd3993d40 100644 (file)
@@ -15,7 +15,7 @@ from ..utils import (
 
 class DisneyIE(InfoExtractor):
     _VALID_URL = r'''(?x)
-        https?://(?P<domain>(?:[^/]+\.)?(?:disney\.[a-z]{2,3}(?:\.[a-z]{2})?|disney(?:(?:me|latino)\.com|turkiye\.com\.tr)|(?:starwars|marvelkids)\.com))/(?:(?:embed/|(?:[^/]+/)+[\w-]+-)(?P<id>[a-z0-9]{24})|(?:[^/]+/)?(?P<display_id>[^/?#]+))'''
+        https?://(?P<domain>(?:[^/]+\.)?(?:disney\.[a-z]{2,3}(?:\.[a-z]{2})?|disney(?:(?:me|latino)\.com|turkiye\.com\.tr|channel\.de)|(?:starwars|marvelkids)\.com))/(?:(?:embed/|(?:[^/]+/)+[\w-]+-)(?P<id>[a-z0-9]{24})|(?:[^/]+/)?(?P<display_id>[^/?#]+))'''
     _TESTS = [{
         # Disney.EmbedVideo
         'url': 'http://video.disney.com/watch/moana-trailer-545ed1857afee5a0ec239977',
@@ -68,6 +68,9 @@ class DisneyIE(InfoExtractor):
     }, {
         'url': 'http://disneyjunior.en.disneyme.com/dj/watch-my-friends-tigger-and-pooh-promo',
         'only_matching': True,
+    }, {
+        'url': 'http://disneychannel.de/sehen/soy-luna-folge-118-5518518987ba27f3cc729268',
+        'only_matching': True,
     }, {
         'url': 'http://disneyjunior.disney.com/galactech-the-galactech-grab-galactech-an-admiral-rescue',
         'only_matching': True,
index a78cb8a2a57c4a851798e94d6f88ee56bd9df68c..c05f601e2b1a69dcf4d73e090638ec990c6c7331 100644 (file)
@@ -13,7 +13,7 @@ from ..utils import (
 
 
 class DigitallySpeakingIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:evt\.dispeak|events\.digitallyspeaking)\.com/(?:[^/]+/)+xml/(?P<id>[^.]+)\.xml'
+    _VALID_URL = r'https?://(?:s?evt\.dispeak|events\.digitallyspeaking)\.com/(?:[^/]+/)+xml/(?P<id>[^.]+)\.xml'
 
     _TESTS = [{
         # From http://gdcvault.com/play/1023460/Tenacious-Design-and-The-Interface
@@ -28,6 +28,10 @@ class DigitallySpeakingIE(InfoExtractor):
         # From http://www.gdcvault.com/play/1014631/Classic-Game-Postmortem-PAC
         'url': 'http://events.digitallyspeaking.com/gdc/sf11/xml/12396_1299111843500GMPX.xml',
         'only_matching': True,
+    }, {
+        # From http://www.gdcvault.com/play/1013700/Advanced-Material
+        'url': 'http://sevt.dispeak.com/ubm/gdc/eur10/xml/11256_1282118587281VNIT.xml',
+        'only_matching': True,
     }]
 
     def _parse_mp4(self, metadata):
index 1f75352ca945c3e63ddf85e1ec204b5787cafeb6..148605c0bd0275fc18ad8fa2ace97ce8aa3cb7c7 100644 (file)
@@ -35,7 +35,7 @@ class DotsubIE(InfoExtractor):
             'thumbnail': 're:^https?://dotsub.com/media/747bcf58-bd59-45b7-8c8c-ac312d084ee6/p',
             'duration': 290,
             'timestamp': 1476767794.2809999,
-            'upload_date': '20160525',
+            'upload_date': '20161018',
             'uploader': 'parthivi001',
             'uploader_id': 'user52596202',
             'view_count': int,
index 82d8a042f58e738422a4c64b7fa90f8ac127f5fb..9757f442259d5678cf3bd6ebeb1dbec28eb03184 100644 (file)
@@ -3,11 +3,14 @@ from __future__ import unicode_literals
 
 import time
 import hashlib
+import re
 
 from .common import InfoExtractor
 from ..utils import (
     ExtractorError,
     unescapeHTML,
+    unified_strdate,
+    urljoin,
 )
 
 
@@ -20,7 +23,7 @@ class DouyuTVIE(InfoExtractor):
             'id': '17732',
             'display_id': 'iseven',
             'ext': 'flv',
-            'title': 're:^清晨醒脑!T-ARA根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
             'description': r're:.*m7show@163\.com.*',
             'thumbnail': r're:^https?://.*\.jpg$',
             'uploader': '7师傅',
@@ -51,7 +54,7 @@ class DouyuTVIE(InfoExtractor):
             'id': '17732',
             'display_id': '17732',
             'ext': 'flv',
-            'title': 're:^清晨醒脑!T-ARA根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
             'description': r're:.*m7show@163\.com.*',
             'thumbnail': r're:^https?://.*\.jpg$',
             'uploader': '7师傅',
@@ -117,3 +120,82 @@ class DouyuTVIE(InfoExtractor):
             'uploader': uploader,
             'is_live': True,
         }
+
+
+class DouyuShowIE(InfoExtractor):
+    _VALID_URL = r'https?://v(?:mobile)?\.douyu\.com/show/(?P<id>[0-9a-zA-Z]+)'
+
+    _TESTS = [{
+        'url': 'https://v.douyu.com/show/rjNBdvnVXNzvE2yw',
+        'md5': '0c2cfd068ee2afe657801269b2d86214',
+        'info_dict': {
+            'id': 'rjNBdvnVXNzvE2yw',
+            'ext': 'mp4',
+            'title': '陈一发儿:砒霜 我有个室友系列!04-01 22点场',
+            'duration': 7150.08,
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': '陈一发儿',
+            'uploader_id': 'XrZwYelr5wbK',
+            'uploader_url': 'https://v.douyu.com/author/XrZwYelr5wbK',
+            'upload_date': '20170402',
+        },
+    }, {
+        'url': 'https://vmobile.douyu.com/show/rjNBdvnVXNzvE2yw',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        url = url.replace('vmobile.', 'v.')
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        room_info = self._parse_json(self._search_regex(
+            r'var\s+\$ROOM\s*=\s*({.+});', webpage, 'room info'), video_id)
+
+        video_info = None
+
+        for trial in range(5):
+            # Sometimes Douyu rejects our request. Let's try it more times
+            try:
+                video_info = self._download_json(
+                    'https://vmobile.douyu.com/video/getInfo', video_id,
+                    query={'vid': video_id},
+                    headers={
+                        'Referer': url,
+                        'x-requested-with': 'XMLHttpRequest',
+                    })
+                break
+            except ExtractorError:
+                self._sleep(1, video_id)
+
+        if not video_info:
+            raise ExtractorError('Can\'t fetch video info')
+
+        formats = self._extract_m3u8_formats(
+            video_info['data']['video_url'], video_id,
+            entry_protocol='m3u8_native', ext='mp4')
+
+        upload_date = unified_strdate(self._html_search_regex(
+            r'<em>上传时间:</em><span>([^<]+)</span>', webpage,
+            'upload date', fatal=False))
+
+        uploader = uploader_id = uploader_url = None
+        mobj = re.search(
+            r'(?m)<a[^>]+href="/author/([0-9a-zA-Z]+)".+?<strong[^>]+title="([^"]+)"',
+            webpage)
+        if mobj:
+            uploader_id, uploader = mobj.groups()
+            uploader_url = urljoin(url, '/author/' + uploader_id)
+
+        return {
+            'id': video_id,
+            'title': room_info['name'],
+            'formats': formats,
+            'duration': room_info.get('duration'),
+            'thumbnail': room_info.get('pic'),
+            'upload_date': upload_date,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'uploader_url': uploader_url,
+        }
index 87c5dd63ec271a6ef8770c2233da295721f863ca..76e784105451293705f0057dbfe5035d73bb5c1d 100644 (file)
@@ -7,16 +7,18 @@ import time
 
 from .common import InfoExtractor
 from ..compat import (
-    compat_urlparse,
     compat_HTTPError,
+    compat_str,
+    compat_urlparse,
 )
 from ..utils import (
-    USER_AGENTS,
     ExtractorError,
     int_or_none,
-    unified_strdate,
     remove_end,
+    try_get,
+    unified_strdate,
     update_url_query,
+    USER_AGENTS,
 )
 
 
@@ -183,28 +185,44 @@ class DPlayItIE(InfoExtractor):
 
         webpage = self._download_webpage(url, display_id)
 
-        info_url = self._search_regex(
-            r'url\s*:\s*["\']((?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)',
-            webpage, 'video id')
-
         title = remove_end(self._og_search_title(webpage), ' | Dplay')
 
-        try:
-            info = self._download_json(
-                info_url, display_id, headers={
-                    'Authorization': 'Bearer %s' % self._get_cookies(url).get(
-                        'dplayit_token').value,
-                    'Referer': url,
-                })
-        except ExtractorError as e:
-            if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403):
-                info = self._parse_json(e.cause.read().decode('utf-8'), display_id)
-                error = info['errors'][0]
-                if error.get('code') == 'access.denied.geoblocked':
-                    self.raise_geo_restricted(
-                        msg=error.get('detail'), countries=self._GEO_COUNTRIES)
-                raise ExtractorError(info['errors'][0]['detail'], expected=True)
-            raise
+        video_id = None
+
+        info = self._search_regex(
+            r'playback_json\s*:\s*JSON\.parse\s*\(\s*("(?:\\.|[^"\\])+?")',
+            webpage, 'playback JSON', default=None)
+        if info:
+            for _ in range(2):
+                info = self._parse_json(info, display_id, fatal=False)
+                if not info:
+                    break
+            else:
+                video_id = try_get(info, lambda x: x['data']['id'])
+
+        if not info:
+            info_url = self._search_regex(
+                r'url\s*[:=]\s*["\']((?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)',
+                webpage, 'info url')
+
+            video_id = info_url.rpartition('/')[-1]
+
+            try:
+                info = self._download_json(
+                    info_url, display_id, headers={
+                        'Authorization': 'Bearer %s' % self._get_cookies(url).get(
+                            'dplayit_token').value,
+                        'Referer': url,
+                    })
+            except ExtractorError as e:
+                if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403):
+                    info = self._parse_json(e.cause.read().decode('utf-8'), display_id)
+                    error = info['errors'][0]
+                    if error.get('code') == 'access.denied.geoblocked':
+                        self.raise_geo_restricted(
+                            msg=error.get('detail'), countries=self._GEO_COUNTRIES)
+                    raise ExtractorError(info['errors'][0]['detail'], expected=True)
+                raise
 
         hls_url = info['data']['attributes']['streaming']['hls']['url']
 
@@ -230,7 +248,7 @@ class DPlayItIE(InfoExtractor):
             season_number = episode_number = upload_date = None
 
         return {
-            'id': info_url.rpartition('/')[-1],
+            'id': compat_str(video_id or display_id),
             'display_id': display_id,
             'title': title,
             'description': self._og_search_description(webpage),
index e7abc888988e9807ee46abc4d4e44f8276b9a084..9a498d72ad6f378ef5dd877354871bcccbfb4faf 100644 (file)
@@ -12,6 +12,7 @@ from ..utils import (
     ExtractorError,
     clean_html,
     int_or_none,
+    remove_end,
     sanitized_Request,
     urlencode_postdata
 )
@@ -72,15 +73,15 @@ class DramaFeverIE(DramaFeverBaseIE):
         'url': 'http://www.dramafever.com/drama/4512/1/Cooking_with_Shin/',
         'info_dict': {
             'id': '4512.1',
-            'ext': 'mp4',
-            'title': 'Cooking with Shin 4512.1',
+            'ext': 'flv',
+            'title': 'Cooking with Shin',
             'description': 'md5:a8eec7942e1664a6896fcd5e1287bfd0',
             'episode': 'Episode 1',
             'episode_number': 1,
             'thumbnail': r're:^https?://.*\.jpg',
             'timestamp': 1404336058,
             'upload_date': '20140702',
-            'duration': 343,
+            'duration': 344,
         },
         'params': {
             # m3u8 download
@@ -90,15 +91,15 @@ class DramaFeverIE(DramaFeverBaseIE):
         'url': 'http://www.dramafever.com/drama/4826/4/Mnet_Asian_Music_Awards_2015/?ap=1',
         'info_dict': {
             'id': '4826.4',
-            'ext': 'mp4',
-            'title': 'Mnet Asian Music Awards 2015 4826.4',
+            'ext': 'flv',
+            'title': 'Mnet Asian Music Awards 2015',
             'description': 'md5:3ff2ee8fedaef86e076791c909cf2e91',
             'episode': 'Mnet Asian Music Awards 2015 - Part 3',
             'episode_number': 4,
             'thumbnail': r're:^https?://.*\.jpg',
             'timestamp': 1450213200,
             'upload_date': '20151215',
-            'duration': 5602,
+            'duration': 5359,
         },
         'params': {
             # m3u8 download
@@ -122,6 +123,10 @@ class DramaFeverIE(DramaFeverBaseIE):
                     countries=self._GEO_COUNTRIES)
             raise
 
+        # title is postfixed with video id for some reason, removing
+        if info.get('title'):
+            info['title'] = remove_end(info['title'], video_id).strip()
+
         series_id, episode_number = video_id.split('.')
         episode_info = self._download_json(
             # We only need a single episode info, so restricting page size to one episode
index 79ec212c890471bd72a0e88eaf7bec0da70af124..164e97c36b8cab223ac813abaa1f032eb7d4444a 100644 (file)
 from __future__ import unicode_literals
 
-import json
 import re
 
 from .common import InfoExtractor
 from ..utils import (
-    int_or_none,
-    parse_iso8601,
+    js_to_json,
+    parse_duration,
+    unescapeHTML,
 )
 
 
 class DRBonanzaIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?dr\.dk/bonanza/(?:[^/]+/)+(?:[^/])+?(?:assetId=(?P<id>\d+))?(?:[#&]|$)'
-
-    _TESTS = [{
-        'url': 'http://www.dr.dk/bonanza/serie/portraetter/Talkshowet.htm?assetId=65517',
+    _VALID_URL = r'https?://(?:www\.)?dr\.dk/bonanza/[^/]+/\d+/[^/]+/(?P<id>\d+)/(?P<display_id>[^/?#&]+)'
+    _TEST = {
+        'url': 'http://www.dr.dk/bonanza/serie/154/matador/40312/matador---0824-komme-fremmede-',
         'info_dict': {
-            'id': '65517',
+            'id': '40312',
+            'display_id': 'matador---0824-komme-fremmede-',
             'ext': 'mp4',
-            'title': 'Talkshowet - Leonard Cohen',
-            'description': 'md5:8f34194fb30cd8c8a30ad8b27b70c0ca',
-            'thumbnail': r're:^https?://.*\.(?:gif|jpg)$',
-            'timestamp': 1295537932,
-            'upload_date': '20110120',
-            'duration': 3664,
-        },
-        'params': {
-            'skip_download': True,  # requires rtmp
-        },
-    }, {
-        'url': 'http://www.dr.dk/bonanza/radio/serie/sport/fodbold.htm?assetId=59410',
-        'md5': '6dfe039417e76795fb783c52da3de11d',
-        'info_dict': {
-            'id': '59410',
-            'ext': 'mp3',
-            'title': 'EM fodbold 1992 Danmark - Tyskland finale Transmission',
-            'description': 'md5:501e5a195749480552e214fbbed16c4e',
+            'title': 'MATADOR - 08:24. "Komme fremmede".',
+            'description': 'md5:77b4c1ac4d4c1b9d610ab4395212ff84',
             'thumbnail': r're:^https?://.*\.(?:gif|jpg)$',
-            'timestamp': 1223274900,
-            'upload_date': '20081006',
-            'duration': 7369,
+            'duration': 4613,
         },
-    }]
+    }
 
     def _real_extract(self, url):
-        url_id = self._match_id(url)
-        webpage = self._download_webpage(url, url_id)
-
-        if url_id:
-            info = json.loads(self._html_search_regex(r'({.*?%s.*})' % url_id, webpage, 'json'))
-        else:
-            # Just fetch the first video on that page
-            info = json.loads(self._html_search_regex(r'bonanzaFunctions.newPlaylist\(({.*})\)', webpage, 'json'))
-
-        asset_id = str(info['AssetId'])
-        title = info['Title'].rstrip(' \'\"-,.:;!?')
-        duration = int_or_none(info.get('Duration'), scale=1000)
-        # First published online. "FirstPublished" contains the date for original airing.
-        timestamp = parse_iso8601(
-            re.sub(r'\.\d+$', '', info['Created']))
-
-        def parse_filename_info(url):
-            match = re.search(r'/\d+_(?P<width>\d+)x(?P<height>\d+)x(?P<bitrate>\d+)K\.(?P<ext>\w+)$', url)
-            if match:
-                return {
-                    'width': int(match.group('width')),
-                    'height': int(match.group('height')),
-                    'vbr': int(match.group('bitrate')),
-                    'ext': match.group('ext')
-                }
-            match = re.search(r'/\d+_(?P<bitrate>\d+)K\.(?P<ext>\w+)$', url)
-            if match:
-                return {
-                    'vbr': int(match.group('bitrate')),
-                    'ext': match.group(2)
-                }
-            return {}
+        mobj = re.match(self._VALID_URL, url)
+        video_id, display_id = mobj.group('id', 'display_id')
 
-        video_types = ['VideoHigh', 'VideoMid', 'VideoLow']
-        preferencemap = {
-            'VideoHigh': -1,
-            'VideoMid': -2,
-            'VideoLow': -3,
-            'Audio': -4,
-        }
+        webpage = self._download_webpage(url, display_id)
 
-        formats = []
-        for file in info['Files']:
-            if info['Type'] == 'Video':
-                if file['Type'] in video_types:
-                    format = parse_filename_info(file['Location'])
-                    format.update({
-                        'url': file['Location'],
-                        'format_id': file['Type'].replace('Video', ''),
-                        'preference': preferencemap.get(file['Type'], -10),
-                    })
-                    if format['url'].startswith('rtmp'):
-                        rtmp_url = format['url']
-                        format['rtmp_live'] = True  # --resume does not work
-                        if '/bonanza/' in rtmp_url:
-                            format['play_path'] = rtmp_url.split('/bonanza/')[1]
-                    formats.append(format)
-                elif file['Type'] == 'Thumb':
-                    thumbnail = file['Location']
-            elif info['Type'] == 'Audio':
-                if file['Type'] == 'Audio':
-                    format = parse_filename_info(file['Location'])
-                    format.update({
-                        'url': file['Location'],
-                        'format_id': file['Type'],
-                        'vcodec': 'none',
-                    })
-                    formats.append(format)
-                elif file['Type'] == 'Thumb':
-                    thumbnail = file['Location']
+        info = self._parse_html5_media_entries(
+            url, webpage, display_id, m3u8_id='hls',
+            m3u8_entry_protocol='m3u8_native')[0]
+        self._sort_formats(info['formats'])
 
-        description = '%s\n%s\n%s\n' % (
-            info['Description'], info['Actors'], info['Colophon'])
+        asset = self._parse_json(
+            self._search_regex(
+                r'(?s)currentAsset\s*=\s*({.+?})\s*</script', webpage, 'asset'),
+            display_id, transform_source=js_to_json)
 
-        self._sort_formats(formats)
+        title = unescapeHTML(asset['AssetTitle']).strip()
 
-        display_id = re.sub(r'[^\w\d-]', '', re.sub(r' ', '-', title.lower())) + '-' + asset_id
-        display_id = re.sub(r'-+', '-', display_id)
+        def extract(field):
+            return self._search_regex(
+                r'<div[^>]+>\s*<p>%s:<p>\s*</div>\s*<div[^>]+>\s*<p>([^<]+)</p>' % field,
+                webpage, field, default=None)
 
-        return {
-            'id': asset_id,
+        info.update({
+            'id': asset.get('AssetId') or video_id,
             'display_id': display_id,
             'title': title,
-            'formats': formats,
-            'description': description,
-            'thumbnail': thumbnail,
-            'timestamp': timestamp,
-            'duration': duration,
-        }
+            'description': extract('Programinfo'),
+            'duration': parse_duration(extract('Tid')),
+            'thumbnail': asset.get('AssetImageUrl'),
+        })
+        return info
index 1eca82b3b46ae47e511b0f2f3f8bd6bb505cdc23..c5d56a9adf9c4ff8ad338d326d5548fb5bdf50e1 100644 (file)
@@ -44,8 +44,23 @@ class DrTuberIE(InfoExtractor):
         webpage = self._download_webpage(
             'http://www.drtuber.com/video/%s' % video_id, display_id)
 
-        video_url = self._html_search_regex(
-            r'<source src="([^"]+)"', webpage, 'video URL')
+        video_data = self._download_json(
+            'http://www.drtuber.com/player_config_json/', video_id, query={
+                'vid': video_id,
+                'embed': 0,
+                'aid': 0,
+                'domain_id': 0,
+            })
+
+        formats = []
+        for format_id, video_url in video_data['files'].items():
+            if video_url:
+                formats.append({
+                    'format_id': format_id,
+                    'quality': 2 if format_id == 'hq' else 1,
+                    'url': video_url
+                })
+        self._sort_formats(formats)
 
         title = self._html_search_regex(
             (r'class="title_watch"[^>]*><(?:p|h\d+)[^>]*>([^<]+)<',
@@ -75,7 +90,7 @@ class DrTuberIE(InfoExtractor):
         return {
             'id': video_id,
             'display_id': display_id,
-            'url': video_url,
+            'formats': formats,
             'title': title,
             'thumbnail': thumbnail,
             'like_count': like_count,
index e4917014adae2e3fe286acc2a2cb66b6ddc3c735..69effba58371426ec6813066424e86aa976b9ce8 100644 (file)
@@ -20,7 +20,7 @@ class DRTVIE(InfoExtractor):
     IE_NAME = 'drtv'
     _TESTS = [{
         'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10',
-        'md5': '25e659cccc9a2ed956110a299fdf5983',
+        'md5': '7ae17b4e18eb5d29212f424a7511c184',
         'info_dict': {
             'id': 'klassen-darlig-taber-10',
             'ext': 'mp4',
@@ -30,21 +30,37 @@ class DRTVIE(InfoExtractor):
             'upload_date': '20160823',
             'duration': 606.84,
         },
-        'params': {
-            'skip_download': True,
-        },
     }, {
+        # embed
         'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang',
-        'md5': '2c37175c718155930f939ef59952474a',
         'info_dict': {
             'id': 'christiania-pusher-street-ryddes-drdkrjpo',
             'ext': 'mp4',
             'title': 'LIVE Christianias rydning af Pusher Street er i gang',
-            'description': '- Det er det fedeste, der er sket i 20 år, fortæller christianit til DR Nyheder.',
+            'description': 'md5:2a71898b15057e9b97334f61d04e6eb5',
             'timestamp': 1472800279,
             'upload_date': '20160902',
             'duration': 131.4,
         },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # with SignLanguage formats
+        'url': 'https://www.dr.dk/tv/se/historien-om-danmark/-/historien-om-danmark-stenalder',
+        'info_dict': {
+            'id': 'historien-om-danmark-stenalder',
+            'ext': 'mp4',
+            'title': 'Historien om Danmark: Stenalder (1)',
+            'description': 'md5:8c66dcbc1669bbc6f873879880f37f2a',
+            'timestamp': 1490401996,
+            'upload_date': '20170325',
+            'duration': 3502.04,
+            'formats': 'mincount:20',
+        },
+        'params': {
+            'skip_download': True,
+        },
     }]
 
     def _real_extract(self, url):
@@ -88,7 +104,7 @@ class DRTVIE(InfoExtractor):
             elif kind in ('VideoResource', 'AudioResource'):
                 duration = float_or_none(asset.get('DurationInMilliseconds'), 1000)
                 restricted_to_denmark = asset.get('RestrictedToDenmark')
-                spoken_subtitles = asset.get('Target') == 'SpokenSubtitles'
+                asset_target = asset.get('Target')
                 for link in asset.get('Links', []):
                     uri = link.get('Uri')
                     if not uri:
@@ -96,13 +112,13 @@ class DRTVIE(InfoExtractor):
                     target = link.get('Target')
                     format_id = target or ''
                     preference = None
-                    if spoken_subtitles:
+                    if asset_target in ('SpokenSubtitles', 'SignLanguage'):
                         preference = -1
-                        format_id += '-spoken-subtitles'
+                        format_id += '-%s' % asset_target
                     if target == 'HDS':
                         f4m_formats = self._extract_f4m_formats(
                             uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43',
-                            video_id, preference, f4m_id=format_id)
+                            video_id, preference, f4m_id=format_id, fatal=False)
                         if kind == 'AudioResource':
                             for f in f4m_formats:
                                 f['vcodec'] = 'none'
@@ -110,7 +126,8 @@ class DRTVIE(InfoExtractor):
                     elif target == 'HLS':
                         formats.extend(self._extract_m3u8_formats(
                             uri, video_id, 'mp4', entry_protocol='m3u8_native',
-                            preference=preference, m3u8_id=format_id))
+                            preference=preference, m3u8_id=format_id,
+                            fatal=False))
                     else:
                         bitrate = link.get('Bitrate')
                         if bitrate:
index 974c69dbc75fcb29bd57e30432fa466182b68743..e85c58bd5669bd14cb069690d43277f77cea49d3 100644 (file)
@@ -5,9 +5,12 @@ import re
 
 from .common import InfoExtractor
 from ..utils import (
+    determine_ext,
+    ExtractorError,
+    int_or_none,
     js_to_json,
+    mimetype2ext,
     unescapeHTML,
-    ExtractorError,
 )
 
 
@@ -24,14 +27,7 @@ class DVTVIE(InfoExtractor):
             'id': 'dc0768de855511e49e4b0025900fea04',
             'ext': 'mp4',
             'title': 'Vondra o Českém století: Při pohledu na Havla mi bylo trapně',
-        }
-    }, {
-        'url': 'http://video.aktualne.cz/dvtv/stropnicky-policie-vrbetice-preventivne-nekontrolovala/r~82ed4322849211e4a10c0025900fea04/',
-        'md5': '6388f1941b48537dbd28791f712af8bf',
-        'info_dict': {
-            'id': '72c02230849211e49f60002590604f2e',
-            'ext': 'mp4',
-            'title': 'Stropnický: Policie Vrbětice preventivně nekontrolovala',
+            'duration': 1484,
         }
     }, {
         'url': 'http://video.aktualne.cz/dvtv/dvtv-16-12-2014-utok-talibanu-boj-o-kliniku-uprchlici/r~973eb3bc854e11e498be002590604f2e/',
@@ -44,55 +40,100 @@ class DVTVIE(InfoExtractor):
             'info_dict': {
                 'id': 'b0b40906854d11e4bdad0025900fea04',
                 'ext': 'mp4',
-                'title': 'Drtinová Veselovský TV 16. 12. 2014: Témata dne'
+                'title': 'Drtinová Veselovský TV 16. 12. 2014: Témata dne',
+                'description': 'md5:0916925dea8e30fe84222582280b47a0',
+                'timestamp': 1418760010,
+                'upload_date': '20141216',
             }
         }, {
             'md5': '5f7652a08b05009c1292317b449ffea2',
             'info_dict': {
                 'id': '420ad9ec854a11e4bdad0025900fea04',
                 'ext': 'mp4',
-                'title': 'Školní masakr možná změní boj s Talibanem, říká novinářka'
+                'title': 'Školní masakr možná změní boj s Talibanem, říká novinářka',
+                'description': 'md5:ff2f9f6de73c73d7cef4f756c1c1af42',
+                'timestamp': 1418760010,
+                'upload_date': '20141216',
             }
         }, {
             'md5': '498eb9dfa97169f409126c617e2a3d64',
             'info_dict': {
                 'id': '95d35580846a11e4b6d20025900fea04',
                 'ext': 'mp4',
-                'title': 'Boj o kliniku: Veřejný zájem, nebo právo na majetek?'
+                'title': 'Boj o kliniku: Veřejný zájem, nebo právo na majetek?',
+                'description': 'md5:889fe610a70fee5511dc3326a089188e',
+                'timestamp': 1418760010,
+                'upload_date': '20141216',
             }
         }, {
             'md5': 'b8dc6b744844032dab6ba3781a7274b9',
             'info_dict': {
                 'id': '6fe14d66853511e4833a0025900fea04',
                 'ext': 'mp4',
-                'title': 'Pánek: Odmítání syrských uprchlíků je ostudou české vlády'
+                'title': 'Pánek: Odmítání syrských uprchlíků je ostudou české vlády',
+                'description': 'md5:544f86de6d20c4815bea11bf2ac3004f',
+                'timestamp': 1418760010,
+                'upload_date': '20141216',
             }
         }],
+    }, {
+        'url': 'https://video.aktualne.cz/dvtv/zeman-si-jen-leci-mindraky-sobotku-nenavidi-a-babis-se-mu-te/r~960cdb3a365a11e7a83b0025900fea04/',
+        'md5': 'f8efe9656017da948369aa099788c8ea',
+        'info_dict': {
+            'id': '3c496fec365911e7a6500025900fea04',
+            'ext': 'mp4',
+            'title': 'Zeman si jen léčí mindráky, Sobotku nenávidí a Babiš se mu teď hodí, tvrdí Kmenta',
+            'duration': 1103,
+        },
+        'params': {
+            'skip_download': True,
+        },
     }, {
         'url': 'http://video.aktualne.cz/v-cechach-poprve-zazni-zelenkova-zrestaurovana-mse/r~45b4b00483ec11e4883b002590604f2e/',
         'only_matching': True,
     }]
 
     def _parse_video_metadata(self, js, video_id):
-        metadata = self._parse_json(js, video_id, transform_source=js_to_json)
+        data = self._parse_json(js, video_id, transform_source=js_to_json)
 
-        formats = []
-        for video in metadata['sources']:
-            ext = video['type'][6:]
-            formats.append({
-                'url': video['file'],
-                'ext': ext,
-                'format_id': '%s-%s' % (ext, video['label']),
-                'height': int(video['label'].rstrip('p')),
-                'fps': 25,
-            })
+        title = unescapeHTML(data['title'])
 
+        formats = []
+        for video in data['sources']:
+            video_url = video.get('file')
+            if not video_url:
+                continue
+            video_type = video.get('type')
+            ext = determine_ext(video_url, mimetype2ext(video_type))
+            if video_type == 'application/vnd.apple.mpegurl' or ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+            elif video_type == 'application/dash+xml' or ext == 'mpd':
+                formats.extend(self._extract_mpd_formats(
+                    video_url, video_id, mpd_id='dash', fatal=False))
+            else:
+                label = video.get('label')
+                height = self._search_regex(
+                    r'^(\d+)[pP]', label or '', 'height', default=None)
+                format_id = ['http']
+                for f in (ext, label):
+                    if f:
+                        format_id.append(f)
+                formats.append({
+                    'url': video_url,
+                    'format_id': '-'.join(format_id),
+                    'height': int_or_none(height),
+                })
         self._sort_formats(formats)
 
         return {
-            'id': metadata['mediaid'],
-            'title': unescapeHTML(metadata['title']),
-            'thumbnail': self._proto_relative_url(metadata['image'], 'http:'),
+            'id': data.get('mediaid') or video_id,
+            'title': title,
+            'description': data.get('description'),
+            'thumbnail': data.get('image'),
+            'duration': int_or_none(data.get('duration')),
+            'timestamp': int_or_none(data.get('pubtime')),
             'formats': formats
         }
 
@@ -103,7 +144,7 @@ class DVTVIE(InfoExtractor):
 
         # single video
         item = self._search_regex(
-            r"(?s)embedData[0-9a-f]{32}\['asset'\]\s*=\s*(\{.+?\});",
+            r'(?s)embedData[0-9a-f]{32}\[["\']asset["\']\]\s*=\s*(\{.+?\});',
             webpage, 'video', default=None, fatal=False)
 
         if item:
@@ -113,6 +154,8 @@ class DVTVIE(InfoExtractor):
         items = re.findall(
             r"(?s)BBX\.context\.assets\['[0-9a-f]{32}'\]\.push\(({.+?})\);",
             webpage)
+        if not items:
+            items = re.findall(r'(?s)var\s+asset\s*=\s*({.+?});\n', webpage)
 
         if items:
             return {
index 76d39adac5faa9912f42d271def60a46128be3f7..42789278e3a04753bcbce06ad62316c458708dc1 100644 (file)
@@ -11,6 +11,7 @@ from ..compat import (
 from ..utils import (
     ExtractorError,
     int_or_none,
+    unsmuggle_url,
 )
 
 
@@ -50,6 +51,10 @@ class EaglePlatformIE(InfoExtractor):
             'view_count': int,
         },
         'skip': 'Georestricted',
+    }, {
+        # referrer protected video (https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/)
+        'url': 'eagleplatform:tvrainru.media.eagleplatform.com:582306',
+        'only_matching': True,
     }]
 
     @staticmethod
@@ -60,16 +65,40 @@ class EaglePlatformIE(InfoExtractor):
             webpage)
         if mobj is not None:
             return mobj.group('url')
-        # Basic usage embedding (see http://dultonmedia.github.io/eplayer/)
+        PLAYER_JS_RE = r'''
+                        <script[^>]+
+                            src=(?P<qjs>["\'])(?:https?:)?//(?P<host>(?:(?!(?P=qjs)).)+\.media\.eagleplatform\.com)/player/player\.js(?P=qjs)
+                        .+?
+                    '''
+        # "Basic usage" embedding (see http://dultonmedia.github.io/eplayer/)
         mobj = re.search(
             r'''(?xs)
-                    <script[^>]+
-                        src=(?P<q1>["\'])(?:https?:)?//(?P<host>.+?\.media\.eagleplatform\.com)/player/player\.js(?P=q1)
-                    .+?
+                    %s
                     <div[^>]+
-                        class=(?P<q2>["\'])eagleplayer(?P=q2)[^>]+
+                        class=(?P<qclass>["\'])eagleplayer(?P=qclass)[^>]+
                         data-id=["\'](?P<id>\d+)
-            ''', webpage)
+            ''' % PLAYER_JS_RE, webpage)
+        if mobj is not None:
+            return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict()
+        # Generalization of "Javascript code usage", "Combined usage" and
+        # "Usage without attaching to DOM" embeddings (see
+        # http://dultonmedia.github.io/eplayer/)
+        mobj = re.search(
+            r'''(?xs)
+                    %s
+                    <script>
+                    .+?
+                    new\s+EaglePlayer\(
+                        (?:[^,]+\s*,\s*)?
+                        {
+                            .+?
+                            \bid\s*:\s*["\']?(?P<id>\d+)
+                            .+?
+                        }
+                    \s*\)
+                    .+?
+                    </script>
+            ''' % PLAYER_JS_RE, webpage)
         if mobj is not None:
             return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict()
 
@@ -79,9 +108,10 @@ class EaglePlatformIE(InfoExtractor):
         if status != 200:
             raise ExtractorError(' '.join(response['errors']), expected=True)
 
-    def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', *args, **kwargs):
+    def _download_json(self, url_or_request, video_id, *args, **kwargs):
         try:
-            response = super(EaglePlatformIE, self)._download_json(url_or_request, video_id, note)
+            response = super(EaglePlatformIE, self)._download_json(
+                url_or_request, video_id, *args, **kwargs)
         except ExtractorError as ee:
             if isinstance(ee.cause, compat_HTTPError):
                 response = self._parse_json(ee.cause.read().decode('utf-8'), video_id)
@@ -93,11 +123,24 @@ class EaglePlatformIE(InfoExtractor):
         return self._download_json(url_or_request, video_id, note)['data'][0]
 
     def _real_extract(self, url):
+        url, smuggled_data = unsmuggle_url(url, {})
+
         mobj = re.match(self._VALID_URL, url)
         host, video_id = mobj.group('custom_host') or mobj.group('host'), mobj.group('id')
 
+        headers = {}
+        query = {
+            'id': video_id,
+        }
+
+        referrer = smuggled_data.get('referrer')
+        if referrer:
+            headers['Referer'] = referrer
+            query['referrer'] = referrer
+
         player_data = self._download_json(
-            'http://%s/api/player_data?id=%s' % (host, video_id), video_id)
+            'http://%s/api/player_data' % host, video_id,
+            headers=headers, query=query)
 
         media = player_data['data']['playlist']['viewports'][0]['medialist'][0]
 
index db921465e1c6b14330a7e22be79c7a93c6e6ea7b..e4a3046af573fec2961ae84a2137a9395bf68f0b 100644 (file)
@@ -1,15 +1,18 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    try_get,
+    unified_timestamp,
+)
 
 
 class EggheadCourseIE(InfoExtractor):
     IE_DESC = 'egghead.io course'
     IE_NAME = 'egghead:course'
-    _VALID_URL = r'https://egghead\.io/courses/(?P<id>[a-zA-Z_0-9-]+)'
+    _VALID_URL = r'https://egghead\.io/courses/(?P<id>[^/?#&]+)'
     _TEST = {
         'url': 'https://egghead.io/courses/professor-frisby-introduces-composable-functional-javascript',
         'playlist_count': 29,
@@ -22,18 +25,60 @@ class EggheadCourseIE(InfoExtractor):
 
     def _real_extract(self, url):
         playlist_id = self._match_id(url)
-        webpage = self._download_webpage(url, playlist_id)
 
-        title = self._html_search_regex(r'<h1 class="title">([^<]+)</h1>', webpage, 'title')
-        ul = self._search_regex(r'(?s)<ul class="series-lessons-list">(.*?)</ul>', webpage, 'session list')
+        course = self._download_json(
+            'https://egghead.io/api/v1/series/%s' % playlist_id, playlist_id)
+
+        entries = [
+            self.url_result(
+                'wistia:%s' % lesson['wistia_id'], ie='Wistia',
+                video_id=lesson['wistia_id'], video_title=lesson.get('title'))
+            for lesson in course['lessons'] if lesson.get('wistia_id')]
+
+        return self.playlist_result(
+            entries, playlist_id, course.get('title'),
+            course.get('description'))
+
+
+class EggheadLessonIE(InfoExtractor):
+    IE_DESC = 'egghead.io lesson'
+    IE_NAME = 'egghead:lesson'
+    _VALID_URL = r'https://egghead\.io/lessons/(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'https://egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box',
+        'info_dict': {
+            'id': 'fv5yotjxcg',
+            'ext': 'mp4',
+            'title': 'Create linear data flow with container style types (Box)',
+            'description': 'md5:9aa2cdb6f9878ed4c39ec09e85a8150e',
+            'thumbnail': r're:^https?:.*\.jpg$',
+            'timestamp': 1481296768,
+            'upload_date': '20161209',
+            'duration': 304,
+            'view_count': 0,
+            'tags': ['javascript', 'free'],
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        lesson_id = self._match_id(url)
 
-        found = re.findall(r'(?s)<a class="[^"]*"\s*href="([^"]+)">\s*<li class="item', ul)
-        entries = [self.url_result(m) for m in found]
+        lesson = self._download_json(
+            'https://egghead.io/api/v1/lessons/%s' % lesson_id, lesson_id)
 
         return {
-            '_type': 'playlist',
-            'id': playlist_id,
-            'title': title,
-            'description': self._og_search_description(webpage),
-            'entries': entries,
+            '_type': 'url_transparent',
+            'ie_key': 'Wistia',
+            'url': 'wistia:%s' % lesson['wistia_id'],
+            'id': lesson['wistia_id'],
+            'title': lesson.get('title'),
+            'description': lesson.get('summary'),
+            'thumbnail': lesson.get('thumb_nail'),
+            'timestamp': unified_timestamp(lesson.get('published_at')),
+            'duration': int_or_none(lesson.get('duration')),
+            'view_count': int_or_none(lesson.get('plays_count')),
+            'tags': try_get(lesson, lambda x: x['tag_list'], list),
         }
index 8795e0ddf5e26f676a421173a0e1fd019cd112cb..7a743606836517736e858411e626a5e2a502890e 100644 (file)
@@ -10,7 +10,25 @@ from ..utils import (
 
 
 class ESPNIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:espn\.go|(?:www\.)?espn)\.com/video/clip(?:\?.*?\bid=|/_/id/)(?P<id>\d+)'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            (?:(?:\w+\.)+)?espn\.go|
+                            (?:www\.)?espn
+                        )\.com/
+                        (?:
+                            (?:
+                                video/clip|
+                                watch/player
+                            )
+                            (?:
+                                \?.*?\bid=|
+                                /_/id/
+                            )
+                        )
+                        (?P<id>\d+)
+                    '''
+
     _TESTS = [{
         'url': 'http://espn.go.com/video/clip?id=10365079',
         'info_dict': {
@@ -25,20 +43,34 @@ class ESPNIE(InfoExtractor):
             'skip_download': True,
         },
     }, {
-        # intl video, from http://www.espnfc.us/video/mls-highlights/150/video/2743663/must-see-moments-best-of-the-mls-season
-        'url': 'http://espn.go.com/video/clip?id=2743663',
+        'url': 'https://broadband.espn.go.com/video/clip?id=18910086',
         'info_dict': {
-            'id': '2743663',
+            'id': '18910086',
             'ext': 'mp4',
-            'title': 'Must-See Moments: Best of the MLS season',
-            'description': 'md5:4c2d7232beaea572632bec41004f0aeb',
-            'timestamp': 1449446454,
-            'upload_date': '20151207',
+            'title': 'Kyrie spins around defender for two',
+            'description': 'md5:2b0f5bae9616d26fba8808350f0d2b9b',
+            'timestamp': 1489539155,
+            'upload_date': '20170315',
         },
         'params': {
             'skip_download': True,
         },
         'expected_warnings': ['Unable to download f4m manifest'],
+    }, {
+        'url': 'http://nonredline.sports.espn.go.com/video/clip?id=19744672',
+        'only_matching': True,
+    }, {
+        'url': 'https://cdn.espn.go.com/video/clip/_/id/19771774',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.espn.com/watch/player?id=19141491',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.espn.com/watch/player?bucketId=257&id=19505875',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.espn.com/watch/player/_/id/19141491',
+        'only_matching': True,
     }, {
         'url': 'http://www.espn.com/video/clip?id=10365079',
         'only_matching': True,
index 355a4e56fb94f39f36bd497bb07682ddafa0ef48..aefadc56f6aeee06150c8c27926a7e8aa3e734bd 100644 (file)
@@ -41,9 +41,11 @@ from .alphaporno import AlphaPornoIE
 from .amcnetworks import AMCNetworksIE
 from .animeondemand import AnimeOnDemandIE
 from .anitube import AnitubeIE
+from .anvato import AnvatoIE
 from .anysex import AnySexIE
 from .aol import AolIE
 from .allocine import AllocineIE
+from .aliexpress import AliExpressLiveIE
 from .aparat import AparatIE
 from .appleconnect import AppleConnectIE
 from .appletrailers import (
@@ -70,6 +72,10 @@ from .arte import (
     TheOperaPlatformIE,
     ArteTVPlaylistIE,
 )
+from .asiancrush import (
+    AsianCrushIE,
+    AsianCrushPlaylistIE,
+)
 from .atresplayer import AtresPlayerIE
 from .atttechchannel import ATTTechChannelIE
 from .atvat import ATVAtIE
@@ -89,7 +95,7 @@ from .azmedien import (
 )
 from .baidu import BaiduVideoIE
 from .bambuser import BambuserIE, BambuserChannelIE
-from .bandcamp import BandcampIE, BandcampAlbumIE
+from .bandcamp import BandcampIE, BandcampAlbumIE, BandcampWeeklyIE
 from .bbc import (
     BBCCoUkIE,
     BBCCoUkArticleIE,
@@ -97,7 +103,10 @@ from .bbc import (
     BBCCoUkPlaylistIE,
     BBCIE,
 )
-from .beampro import BeamProLiveIE
+from .beampro import (
+    BeamProLiveIE,
+    BeamProVodIE,
+)
 from .beeg import BeegIE
 from .behindkink import BehindKinkIE
 from .bellmedia import BellMediaIE
@@ -177,8 +186,9 @@ from .chirbit import (
     ChirbitProfileIE,
 )
 from .cinchcast import CinchcastIE
-from .clipfish import ClipfishIE
+from .cjsw import CJSWIE
 from .cliphunter import CliphunterIE
+from .clippit import ClippitIE
 from .cliprs import ClipRsIE
 from .clipsyndicate import ClipsyndicateIE
 from .closertotruth import CloserToTruthIE
@@ -250,7 +260,10 @@ from .democracynow import DemocracynowIE
 from .dfb import DFBIE
 from .dhm import DHMIE
 from .dotsub import DotsubIE
-from .douyutv import DouyuTVIE
+from .douyutv import (
+    DouyuShowIE,
+    DouyuTVIE,
+)
 from .dplay import (
     DPlayIE,
     DPlayItIE,
@@ -286,7 +299,10 @@ from .dw import (
 from .eagleplatform import EaglePlatformIE
 from .ebaumsworld import EbaumsWorldIE
 from .echomsk import EchoMskIE
-from .egghead import EggheadCourseIE
+from .egghead import (
+    EggheadCourseIE,
+    EggheadLessonIE,
+)
 from .ehow import EHowIE
 from .eighttracks import EightTracksIE
 from .einthusan import EinthusanIE
@@ -336,7 +352,12 @@ from .flipagram import FlipagramIE
 from .folketinget import FolketingetIE
 from .footyroom import FootyRoomIE
 from .formula1 import Formula1IE
-from .fourtube import FourTubeIE
+from .fourtube import (
+    FourTubeIE,
+    PornTubeIE,
+    PornerBrosIE,
+    FuxIE,
+)
 from .fox import FOXIE
 from .fox9 import FOX9IE
 from .foxgay import FoxgayIE
@@ -349,9 +370,9 @@ from .foxsports import FoxSportsIE
 from .franceculture import FranceCultureIE
 from .franceinter import FranceInterIE
 from .francetv import (
-    PluzzIE,
-    FranceTvInfoIE,
     FranceTVIE,
+    FranceTVEmbedIE,
+    FranceTVInfoIE,
     GenerationQuoiIE,
     CultureboxIE,
 )
@@ -385,7 +406,6 @@ from .globo import (
 from .go import GoIE
 from .go90 import Go90IE
 from .godtube import GodTubeIE
-from .godtv import GodTVIE
 from .golem import GolemIE
 from .googledrive import GoogleDriveIE
 from .googleplus import GooglePlusIE
@@ -459,6 +479,7 @@ from .jamendo import (
 )
 from .jeuxvideo import JeuxVideoIE
 from .jove import JoveIE
+from .joj import JojIE
 from .jwplatform import JWPlatformIE
 from .jpopsukitv import JpopsukiIE
 from .kaltura import KalturaIE
@@ -489,6 +510,7 @@ from .la7 import LA7IE
 from .laola1tv import (
     Laola1TvEmbedIE,
     Laola1TvIE,
+    ITTFIE,
 )
 from .lci import LCIIE
 from .lcp import (
@@ -516,7 +538,10 @@ from .limelight import (
     LimelightChannelListIE,
 )
 from .litv import LiTVIE
-from .liveleak import LiveLeakIE
+from .liveleak import (
+    LiveLeakIE,
+    LiveLeakEmbedIE,
+)
 from .livestream import (
     LivestreamIE,
     LivestreamOriginalIE,
@@ -539,9 +564,12 @@ from .mangomolo import (
     MangomoloVideoIE,
     MangomoloLiveIE,
 )
+from .manyvids import ManyVidsIE
 from .matchtv import MatchTVIE
 from .mdr import MDRIE
+from .mediaset import MediasetIE
 from .medici import MediciIE
+from .megaphone import MegaphoneIE
 from .meipai import MeipaiIE
 from .melonvod import MelonVODIE
 from .meta import METAIE
@@ -568,7 +596,6 @@ from .mixcloud import (
 )
 from .mlb import MLBIE
 from .mnet import MnetIE
-from .mpora import MporaIE
 from .moevideo import MoeVideoIE
 from .mofosex import MofosexIE
 from .mojvideo import MojvideoIE
@@ -629,7 +656,10 @@ from .neteasemusic import (
     NetEaseMusicProgramIE,
     NetEaseMusicDjRadioIE,
 )
-from .newgrounds import NewgroundsIE
+from .newgrounds import (
+    NewgroundsIE,
+    NewgroundsPlaylistIE,
+)
 from .newstube import NewstubeIE
 from .nextmedia import (
     NextMediaIE,
@@ -637,6 +667,10 @@ from .nextmedia import (
     AppleDailyIE,
     NextTVIE,
 )
+from .nexx import (
+    NexxIE,
+    NexxEmbedIE,
+)
 from .nfb import NFBIE
 from .nfl import NFLIE
 from .nhk import NhkVodIE
@@ -650,6 +684,7 @@ from .nick import (
     NickIE,
     NickDeIE,
     NickNightIE,
+    NickRuIE,
 )
 from .niconico import NiconicoIE, NiconicoPlaylistIE
 from .ninecninemedia import (
@@ -662,6 +697,8 @@ from .nintendo import NintendoIE
 from .njpwworld import NJPWWorldIE
 from .nobelprize import NobelPrizeIE
 from .noco import NocoIE
+from .nonktube import NonkTubeIE
+from .noovo import NoovoIE
 from .normalboots import NormalbootsIE
 from .nosvideo import NosVideoIE
 from .nova import NovaIE
@@ -730,8 +767,8 @@ from .openload import OpenloadIE
 from .ora import OraTVIE
 from .orf import (
     ORFTVthekIE,
-    ORFOE1IE,
     ORFFM4IE,
+    ORFOE1IE,
     ORFIPTVIE,
 )
 from .packtpub import (
@@ -743,6 +780,7 @@ from .pandoratv import PandoraTVIE
 from .parliamentliveuk import ParliamentLiveUKIE
 from .patreon import PatreonIE
 from .pbs import PBSIE
+from .pearvideo import PearVideoIE
 from .people import PeopleIE
 from .periscope import (
     PeriscopeIE,
@@ -808,11 +846,16 @@ from .radiobremen import RadioBremenIE
 from .radiofrance import RadioFranceIE
 from .rai import (
     RaiPlayIE,
+    RaiPlayLiveIE,
     RaiIE,
 )
 from .rbmaradio import RBMARadioIE
 from .rds import RDSIE
 from .redbulltv import RedBullTVIE
+from .reddit import (
+    RedditIE,
+    RedditRIE,
+)
 from .redtube import RedTubeIE
 from .regiotv import RegioTVIE
 from .rentv import (
@@ -856,9 +899,11 @@ from .rutube import (
     RutubeEmbedIE,
     RutubeMovieIE,
     RutubePersonIE,
+    RutubePlaylistIE,
 )
 from .rutv import RUTVIE
 from .ruutu import RuutuIE
+from .ruv import RuvIE
 from .sandia import SandiaIE
 from .safari import (
     SafariIE,
@@ -905,8 +950,9 @@ from .soundcloud import (
     SoundcloudIE,
     SoundcloudSetIE,
     SoundcloudUserIE,
+    SoundcloudTrackStationIE,
     SoundcloudPlaylistIE,
-    SoundcloudSearchIE
+    SoundcloudSearchIE,
 )
 from .soundgasm import (
     SoundgasmIE,
@@ -955,6 +1001,7 @@ from .tagesschau import (
     TagesschauIE,
 )
 from .tass import TassIE
+from .tastytrade import TastyTradeIE
 from .tbs import TBSIE
 from .tdslifeway import TDSLifewayIE
 from .teachertube import (
@@ -963,7 +1010,6 @@ from .teachertube import (
 )
 from .teachingchannel import TeachingChannelIE
 from .teamcoco import TeamcocoIE
-from .teamfourstar import TeamFourStarIE
 from .techtalks import TechTalksIE
 from .ted import TEDIE
 from .tele13 import Tele13IE
@@ -1012,11 +1058,6 @@ from .trilulilu import TriluliluIE
 from .trutv import TruTVIE
 from .tube8 import Tube8IE
 from .tubitv import TubiTvIE
-from .tudou import (
-    TudouIE,
-    TudouPlaylistIE,
-    TudouAlbumIE,
-)
 from .tumblr import TumblrIE
 from .tunein import (
     TuneInClipIE,
@@ -1096,6 +1137,10 @@ from .uplynk import (
     UplynkIE,
     UplynkPreplayIE,
 )
+from .upskill import (
+    UpskillIE,
+    UpskillCourseIE,
+)
 from .urort import UrortIE
 from .urplay import URPlayIE
 from .usanetwork import USANetworkIE
@@ -1123,6 +1168,7 @@ from .vgtv import (
 from .vh1 import VH1IE
 from .vice import (
     ViceIE,
+    ViceArticleIE,
     ViceShowIE,
 )
 from .viceland import VicelandIE
@@ -1185,12 +1231,14 @@ from .vk import (
 )
 from .vlive import (
     VLiveIE,
-    VLiveChannelIE
+    VLiveChannelIE,
+    VLivePlaylistIE
 )
 from .vodlocker import VodlockerIE
 from .vodpl import VODPlIE
 from .vodplatform import VODPlatformIE
 from .voicerepublic import VoiceRepublicIE
+from .voot import VootIE
 from .voxmedia import VoxMediaIE
 from .vporn import VpornIE
 from .vrt import VRTIE
@@ -1212,6 +1260,7 @@ from .washingtonpost import (
     WashingtonPostArticleIE,
 )
 from .wat import WatIE
+from .watchbox import WatchBoxIE
 from .watchindianporn import WatchIndianPornIE
 from .wdr import (
     WDRIE,
@@ -1261,12 +1310,12 @@ from .yahoo import (
     YahooIE,
     YahooSearchIE,
 )
-from .yam import YamIE
 from .yandexmusic import (
     YandexMusicTrackIE,
     YandexMusicAlbumIE,
     YandexMusicPlaylistIE,
 )
+from .yandexdisk import YandexDiskIE
 from .yesjapan import YesJapanIE
 from .yinyuetai import YinYueTaiIE
 from .ynet import YnetIE
@@ -1298,5 +1347,6 @@ from .youtube import (
     YoutubeWatchLaterIE,
 )
 from .zapiks import ZapiksIE
+from .zaq1 import Zaq1IE
 from .zdf import ZDFIE, ZDFChannelIE
 from .zingmp3 import ZingMp3IE
index b69c1ede0046d73e31df2098f78cf6dc20c254d4..4b3f6cc86b57f283f08faedfa481bfbc9719a879 100644 (file)
@@ -203,19 +203,19 @@ class FacebookIE(InfoExtractor):
     }]
 
     @staticmethod
-    def _extract_url(webpage):
-        mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
-        if mobj is not None:
-            return mobj.group('url')
-
+    def _extract_urls(webpage):
+        urls = []
+        for mobj in re.finditer(
+                r'<iframe[^>]+?src=(["\'])(?P<url>https?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1',
+                webpage):
+            urls.append(mobj.group('url'))
         # Facebook API embed
         # see https://developers.facebook.com/docs/plugins/embedded-video-player
-        mobj = re.search(r'''(?x)<div[^>]+
+        for mobj in re.finditer(r'''(?x)<div[^>]+
                 class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+
-                data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage)
-        if mobj is not None:
-            return mobj.group('url')
+                data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage):
+            urls.append(mobj.group('url'))
+        return urls
 
     def _login(self):
         (useremail, password) = self._get_login_info()
index 081c7184233d3e79d0a2d684bd693631b7600eb2..4803a22c81d16c2540f33cfb97bdc5a1d7f7aa57 100644 (file)
@@ -102,6 +102,8 @@ class FirstTVIE(InfoExtractor):
                     'format_id': f.get('name'),
                     'tbr': tbr,
                     'source_preference': quality(f.get('name')),
+                    # quality metadata of http formats may be incorrect
+                    'preference': -1,
                 })
             # m3u8 URL format is reverse engineered from [1] (search for
             # master.m3u8). dashEdges (that is currently balancer-vod.1tv.ru)
index 15736c9fe91e6d5a860641bcbd3be49636b83b47..9f9863746d3daacb8f008a36add7d237ea6ee12f 100644 (file)
@@ -43,7 +43,7 @@ class FiveTVIE(InfoExtractor):
         'info_dict': {
             'id': 'glavnoe',
             'ext': 'mp4',
-            'title': 'Итоги недели с 8 по 14 июня 2015 года',
+            'title': r're:^Итоги недели с \d+ по \d+ \w+ \d{4} года$',
             'thumbnail': r're:^https?://.*\.jpg$',
         },
     }, {
@@ -70,7 +70,8 @@ class FiveTVIE(InfoExtractor):
         webpage = self._download_webpage(url, video_id)
 
         video_url = self._search_regex(
-            r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"',
+            [r'<div[^>]+?class="flowplayer[^>]+?data-href="([^"]+)"',
+             r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"'],
             webpage, 'video url')
 
         title = self._og_search_title(webpage, default=None) or self._search_regex(
index a8e1bf42a433fd87f638e8b34ce5ab68464a9252..9f166efd4851fe0833de8a85bac07a97ce9f1722 100644 (file)
@@ -1,7 +1,10 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..compat import compat_urllib_parse_urlencode
+from ..compat import (
+    compat_str,
+    compat_urllib_parse_urlencode,
+)
 from ..utils import (
     ExtractorError,
     int_or_none,
@@ -81,7 +84,7 @@ class FlickrIE(InfoExtractor):
 
             formats = []
             for stream in streams['stream']:
-                stream_type = str(stream.get('type'))
+                stream_type = compat_str(stream.get('type'))
                 formats.append({
                     'format_id': stream_type,
                     'url': stream['_content'],
index 9776c8422228f1f44b5b0a3bf0c40a125e1fa50a..ad273a0e70c3fbd9087779d33829b817d9d70127 100644 (file)
@@ -3,39 +3,22 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_urlparse
 from ..utils import (
     parse_duration,
     parse_iso8601,
-    sanitized_Request,
     str_to_int,
 )
 
 
-class FourTubeIE(InfoExtractor):
-    IE_NAME = '4tube'
-    _VALID_URL = r'https?://(?:www\.)?4tube\.com/videos/(?P<id>\d+)'
+class FourTubeBaseIE(InfoExtractor):
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        kind, video_id, display_id = mobj.group('kind', 'id', 'display_id')
 
-    _TEST = {
-        'url': 'http://www.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black',
-        'md5': '6516c8ac63b03de06bc8eac14362db4f',
-        'info_dict': {
-            'id': '209733',
-            'ext': 'mp4',
-            'title': 'Hot Babe Holly Michaels gets her ass stuffed by black',
-            'uploader': 'WCP Club',
-            'uploader_id': 'wcp-club',
-            'upload_date': '20131031',
-            'timestamp': 1383263892,
-            'duration': 583,
-            'view_count': int,
-            'like_count': int,
-            'categories': list,
-            'age_limit': 18,
-        }
-    }
+        if kind == 'm' or not display_id:
+            url = self._URL_TEMPLATE % video_id
 
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
 
         title = self._html_search_meta('name', webpage)
@@ -43,10 +26,10 @@ class FourTubeIE(InfoExtractor):
             'uploadDate', webpage))
         thumbnail = self._html_search_meta('thumbnailUrl', webpage)
         uploader_id = self._html_search_regex(
-            r'<a class="item-to-subscribe" href="[^"]+/channels/([^/"]+)" title="Go to [^"]+ page">',
+            r'<a class="item-to-subscribe" href="[^"]+/(?:channel|user)s?/([^/"]+)" title="Go to [^"]+ page">',
             webpage, 'uploader id', fatal=False)
         uploader = self._html_search_regex(
-            r'<a class="item-to-subscribe" href="[^"]+/channels/[^/"]+" title="Go to ([^"]+) page">',
+            r'<a class="item-to-subscribe" href="[^"]+/(?:channel|user)s?/[^/"]+" title="Go to ([^"]+) page">',
             webpage, 'uploader', fatal=False)
 
         categories_html = self._search_regex(
@@ -60,10 +43,10 @@ class FourTubeIE(InfoExtractor):
 
         view_count = str_to_int(self._search_regex(
             r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:([0-9,]+)">',
-            webpage, 'view count', fatal=False))
+            webpage, 'view count', default=None))
         like_count = str_to_int(self._search_regex(
             r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserLikes:([0-9,]+)">',
-            webpage, 'like count', fatal=False))
+            webpage, 'like count', default=None))
         duration = parse_duration(self._html_search_meta('duration', webpage))
 
         media_id = self._search_regex(
@@ -85,14 +68,14 @@ class FourTubeIE(InfoExtractor):
             media_id = params[0]
             sources = ['%s' % p for p in params[2]]
 
-        token_url = 'http://tkn.4tube.com/{0}/desktop/{1}'.format(
+        token_url = 'https://tkn.kodicdn.com/{0}/desktop/{1}'.format(
             media_id, '+'.join(sources))
-        headers = {
-            b'Content-Type': b'application/x-www-form-urlencoded',
-            b'Origin': b'http://www.4tube.com',
-        }
-        token_req = sanitized_Request(token_url, b'{}', headers)
-        tokens = self._download_json(token_req, video_id)
+
+        parsed_url = compat_urlparse.urlparse(url)
+        tokens = self._download_json(token_url, video_id, data=b'', headers={
+            'Origin': '%s://%s' % (parsed_url.scheme, parsed_url.hostname),
+            'Referer': url,
+        })
         formats = [{
             'url': tokens[format]['token'],
             'format_id': format + 'p',
@@ -115,3 +98,126 @@ class FourTubeIE(InfoExtractor):
             'duration': duration,
             'age_limit': 18,
         }
+
+
+class FourTubeIE(FourTubeBaseIE):
+    IE_NAME = '4tube'
+    _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?4tube\.com/(?:videos|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?'
+    _URL_TEMPLATE = 'https://www.4tube.com/videos/%s/video'
+    _TESTS = [{
+        'url': 'http://www.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black',
+        'md5': '6516c8ac63b03de06bc8eac14362db4f',
+        'info_dict': {
+            'id': '209733',
+            'ext': 'mp4',
+            'title': 'Hot Babe Holly Michaels gets her ass stuffed by black',
+            'uploader': 'WCP Club',
+            'uploader_id': 'wcp-club',
+            'upload_date': '20131031',
+            'timestamp': 1383263892,
+            'duration': 583,
+            'view_count': int,
+            'like_count': int,
+            'categories': list,
+            'age_limit': 18,
+        },
+    }, {
+        'url': 'http://www.4tube.com/embed/209733',
+        'only_matching': True,
+    }, {
+        'url': 'http://m.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black',
+        'only_matching': True,
+    }]
+
+
+class FuxIE(FourTubeBaseIE):
+    _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?fux\.com/(?:video|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?'
+    _URL_TEMPLATE = 'https://www.fux.com/video/%s/video'
+    _TESTS = [{
+        'url': 'https://www.fux.com/video/195359/awesome-fucking-kitchen-ends-cum-swallow',
+        'info_dict': {
+            'id': '195359',
+            'ext': 'mp4',
+            'title': 'Awesome fucking in the kitchen ends with cum swallow',
+            'uploader': 'alenci2342',
+            'uploader_id': 'alenci2342',
+            'upload_date': '20131230',
+            'timestamp': 1388361660,
+            'duration': 289,
+            'view_count': int,
+            'like_count': int,
+            'categories': list,
+            'age_limit': 18,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.fux.com/embed/195359',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.fux.com/video/195359/awesome-fucking-kitchen-ends-cum-swallow',
+        'only_matching': True,
+    }]
+
+
+class PornTubeIE(FourTubeBaseIE):
+    _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?porntube\.com/(?:videos/(?P<display_id>[^/]+)_|embed/)(?P<id>\d+)'
+    _URL_TEMPLATE = 'https://www.porntube.com/videos/video_%s'
+    _TESTS = [{
+        'url': 'https://www.porntube.com/videos/teen-couple-doing-anal_7089759',
+        'info_dict': {
+            'id': '7089759',
+            'ext': 'mp4',
+            'title': 'Teen couple doing anal',
+            'uploader': 'Alexy',
+            'uploader_id': 'Alexy',
+            'upload_date': '20150606',
+            'timestamp': 1433595647,
+            'duration': 5052,
+            'view_count': int,
+            'like_count': int,
+            'categories': list,
+            'age_limit': 18,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.porntube.com/embed/7089759',
+        'only_matching': True,
+    }, {
+        'url': 'https://m.porntube.com/videos/teen-couple-doing-anal_7089759',
+        'only_matching': True,
+    }]
+
+
+class PornerBrosIE(FourTubeBaseIE):
+    _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?pornerbros\.com/(?:videos/(?P<display_id>[^/]+)_|embed/)(?P<id>\d+)'
+    _URL_TEMPLATE = 'https://www.pornerbros.com/videos/video_%s'
+    _TESTS = [{
+        'url': 'https://www.pornerbros.com/videos/skinny-brunette-takes-big-cock-down-her-anal-hole_181369',
+        'md5': '6516c8ac63b03de06bc8eac14362db4f',
+        'info_dict': {
+            'id': '181369',
+            'ext': 'mp4',
+            'title': 'Skinny brunette takes big cock down her anal hole',
+            'uploader': 'PornerBros HD',
+            'uploader_id': 'pornerbros-hd',
+            'upload_date': '20130130',
+            'timestamp': 1359527401,
+            'duration': 1224,
+            'view_count': int,
+            'categories': list,
+            'age_limit': 18,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.pornerbros.com/embed/181369',
+        'only_matching': True,
+    }, {
+        'url': 'https://m.pornerbros.com/videos/skinny-brunette-takes-big-cock-down-her-anal-hole_181369',
+        'only_matching': True,
+    }]
index 159fdf9c476df24e15c2f02fcce513922acf85ca..facc665f606238fb2ebb54193fe4e525141216f4 100644 (file)
@@ -3,56 +3,99 @@ from __future__ import unicode_literals
 
 from .adobepass import AdobePassIE
 from ..utils import (
-    smuggle_url,
-    update_url_query,
+    int_or_none,
+    parse_age_limit,
+    parse_duration,
+    try_get,
+    unified_timestamp,
 )
 
 
 class FOXIE(AdobePassIE):
-    _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[0-9]+)'
-    _TEST = {
-        'url': 'http://www.fox.com/watch/255180355939/7684182528',
+    _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[\da-fA-F]+)'
+    _TESTS = [{
+        # clip
+        'url': 'https://www.fox.com/watch/4b765a60490325103ea69888fb2bd4e8/',
         'md5': 'ebd296fcc41dd4b19f8115d8461a3165',
         'info_dict': {
-            'id': '255180355939',
+            'id': '4b765a60490325103ea69888fb2bd4e8',
             'ext': 'mp4',
-            'title': 'Official Trailer: Gotham',
-            'description': 'Tracing the rise of the great DC Comics Super-Villains and vigilantes, Gotham reveals an entirely new chapter that has never been told.',
-            'duration': 129,
-            'timestamp': 1400020798,
-            'upload_date': '20140513',
-            'uploader': 'NEWA-FNG-FOXCOM',
+            'title': 'Aftermath: Bruce Wayne Develops Into The Dark Knight',
+            'description': 'md5:549cd9c70d413adb32ce2a779b53b486',
+            'duration': 102,
+            'timestamp': 1504291893,
+            'upload_date': '20170901',
+            'creator': 'FOX',
+            'series': 'Gotham',
         },
-        'add_ie': ['ThePlatform'],
-    }
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # episode, geo-restricted
+        'url': 'https://www.fox.com/watch/087036ca7f33c8eb79b08152b4dd75c1/',
+        'only_matching': True,
+    }, {
+        # episode, geo-restricted, tv provided required
+        'url': 'https://www.fox.com/watch/30056b295fb57f7452aeeb4920bc3024/',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
-
-        settings = self._parse_json(self._search_regex(
-            r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
-            webpage, 'drupal settings'), video_id)
-        fox_pdk_player = settings['fox_pdk_player']
-        release_url = fox_pdk_player['release_url']
-        query = {
-            'mbr': 'true',
-            'switch': 'http'
-        }
-        if fox_pdk_player.get('access') == 'locked':
-            ap_p = settings['foxAdobePassProvider']
-            rating = ap_p.get('videoRating')
-            if rating == 'n/a':
-                rating = None
-            resource = self._get_mvpd_resource('fbc-fox', None, ap_p['videoGUID'], rating)
-            query['auth'] = self._extract_mvpd_auth(url, video_id, 'fbc-fox', resource)
-
-        info = self._search_json_ld(webpage, video_id, fatal=False)
-        info.update({
-            '_type': 'url_transparent',
-            'ie_key': 'ThePlatform',
-            'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}),
-            'id': video_id,
-        })
 
-        return info
+        video = self._download_json(
+            'https://api.fox.com/fbc-content/v1_4/video/%s' % video_id,
+            video_id, headers={
+                'apikey': 'abdcbed02c124d393b39e818a4312055',
+                'Content-Type': 'application/json',
+                'Referer': url,
+            })
+
+        title = video['name']
+
+        m3u8_url = self._download_json(
+            video['videoRelease']['url'], video_id)['playURL']
+
+        formats = self._extract_m3u8_formats(
+            m3u8_url, video_id, 'mp4',
+            entry_protocol='m3u8_native', m3u8_id='hls')
+        self._sort_formats(formats)
+
+        description = video.get('description')
+        duration = int_or_none(video.get('durationInSeconds')) or int_or_none(
+            video.get('duration')) or parse_duration(video.get('duration'))
+        timestamp = unified_timestamp(video.get('datePublished'))
+        age_limit = parse_age_limit(video.get('contentRating'))
+
+        data = try_get(
+            video, lambda x: x['trackingData']['properties'], dict) or {}
+
+        creator = data.get('brand') or data.get('network') or video.get('network')
+
+        series = video.get('seriesName') or data.get(
+            'seriesName') or data.get('show')
+        season_number = int_or_none(video.get('seasonNumber'))
+        episode = video.get('name')
+        episode_number = int_or_none(video.get('episodeNumber'))
+        release_year = int_or_none(video.get('releaseYear'))
+
+        if data.get('authRequired'):
+            # TODO: AP
+            pass
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'timestamp': timestamp,
+            'age_limit': age_limit,
+            'creator': creator,
+            'series': series,
+            'season_number': season_number,
+            'episode': episode,
+            'episode_number': episode_number,
+            'release_year': release_year,
+            'formats': formats,
+        }
index e887ae48869426617fdbf797182cef93f97ac2ef..512a106455cc7460d81f48d2890bc457fccbe661 100644 (file)
@@ -5,6 +5,7 @@ import itertools
 from .common import InfoExtractor
 from ..utils import (
     get_element_by_id,
+    int_or_none,
     remove_end,
 )
 
@@ -46,7 +47,7 @@ class FoxgayIE(InfoExtractor):
 
         formats = [{
             'url': source,
-            'height': resolution,
+            'height': int_or_none(resolution),
         } for source, resolution in zip(
             video_data['sources'], video_data.get('resolutions', itertools.repeat(None)))]
 
index a3bb98377cf4feb769d89769c40fe7098ae20743..985542727e4273dc1ee379dc2e82e12a580517c2 100644 (file)
@@ -11,10 +11,10 @@ class FoxSportsIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?foxsports\.com/(?:[^/]+/)*(?P<id>[^/]+)'
 
     _TEST = {
-        'url': 'http://www.foxsports.com/video?vid=432609859715',
+        'url': 'http://www.foxsports.com/tennessee/video/432609859715',
         'md5': 'b49050e955bebe32c301972e4012ac17',
         'info_dict': {
-            'id': 'i0qKWsk3qJaM',
+            'id': 'bwduI3X_TgUB',
             'ext': 'mp4',
             'title': 'Courtney Lee on going up 2-0 in series vs. Blazers',
             'description': 'Courtney Lee talks about Memphis being focused.',
@@ -31,8 +31,9 @@ class FoxSportsIE(InfoExtractor):
         webpage = self._download_webpage(url, video_id)
 
         config = self._parse_json(
-            self._search_regex(
-                r"data-player-config='([^']+)'", webpage, 'data player config'),
+            self._html_search_regex(
+                r"""class="[^"]*(?:fs-player|platformPlayer-wrapper)[^"]*".+?data-player-config='([^']+)'""",
+                webpage, 'data player config'),
             video_id)
 
         return self.url_result(smuggle_url(update_url_query(
index 48d43ae58e80bd3b054068e59f4e43464e31ec0f..2bcbb3e39bb5502e83f64b45b3ca115d7a3bbaa9 100644 (file)
@@ -21,11 +21,13 @@ from .dailymotion import (
 
 
 class FranceTVBaseInfoExtractor(InfoExtractor):
-    def _extract_video(self, video_id, catalogue):
+    def _extract_video(self, video_id, catalogue=None):
         info = self._download_json(
-            'http://webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=%s&catalogue=%s'
-            % (video_id, catalogue),
-            video_id, 'Downloading video JSON')
+            'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/',
+            video_id, 'Downloading video JSON', query={
+                'idDiffusion': video_id,
+                'catalogue': catalogue or '',
+            })
 
         if info.get('status') == 'NOK':
             raise ExtractorError(
@@ -109,27 +111,100 @@ class FranceTVBaseInfoExtractor(InfoExtractor):
         }
 
 
-class PluzzIE(FranceTVBaseInfoExtractor):
-    IE_NAME = 'pluzz.francetv.fr'
-    _VALID_URL = r'https?://(?:m\.)?pluzz\.francetv\.fr/videos/(?P<id>.+?)\.html'
+class FranceTVIE(FranceTVBaseInfoExtractor):
+    _VALID_URL = r'https?://(?:(?:www\.)?france\.tv|mobile\.france\.tv)/(?:[^/]+/)*(?P<id>[^/]+)\.html'
 
-    # Can't use tests, videos expire in 7 days
+    _TESTS = [{
+        'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html',
+        'info_dict': {
+            'id': '157550144',
+            'ext': 'mp4',
+            'title': '13h15, le dimanche... - Les mystères de Jésus',
+            'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42',
+            'timestamp': 1494156300,
+            'upload_date': '20170507',
+        },
+        'params': {
+            # m3u8 downloads
+            'skip_download': True,
+        },
+    }, {
+        # france3
+        'url': 'https://www.france.tv/france-3/des-chiffres-et-des-lettres/139063-emission-du-mardi-9-mai-2017.html',
+        'only_matching': True,
+    }, {
+        # france4
+        'url': 'https://www.france.tv/france-4/hero-corp/saison-1/134151-apres-le-calme.html',
+        'only_matching': True,
+    }, {
+        # france5
+        'url': 'https://www.france.tv/france-5/c-a-dire/saison-10/137013-c-a-dire.html',
+        'only_matching': True,
+    }, {
+        # franceo
+        'url': 'https://www.france.tv/france-o/archipels/132249-mon-ancetre-l-esclave.html',
+        'only_matching': True,
+    }, {
+        # france2 live
+        'url': 'https://www.france.tv/france-2/direct.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.france.tv/documentaires/histoire/136517-argentine-les-500-bebes-voles-de-la-dictature.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.france.tv/jeux-et-divertissements/divertissements/133965-le-web-contre-attaque.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://mobile.france.tv/france-5/c-dans-l-air/137347-emission-du-vendredi-12-mai-2017.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.france.tv/142749-rouge-sang.html',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         display_id = self._match_id(url)
 
         webpage = self._download_webpage(url, display_id)
 
-        video_id = self._html_search_meta(
-            'id_video', webpage, 'video id', default=None)
+        catalogue = None
+        video_id = self._search_regex(
+            r'data-main-video=(["\'])(?P<id>(?:(?!\1).)+)\1',
+            webpage, 'video id', default=None, group='id')
+
         if not video_id:
-            video_id = self._search_regex(
-                r'data-diffusion=["\'](\d+)', webpage, 'video id')
+            video_id, catalogue = self._html_search_regex(
+                r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"',
+                webpage, 'video ID').split('@')
+        return self._extract_video(video_id, catalogue)
 
-        return self._extract_video(video_id, 'Pluzz')
 
+class FranceTVEmbedIE(FranceTVBaseInfoExtractor):
+    _VALID_URL = r'https?://embed\.francetv\.fr/*\?.*?\bue=(?P<id>[^&]+)'
 
-class FranceTvInfoIE(FranceTVBaseInfoExtractor):
+    _TEST = {
+        'url': 'http://embed.francetv.fr/?ue=7fd581a2ccf59d2fc5719c5c13cf6961',
+        'info_dict': {
+            'id': 'NI_983319',
+            'ext': 'mp4',
+            'title': 'Le Pen Reims',
+            'upload_date': '20170505',
+            'timestamp': 1493981780,
+            'duration': 16,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video = self._download_json(
+            'http://api-embed.webservices.francetelevisions.fr/key/%s' % video_id,
+            video_id)
+
+        return self._extract_video(video['video_id'], video.get('catalog'))
+
+
+class FranceTVInfoIE(FranceTVBaseInfoExtractor):
     IE_NAME = 'francetvinfo.fr'
     _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P<title>[^/?#&.]+)'
 
@@ -233,124 +308,6 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
         return self._extract_video(video_id, catalogue)
 
 
-class FranceTVIE(FranceTVBaseInfoExtractor):
-    IE_NAME = 'francetv'
-    IE_DESC = 'France 2, 3, 4, 5 and Ô'
-    _VALID_URL = r'''(?x)
-                    https?://
-                        (?:
-                            (?:www\.)?france[2345o]\.fr/
-                                (?:
-                                    emissions/[^/]+/(?:videos|diffusions)|
-                                    emission/[^/]+|
-                                    videos|
-                                    jt
-                                )
-                            /|
-                            embed\.francetv\.fr/\?ue=
-                        )
-                        (?P<id>[^/?]+)
-                    '''
-
-    _TESTS = [
-        # france2
-        {
-            'url': 'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104',
-            'md5': 'c03fc87cb85429ffd55df32b9fc05523',
-            'info_dict': {
-                'id': '109169362',
-                'ext': 'flv',
-                'title': '13h15, le dimanche...',
-                'description': 'md5:9a0932bb465f22d377a449be9d1a0ff7',
-                'upload_date': '20140914',
-                'timestamp': 1410693600,
-            },
-        },
-        # france3
-        {
-            'url': 'http://www.france3.fr/emissions/pieces-a-conviction/diffusions/13-11-2013_145575',
-            'md5': '679bb8f8921f8623bd658fa2f8364da0',
-            'info_dict': {
-                'id': '000702326_CAPP_PicesconvictionExtrait313022013_120220131722_Au',
-                'ext': 'mp4',
-                'title': 'Le scandale du prix des médicaments',
-                'description': 'md5:1384089fbee2f04fc6c9de025ee2e9ce',
-                'upload_date': '20131113',
-                'timestamp': 1384380000,
-            },
-        },
-        # france4
-        {
-            'url': 'http://www.france4.fr/emissions/hero-corp/videos/rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4',
-            'md5': 'a182bf8d2c43d88d46ec48fbdd260c1c',
-            'info_dict': {
-                'id': 'rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4',
-                'ext': 'mp4',
-                'title': 'Hero Corp Making of - Extrait 1',
-                'description': 'md5:c87d54871b1790679aec1197e73d650a',
-                'upload_date': '20131106',
-                'timestamp': 1383766500,
-            },
-        },
-        # france5
-        {
-            'url': 'http://www.france5.fr/emissions/c-a-dire/videos/quels_sont_les_enjeux_de_cette_rentree_politique__31-08-2015_908948?onglet=tous&page=1',
-            'md5': 'f6c577df3806e26471b3d21631241fd0',
-            'info_dict': {
-                'id': '123327454',
-                'ext': 'flv',
-                'title': 'C à dire ?! - Quels sont les enjeux de cette rentrée politique ?',
-                'description': 'md5:4a0d5cb5dce89d353522a84462bae5a4',
-                'upload_date': '20150831',
-                'timestamp': 1441035120,
-            },
-        },
-        # franceo
-        {
-            'url': 'http://www.franceo.fr/jt/info-soir/18-07-2015',
-            'md5': '47d5816d3b24351cdce512ad7ab31da8',
-            'info_dict': {
-                'id': '125377621',
-                'ext': 'flv',
-                'title': 'Infô soir',
-                'description': 'md5:01b8c6915a3d93d8bbbd692651714309',
-                'upload_date': '20150718',
-                'timestamp': 1437241200,
-                'duration': 414,
-            },
-        },
-        {
-            # francetv embed
-            'url': 'http://embed.francetv.fr/?ue=8d7d3da1e3047c42ade5a5d7dfd3fc87',
-            'info_dict': {
-                'id': 'EV_30231',
-                'ext': 'flv',
-                'title': 'Alcaline, le concert avec Calogero',
-                'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff',
-                'upload_date': '20150226',
-                'timestamp': 1424989860,
-                'duration': 5400,
-            },
-        },
-        {
-            'url': 'http://www.france4.fr/emission/highlander/diffusion-du-17-07-2015-04h05',
-            'only_matching': True,
-        },
-        {
-            'url': 'http://www.franceo.fr/videos/125377617',
-            'only_matching': True,
-        }
-    ]
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
-        video_id, catalogue = self._html_search_regex(
-            r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"',
-            webpage, 'video ID').split('@')
-        return self._extract_video(video_id, catalogue)
-
-
 class GenerationQuoiIE(InfoExtractor):
     IE_NAME = 'france2.fr:generation-quoi'
     _VALID_URL = r'https?://generation-quoi\.france2\.fr/portrait/(?P<id>[^/?#]+)'
index e44a2a87fa9759b170ed5cc780511a6b42faf695..8c37509ec60f132671952c8ad018e4ba79a5fac6 100644 (file)
@@ -2,15 +2,11 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..compat import (
-    compat_HTTPError,
-    compat_urllib_parse_unquote_plus,
-)
+from ..compat import compat_HTTPError
 from ..utils import (
     determine_ext,
     int_or_none,
     js_to_json,
-    sanitized_Request,
     ExtractorError,
     urlencode_postdata
 )
@@ -20,6 +16,7 @@ class FunimationIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?funimation(?:\.com|now\.uk)/shows/[^/]+/(?P<id>[^/?#&]+)'
 
     _NETRC_MACHINE = 'funimation'
+    _TOKEN = None
 
     _TESTS = [{
         'url': 'https://www.funimation.com/shows/hacksign/role-play/',
@@ -38,56 +35,38 @@ class FunimationIE(InfoExtractor):
     }, {
         'url': 'https://www.funimation.com/shows/attack-on-titan-junior-high/broadcast-dub-preview/',
         'info_dict': {
-            'id': '9635',
+            'id': '210051',
             'display_id': 'broadcast-dub-preview',
             'ext': 'mp4',
             'title': 'Attack on Titan: Junior High - Broadcast Dub Preview',
-            'description': 'md5:f8ec49c0aff702a7832cd81b8a44f803',
             'thumbnail': r're:https?://.*\.(?:jpg|png)',
         },
-        'skip': 'Access without user interaction is forbidden by CloudFlare',
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
     }, {
         'url': 'https://www.funimationnow.uk/shows/puzzle-dragons-x/drop-impact/simulcast/',
         'only_matching': True,
     }]
 
-    _LOGIN_URL = 'http://www.funimation.com/login'
-
-    def _extract_cloudflare_session_ua(self, url):
-        ci_session_cookie = self._get_cookies(url).get('ci_session')
-        if ci_session_cookie:
-            ci_session = compat_urllib_parse_unquote_plus(ci_session_cookie.value)
-            # ci_session is a string serialized by PHP function serialize()
-            # This case is simple enough to use regular expressions only
-            return self._search_regex(
-                r'"user_agent";s:\d+:"([^"]+)"', ci_session, 'user agent',
-                default=None)
-
     def _login(self):
         (username, password) = self._get_login_info()
         if username is None:
             return
-        data = urlencode_postdata({
-            'email_field': username,
-            'password_field': password,
-        })
-        user_agent = self._extract_cloudflare_session_ua(self._LOGIN_URL)
-        if not user_agent:
-            user_agent = 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0'
-        login_request = sanitized_Request(self._LOGIN_URL, data, headers={
-            'User-Agent': user_agent,
-            'Content-Type': 'application/x-www-form-urlencoded'
-        })
-        login_page = self._download_webpage(
-            login_request, None, 'Logging in as %s' % username)
-        if any(p in login_page for p in ('funimation.com/logout', '>Log Out<')):
-            return
-        error = self._html_search_regex(
-            r'(?s)<div[^>]+id=["\']errorMessages["\'][^>]*>(.+?)</div>',
-            login_page, 'error messages', default=None)
-        if error:
-            raise ExtractorError('Unable to login: %s' % error, expected=True)
-        raise ExtractorError('Unable to log in')
+        try:
+            data = self._download_json(
+                'https://prod-api-funimationnow.dadcdigital.com/api/auth/login/',
+                None, 'Logging in as %s' % username, data=urlencode_postdata({
+                    'username': username,
+                    'password': password,
+                }))
+            self._TOKEN = data['token']
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+                error = self._parse_json(e.cause.read().decode(), None)['error']
+                raise ExtractorError(error, expected=True)
+            raise
 
     def _real_initialize(self):
         self._login()
@@ -125,9 +104,12 @@ class FunimationIE(InfoExtractor):
         description = self._html_search_meta(['description', 'og:description'], webpage, fatal=True)
 
         try:
+            headers = {}
+            if self._TOKEN:
+                headers['Authorization'] = 'Token %s' % self._TOKEN
             sources = self._download_json(
                 'https://prod-api-funimationnow.dadcdigital.com/api/source/catalog/video/%s/signed/' % video_id,
-                video_id)['items']
+                video_id, headers=headers)['items']
         except ExtractorError as e:
             if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
                 error = self._parse_json(e.cause.read(), video_id)['errors'][0]
index 49409369cc5e72e626b5fbe64f8a346d9695f802..f85e7de1496b07848e19b491b7ef5312652a0d7c 100644 (file)
@@ -1,10 +1,14 @@
 from __future__ import unicode_literals
 
-import json
 import re
 
 from .common import InfoExtractor
-from ..utils import ExtractorError
+from ..utils import (
+    ExtractorError,
+    float_or_none,
+    int_or_none,
+    unified_timestamp,
+)
 
 
 class FunnyOrDieIE(InfoExtractor):
@@ -18,6 +22,10 @@ class FunnyOrDieIE(InfoExtractor):
             'title': 'Heart-Shaped Box: Literal Video Version',
             'description': 'md5:ea09a01bc9a1c46d9ab696c01747c338',
             'thumbnail': r're:^http:.*\.jpg$',
+            'uploader': 'DASjr',
+            'timestamp': 1317904928,
+            'upload_date': '20111006',
+            'duration': 318.3,
         },
     }, {
         'url': 'http://www.funnyordie.com/embed/e402820827',
@@ -27,6 +35,8 @@ class FunnyOrDieIE(InfoExtractor):
             'title': 'Please Use This Song (Jon Lajoie)',
             'description': 'Please use this to sell something.  www.jonlajoie.com',
             'thumbnail': r're:^http:.*\.jpg$',
+            'timestamp': 1398988800,
+            'upload_date': '20140502',
         },
         'params': {
             'skip_download': True,
@@ -100,15 +110,53 @@ class FunnyOrDieIE(InfoExtractor):
                 'url': 'http://www.funnyordie.com%s' % src,
             }]
 
-        post_json = self._search_regex(
-            r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details')
-        post = json.loads(post_json)
+        timestamp = unified_timestamp(self._html_search_meta(
+            'uploadDate', webpage, 'timestamp', default=None))
+
+        uploader = self._html_search_regex(
+            r'<h\d[^>]+\bclass=["\']channel-preview-name[^>]+>(.+?)</h',
+            webpage, 'uploader', default=None)
+
+        title, description, thumbnail, duration = [None] * 4
+
+        medium = self._parse_json(
+            self._search_regex(
+                r'jsonMedium\s*=\s*({.+?});', webpage, 'JSON medium',
+                default='{}'),
+            video_id, fatal=False)
+        if medium:
+            title = medium.get('title')
+            duration = float_or_none(medium.get('duration'))
+            if not timestamp:
+                timestamp = unified_timestamp(medium.get('publishDate'))
+
+        post = self._parse_json(
+            self._search_regex(
+                r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details',
+                default='{}'),
+            video_id, fatal=False)
+        if post:
+            if not title:
+                title = post.get('name')
+            description = post.get('description')
+            thumbnail = post.get('picture')
+
+        if not title:
+            title = self._og_search_title(webpage)
+        if not description:
+            description = self._og_search_description(webpage)
+        if not duration:
+            duration = int_or_none(self._html_search_meta(
+                ('video:duration', 'duration'), webpage, 'duration', default=False))
 
         return {
             'id': video_id,
-            'title': post['name'],
-            'description': post.get('description'),
-            'thumbnail': post.get('picture'),
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'timestamp': timestamp,
+            'duration': duration,
             'formats': formats,
             'subtitles': subtitles,
         }
index 36ba7d8cf3961198b55454346f95f78a938717ff..1726a67049e63ebdc86407f6de244838be1aa47a 100644 (file)
@@ -6,62 +6,52 @@ from .common import InfoExtractor
 from ..utils import (
     float_or_none,
     int_or_none,
-    js_to_json,
     unified_strdate,
 )
 
 
 class GaskrankIE(InfoExtractor):
-    """InfoExtractor for gaskrank.tv"""
-    _VALID_URL = r'https?://(?:www\.)?gaskrank\.tv/tv/(?P<categories>[^/]+)/(?P<id>[^/]+)\.html?'
-    _TESTS = [
-        {
-            'url': 'http://www.gaskrank.tv/tv/motorrad-fun/strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden.htm',
-            'md5': '1ae88dbac97887d85ebd1157a95fc4f9',
-            'info_dict': {
-                'id': '201601/26955',
-                'ext': 'mp4',
-                'title': 'Strike! Einparken können nur Männer - Flurschaden hält sich in Grenzen *lol*',
-                'thumbnail': r're:^https?://.*\.jpg$',
-                'categories': ['motorrad-fun'],
-                'display_id': 'strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden',
-                'uploader_id': 'Bikefun',
-                'upload_date': '20170110',
-                'uploader_url': None,
-            }
-        },
-        {
-            'url': 'http://www.gaskrank.tv/tv/racing/isle-of-man-tt-2011-michael-du-15920.htm',
-            'md5': 'c33ee32c711bc6c8224bfcbe62b23095',
-            'info_dict': {
-                'id': '201106/15920',
-                'ext': 'mp4',
-                'title': 'Isle of Man - Michael Dunlop vs Guy Martin - schwindelig kucken',
-                'thumbnail': r're:^https?://.*\.jpg$',
-                'categories': ['racing'],
-                'display_id': 'isle-of-man-tt-2011-michael-du-15920',
-                'uploader_id': 'IOM',
-                'upload_date': '20160506',
-                'uploader_url': 'www.iomtt.com',
-            }
+    _VALID_URL = r'https?://(?:www\.)?gaskrank\.tv/tv/(?P<categories>[^/]+)/(?P<id>[^/]+)\.htm'
+    _TESTS = [{
+        'url': 'http://www.gaskrank.tv/tv/motorrad-fun/strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden.htm',
+        'md5': '1ae88dbac97887d85ebd1157a95fc4f9',
+        'info_dict': {
+            'id': '201601/26955',
+            'ext': 'mp4',
+            'title': 'Strike! Einparken können nur Männer - Flurschaden hält sich in Grenzen *lol*',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'categories': ['motorrad-fun'],
+            'display_id': 'strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden',
+            'uploader_id': 'Bikefun',
+            'upload_date': '20170110',
+            'uploader_url': None,
         }
-    ]
+    }, {
+        'url': 'http://www.gaskrank.tv/tv/racing/isle-of-man-tt-2011-michael-du-15920.htm',
+        'md5': 'c33ee32c711bc6c8224bfcbe62b23095',
+        'info_dict': {
+            'id': '201106/15920',
+            'ext': 'mp4',
+            'title': 'Isle of Man - Michael Dunlop vs Guy Martin - schwindelig kucken',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'categories': ['racing'],
+            'display_id': 'isle-of-man-tt-2011-michael-du-15920',
+            'uploader_id': 'IOM',
+            'upload_date': '20170523',
+            'uploader_url': 'www.iomtt.com',
+        }
+    }]
 
     def _real_extract(self, url):
-        """extract information from gaskrank.tv"""
-        def fix_json(code):
-            """Removes trailing comma in json: {{},} --> {{}}"""
-            return re.sub(r',\s*}', r'}', js_to_json(code))
-
         display_id = self._match_id(url)
+
         webpage = self._download_webpage(url, display_id)
+
+        title = self._og_search_title(
+            webpage, default=None) or self._html_search_meta(
+            'title', webpage, fatal=True)
+
         categories = [re.match(self._VALID_URL, url).group('categories')]
-        title = self._search_regex(
-            r'movieName\s*:\s*\'([^\']*)\'',
-            webpage, 'title')
-        thumbnail = self._search_regex(
-            r'poster\s*:\s*\'([^\']*)\'',
-            webpage, 'thumbnail', default=None)
 
         mobj = re.search(
             r'Video von:\s*(?P<uploader_id>[^|]*?)\s*\|\s*vom:\s*(?P<upload_date>[0-9][0-9]\.[0-9][0-9]\.[0-9][0-9][0-9][0-9])',
@@ -89,29 +79,14 @@ class GaskrankIE(InfoExtractor):
         if average_rating:
             average_rating = float_or_none(average_rating.replace(',', '.'))
 
-        playlist = self._parse_json(
-            self._search_regex(
-                r'playlist\s*:\s*\[([^\]]*)\]',
-                webpage, 'playlist', default='{}'),
-            display_id, transform_source=fix_json, fatal=False)
-
         video_id = self._search_regex(
             r'https?://movies\.gaskrank\.tv/([^-]*?)(-[^\.]*)?\.mp4',
-            playlist.get('0').get('src'), 'video id')
-
-        formats = []
-        for key in playlist:
-            formats.append({
-                'url': playlist[key]['src'],
-                'format_id': key,
-                'quality': playlist[key].get('quality')})
-        self._sort_formats(formats, field_preference=['format_id'])
+            webpage, 'video id', default=display_id)
 
-        return {
+        entry = self._parse_html5_media_entries(url, webpage, video_id)[0]
+        entry.update({
             'id': video_id,
             'title': title,
-            'formats': formats,
-            'thumbnail': thumbnail,
             'categories': categories,
             'display_id': display_id,
             'uploader_id': uploader_id,
@@ -120,4 +95,7 @@ class GaskrankIE(InfoExtractor):
             'tags': tags,
             'view_count': view_count,
             'average_rating': average_rating,
-        }
+        })
+        self._sort_formats(entry['formats'])
+
+        return entry
index 3136427db39a2f1739fa0a791bd2cc85f1eedd02..f71d9092e5371d5d2c143058ded24cb5f4ca1958 100644 (file)
@@ -75,6 +75,19 @@ class GDCVaultIE(InfoExtractor):
                 'format': 'jp',  # The japanese audio
             }
         },
+        {
+            # gdc-player.html
+            'url': 'http://www.gdcvault.com/play/1435/An-American-engine-in-Tokyo',
+            'info_dict': {
+                'id': '1435',
+                'display_id': 'An-American-engine-in-Tokyo',
+                'ext': 'flv',
+                'title': 'An American Engine in Tokyo:/nThe collaboration of Epic Games and Square Enix/nFor THE LAST REMINANT',
+            },
+            'params': {
+                'skip_download': True,  # Requires rtmpdump
+            },
+        },
     ]
 
     def _login(self, webpage_url, display_id):
@@ -128,7 +141,7 @@ class GDCVaultIE(InfoExtractor):
                 'title': title,
             }
 
-        PLAYER_REGEX = r'<iframe src="(?P<xml_root>.+?)/player.*?\.html.*?".*?</iframe>'
+        PLAYER_REGEX = r'<iframe src="(?P<xml_root>.+?)/(?:gdc-)?player.*?\.html.*?".*?</iframe>'
 
         xml_root = self._html_search_regex(
             PLAYER_REGEX, start_page, 'xml root', default=None)
index 67184bc5d81f605450809b165dfef8be0d950202..b83c18380d2ba24ff0ce4909cf5a66643ae19d41 100644 (file)
@@ -10,6 +10,7 @@ from .common import InfoExtractor
 from .youtube import YoutubeIE
 from ..compat import (
     compat_etree_fromstring,
+    compat_str,
     compat_urllib_parse_unquote,
     compat_urlparse,
     compat_xml_parse_error,
@@ -35,6 +36,10 @@ from .brightcove import (
     BrightcoveLegacyIE,
     BrightcoveNewIE,
 )
+from .nexx import (
+    NexxIE,
+    NexxEmbedIE,
+)
 from .nbc import NBCSportsVPlayerIE
 from .ooyala import OoyalaIE
 from .rutv import RUTVIE
@@ -56,6 +61,7 @@ from .dailymotion import (
     DailymotionIE,
     DailymotionCloudIE,
 )
+from .dailymail import DailyMailIE
 from .onionstudios import OnionStudiosIE
 from .viewlift import ViewLiftEmbedIE
 from .mtv import MTVServicesEmbeddedIE
@@ -86,6 +92,13 @@ from .openload import OpenloadIE
 from .videopress import VideoPressIE
 from .rutube import RutubeIE
 from .limelight import LimelightBaseIE
+from .anvato import AnvatoIE
+from .washingtonpost import WashingtonPostIE
+from .wistia import WistiaIE
+from .mediaset import MediasetIE
+from .joj import JojIE
+from .megaphone import MegaphoneIE
+from .vzaar import VzaarIE
 
 
 class GenericIE(InfoExtractor):
@@ -563,6 +576,19 @@ class GenericIE(InfoExtractor):
             },
             'skip': 'movie expired',
         },
+        # ooyala video embedded with http://player.ooyala.com/static/v4/production/latest/core.min.js
+        {
+            'url': 'http://wnep.com/2017/07/22/steampunk-fest-comes-to-honesdale/',
+            'info_dict': {
+                'id': 'lwYWYxYzE6V5uJMjNGyKtwwiw9ZJD7t2',
+                'ext': 'mp4',
+                'title': 'Steampunk Fest Comes to Honesdale',
+                'duration': 43.276,
+            },
+            'params': {
+                'skip_download': True,
+            }
+        },
         # embed.ly video
         {
             'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
@@ -754,6 +780,20 @@ class GenericIE(InfoExtractor):
             },
             'add_ie': ['Dailymotion'],
         },
+        # DailyMail embed
+        {
+            'url': 'http://www.bumm.sk/krimi/2017/07/05/biztonsagi-kamera-buktatta-le-az-agg-ferfit-utlegelo-apolot',
+            'info_dict': {
+                'id': '1495629',
+                'ext': 'mp4',
+                'title': 'Care worker punches elderly dementia patient in head 11 times',
+                'description': 'md5:3a743dee84e57e48ec68bf67113199a5',
+            },
+            'add_ie': ['DailyMail'],
+            'params': {
+                'skip_download': True,
+            },
+        },
         # YouTube embed
         {
             'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
@@ -1180,7 +1220,7 @@ class GenericIE(InfoExtractor):
             },
             'add_ie': ['Kaltura'],
         },
-        # Eagle.Platform embed (generic URL)
+        # EaglePlatform embed (generic URL)
         {
             'url': 'http://lenta.ru/news/2015/03/06/navalny/',
             # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
@@ -1194,8 +1234,26 @@ class GenericIE(InfoExtractor):
                 'view_count': int,
                 'age_limit': 0,
             },
+            'params': {
+                'skip_download': True,
+            },
         },
-        # ClipYou (Eagle.Platform) embed (custom URL)
+        # referrer protected EaglePlatform embed
+        {
+            'url': 'https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/',
+            'info_dict': {
+                'id': '582306',
+                'ext': 'mp4',
+                'title': 'Стас Намин: «Мы нарушили девственность Кремля»',
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'duration': 3382,
+                'view_count': int,
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        # ClipYou (EaglePlatform) embed (custom URL)
         {
             'url': 'http://muz-tv.ru/play/7129/',
             # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
@@ -1207,6 +1265,9 @@ class GenericIE(InfoExtractor):
                 'duration': 216,
                 'view_count': int,
             },
+            'params': {
+                'skip_download': True,
+            },
         },
         # Pladform embed
         {
@@ -1427,6 +1488,22 @@ class GenericIE(InfoExtractor):
                 'skip_download': True,
             },
         },
+        {
+            # Brightcove embed with whitespace around attribute names
+            'url': 'http://www.stack.com/video/3167554373001/learn-to-hit-open-three-pointers-with-damian-lillard-s-baseline-drift-drill',
+            'info_dict': {
+                'id': '3167554373001',
+                'ext': 'mp4',
+                'title': "Learn to Hit Open Three-Pointers With Damian Lillard's Baseline Drift Drill",
+                'description': 'md5:57bacb0e0f29349de4972bfda3191713',
+                'uploader_id': '1079349493',
+                'upload_date': '20140207',
+                'timestamp': 1391810548,
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
         # Another form of arte.tv embed
         {
             'url': 'http://www.tv-replay.fr/redirection/09-04-16/arte-reportage-arte-11508975.html',
@@ -1442,14 +1519,27 @@ class GenericIE(InfoExtractor):
         # LiveLeak embed
         {
             'url': 'http://www.wykop.pl/link/3088787/',
-            'md5': 'ace83b9ed19b21f68e1b50e844fdf95d',
+            'md5': '7619da8c820e835bef21a1efa2a0fc71',
             'info_dict': {
                 'id': '874_1459135191',
                 'ext': 'mp4',
                 'title': 'Man shows poor quality of new apartment building',
                 'description': 'The wall is like a sand pile.',
                 'uploader': 'Lake8737',
-            }
+            },
+            'add_ie': [LiveLeakIE.ie_key()],
+        },
+        # Another LiveLeak embed pattern (#13336)
+        {
+            'url': 'https://milo.yiannopoulos.net/2017/06/concealed-carry-robbery/',
+            'info_dict': {
+                'id': '2eb_1496309988',
+                'ext': 'mp4',
+                'title': 'Thief robs place where everyone was armed',
+                'description': 'md5:694d73ee79e535953cf2488562288eee',
+                'uploader': 'brazilwtf',
+            },
+            'add_ie': [LiveLeakIE.ie_key()],
         },
         # Duplicated embedded video URLs
         {
@@ -1491,6 +1581,22 @@ class GenericIE(InfoExtractor):
             },
             'add_ie': ['BrightcoveLegacy'],
         },
+        # Nexx embed
+        {
+            'url': 'https://www.funk.net/serien/5940e15073f6120001657956/items/593efbb173f6120001657503',
+            'info_dict': {
+                'id': '247746',
+                'ext': 'mp4',
+                'title': "Yesterday's Jam (OV)",
+                'description': 'md5:09bc0984723fed34e2581624a84e05f0',
+                'timestamp': 1492594816,
+                'upload_date': '20170419',
+            },
+            'params': {
+                'format': 'bestvideo',
+                'skip_download': True,
+            },
+        },
         # Facebook <iframe> embed
         {
             'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html',
@@ -1501,6 +1607,21 @@ class GenericIE(InfoExtractor):
                 'title': 'Facebook video #599637780109885',
             },
         },
+        # Facebook <iframe> embed, plugin video
+        {
+            'url': 'http://5pillarsuk.com/2017/06/07/tariq-ramadan-disagrees-with-pr-exercise-by-imams-refusing-funeral-prayers-for-london-attackers/',
+            'info_dict': {
+                'id': '1754168231264132',
+                'ext': 'mp4',
+                'title': 'About the Imams and Religious leaders refusing to perform funeral prayers for...',
+                'uploader': 'Tariq Ramadan (official)',
+                'timestamp': 1496758379,
+                'upload_date': '20170606',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
         # Facebook API embed
         {
             'url': 'http://www.lothype.com/blue-stars-2016-preview-standstill-full-show/',
@@ -1677,6 +1798,87 @@ class GenericIE(InfoExtractor):
             },
             'playlist_mincount': 5,
         },
+        {
+            # Limelight embed (LimelightPlayerUtil.embed)
+            'url': 'https://tv5.ca/videos?v=xuu8qowr291ri',
+            'info_dict': {
+                'id': '95d035dc5c8a401588e9c0e6bd1e9c92',
+                'ext': 'mp4',
+                'title': '07448641',
+                'timestamp': 1499890639,
+                'upload_date': '20170712',
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'add_ie': ['LimelightMedia'],
+        },
+        {
+            'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/',
+            'info_dict': {
+                'id': 'standoff-with-walnut-creek-murder-suspect-ends-with-arrest',
+                'title': 'Standoff with Walnut Creek murder suspect ends',
+                'description': 'md5:3ccc48a60fc9441eeccfc9c469ebf788',
+            },
+            'playlist_mincount': 4,
+        },
+        {
+            # WashingtonPost embed
+            'url': 'http://www.vanityfair.com/hollywood/2017/04/donald-trump-tv-pitches',
+            'info_dict': {
+                'id': '8caf6e88-d0ec-11e5-90d3-34c2c42653ac',
+                'ext': 'mp4',
+                'title': "No one has seen the drama series based on Trump's life \u2014 until now",
+                'description': 'Donald Trump wanted a weekly TV drama based on his life. It never aired. But The Washington Post recently obtained a scene from the pilot script — and enlisted actors.',
+                'timestamp': 1455216756,
+                'uploader': 'The Washington Post',
+                'upload_date': '20160211',
+            },
+            'add_ie': [WashingtonPostIE.ie_key()],
+        },
+        {
+            # Mediaset embed
+            'url': 'http://www.tgcom24.mediaset.it/politica/serracchiani-voglio-vivere-in-una-societa-aperta-reazioni-sproporzionate-_3071354-201702a.shtml',
+            'info_dict': {
+                'id': '720642',
+                'ext': 'mp4',
+                'title': 'Serracchiani: "Voglio vivere in una società aperta, con tutela del patto di fiducia"',
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'add_ie': [MediasetIE.ie_key()],
+        },
+        {
+            # JOJ.sk embeds
+            'url': 'https://www.noviny.sk/slovensko/238543-slovenskom-sa-prehnala-vlna-silnych-burok',
+            'info_dict': {
+                'id': '238543-slovenskom-sa-prehnala-vlna-silnych-burok',
+                'title': 'Slovenskom sa prehnala vlna silných búrok',
+            },
+            'playlist_mincount': 5,
+            'add_ie': [JojIE.ie_key()],
+        },
+        {
+            # AMP embed (see https://www.ampproject.org/docs/reference/components/amp-video)
+            'url': 'https://tvrain.ru/amp/418921/',
+            'md5': 'cc00413936695987e8de148b67d14f1d',
+            'info_dict': {
+                'id': '418921',
+                'ext': 'mp4',
+                'title': 'Стас Намин: «Мы нарушили девственность Кремля»',
+            },
+        },
+        {
+            # vzaar embed
+            'url': 'http://help.vzaar.com/article/165-embedding-video',
+            'md5': '7e3919d9d2620b89e3e00bec7fe8c9d4',
+            'info_dict': {
+                'id': '8707641',
+                'ext': 'mp4',
+                'title': 'Building A Business Online: Principal Chairs Q & A',
+            },
+        },
         # {
         #     # TODO: find another test
         #     # http://schema.org/VideoObject
@@ -1826,7 +2028,7 @@ class GenericIE(InfoExtractor):
 
         if head_response is not False:
             # Check for redirect
-            new_url = head_response.geturl()
+            new_url = compat_str(head_response.geturl())
             if url != new_url:
                 self.report_following_redirect(new_url)
                 if force_videoid:
@@ -1851,14 +2053,14 @@ class GenericIE(InfoExtractor):
         content_type = head_response.headers.get('Content-Type', '').lower()
         m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
         if m:
-            format_id = m.group('format_id')
+            format_id = compat_str(m.group('format_id'))
             if format_id.endswith('mpegurl'):
                 formats = self._extract_m3u8_formats(url, video_id, 'mp4')
             elif format_id == 'f4m':
                 formats = self._extract_f4m_formats(url, video_id)
             else:
                 formats = [{
-                    'format_id': m.group('format_id'),
+                    'format_id': format_id,
                     'url': url,
                     'vcodec': 'none' if m.group('type') == 'audio' else None
                 }]
@@ -1927,7 +2129,7 @@ class GenericIE(InfoExtractor):
             elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
                 info_dict['formats'] = self._parse_mpd_formats(
                     doc, video_id,
-                    mpd_base_url=full_response.geturl().rpartition('/')[0],
+                    mpd_base_url=compat_str(full_response.geturl()).rpartition('/')[0],
                     mpd_url=url)
                 self._sort_formats(info_dict['formats'])
                 return info_dict
@@ -1976,6 +2178,13 @@ class GenericIE(InfoExtractor):
         video_description = self._og_search_description(webpage, default=None)
         video_thumbnail = self._og_search_thumbnail(webpage, default=None)
 
+        info_dict.update({
+            'title': video_title,
+            'description': video_description,
+            'thumbnail': video_thumbnail,
+            'age_limit': age_limit,
+        })
+
         # Look for Brightcove Legacy Studio embeds
         bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
         if bc_urls:
@@ -1997,6 +2206,16 @@ class GenericIE(InfoExtractor):
         if bc_urls:
             return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew')
 
+        # Look for Nexx embeds
+        nexx_urls = NexxIE._extract_urls(webpage)
+        if nexx_urls:
+            return self.playlist_from_matches(nexx_urls, video_id, video_title, ie=NexxIE.ie_key())
+
+        # Look for Nexx iFrame embeds
+        nexx_embed_urls = NexxEmbedIE._extract_urls(webpage)
+        if nexx_embed_urls:
+            return self.playlist_from_matches(nexx_embed_urls, video_id, video_title, ie=NexxEmbedIE.ie_key())
+
         # Look for ThePlatform embeds
         tp_urls = ThePlatformIE._extract_urls(webpage)
         if tp_urls:
@@ -2024,36 +2243,11 @@ class GenericIE(InfoExtractor):
         if vid_me_embed_url is not None:
             return self.url_result(vid_me_embed_url, 'Vidme')
 
-        # Look for embedded YouTube player
-        matches = re.findall(r'''(?x)
-            (?:
-                <iframe[^>]+?src=|
-                data-video-url=|
-                <embed[^>]+?src=|
-                embedSWF\(?:\s*|
-                <object[^>]+data=|
-                new\s+SWFObject\(
-            )
-            (["\'])
-                (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
-                (?:embed|v|p)/.+?)
-            \1''', webpage)
-        if matches:
+        # Look for YouTube embeds
+        youtube_urls = YoutubeIE._extract_urls(webpage)
+        if youtube_urls:
             return self.playlist_from_matches(
-                matches, video_id, video_title, lambda m: unescapeHTML(m[1]))
-
-        # Look for lazyYT YouTube embed
-        matches = re.findall(
-            r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
-        if matches:
-            return self.playlist_from_matches(matches, video_id, video_title, lambda m: unescapeHTML(m))
-
-        # Look for Wordpress "YouTube Video Importer" plugin
-        matches = re.findall(r'''(?x)<div[^>]+
-            class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
-            data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
-        if matches:
-            return self.playlist_from_matches(matches, video_id, video_title, lambda m: m[-1])
+                youtube_urls, video_id, video_title, ie=YoutubeIE.ie_key())
 
         matches = DailymotionIE._extract_urls(webpage)
         if matches:
@@ -2069,58 +2263,27 @@ class GenericIE(InfoExtractor):
                 return self.playlist_from_matches(
                     playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p)
 
-        # Look for embedded Wistia player
-        match = re.search(
-            r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
-        if match:
-            embed_url = self._proto_relative_url(
-                unescapeHTML(match.group('url')))
-            return {
-                '_type': 'url_transparent',
-                'url': embed_url,
-                'ie_key': 'Wistia',
-                'uploader': video_uploader,
-            }
+        # Look for DailyMail embeds
+        dailymail_urls = DailyMailIE._extract_urls(webpage)
+        if dailymail_urls:
+            return self.playlist_from_matches(
+                dailymail_urls, video_id, video_title, ie=DailyMailIE.ie_key())
 
-        match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
-        if match:
+        # Look for embedded Wistia player
+        wistia_url = WistiaIE._extract_url(webpage)
+        if wistia_url:
             return {
                 '_type': 'url_transparent',
-                'url': 'wistia:%s' % match.group('id'),
-                'ie_key': 'Wistia',
+                'url': self._proto_relative_url(wistia_url),
+                'ie_key': WistiaIE.ie_key(),
                 'uploader': video_uploader,
             }
 
-        match = re.search(
-            r'''(?sx)
-                <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*?
-                <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2
-            ''', webpage)
-        if match:
-            return self.url_result(self._proto_relative_url(
-                'wistia:%s' % match.group('id')), 'Wistia')
-
         # Look for SVT player
         svt_url = SVTIE._extract_url(webpage)
         if svt_url:
             return self.url_result(svt_url, 'SVT')
 
-        # Look for embedded condenast player
-        matches = re.findall(
-            r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
-            webpage)
-        if matches:
-            return {
-                '_type': 'playlist',
-                'entries': [{
-                    '_type': 'url',
-                    'ie_key': 'CondeNast',
-                    'url': ma,
-                } for ma in matches],
-                'title': video_title,
-                'id': video_id,
-            }
-
         # Look for Bandcamp pages with custom domain
         mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
         if mobj is not None:
@@ -2157,6 +2320,7 @@ class GenericIE(InfoExtractor):
         # Look for Ooyala videos
         mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
                 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
+                re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
                 re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
                 re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
         if mobj is not None:
@@ -2202,9 +2366,9 @@ class GenericIE(InfoExtractor):
             return self.url_result(mobj.group('url'))
 
         # Look for embedded Facebook player
-        facebook_url = FacebookIE._extract_url(webpage)
-        if facebook_url is not None:
-            return self.url_result(facebook_url, 'Facebook')
+        facebook_urls = FacebookIE._extract_urls(webpage)
+        if facebook_urls:
+            return self.playlist_from_matches(facebook_urls, video_id, video_title)
 
         # Look for embedded VK player
         mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
@@ -2401,12 +2565,12 @@ class GenericIE(InfoExtractor):
         if kaltura_url:
             return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key())
 
-        # Look for Eagle.Platform embeds
+        # Look for EaglePlatform embeds
         eagleplatform_url = EaglePlatformIE._extract_url(webpage)
         if eagleplatform_url:
-            return self.url_result(eagleplatform_url, EaglePlatformIE.ie_key())
+            return self.url_result(smuggle_url(eagleplatform_url, {'referrer': url}), EaglePlatformIE.ie_key())
 
-        # Look for ClipYou (uses Eagle.Platform) embeds
+        # Look for ClipYou (uses EaglePlatform) embeds
         mobj = re.search(
             r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
         if mobj is not None:
@@ -2514,28 +2678,11 @@ class GenericIE(InfoExtractor):
             return self.playlist_result(
                 limelight_urls, video_id, video_title, video_description)
 
-        mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage)
-        if mobj:
-            lm = {
-                'Media': 'media',
-                'Channel': 'channel',
-                'ChannelList': 'channel_list',
-            }
-            return self.url_result(smuggle_url('limelight:%s:%s' % (
-                lm[mobj.group(1)], mobj.group(2)), {'source_url': url}),
-                'Limelight%s' % mobj.group(1), mobj.group(2))
-
-        mobj = re.search(
-            r'''(?sx)
-                <object[^>]+class=(["\'])LimelightEmbeddedPlayerFlash\1[^>]*>.*?
-                    <param[^>]+
-                        name=(["\'])flashVars\2[^>]+
-                        value=(["\'])(?:(?!\3).)*mediaId=(?P<id>[a-z0-9]{32})
-            ''', webpage)
-        if mobj:
-            return self.url_result(smuggle_url(
-                'limelight:media:%s' % mobj.group('id'),
-                {'source_url': url}), 'LimelightMedia', mobj.group('id'))
+        # Look for Anvato embeds
+        anvato_urls = AnvatoIE._extract_urls(self, webpage, video_id)
+        if anvato_urls:
+            return self.playlist_result(
+                anvato_urls, video_id, video_title, video_description)
 
         # Look for AdobeTVVideo embeds
         mobj = re.search(
@@ -2598,9 +2745,9 @@ class GenericIE(InfoExtractor):
                 self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key())
 
         # Look for LiveLeak embeds
-        liveleak_url = LiveLeakIE._extract_url(webpage)
-        if liveleak_url:
-            return self.url_result(liveleak_url, 'LiveLeak')
+        liveleak_urls = LiveLeakIE._extract_urls(webpage)
+        if liveleak_urls:
+            return self.playlist_from_matches(liveleak_urls, video_id, video_title)
 
         # Look for 3Q SDN embeds
         threeqsdn_url = ThreeQSDNIE._extract_url(webpage)
@@ -2652,20 +2799,52 @@ class GenericIE(InfoExtractor):
         rutube_urls = RutubeIE._extract_urls(webpage)
         if rutube_urls:
             return self.playlist_from_matches(
-                rutube_urls, ie=RutubeIE.ie_key())
+                rutube_urls, video_id, video_title, ie=RutubeIE.ie_key())
 
-        # Looking for http://schema.org/VideoObject
-        json_ld = self._search_json_ld(
-            webpage, video_id, default={}, expected_type='VideoObject')
-        if json_ld.get('url'):
-            info_dict.update({
-                'title': video_title or info_dict['title'],
-                'description': video_description,
-                'thumbnail': video_thumbnail,
-                'age_limit': age_limit
-            })
-            info_dict.update(json_ld)
-            return info_dict
+        # Look for WashingtonPost embeds
+        wapo_urls = WashingtonPostIE._extract_urls(webpage)
+        if wapo_urls:
+            return self.playlist_from_matches(
+                wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key())
+
+        # Look for Mediaset embeds
+        mediaset_urls = MediasetIE._extract_urls(webpage)
+        if mediaset_urls:
+            return self.playlist_from_matches(
+                mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key())
+
+        # Look for JOJ.sk embeds
+        joj_urls = JojIE._extract_urls(webpage)
+        if joj_urls:
+            return self.playlist_from_matches(
+                joj_urls, video_id, video_title, ie=JojIE.ie_key())
+
+        # Look for megaphone.fm embeds
+        mpfn_urls = MegaphoneIE._extract_urls(webpage)
+        if mpfn_urls:
+            return self.playlist_from_matches(
+                mpfn_urls, video_id, video_title, ie=MegaphoneIE.ie_key())
+
+        # Look for vzaar embeds
+        vzaar_urls = VzaarIE._extract_urls(webpage)
+        if vzaar_urls:
+            return self.playlist_from_matches(
+                vzaar_urls, video_id, video_title, ie=VzaarIE.ie_key())
+
+        def merge_dicts(dict1, dict2):
+            merged = {}
+            for k, v in dict1.items():
+                if v is not None:
+                    merged[k] = v
+            for k, v in dict2.items():
+                if v is None:
+                    continue
+                if (k not in merged or
+                        (isinstance(v, compat_str) and v and
+                            isinstance(merged[k], compat_str) and
+                            not merged[k])):
+                    merged[k] = v
+            return merged
 
         # Look for HTML5 media
         entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
@@ -2683,9 +2862,13 @@ class GenericIE(InfoExtractor):
         if jwplayer_data:
             info = self._parse_jwplayer_data(
                 jwplayer_data, video_id, require_title=False, base_url=url)
-            if not info.get('title'):
-                info['title'] = video_title
-            return info
+            return merge_dicts(info, info_dict)
+
+        # Looking for http://schema.org/VideoObject
+        json_ld = self._search_json_ld(
+            webpage, video_id, default={}, expected_type='VideoObject')
+        if json_ld.get('url'):
+            return merge_dicts(json_ld, info_dict)
 
         def check_video(vurl):
             if YoutubeIE.suitable(vurl):
index 884700c52b90b53fdc8f581378d28611d7c36f33..45ccc11c10a18033c5d53d0416903425a6ef2385 100644 (file)
@@ -82,7 +82,7 @@ class GfycatIE(InfoExtractor):
             video_url = gfy.get('%sUrl' % format_id)
             if not video_url:
                 continue
-            filesize = gfy.get('%sSize' % format_id)
+            filesize = int_or_none(gfy.get('%sSize' % format_id))
             formats.append({
                 'url': video_url,
                 'format_id': format_id,
index 29b684d35875031c0b5a256e0f12cf0695b90353..6a1b1e96ebf4dc59f7f5e13dc18e3ee08ef1110c 100644 (file)
@@ -5,9 +5,10 @@ import json
 
 from .common import InfoExtractor
 from ..utils import (
-    unescapeHTML,
-    qualities,
+    determine_ext,
     int_or_none,
+    qualities,
+    unescapeHTML,
 )
 
 
@@ -15,7 +16,7 @@ class GiantBombIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?giantbomb\.com/videos/(?P<display_id>[^/]+)/(?P<id>\d+-\d+)'
     _TEST = {
         'url': 'http://www.giantbomb.com/videos/quick-look-destiny-the-dark-below/2300-9782/',
-        'md5': '57badeface303ecf6b98b812de1b9018',
+        'md5': 'c8ea694254a59246a42831155dec57ac',
         'info_dict': {
             'id': '2300-9782',
             'display_id': 'quick-look-destiny-the-dark-below',
@@ -51,11 +52,16 @@ class GiantBombIE(InfoExtractor):
         for format_id, video_url in video['videoStreams'].items():
             if format_id == 'f4m_stream':
                 continue
-            if video_url.endswith('.f4m'):
+            ext = determine_ext(video_url)
+            if ext == 'f4m':
                 f4m_formats = self._extract_f4m_formats(video_url + '?hdcore=3.3.1', display_id)
                 if f4m_formats:
                     f4m_formats[0]['quality'] = quality(format_id)
                     formats.extend(f4m_formats)
+            elif ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    video_url, display_id, ext='mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
             else:
                 formats.append({
                     'url': video_url,
diff --git a/youtube_dl/extractor/godtv.py b/youtube_dl/extractor/godtv.py
deleted file mode 100644 (file)
index c5d3b4e..0000000
+++ /dev/null
@@ -1,66 +0,0 @@
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from .ooyala import OoyalaIE
-from ..utils import js_to_json
-
-
-class GodTVIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?god\.tv(?:/[^/]+)*/(?P<id>[^/?#&]+)'
-    _TESTS = [{
-        'url': 'http://god.tv/jesus-image/video/jesus-conference-2016/randy-needham',
-        'info_dict': {
-            'id': 'lpd3g2MzE6D1g8zFAKz8AGpxWcpu6o_3',
-            'ext': 'mp4',
-            'title': 'Randy Needham',
-            'duration': 3615.08,
-        },
-        'params': {
-            'skip_download': True,
-        }
-    }, {
-        'url': 'http://god.tv/playlist/bible-study',
-        'info_dict': {
-            'id': 'bible-study',
-        },
-        'playlist_mincount': 37,
-    }, {
-        'url': 'http://god.tv/node/15097',
-        'only_matching': True,
-    }, {
-        'url': 'http://god.tv/live/africa',
-        'only_matching': True,
-    }, {
-        'url': 'http://god.tv/liveevents',
-        'only_matching': True,
-    }]
-
-    def _real_extract(self, url):
-        display_id = self._match_id(url)
-
-        webpage = self._download_webpage(url, display_id)
-
-        settings = self._parse_json(
-            self._search_regex(
-                r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
-                webpage, 'settings', default='{}'),
-            display_id, transform_source=js_to_json, fatal=False)
-
-        ooyala_id = None
-
-        if settings:
-            playlist = settings.get('playlist')
-            if playlist and isinstance(playlist, list):
-                entries = [
-                    OoyalaIE._build_url_result(video['content_id'])
-                    for video in playlist if video.get('content_id')]
-                if entries:
-                    return self.playlist_result(entries, display_id)
-            ooyala_id = settings.get('ooyala', {}).get('content_id')
-
-        if not ooyala_id:
-            ooyala_id = self._search_regex(
-                r'["\']content_id["\']\s*:\s*(["\'])(?P<id>[\w-]+)\1',
-                webpage, 'ooyala id', group='id')
-
-        return OoyalaIE._build_url_result(ooyala_id)
index 2bfb9904022c6a3830901baa2ee380b6f4f14714..47a068e742bc0b1a8ae92f55da4834ea628005ae 100644 (file)
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
 
 from .common import InfoExtractor
 from ..compat import (
+    compat_str,
     compat_urlparse,
 )
 from ..utils import (
@@ -46,7 +47,7 @@ class GolemIE(InfoExtractor):
                 continue
 
             formats.append({
-                'format_id': e.tag,
+                'format_id': compat_str(e.tag),
                 'url': compat_urlparse.urljoin(self._PREFIX, url),
                 'height': self._int(e.get('height'), 'height'),
                 'width': self._int(e.get('width'), 'width'),
index fec36cbbb7f43d2b8b37370aec270f543e8f257d..3bf462d631d7ca20f8eb1b146b496a7c3a57415f 100644 (file)
@@ -4,26 +4,61 @@ import re
 
 from .common import InfoExtractor
 from ..utils import (
+    determine_ext,
     ExtractorError,
     int_or_none,
     lowercase_escape,
+    update_url_query,
 )
 
 
 class GoogleDriveIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28,})'
+    _VALID_URL = r'''(?x)
+                        https?://
+                            (?:
+                                (?:docs|drive)\.google\.com/
+                                (?:
+                                    (?:uc|open)\?.*?id=|
+                                    file/d/
+                                )|
+                                video\.google\.com/get_player\?.*?docid=
+                            )
+                            (?P<id>[a-zA-Z0-9_-]{28,})
+                    '''
     _TESTS = [{
         'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
-        'md5': 'd109872761f7e7ecf353fa108c0dbe1e',
+        'md5': '5c602afbbf2c1db91831f5d82f678554',
         'info_dict': {
             'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
             'ext': 'mp4',
             'title': 'Big Buck Bunny.mp4',
             'duration': 45,
         }
+    }, {
+        # video can't be watched anonymously due to view count limit reached,
+        # but can be downloaded (see https://github.com/rg3/youtube-dl/issues/14046)
+        'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view',
+        'md5': 'bfbd670d03a470bb1e6d4a257adec12e',
+        'info_dict': {
+            'id': '0B-vUyvmDLdWDcEt4WjBqcmI2XzQ',
+            'ext': 'mp4',
+            'title': 'Annabelle Creation (2017)- Z.V1 [TH].MP4',
+        }
     }, {
         # video id is longer than 28 characters
         'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
+        'info_dict': {
+            'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ',
+            'ext': 'mp4',
+            'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4',
+            'duration': 189,
+        },
+        'only_matching': True,
+    }, {
+        'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
+        'only_matching': True,
+    }, {
+        'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
         'only_matching': True,
     }]
     _FORMATS_EXT = {
@@ -44,6 +79,13 @@ class GoogleDriveIE(InfoExtractor):
         '46': 'webm',
         '59': 'mp4',
     }
+    _BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext'
+    _CAPTIONS_ENTRY_TAG = {
+        'subtitles': 'track',
+        'automatic_captions': 'target',
+    }
+    _caption_formats_ext = []
+    _captions_xml = None
 
     @staticmethod
     def _extract_url(webpage):
@@ -53,41 +95,183 @@ class GoogleDriveIE(InfoExtractor):
         if mobj:
             return 'https://drive.google.com/file/d/%s' % mobj.group('id')
 
+    def _download_subtitles_xml(self, video_id, subtitles_id, hl):
+        if self._captions_xml:
+            return
+        self._captions_xml = self._download_xml(
+            self._BASE_URL_CAPTIONS, video_id, query={
+                'id': video_id,
+                'vid': subtitles_id,
+                'hl': hl,
+                'v': video_id,
+                'type': 'list',
+                'tlangs': '1',
+                'fmts': '1',
+                'vssids': '1',
+            }, note='Downloading subtitles XML',
+            errnote='Unable to download subtitles XML', fatal=False)
+        if self._captions_xml:
+            for f in self._captions_xml.findall('format'):
+                if f.attrib.get('fmt_code') and not f.attrib.get('default'):
+                    self._caption_formats_ext.append(f.attrib['fmt_code'])
+
+    def _get_captions_by_type(self, video_id, subtitles_id, caption_type,
+                              origin_lang_code=None):
+        if not subtitles_id or not caption_type:
+            return
+        captions = {}
+        for caption_entry in self._captions_xml.findall(
+                self._CAPTIONS_ENTRY_TAG[caption_type]):
+            caption_lang_code = caption_entry.attrib.get('lang_code')
+            if not caption_lang_code:
+                continue
+            caption_format_data = []
+            for caption_format in self._caption_formats_ext:
+                query = {
+                    'vid': subtitles_id,
+                    'v': video_id,
+                    'fmt': caption_format,
+                    'lang': (caption_lang_code if origin_lang_code is None
+                             else origin_lang_code),
+                    'type': 'track',
+                    'name': '',
+                    'kind': '',
+                }
+                if origin_lang_code is not None:
+                    query.update({'tlang': caption_lang_code})
+                caption_format_data.append({
+                    'url': update_url_query(self._BASE_URL_CAPTIONS, query),
+                    'ext': caption_format,
+                })
+            captions[caption_lang_code] = caption_format_data
+        return captions
+
+    def _get_subtitles(self, video_id, subtitles_id, hl):
+        if not subtitles_id or not hl:
+            return
+        self._download_subtitles_xml(video_id, subtitles_id, hl)
+        if not self._captions_xml:
+            return
+        return self._get_captions_by_type(video_id, subtitles_id, 'subtitles')
+
+    def _get_automatic_captions(self, video_id, subtitles_id, hl):
+        if not subtitles_id or not hl:
+            return
+        self._download_subtitles_xml(video_id, subtitles_id, hl)
+        if not self._captions_xml:
+            return
+        track = self._captions_xml.find('track')
+        if track is None:
+            return
+        origin_lang_code = track.attrib.get('lang_code')
+        if not origin_lang_code:
+            return
+        return self._get_captions_by_type(
+            video_id, subtitles_id, 'automatic_captions', origin_lang_code)
+
     def _real_extract(self, url):
         video_id = self._match_id(url)
         webpage = self._download_webpage(
             'http://docs.google.com/file/d/%s' % video_id, video_id)
 
-        reason = self._search_regex(r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None)
-        if reason:
-            raise ExtractorError(reason)
-
-        title = self._search_regex(r'"title"\s*,\s*"([^"]+)', webpage, 'title')
+        title = self._search_regex(
+            r'"title"\s*,\s*"([^"]+)', webpage, 'title',
+            default=None) or self._og_search_title(webpage)
         duration = int_or_none(self._search_regex(
-            r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', default=None))
-        fmt_stream_map = self._search_regex(
-            r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, 'fmt stream map').split(',')
-        fmt_list = self._search_regex(r'"fmt_list"\s*,\s*"([^"]+)', webpage, 'fmt_list').split(',')
+            r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds',
+            default=None))
 
         formats = []
-        for fmt, fmt_stream in zip(fmt_list, fmt_stream_map):
-            fmt_id, fmt_url = fmt_stream.split('|')
-            resolution = fmt.split('/')[1]
-            width, height = resolution.split('x')
-            formats.append({
-                'url': lowercase_escape(fmt_url),
-                'format_id': fmt_id,
-                'resolution': resolution,
-                'width': int_or_none(width),
-                'height': int_or_none(height),
-                'ext': self._FORMATS_EXT[fmt_id],
+        fmt_stream_map = self._search_regex(
+            r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage,
+            'fmt stream map', default='').split(',')
+        fmt_list = self._search_regex(
+            r'"fmt_list"\s*,\s*"([^"]+)', webpage,
+            'fmt_list', default='').split(',')
+        if fmt_stream_map and fmt_list:
+            resolutions = {}
+            for fmt in fmt_list:
+                mobj = re.search(
+                    r'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt)
+                if mobj:
+                    resolutions[mobj.group('format_id')] = (
+                        int(mobj.group('width')), int(mobj.group('height')))
+
+            for fmt_stream in fmt_stream_map:
+                fmt_stream_split = fmt_stream.split('|')
+                if len(fmt_stream_split) < 2:
+                    continue
+                format_id, format_url = fmt_stream_split[:2]
+                f = {
+                    'url': lowercase_escape(format_url),
+                    'format_id': format_id,
+                    'ext': self._FORMATS_EXT[format_id],
+                }
+                resolution = resolutions.get(format_id)
+                if resolution:
+                    f.update({
+                        'width': resolution[0],
+                        'height': resolution[1],
+                    })
+                formats.append(f)
+
+        source_url = update_url_query(
+            'https://drive.google.com/uc', {
+                'id': video_id,
+                'export': 'download',
             })
+        urlh = self._request_webpage(
+            source_url, video_id, note='Requesting source file',
+            errnote='Unable to request source file', fatal=False)
+        if urlh:
+            def add_source_format(src_url):
+                formats.append({
+                    'url': src_url,
+                    'ext': determine_ext(title, 'mp4').lower(),
+                    'format_id': 'source',
+                    'quality': 1,
+                })
+            if urlh.headers.get('Content-Disposition'):
+                add_source_format(source_url)
+            else:
+                confirmation_webpage = self._webpage_read_content(
+                    urlh, url, video_id, note='Downloading confirmation page',
+                    errnote='Unable to confirm download', fatal=False)
+                if confirmation_webpage:
+                    confirm = self._search_regex(
+                        r'confirm=([^&"\']+)', confirmation_webpage,
+                        'confirmation code', fatal=False)
+                    if confirm:
+                        add_source_format(update_url_query(source_url, {
+                            'confirm': confirm,
+                        }))
+
+        if not formats:
+            reason = self._search_regex(
+                r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None)
+            if reason:
+                raise ExtractorError(reason, expected=True)
+
         self._sort_formats(formats)
 
+        hl = self._search_regex(
+            r'"hl"\s*,\s*"([^"]+)', webpage, 'hl', default=None)
+        subtitles_id = None
+        ttsurl = self._search_regex(
+            r'"ttsurl"\s*,\s*"([^"]+)', webpage, 'ttsurl', default=None)
+        if ttsurl:
+            # the video Id for subtitles will be the last value in the ttsurl
+            # query string
+            subtitles_id = ttsurl.encode('utf-8').decode(
+                'unicode_escape').split('=')[-1]
+
         return {
             'id': video_id,
             'title': title,
             'thumbnail': self._og_search_thumbnail(webpage, default=None),
             'duration': duration,
             'formats': formats,
+            'subtitles': self.extract_subtitles(video_id, subtitles_id, hl),
+            'automatic_captions': self.extract_automatic_captions(
+                video_id, subtitles_id, hl),
         }
index e854300c71b6b5c2592f9c9c84b386831578c482..a4f3325650d491d14e3cf817c8a0892d2b53c975 100644 (file)
@@ -7,14 +7,19 @@ from .common import InfoExtractor
 class HGTVComShowIE(InfoExtractor):
     IE_NAME = 'hgtv.com:show'
     _VALID_URL = r'https?://(?:www\.)?hgtv\.com/shows/[^/]+/(?P<id>[^/?#&]+)'
-    _TEST = {
-        'url': 'http://www.hgtv.com/shows/flip-or-flop/flip-or-flop-full-episodes-videos',
+    _TESTS = [{
+        # data-module="video"
+        'url': 'http://www.hgtv.com/shows/flip-or-flop/flip-or-flop-full-episodes-season-4-videos',
         'info_dict': {
-            'id': 'flip-or-flop-full-episodes-videos',
+            'id': 'flip-or-flop-full-episodes-season-4-videos',
             'title': 'Flip or Flop Full Episodes',
         },
         'playlist_mincount': 15,
-    }
+    }, {
+        # data-deferred-module="video"
+        'url': 'http://www.hgtv.com/shows/good-bones/episodes/an-old-victorian-house-gets-a-new-facelift',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         display_id = self._match_id(url)
@@ -23,7 +28,7 @@ class HGTVComShowIE(InfoExtractor):
 
         config = self._parse_json(
             self._search_regex(
-                r'(?s)data-module=["\']video["\'][^>]*>.*?<script[^>]+type=["\']text/x-config["\'][^>]*>(.+?)</script',
+                r'(?s)data-(?:deferred-)?module=["\']video["\'][^>]*>.*?<script[^>]+type=["\']text/x-config["\'][^>]*>(.+?)</script',
                 webpage, 'video config'),
             display_id)['channels'][0]
 
index e21ebb8fb4057ac6b6d226b3c0f99501b3340cdf..1d905dc81d925f89a5c8f7a4a23c1e62238d01b5 100644 (file)
@@ -16,8 +16,8 @@ from ..utils import (
 
 class HitboxIE(InfoExtractor):
     IE_NAME = 'hitbox'
-    _VALID_URL = r'https?://(?:www\.)?hitbox\.tv/video/(?P<id>[0-9]+)'
-    _TEST = {
+    _VALID_URL = r'https?://(?:www\.)?(?:hitbox|smashcast)\.tv/(?:[^/]+/)*videos?/(?P<id>[0-9]+)'
+    _TESTS = [{
         'url': 'http://www.hitbox.tv/video/203213',
         'info_dict': {
             'id': '203213',
@@ -38,13 +38,15 @@ class HitboxIE(InfoExtractor):
             # m3u8 download
             'skip_download': True,
         },
-    }
+    }, {
+        'url': 'https://www.smashcast.tv/hitboxlive/videos/203213',
+        'only_matching': True,
+    }]
 
     def _extract_metadata(self, url, video_id):
         thumb_base = 'https://edge.sf.hitbox.tv'
         metadata = self._download_json(
-            '%s/%s' % (url, video_id), video_id,
-            'Downloading metadata JSON')
+            '%s/%s' % (url, video_id), video_id, 'Downloading metadata JSON')
 
         date = 'media_live_since'
         media_type = 'livestream'
@@ -63,14 +65,15 @@ class HitboxIE(InfoExtractor):
         views = int_or_none(video_meta.get('media_views'))
         timestamp = parse_iso8601(video_meta.get(date), ' ')
         categories = [video_meta.get('category_name')]
-        thumbs = [
-            {'url': thumb_base + video_meta.get('media_thumbnail'),
-             'width': 320,
-             'height': 180},
-            {'url': thumb_base + video_meta.get('media_thumbnail_large'),
-             'width': 768,
-             'height': 432},
-        ]
+        thumbs = [{
+            'url': thumb_base + video_meta.get('media_thumbnail'),
+            'width': 320,
+            'height': 180
+        }, {
+            'url': thumb_base + video_meta.get('media_thumbnail_large'),
+            'width': 768,
+            'height': 432
+        }]
 
         return {
             'id': video_id,
@@ -90,7 +93,7 @@ class HitboxIE(InfoExtractor):
         video_id = self._match_id(url)
 
         player_config = self._download_json(
-            'https://www.hitbox.tv/api/player/config/video/%s' % video_id,
+            'https://www.smashcast.tv/api/player/config/video/%s' % video_id,
             video_id, 'Downloading video JSON')
 
         formats = []
@@ -121,8 +124,7 @@ class HitboxIE(InfoExtractor):
         self._sort_formats(formats)
 
         metadata = self._extract_metadata(
-            'https://www.hitbox.tv/api/media/video',
-            video_id)
+            'https://www.smashcast.tv/api/media/video', video_id)
         metadata['formats'] = formats
 
         return metadata
@@ -130,8 +132,8 @@ class HitboxIE(InfoExtractor):
 
 class HitboxLiveIE(HitboxIE):
     IE_NAME = 'hitbox:live'
-    _VALID_URL = r'https?://(?:www\.)?hitbox\.tv/(?!video)(?P<id>.+)'
-    _TEST = {
+    _VALID_URL = r'https?://(?:www\.)?(?:hitbox|smashcast)\.tv/(?P<id>[^/?#&]+)'
+    _TESTS = [{
         'url': 'http://www.hitbox.tv/dimak',
         'info_dict': {
             'id': 'dimak',
@@ -146,13 +148,20 @@ class HitboxLiveIE(HitboxIE):
             # live
             'skip_download': True,
         },
-    }
+    }, {
+        'url': 'https://www.smashcast.tv/dimak',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if HitboxIE.suitable(url) else super(HitboxLiveIE, cls).suitable(url)
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
         player_config = self._download_json(
-            'https://www.hitbox.tv/api/player/config/live/%s' % video_id,
+            'https://www.smashcast.tv/api/player/config/live/%s' % video_id,
             video_id)
 
         formats = []
@@ -197,8 +206,7 @@ class HitboxLiveIE(HitboxIE):
         self._sort_formats(formats)
 
         metadata = self._extract_metadata(
-            'https://www.hitbox.tv/api/media/live',
-            video_id)
+            'https://www.smashcast.tv/api/media/live', video_id)
         metadata['formats'] = formats
         metadata['is_live'] = True
         metadata['title'] = self._live_title(metadata.get('title'))
index c45c68c1d6ff523ddcbb5144e260bc931fb09873..c1367cf517ce9b39960a13705b4475bf4f3cf8d1 100644 (file)
@@ -89,6 +89,11 @@ class IGNIE(InfoExtractor):
             'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds',
             'only_matching': True,
         },
+        {
+            # videoId pattern
+            'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned',
+            'only_matching': True,
+        },
     ]
 
     def _find_video_id(self, webpage):
@@ -98,6 +103,8 @@ class IGNIE(InfoExtractor):
             r'data-video-id="(.+?)"',
             r'<object id="vid_(.+?)"',
             r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"',
+            r'videoId&quot;\s*:\s*&quot;(.+?)&quot;',
+            r'videoId["\']\s*:\s*["\']([^"\']+?)["\']',
         ]
         return self._search_regex(res_id, webpage, 'video id', default=None)
 
index f95c00c7330f3db4b5354161804460cbc2bb53d0..3ff672a89215f249574fac721bf913bee84d8200 100644 (file)
@@ -13,7 +13,7 @@ from ..utils import (
 class ImdbIE(InfoExtractor):
     IE_NAME = 'imdb'
     IE_DESC = 'Internet Movie Database trailers'
-    _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video/[^/]+/|title/tt\d+.*?#lb-|videoplayer/)vi(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title).+?[/-]vi(?P<id>\d+)'
 
     _TESTS = [{
         'url': 'http://www.imdb.com/video/imdb/vi2524815897',
@@ -35,6 +35,9 @@ class ImdbIE(InfoExtractor):
     }, {
         'url': 'http://www.imdb.com/videoplayer/vi1562949145',
         'only_matching': True,
+    }, {
+        'url': 'http://www.imdb.com/title/tt4218696/videoplayer/vi2608641561',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
index 9fb71e8effe107c6e182c3537cd634b7ac21e9bb..fe425e786479e505aad8082745d43040d38ba881 100644 (file)
@@ -87,8 +87,8 @@ class InfoQIE(BokeCCBaseIE):
 
     def _extract_http_audio(self, webpage, video_id):
         fields = self._hidden_inputs(webpage)
-        http_audio_url = fields['filename']
-        if http_audio_url is None:
+        http_audio_url = fields.get('filename')
+        if not http_audio_url:
             return []
 
         cookies_header = {'Cookie': self._extract_cookies(webpage)}
index f3156804d8a072509045563017046dc6d670918e..26c48e4b889545f4e2ffacccfdb859678c4bf2c3 100644 (file)
@@ -59,12 +59,18 @@ class ITVIE(InfoExtractor):
         def _add_sub_element(element, name):
             return etree.SubElement(element, _add_ns(name))
 
+        production_id = (
+            params.get('data-video-autoplay-id') or
+            '%s#001' % (
+                params.get('data-video-episode-id') or
+                video_id.replace('a', '/')))
+
         req_env = etree.Element(_add_ns('soapenv:Envelope'))
         _add_sub_element(req_env, 'soapenv:Header')
         body = _add_sub_element(req_env, 'soapenv:Body')
         get_playlist = _add_sub_element(body, ('tem:GetPlaylist'))
         request = _add_sub_element(get_playlist, 'tem:request')
-        _add_sub_element(request, 'itv:ProductionId').text = params['data-video-id']
+        _add_sub_element(request, 'itv:ProductionId').text = production_id
         _add_sub_element(request, 'itv:RequestGuid').text = compat_str(uuid.uuid4()).upper()
         vodcrid = _add_sub_element(request, 'itv:Vodcrid')
         _add_sub_element(vodcrid, 'com:Id')
diff --git a/youtube_dl/extractor/joj.py b/youtube_dl/extractor/joj.py
new file mode 100755 (executable)
index 0000000..a764023
--- /dev/null
@@ -0,0 +1,100 @@
+# coding: utf-8\r
+from __future__ import unicode_literals\r
+\r
+import re\r
+\r
+from .common import InfoExtractor\r
+from ..compat import compat_str\r
+from ..utils import (\r
+    int_or_none,\r
+    js_to_json,\r
+    try_get,\r
+)\r
+\r
+\r
+class JojIE(InfoExtractor):\r
+    _VALID_URL = r'''(?x)\r
+                    (?:\r
+                        joj:|\r
+                        https?://media\.joj\.sk/embed/\r
+                    )\r
+                    (?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})\r
+                '''\r
+    _TESTS = [{\r
+        'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932',\r
+        'info_dict': {\r
+            'id': 'a388ec4c-6019-4a4a-9312-b1bee194e932',\r
+            'ext': 'mp4',\r
+            'title': 'NOVÉ BÝVANIE',\r
+            'thumbnail': r're:^https?://.*\.jpg$',\r
+            'duration': 3118,\r
+        }\r
+    }, {\r
+        'url': 'joj:a388ec4c-6019-4a4a-9312-b1bee194e932',\r
+        'only_matching': True,\r
+    }]\r
+\r
+    @staticmethod\r
+    def _extract_urls(webpage):\r
+        return re.findall(\r
+            r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//media\.joj\.sk/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',\r
+            webpage)\r
+\r
+    def _real_extract(self, url):\r
+        video_id = self._match_id(url)\r
+\r
+        webpage = self._download_webpage(\r
+            'https://media.joj.sk/embed/%s' % video_id, video_id)\r
+\r
+        title = self._search_regex(\r
+            (r'videoTitle\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',\r
+             r'<title>(?P<title>[^<]+)'), webpage, 'title',\r
+            default=None, group='title') or self._og_search_title(webpage)\r
+\r
+        bitrates = self._parse_json(\r
+            self._search_regex(\r
+                r'(?s)bitrates\s*=\s*({.+?});', webpage, 'bitrates',\r
+                default='{}'),\r
+            video_id, transform_source=js_to_json, fatal=False)\r
+\r
+        formats = []\r
+        for format_url in try_get(bitrates, lambda x: x['mp4'], list) or []:\r
+            if isinstance(format_url, compat_str):\r
+                height = self._search_regex(\r
+                    r'(\d+)[pP]\.', format_url, 'height', default=None)\r
+                formats.append({\r
+                    'url': format_url,\r
+                    'format_id': '%sp' % height if height else None,\r
+                    'height': int(height),\r
+                })\r
+        if not formats:\r
+            playlist = self._download_xml(\r
+                'https://media.joj.sk/services/Video.php?clip=%s' % video_id,\r
+                video_id)\r
+            for file_el in playlist.findall('./files/file'):\r
+                path = file_el.get('path')\r
+                if not path:\r
+                    continue\r
+                format_id = file_el.get('id') or file_el.get('label')\r
+                formats.append({\r
+                    'url': 'http://n16.joj.sk/storage/%s' % path.replace(\r
+                        'dat/', '', 1),\r
+                    'format_id': format_id,\r
+                    'height': int_or_none(self._search_regex(\r
+                        r'(\d+)[pP]', format_id or path, 'height',\r
+                        default=None)),\r
+                })\r
+        self._sort_formats(formats)\r
+\r
+        thumbnail = self._og_search_thumbnail(webpage)\r
+\r
+        duration = int_or_none(self._search_regex(\r
+            r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False))\r
+\r
+        return {\r
+            'id': video_id,\r
+            'title': title,\r
+            'thumbnail': thumbnail,\r
+            'duration': duration,\r
+            'formats': formats,\r
+        }\r
index f9a034b78e41a8ec4b998f956c25c47715d6b1c7..27e0e37f6e56a4d972a8757482d7970f4d2ea114 100644 (file)
@@ -65,9 +65,9 @@ class JoveIE(InfoExtractor):
             webpage, 'description', fatal=False)
         publish_date = unified_strdate(self._html_search_meta(
             'citation_publication_date', webpage, 'publish date', fatal=False))
-        comment_count = self._html_search_regex(
+        comment_count = int(self._html_search_regex(
             r'<meta name="num_comments" content="(\d+) Comments?"',
-            webpage, 'comment count', fatal=False)
+            webpage, 'comment count', fatal=False))
 
         return {
             'id': video_id,
index 41c1f3d96be288957cbaba2c21e3477a2969b24d..138d4844d1bbd70e56aff50b5a59ff2ddac8f665 100644 (file)
@@ -324,7 +324,7 @@ class KalturaIE(InfoExtractor):
         if captions:
             for caption in captions.get('objects', []):
                 # Continue if caption is not ready
-                if f.get('status') != 2:
+                if caption.get('status') != 2:
                     continue
                 if not caption.get('id'):
                     continue
index 4e9eb67bf24690571176299de5ff900c7496fec8..f236a2f78e600d45c3f7dbd40de2f320f4cf277e 100644 (file)
@@ -48,7 +48,7 @@ class KarriereVideosIE(InfoExtractor):
         webpage = self._download_webpage(url, video_id)
 
         title = (self._html_search_meta('title', webpage, default=None) or
-                 self._search_regex(r'<h1 class="title">([^<]+)</h1>'))
+                 self._search_regex(r'<h1 class="title">([^<]+)</h1>', webpage, 'video title'))
 
         video_id = self._search_regex(
             r'/config/video/(.+?)\.xml', webpage, 'video id')
index 3190b187c9dfb8fa9204e9761b47ded0c17f5f2d..c7f813370162bb8582db8abd9a3f49e3e4bb1bc1 100644 (file)
@@ -1,6 +1,8 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import json
+
 from .common import InfoExtractor
 from ..utils import (
     ExtractorError,
@@ -8,15 +10,15 @@ from ..utils import (
     urlencode_postdata,
     xpath_element,
     xpath_text,
-    urljoin,
     update_url_query,
+    js_to_json,
 )
 
 
 class Laola1TvEmbedIE(InfoExtractor):
     IE_NAME = 'laola1tv:embed'
     _VALID_URL = r'https?://(?:www\.)?laola1\.tv/titanplayer\.php\?.*?\bvideoid=(?P<id>\d+)'
-    _TEST = {
+    _TESTS = [{
         # flashvars.premium = "false";
         'url': 'https://www.laola1.tv/titanplayer.php?videoid=708065&type=V&lang=en&portal=int&customer=1024',
         'info_dict': {
@@ -26,7 +28,30 @@ class Laola1TvEmbedIE(InfoExtractor):
             'uploader': 'ITTF - International Table Tennis Federation',
             'upload_date': '20161211',
         },
-    }
+    }]
+
+    def _extract_token_url(self, stream_access_url, video_id, data):
+        return self._download_json(
+            stream_access_url, video_id, headers={
+                'Content-Type': 'application/json',
+            }, data=json.dumps(data).encode())['data']['stream-access'][0]
+
+    def _extract_formats(self, token_url, video_id):
+        token_doc = self._download_xml(
+            token_url, video_id, 'Downloading token',
+            headers=self.geo_verification_headers())
+
+        token_attrib = xpath_element(token_doc, './/token').attrib
+
+        if token_attrib['status'] != '0':
+            raise ExtractorError(
+                'Token error: %s' % token_attrib['comment'], expected=True)
+
+        formats = self._extract_akamai_formats(
+            '%s?hdnea=%s' % (token_attrib['url'], token_attrib['auth']),
+            video_id)
+        self._sort_formats(formats)
+        return formats
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -68,29 +93,16 @@ class Laola1TvEmbedIE(InfoExtractor):
         else:
             data_abo = urlencode_postdata(
                 dict((i, v) for i, v in enumerate(_v('req_liga_abos').split(','))))
-            token_url = self._download_json(
-                'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access',
-                video_id, query={
+            stream_access_url = update_url_query(
+                'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access', {
                     'videoId': _v('id'),
                     'target': self._search_regex(r'vs_target = (\d+);', webpage, 'vs target'),
                     'label': _v('label'),
                     'area': _v('area'),
-                }, data=data_abo)['data']['stream-access'][0]
-
-        token_doc = self._download_xml(
-            token_url, video_id, 'Downloading token',
-            headers=self.geo_verification_headers())
-
-        token_attrib = xpath_element(token_doc, './/token').attrib
+                })
+            token_url = self._extract_token_url(stream_access_url, video_id, data_abo)
 
-        if token_attrib['status'] != '0':
-            raise ExtractorError(
-                'Token error: %s' % token_attrib['comment'], expected=True)
-
-        formats = self._extract_akamai_formats(
-            '%s?hdnea=%s' % (token_attrib['url'], token_attrib['auth']),
-            video_id)
-        self._sort_formats(formats)
+        formats = self._extract_formats(token_url, video_id)
 
         categories_str = _v('meta_sports')
         categories = categories_str.split(',') if categories_str else []
@@ -107,7 +119,7 @@ class Laola1TvEmbedIE(InfoExtractor):
         }
 
 
-class Laola1TvIE(InfoExtractor):
+class Laola1TvIE(Laola1TvEmbedIE):
     IE_NAME = 'laola1tv'
     _VALID_URL = r'https?://(?:www\.)?laola1\.tv/[a-z]+-[a-z]+/[^/]+/(?P<id>[^/?#&]+)'
     _TESTS = [{
@@ -164,13 +176,60 @@ class Laola1TvIE(InfoExtractor):
         if 'Dieser Livestream ist bereits beendet.' in webpage:
             raise ExtractorError('This live stream has already finished.', expected=True)
 
-        iframe_url = urljoin(url, self._search_regex(
-            r'<iframe[^>]*?id="videoplayer"[^>]*?src="([^"]+)"',
-            webpage, 'iframe url'))
+        conf = self._parse_json(self._search_regex(
+            r'(?s)conf\s*=\s*({.+?});', webpage, 'conf'),
+            display_id, js_to_json)
+
+        video_id = conf['videoid']
+
+        config = self._download_json(conf['configUrl'], video_id, query={
+            'videoid': video_id,
+            'partnerid': conf['partnerid'],
+            'language': conf.get('language', ''),
+            'portal': conf.get('portalid', ''),
+        })
+        error = config.get('error')
+        if error:
+            raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
+
+        video_data = config['video']
+        title = video_data['title']
+        is_live = video_data.get('isLivestream') and video_data.get('isLive')
+        meta = video_data.get('metaInformation')
+        sports = meta.get('sports')
+        categories = sports.split(',') if sports else []
+
+        token_url = self._extract_token_url(
+            video_data['streamAccess'], video_id,
+            video_data['abo']['required'])
+
+        formats = self._extract_formats(token_url, video_id)
 
         return {
-            '_type': 'url',
+            'id': video_id,
             'display_id': display_id,
-            'url': iframe_url,
-            'ie_key': 'Laola1TvEmbed',
+            'title': self._live_title(title) if is_live else title,
+            'description': video_data.get('description'),
+            'thumbnail': video_data.get('image'),
+            'categories': categories,
+            'formats': formats,
+            'is_live': is_live,
         }
+
+
+class ITTFIE(InfoExtractor):
+    _VALID_URL = r'https?://tv\.ittf\.com/video/[^/]+/(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://tv.ittf.com/video/peng-wang-wei-matsudaira-kenta/951802',
+        'only_matching': True,
+    }
+
+    def _real_extract(self, url):
+        return self.url_result(
+            update_url_query('https://www.laola1.tv/titanplayer.php', {
+                'videoid': self._match_id(url),
+                'type': 'V',
+                'lang': 'en',
+                'portal': 'int',
+                'customer': 1024,
+            }), Laola1TvEmbedIE.ie_key())
index 9eda956d25aa32dc6adf6186e2bde15f6cdcca8d..0a07c1320993647dbc844b6d0bdfd4591ef2825d 100644 (file)
@@ -23,7 +23,6 @@ from ..utils import (
     str_or_none,
     url_basename,
     urshift,
-    update_url_query,
 )
 
 
@@ -51,7 +50,7 @@ class LeIE(InfoExtractor):
             'id': '1415246',
             'ext': 'mp4',
             'title': '美人天下01',
-            'description': 'md5:f88573d9d7225ada1359eaf0dbf8bcda',
+            'description': 'md5:28942e650e82ed4fcc8e4de919ee854d',
         },
         'params': {
             'hls_prefer_native': True,
@@ -69,7 +68,6 @@ class LeIE(InfoExtractor):
         'params': {
             'hls_prefer_native': True,
         },
-        'skip': 'Only available in China',
     }, {
         'url': 'http://sports.le.com/video/25737697.html',
         'only_matching': True,
@@ -81,7 +79,7 @@ class LeIE(InfoExtractor):
         'only_matching': True,
     }]
 
-    # ror() and calc_time_key() are reversed from a embedded swf file in KLetvPlayer.swf
+    # ror() and calc_time_key() are reversed from a embedded swf file in LetvPlayer.swf
     def ror(self, param1, param2):
         _loc3_ = 0
         while _loc3_ < param2:
@@ -90,15 +88,8 @@ class LeIE(InfoExtractor):
         return param1
 
     def calc_time_key(self, param1):
-        _loc2_ = 773625421
-        _loc3_ = self.ror(param1, _loc2_ % 13)
-        _loc3_ = _loc3_ ^ _loc2_
-        _loc3_ = self.ror(_loc3_, _loc2_ % 17)
-        return _loc3_
-
-    # reversed from http://jstatic.letvcdn.com/sdk/player.js
-    def get_mms_key(self, time):
-        return self.ror(time, 8) ^ 185025305
+        _loc2_ = 185025305
+        return self.ror(param1, _loc2_ % 17) ^ _loc2_
 
     # see M3U8Encryption class in KLetvPlayer.swf
     @staticmethod
@@ -122,7 +113,7 @@ class LeIE(InfoExtractor):
 
     def _check_errors(self, play_json):
         # Check for errors
-        playstatus = play_json['playstatus']
+        playstatus = play_json['msgs']['playstatus']
         if playstatus['status'] == 0:
             flag = playstatus['flag']
             if flag == 1:
@@ -134,58 +125,31 @@ class LeIE(InfoExtractor):
         media_id = self._match_id(url)
         page = self._download_webpage(url, media_id)
 
-        play_json_h5 = self._download_json(
-            'http://api.le.com/mms/out/video/playJsonH5',
-            media_id, 'Downloading html5 playJson data', query={
-                'id': media_id,
-                'platid': 3,
-                'splatid': 304,
-                'format': 1,
-                'tkey': self.get_mms_key(int(time.time())),
-                'domain': 'www.le.com',
-                'tss': 'no',
-            },
-            headers=self.geo_verification_headers())
-        self._check_errors(play_json_h5)
-
         play_json_flash = self._download_json(
-            'http://api.le.com/mms/out/video/playJson',
+            'http://player-pc.le.com/mms/out/video/playJson',
             media_id, 'Downloading flash playJson data', query={
                 'id': media_id,
                 'platid': 1,
                 'splatid': 101,
                 'format': 1,
+                'source': 1000,
                 'tkey': self.calc_time_key(int(time.time())),
                 'domain': 'www.le.com',
+                'region': 'cn',
             },
             headers=self.geo_verification_headers())
         self._check_errors(play_json_flash)
 
-        def get_h5_urls(media_url, format_id):
-            location = self._download_json(
-                media_url, media_id,
-                'Download JSON metadata for format %s' % format_id, query={
-                    'format': 1,
-                    'expect': 3,
-                    'tss': 'no',
-                })['location']
-
-            return {
-                'http': update_url_query(location, {'tss': 'no'}),
-                'hls': update_url_query(location, {'tss': 'ios'}),
-            }
-
         def get_flash_urls(media_url, format_id):
-            media_url += '&' + compat_urllib_parse_urlencode({
-                'm3v': 1,
-                'format': 1,
-                'expect': 3,
-                'rateid': format_id,
-            })
-
             nodes_data = self._download_json(
                 media_url, media_id,
-                'Download JSON metadata for format %s' % format_id)
+                'Download JSON metadata for format %s' % format_id,
+                query={
+                    'm3v': 1,
+                    'format': 1,
+                    'expect': 3,
+                    'tss': 'ios',
+                })
 
             req = self._request_webpage(
                 nodes_data['nodelist'][0]['location'], media_id,
@@ -199,29 +163,28 @@ class LeIE(InfoExtractor):
 
         extracted_formats = []
         formats = []
-        for play_json, get_urls in ((play_json_h5, get_h5_urls), (play_json_flash, get_flash_urls)):
-            playurl = play_json['playurl']
-            play_domain = playurl['domain'][0]
-
-            for format_id, format_data in playurl.get('dispatch', []).items():
-                if format_id in extracted_formats:
-                    continue
-                extracted_formats.append(format_id)
-
-                media_url = play_domain + format_data[0]
-                for protocol, format_url in get_urls(media_url, format_id).items():
-                    f = {
-                        'url': format_url,
-                        'ext': determine_ext(format_data[1]),
-                        'format_id': '%s-%s' % (protocol, format_id),
-                        'protocol': 'm3u8_native' if protocol == 'hls' else 'http',
-                        'quality': int_or_none(format_id),
-                    }
-
-                    if format_id[-1:] == 'p':
-                        f['height'] = int_or_none(format_id[:-1])
-
-                    formats.append(f)
+        playurl = play_json_flash['msgs']['playurl']
+        play_domain = playurl['domain'][0]
+
+        for format_id, format_data in playurl.get('dispatch', []).items():
+            if format_id in extracted_formats:
+                continue
+            extracted_formats.append(format_id)
+
+            media_url = play_domain + format_data[0]
+            for protocol, format_url in get_flash_urls(media_url, format_id).items():
+                f = {
+                    'url': format_url,
+                    'ext': determine_ext(format_data[1]),
+                    'format_id': '%s-%s' % (protocol, format_id),
+                    'protocol': 'm3u8_native' if protocol == 'hls' else 'http',
+                    'quality': int_or_none(format_id),
+                }
+
+                if format_id[-1:] == 'p':
+                    f['height'] = int_or_none(format_id[:-1])
+
+                formats.append(f)
         self._sort_formats(formats, ('height', 'quality', 'format_id'))
 
         publish_time = parse_iso8601(self._html_search_regex(
index 0a5a3956c66827a745daa70ed3b33e61ea676d64..ad65b2759d7245d2129e238f703a8e66a6cb333c 100644 (file)
@@ -26,14 +26,16 @@ class LimelightBaseIE(InfoExtractor):
             'Channel': 'channel',
             'ChannelList': 'channel_list',
         }
+
+        def smuggle(url):
+            return smuggle_url(url, {'source_url': source_url})
+
         entries = []
         for kind, video_id in re.findall(
                 r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})',
                 webpage):
             entries.append(cls.url_result(
-                smuggle_url(
-                    'limelight:%s:%s' % (lm[kind], video_id),
-                    {'source_url': source_url}),
+                smuggle('limelight:%s:%s' % (lm[kind], video_id)),
                 'Limelight%s' % kind, video_id))
         for mobj in re.finditer(
                 # As per [1] class attribute should be exactly equal to
@@ -49,10 +51,15 @@ class LimelightBaseIE(InfoExtractor):
                 ''', webpage):
             kind, video_id = mobj.group('kind'), mobj.group('id')
             entries.append(cls.url_result(
-                smuggle_url(
-                    'limelight:%s:%s' % (kind, video_id),
-                    {'source_url': source_url}),
+                smuggle('limelight:%s:%s' % (kind, video_id)),
                 'Limelight%s' % kind.capitalize(), video_id))
+        # http://support.3playmedia.com/hc/en-us/articles/115009517327-Limelight-Embedding-the-Audio-Description-Plugin-with-the-Limelight-Player-on-Your-Web-Page)
+        for video_id in re.findall(
+                r'(?s)LimelightPlayerUtil\.embed\s*\(\s*{.*?\bmediaId["\']\s*:\s*["\'](?P<id>[a-z0-9]{32})',
+                webpage):
+            entries.append(cls.url_result(
+                smuggle('limelight:media:%s' % video_id),
+                LimelightMediaIE.ie_key(), video_id))
         return entries
 
     def _call_playlist_service(self, item_id, method, fatal=True, referer=None):
index c7de65353e616dc0d5f2ee1b0128c059d6f4f933..246aac576a2c6b275ed38ef614eb7a569b9e2ffb 100644 (file)
@@ -1,6 +1,5 @@
 from __future__ import unicode_literals
 
-import json
 import re
 
 from .common import InfoExtractor
@@ -11,10 +10,10 @@ class LiveLeakIE(InfoExtractor):
     _VALID_URL = r'https?://(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<id>[\w_]+)(?:.*)'
     _TESTS = [{
         'url': 'http://www.liveleak.com/view?i=757_1364311680',
-        'md5': '50f79e05ba149149c1b4ea961223d5b3',
+        'md5': '0813c2430bea7a46bf13acf3406992f4',
         'info_dict': {
             'id': '757_1364311680',
-            'ext': 'flv',
+            'ext': 'mp4',
             'description': 'extremely bad day for this guy..!',
             'uploader': 'ljfriel2',
             'title': 'Most unlucky car accident',
@@ -22,7 +21,7 @@ class LiveLeakIE(InfoExtractor):
         }
     }, {
         'url': 'http://www.liveleak.com/view?i=f93_1390833151',
-        'md5': 'b13a29626183c9d33944e6a04f41aafc',
+        'md5': 'd3f1367d14cc3c15bf24fbfbe04b9abf',
         'info_dict': {
             'id': 'f93_1390833151',
             'ext': 'mp4',
@@ -32,6 +31,7 @@ class LiveLeakIE(InfoExtractor):
             'thumbnail': r're:^https?://.*\.jpg$'
         }
     }, {
+        # Prochan embed
         'url': 'http://www.liveleak.com/view?i=4f7_1392687779',
         'md5': '42c6d97d54f1db107958760788c5f48f',
         'info_dict': {
@@ -41,11 +41,13 @@ class LiveLeakIE(InfoExtractor):
             'uploader': 'CapObveus',
             'title': 'Man is Fatally Struck by Reckless Car While Packing up a Moving Truck',
             'age_limit': 18,
-        }
+        },
+        'skip': 'Video is dead',
     }, {
         # Covers https://github.com/rg3/youtube-dl/pull/5983
+        # Multiple resolutions
         'url': 'http://www.liveleak.com/view?i=801_1409392012',
-        'md5': '0b3bec2d888c20728ca2ad3642f0ef15',
+        'md5': 'c3a449dbaca5c0d1825caecd52a57d7b',
         'info_dict': {
             'id': '801_1409392012',
             'ext': 'mp4',
@@ -70,15 +72,20 @@ class LiveLeakIE(InfoExtractor):
         'params': {
             'skip_download': True,
         },
+    }, {
+        'url': 'https://www.liveleak.com/view?i=677_1439397581',
+        'info_dict': {
+            'id': '677_1439397581',
+            'title': 'Fuel Depot in China Explosion caught on video',
+        },
+        'playlist_count': 3,
     }]
 
     @staticmethod
-    def _extract_url(webpage):
-        mobj = re.search(
-            r'<iframe[^>]+src="https?://(?:\w+\.)?liveleak\.com/ll_embed\?(?:.*?)i=(?P<id>[\w_]+)(?:.*)',
+    def _extract_urls(webpage):
+        return re.findall(
+            r'<iframe[^>]+src="(https?://(?:\w+\.)?liveleak\.com/ll_embed\?[^"]*[if]=[\w_]+[^"]+)"',
             webpage)
-        if mobj:
-            return 'http://www.liveleak.com/view?i=%s' % mobj.group('id')
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -93,57 +100,70 @@ class LiveLeakIE(InfoExtractor):
             webpage, 'age limit', default=None))
         video_thumbnail = self._og_search_thumbnail(webpage)
 
-        sources_raw = self._search_regex(
-            r'(?s)sources:\s*(\[.*?\]),', webpage, 'video URLs', default=None)
-        if sources_raw is None:
-            alt_source = self._search_regex(
-                r'(file: ".*?"),', webpage, 'video URL', default=None)
-            if alt_source:
-                sources_raw = '[{ %s}]' % alt_source
+        entries = self._parse_html5_media_entries(url, webpage, video_id)
+        if not entries:
+            # Maybe an embed?
+            embed_url = self._search_regex(
+                r'<iframe[^>]+src="((?:https?:)?//(?:www\.)?(?:prochan|youtube)\.com/embed[^"]+)"',
+                webpage, 'embed URL')
+            return {
+                '_type': 'url_transparent',
+                'url': embed_url,
+                'id': video_id,
+                'title': video_title,
+                'description': video_description,
+                'uploader': video_uploader,
+                'age_limit': age_limit,
+            }
+
+        for idx, info_dict in enumerate(entries):
+            for a_format in info_dict['formats']:
+                if not a_format.get('height'):
+                    a_format['height'] = int_or_none(self._search_regex(
+                        r'([0-9]+)p\.mp4', a_format['url'], 'height label',
+                        default=None))
+
+            self._sort_formats(info_dict['formats'])
+
+            # Don't append entry ID for one-video pages to keep backward compatibility
+            if len(entries) > 1:
+                info_dict['id'] = '%s_%s' % (video_id, idx + 1)
             else:
-                # Maybe an embed?
-                embed_url = self._search_regex(
-                    r'<iframe[^>]+src="(https?://(?:www\.)?(?:prochan|youtube)\.com/embed[^"]+)"',
-                    webpage, 'embed URL')
-                return {
-                    '_type': 'url_transparent',
-                    'url': embed_url,
-                    'id': video_id,
-                    'title': video_title,
-                    'description': video_description,
-                    'uploader': video_uploader,
-                    'age_limit': age_limit,
-                }
-
-        sources_json = re.sub(r'\s([a-z]+):\s', r'"\1": ', sources_raw)
-        sources = json.loads(sources_json)
-
-        formats = [{
-            'format_id': '%s' % i,
-            'format_note': s.get('label'),
-            'url': s['file'],
-        } for i, s in enumerate(sources)]
-
-        for i, s in enumerate(sources):
-            # Removing '.h264_*.mp4' gives the raw video, which is essentially
-            # the same video without the LiveLeak logo at the top (see
-            # https://github.com/rg3/youtube-dl/pull/4768)
-            orig_url = re.sub(r'\.h264_.+?\.mp4', '', s['file'])
-            if s['file'] != orig_url:
-                formats.append({
-                    'format_id': 'original-%s' % i,
-                    'format_note': s.get('label'),
-                    'url': orig_url,
-                    'preference': 1,
-                })
-        self._sort_formats(formats)
-
-        return {
-            'id': video_id,
-            'title': video_title,
-            'description': video_description,
-            'uploader': video_uploader,
-            'formats': formats,
-            'age_limit': age_limit,
-            'thumbnail': video_thumbnail,
-        }
+                info_dict['id'] = video_id
+
+            info_dict.update({
+                'title': video_title,
+                'description': video_description,
+                'uploader': video_uploader,
+                'age_limit': age_limit,
+                'thumbnail': video_thumbnail,
+            })
+
+        return self.playlist_result(entries, video_id, video_title)
+
+
+class LiveLeakEmbedIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?liveleak\.com/ll_embed\?.*?\b(?P<kind>[if])=(?P<id>[\w_]+)'
+
+    # See generic.py for actual test cases
+    _TESTS = [{
+        'url': 'https://www.liveleak.com/ll_embed?i=874_1459135191',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.liveleak.com/ll_embed?f=ab065df993c1',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        kind, video_id = mobj.group('kind', 'id')
+
+        if kind == 'f':
+            webpage = self._download_webpage(url, video_id)
+            liveleak_url = self._search_regex(
+                r'logourl\s*:\s*(?P<q1>[\'"])(?P<url>%s)(?P=q1)' % LiveLeakIE._VALID_URL,
+                webpage, 'LiveLeak URL', group='url')
+        elif kind == 'i':
+            liveleak_url = 'http://www.liveleak.com/view?i=%s' % video_id
+
+        return self.url_result(liveleak_url, ie=LiveLeakIE.ie_key())
diff --git a/youtube_dl/extractor/manyvids.py b/youtube_dl/extractor/manyvids.py
new file mode 100644 (file)
index 0000000..b94b3c2
--- /dev/null
@@ -0,0 +1,48 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class ManyVidsIE(InfoExtractor):
+    _VALID_URL = r'(?i)https?://(?:www\.)?manyvids\.com/video/(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://www.manyvids.com/Video/133957/everthing-about-me/',
+        'md5': '03f11bb21c52dd12a05be21a5c7dcc97',
+        'info_dict': {
+            'id': '133957',
+            'ext': 'mp4',
+            'title': 'everthing about me (Preview)',
+            'view_count': int,
+            'like_count': int,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_url = self._search_regex(
+            r'data-(?:video-filepath|meta-video)\s*=s*(["\'])(?P<url>(?:(?!\1).)+)\1',
+            webpage, 'video URL', group='url')
+
+        title = '%s (Preview)' % self._html_search_regex(
+            r'<h2[^>]+class="m-a-0"[^>]*>([^<]+)', webpage, 'title')
+
+        like_count = int_or_none(self._search_regex(
+            r'data-likes=["\'](\d+)', webpage, 'like count', default=None))
+        view_count = int_or_none(self._html_search_regex(
+            r'(?s)<span[^>]+class="views-wrapper"[^>]*>(.+?)</span', webpage,
+            'view count', default=None))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'view_count': view_count,
+            'like_count': like_count,
+            'formats': [{
+                'url': video_url,
+            }],
+        }
index 6e067474b78c33948753ba58de2250617838b688..4c32fbc2c27b9a78e0e6fdae73f7dafd865d90fc 100644 (file)
@@ -17,7 +17,7 @@ from ..utils import (
 class MedialaanIE(InfoExtractor):
     _VALID_URL = r'''(?x)
                     https?://
-                        (?:www\.)?
+                        (?:www\.|nieuws\.)?
                         (?:
                             (?P<site_id>vtm|q2|vtmkzoom)\.be/
                             (?:
@@ -85,6 +85,22 @@ class MedialaanIE(InfoExtractor):
         # clip
         'url': 'http://vtmkzoom.be/k3-dansstudio/een-nieuw-seizoen-van-k3-dansstudio',
         'only_matching': True,
+    }, {
+        # http/s redirect
+        'url': 'https://vtmkzoom.be/video?aid=45724',
+        'info_dict': {
+            'id': '257136373657000',
+            'ext': 'mp4',
+            'title': 'K3 Dansstudio Ushuaia afl.6',
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'skip': 'Requires account credentials',
+    }, {
+        # nieuws.vtm.be
+        'url': 'https://nieuws.vtm.be/stadion/stadion/genk-nog-moeilijk-programma',
+        'only_matching': True,
     }]
 
     def _real_initialize(self):
@@ -146,6 +162,8 @@ class MedialaanIE(InfoExtractor):
                 video_id, transform_source=lambda s: '[%s]' % s, fatal=False)
             if player:
                 video = player[-1]
+                if video['videoUrl'] in ('http', 'https'):
+                    return self.url_result(video['url'], MedialaanIE.ie_key())
                 info = {
                     'id': video_id,
                     'url': video['videoUrl'],
diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py
new file mode 100644 (file)
index 0000000..9760eaf
--- /dev/null
@@ -0,0 +1,118 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    determine_ext,
+    parse_duration,
+    try_get,
+    unified_strdate,
+)
+
+
+class MediasetIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    (?:
+                        mediaset:|
+                        https?://
+                            (?:www\.)?video\.mediaset\.it/
+                            (?:
+                                (?:video|on-demand)/(?:[^/]+/)+[^/]+_|
+                                player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid=
+                            )
+                    )(?P<id>[0-9]+)
+                    '''
+    _TESTS = [{
+        # full episode
+        'url': 'http://www.video.mediaset.it/video/hello_goodbye/full/quarta-puntata_661824.html',
+        'md5': '9b75534d42c44ecef7bf1ffeacb7f85d',
+        'info_dict': {
+            'id': '661824',
+            'ext': 'mp4',
+            'title': 'Quarta puntata',
+            'description': 'md5:7183696d6df570e3412a5ef74b27c5e2',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 1414,
+            'creator': 'mediaset',
+            'upload_date': '20161107',
+            'series': 'Hello Goodbye',
+            'categories': ['reality'],
+        },
+        'expected_warnings': ['is not a supported codec'],
+    }, {
+        # clip
+        'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html',
+        'only_matching': True,
+    }, {
+        # iframe simple
+        'url': 'http://www.video.mediaset.it/player/playerIFrame.shtml?id=665924&autoplay=true',
+        'only_matching': True,
+    }, {
+        # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/)
+        'url': 'https://www.video.mediaset.it/player/playerIFrameTwitter.shtml?id=665104&playrelated=false&autoplay=false&related=true&hidesocial=true',
+        'only_matching': True,
+    }, {
+        'url': 'mediaset:661824',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return [
+            mobj.group('url')
+            for mobj in re.finditer(
+                r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>https?://(?:www\.)?video\.mediaset\.it/player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid=\d+.*?)\1',
+                webpage)]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video_list = self._download_json(
+            'http://cdnsel01.mediaset.net/GetCdn.aspx',
+            video_id, 'Downloading video CDN JSON', query={
+                'streamid': video_id,
+                'format': 'json',
+            })['videoList']
+
+        formats = []
+        for format_url in video_list:
+            if '.ism' in format_url:
+                formats.extend(self._extract_ism_formats(
+                    format_url, video_id, ism_id='mss', fatal=False))
+            else:
+                formats.append({
+                    'url': format_url,
+                    'format_id': determine_ext(format_url),
+                })
+        self._sort_formats(formats)
+
+        mediainfo = self._download_json(
+            'http://plr.video.mediaset.it/html/metainfo.sjson',
+            video_id, 'Downloading video info JSON', query={
+                'id': video_id,
+            })['video']
+
+        title = mediainfo['title']
+
+        creator = try_get(
+            mediainfo, lambda x: x['brand-info']['publisher'], compat_str)
+        category = try_get(
+            mediainfo, lambda x: x['brand-info']['category'], compat_str)
+        categories = [category] if category else None
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': mediainfo.get('short-description'),
+            'thumbnail': mediainfo.get('thumbnail'),
+            'duration': parse_duration(mediainfo.get('duration')),
+            'creator': creator,
+            'upload_date': unified_strdate(mediainfo.get('production-date')),
+            'webpage_url': mediainfo.get('url'),
+            'series': mediainfo.get('brand-value'),
+            'categories': categories,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/megaphone.py b/youtube_dl/extractor/megaphone.py
new file mode 100644 (file)
index 0000000..60e3caf
--- /dev/null
@@ -0,0 +1,55 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import js_to_json
+
+
+class MegaphoneIE(InfoExtractor):
+    IE_NAME = 'megaphone.fm'
+    IE_DESC = 'megaphone.fm embedded players'
+    _VALID_URL = r'https://player\.megaphone\.fm/(?P<id>[A-Z0-9]+)'
+    _TEST = {
+        'url': 'https://player.megaphone.fm/GLT9749789991?"',
+        'md5': '4816a0de523eb3e972dc0dda2c191f96',
+        'info_dict': {
+            'id': 'GLT9749789991',
+            'ext': 'mp3',
+            'title': '#97 What Kind Of Idiot Gets Phished?',
+            'thumbnail': 're:^https://.*\.png.*$',
+            'duration': 1776.26375,
+            'author': 'Reply All',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._og_search_property('audio:title', webpage)
+        author = self._og_search_property('audio:artist', webpage)
+        thumbnail = self._og_search_thumbnail(webpage)
+
+        episode_json = self._search_regex(r'(?s)var\s+episode\s*=\s*(\{.+?\});', webpage, 'episode JSON')
+        episode_data = self._parse_json(episode_json, video_id, js_to_json)
+        video_url = self._proto_relative_url(episode_data['mediaUrl'], 'https:')
+
+        formats = [{
+            'url': video_url,
+        }]
+
+        return {
+            'id': video_id,
+            'thumbnail': thumbnail,
+            'title': title,
+            'author': author,
+            'duration': episode_data['duration'],
+            'formats': formats,
+        }
+
+    @classmethod
+    def _extract_urls(cls, webpage):
+        return [m[0] for m in re.findall(
+            r'<iframe[^>]*?\ssrc=["\'](%s)' % cls._VALID_URL, webpage)]
index 28b743cca1f2355a24cb6b913c7a7410f40d595f..964dc542cc06cc1128fa11693c1d9a4a64271672 100644 (file)
@@ -136,11 +136,9 @@ class MiTeleIE(InfoExtractor):
             video_id, 'Downloading gigya script')
 
         # Get a appKey/uuid for getting the session key
-        appKey_var = self._search_regex(
-            r'value\s*\(\s*["\']appGridApplicationKey["\']\s*,\s*([0-9a-f]+)',
-            gigya_sc, 'appKey variable')
         appKey = self._search_regex(
-            r'var\s+%s\s*=\s*["\']([0-9a-f]+)' % appKey_var, gigya_sc, 'appKey')
+            r'constant\s*\(\s*["\']_appGridApplicationKey["\']\s*,\s*["\']([0-9a-f]+)',
+            gigya_sc, 'appKey')
 
         session_json = self._download_json(
             'https://appgrid-api.cloud.accedo.tv/session',
index 0efbe660a5d2a88b41e198078da84baf8bfc7a9e..f6360cce6767c7281a661f6e1cf18bbfe8924fe9 100644 (file)
@@ -9,6 +9,7 @@ from .common import InfoExtractor
 from ..compat import (
     compat_chr,
     compat_ord,
+    compat_str,
     compat_urllib_parse_unquote,
     compat_urlparse,
 )
@@ -53,16 +54,27 @@ class MixcloudIE(InfoExtractor):
         'only_matching': True,
     }]
 
-    # See https://www.mixcloud.com/media/js2/www_js_2.9e23256562c080482435196ca3975ab5.js
-    @staticmethod
-    def _decrypt_play_info(play_info):
-        KEY = 'pleasedontdownloadourmusictheartistswontgetpaid'
+    _keys = [
+        'return { requestAnimationFrame: function(callback) { callback(); }, innerHeight: 500 };',
+        'pleasedontdownloadourmusictheartistswontgetpaid',
+        'window.addEventListener = window.addEventListener || function() {};',
+        '(function() { return new Date().toLocaleDateString(); })()'
+    ]
+    _current_key = None
 
+    # See https://www.mixcloud.com/media/js2/www_js_2.9e23256562c080482435196ca3975ab5.js
+    def _decrypt_play_info(self, play_info, video_id):
         play_info = base64.b64decode(play_info.encode('ascii'))
-
-        return ''.join([
-            compat_chr(compat_ord(ch) ^ compat_ord(KEY[idx % len(KEY)]))
-            for idx, ch in enumerate(play_info)])
+        for num, key in enumerate(self._keys, start=1):
+            try:
+                return self._parse_json(
+                    ''.join([
+                        compat_chr(compat_ord(ch) ^ compat_ord(key[idx % len(key)]))
+                        for idx, ch in enumerate(play_info)]),
+                    video_id)
+            except ExtractorError:
+                if num == len(self._keys):
+                    raise
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
@@ -72,14 +84,30 @@ class MixcloudIE(InfoExtractor):
 
         webpage = self._download_webpage(url, track_id)
 
+        if not self._current_key:
+            js_url = self._search_regex(
+                r'<script[^>]+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js2/www_js_4\.[^>]+\.js)',
+                webpage, 'js url', default=None)
+            if js_url:
+                js = self._download_webpage(js_url, track_id, fatal=False)
+                if js:
+                    KEY_RE_TEMPLATE = r'player\s*:\s*{.*?\b%s\s*:\s*(["\'])(?P<key>(?:(?!\1).)+)\1'
+                    for key_name in ('value', 'key_value', 'key_value.*?', '.*?value.*?'):
+                        key = self._search_regex(
+                            KEY_RE_TEMPLATE % key_name, js, 'key',
+                            default=None, group='key')
+                        if key and isinstance(key, compat_str):
+                            self._keys.insert(0, key)
+                            self._current_key = key
+
         message = self._html_search_regex(
             r'(?s)<div[^>]+class="global-message cloudcast-disabled-notice-light"[^>]*>(.+?)<(?:a|/div)',
             webpage, 'error message', default=None)
 
         encrypted_play_info = self._search_regex(
             r'm-play-info="([^"]+)"', webpage, 'play info')
-        play_info = self._parse_json(
-            self._decrypt_play_info(encrypted_play_info), track_id)
+
+        play_info = self._decrypt_play_info(encrypted_play_info, track_id)
 
         if message and 'stream_url' not in play_info:
             raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True)
index 59cd4b8389f28a72f9d16df70edfa64a7ce2ba40..675ff687374a9a94928f3a899ffdb4a45b1b743c 100644 (file)
@@ -15,7 +15,7 @@ class MLBIE(InfoExtractor):
                         (?:[\da-z_-]+\.)*mlb\.com/
                         (?:
                             (?:
-                                (?:.*?/)?video/(?:topic/[\da-z_-]+/)?v|
+                                (?:.*?/)?video/(?:topic/[\da-z_-]+/)?(?:v|.*?/c-)|
                                 (?:
                                     shared/video/embed/(?:embed|m-internal-embed)\.html|
                                     (?:[^/]+/)+(?:play|index)\.jsp|
@@ -84,7 +84,7 @@ class MLBIE(InfoExtractor):
         },
         {
             'url': 'http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer',
-            'md5': 'b190e70141fb9a1552a85426b4da1b5d',
+            'md5': 'aafaf5b0186fee8f32f20508092f8111',
             'info_dict': {
                 'id': '75609783',
                 'ext': 'mp4',
@@ -94,6 +94,10 @@ class MLBIE(InfoExtractor):
                 'upload_date': '20150415',
             }
         },
+        {
+            'url': 'https://www.mlb.com/video/hargrove-homers-off-caldwell/c-1352023483?tid=67793694',
+            'only_matching': True,
+        },
         {
             'url': 'http://m.mlb.com/shared/video/embed/embed.html?content_id=35692085&topic_id=6479266&width=400&height=224&property=mlb',
             'only_matching': True,
diff --git a/youtube_dl/extractor/mpora.py b/youtube_dl/extractor/mpora.py
deleted file mode 100644 (file)
index 5a1bee5..0000000
+++ /dev/null
@@ -1,62 +0,0 @@
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..utils import int_or_none
-
-
-class MporaIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?mpora\.(?:com|de)/videos/(?P<id>[^?#/]+)'
-    IE_NAME = 'MPORA'
-
-    _TEST = {
-        'url': 'http://mpora.de/videos/AAdo8okx4wiz/embed?locale=de',
-        'md5': 'a7a228473eedd3be741397cf452932eb',
-        'info_dict': {
-            'id': 'AAdo8okx4wiz',
-            'ext': 'mp4',
-            'title': 'Katy Curd -  Winter in the Forest',
-            'duration': 416,
-            'uploader': 'Peter Newman Media',
-        },
-    }
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
-
-        data_json = self._search_regex(
-            [r"new FM\.Player\('[^']+',\s*(\{.*?)\).player;",
-             r"new\s+FM\.Kaltura\.Player\('[^']+'\s*,\s*({.+?})\);"],
-            webpage, 'json')
-        data = self._parse_json(data_json, video_id)
-
-        uploader = data['info_overlay'].get('username')
-        duration = data['video']['duration'] // 1000
-        thumbnail = data['video']['encodings']['sd']['poster']
-        title = data['info_overlay']['title']
-
-        formats = []
-        for encoding_id, edata in data['video']['encodings'].items():
-            for src in edata['sources']:
-                width_str = self._search_regex(
-                    r'_([0-9]+)\.[a-zA-Z0-9]+$', src['src'],
-                    False, default=None)
-                vcodec = src['type'].partition('/')[2]
-
-                formats.append({
-                    'format_id': encoding_id + '-' + vcodec,
-                    'url': src['src'],
-                    'vcodec': vcodec,
-                    'width': int_or_none(width_str),
-                })
-
-        self._sort_formats(formats)
-
-        return {
-            'id': video_id,
-            'title': title,
-            'formats': formats,
-            'uploader': uploader,
-            'duration': duration,
-            'thumbnail': thumbnail,
-        }
index 1473bcf4845d4b8470393686a7bd9baf1df0e398..650731fdc3617fd8b9680ee97e40b8dfefc7c4e4 100644 (file)
@@ -68,10 +68,6 @@ class MSNIE(InfoExtractor):
             format_url = file_.get('url')
             if not format_url:
                 continue
-            ext = determine_ext(format_url)
-            if ext == 'ism':
-                formats.extend(self._extract_ism_formats(
-                    format_url + '/Manifest', display_id, 'mss', fatal=False))
             if 'm3u8' in format_url:
                 # m3u8_native should not be used here until
                 # https://github.com/rg3/youtube-dl/issues/9913 is fixed
@@ -79,6 +75,9 @@ class MSNIE(InfoExtractor):
                     format_url, display_id, 'mp4',
                     m3u8_id='hls', fatal=False)
                 formats.extend(m3u8_formats)
+            elif determine_ext(format_url) == 'ism':
+                formats.extend(self._extract_ism_formats(
+                    format_url + '/Manifest', display_id, 'mss', fatal=False))
             else:
                 formats.append({
                     'url': format_url,
index 8acea1461a662dc40840526c4efabcbe7a7c29b0..25af5ddfda4765132fec413caca9a09fc2ba2bb9 100644 (file)
@@ -50,8 +50,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
         thumb_node = itemdoc.find(search_path)
         if thumb_node is None:
             return None
-        else:
-            return thumb_node.attrib['url']
+        return thumb_node.get('url') or thumb_node.text or None
 
     def _extract_mobile_video_formats(self, mtvn_id):
         webpage_url = self._MOBILE_TEMPLATE % mtvn_id
@@ -83,7 +82,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
                 hls_url = rendition.find('./src').text
                 formats.extend(self._extract_m3u8_formats(
                     hls_url, video_id, ext='mp4', entry_protocol='m3u8_native',
-                    m3u8_id='hls'))
+                    m3u8_id='hls', fatal=False))
             else:
                 # fms
                 try:
@@ -106,7 +105,8 @@ class MTVServicesInfoExtractor(InfoExtractor):
                     }])
                 except (KeyError, TypeError):
                     raise ExtractorError('Invalid rendition field.')
-        self._sort_formats(formats)
+        if formats:
+            self._sort_formats(formats)
         return formats
 
     def _extract_subtitles(self, mdoc, mtvn_id):
@@ -133,8 +133,11 @@ class MTVServicesInfoExtractor(InfoExtractor):
             mediagen_url += 'acceptMethods='
             mediagen_url += 'hls' if use_hls else 'fms'
 
-        mediagen_doc = self._download_xml(mediagen_url, video_id,
-                                          'Downloading video urls')
+        mediagen_doc = self._download_xml(
+            mediagen_url, video_id, 'Downloading video urls', fatal=False)
+
+        if mediagen_doc is False:
+            return None
 
         item = mediagen_doc.find('./video/item')
         if item is not None and item.get('type') == 'text':
@@ -174,6 +177,13 @@ class MTVServicesInfoExtractor(InfoExtractor):
 
         formats = self._extract_video_formats(mediagen_doc, mtvn_id, video_id)
 
+        # Some parts of complete video may be missing (e.g. missing Act 3 in
+        # http://www.southpark.de/alle-episoden/s14e01-sexual-healing)
+        if not formats:
+            return None
+
+        self._sort_formats(formats)
+
         return {
             'title': title,
             'formats': formats,
@@ -205,9 +215,14 @@ class MTVServicesInfoExtractor(InfoExtractor):
         title = xpath_text(idoc, './channel/title')
         description = xpath_text(idoc, './channel/description')
 
+        entries = []
+        for item in idoc.findall('.//item'):
+            info = self._get_video_info(item, use_hls)
+            if info:
+                entries.append(info)
+
         return self.playlist_result(
-            [self._get_video_info(item, use_hls) for item in idoc.findall('.//item')],
-            playlist_title=title, playlist_description=description)
+            entries, playlist_title=title, playlist_description=description)
 
     def _extract_triforce_mgid(self, webpage, data_zone=None, video_id=None):
         triforce_feed = self._parse_json(self._search_regex(
index f281238c93cd0f184b5a5213e67fb16ac82bc8c7..e164d5940044fc57bd9d760927b8f83c638fa198 100644 (file)
@@ -12,64 +12,62 @@ from ..utils import (
 
 
 class MySpaceIE(InfoExtractor):
-    _VALID_URL = r'https?://myspace\.com/([^/]+)/(?P<mediatype>video/[^/]+/|music/song/.*?)(?P<id>\d+)'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        myspace\.com/[^/]+/
+                        (?P<mediatype>
+                            video/[^/]+/(?P<video_id>\d+)|
+                            music/song/[^/?#&]+-(?P<song_id>\d+)-\d+(?:[/?#&]|$)
+                        )
+                    '''
 
-    _TESTS = [
-        {
-            'url': 'https://myspace.com/fiveminutestothestage/video/little-big-town/109594919',
-            'md5': '9c1483c106f4a695c47d2911feed50a7',
-            'info_dict': {
-                'id': '109594919',
-                'ext': 'mp4',
-                'title': 'Little Big Town',
-                'description': 'This country quartet was all smiles while playing a sold out show at the Pacific Amphitheatre in Orange County, California.',
-                'uploader': 'Five Minutes to the Stage',
-                'uploader_id': 'fiveminutestothestage',
-                'timestamp': 1414108751,
-                'upload_date': '20141023',
-            },
+    _TESTS = [{
+        'url': 'https://myspace.com/fiveminutestothestage/video/little-big-town/109594919',
+        'md5': '9c1483c106f4a695c47d2911feed50a7',
+        'info_dict': {
+            'id': '109594919',
+            'ext': 'mp4',
+            'title': 'Little Big Town',
+            'description': 'This country quartet was all smiles while playing a sold out show at the Pacific Amphitheatre in Orange County, California.',
+            'uploader': 'Five Minutes to the Stage',
+            'uploader_id': 'fiveminutestothestage',
+            'timestamp': 1414108751,
+            'upload_date': '20141023',
         },
+    }, {
         # songs
-        {
-            'url': 'https://myspace.com/killsorrow/music/song/of-weakened-soul...-93388656-103880681',
-            'md5': '1d7ee4604a3da226dd69a123f748b262',
-            'info_dict': {
-                'id': '93388656',
-                'ext': 'm4a',
-                'title': 'Of weakened soul...',
-                'uploader': 'Killsorrow',
-                'uploader_id': 'killsorrow',
-            },
-        }, {
-            'add_ie': ['Youtube'],
-            'url': 'https://myspace.com/threedaysgrace/music/song/animal-i-have-become-28400208-28218041',
-            'info_dict': {
-                'id': 'xqds0B_meys',
-                'ext': 'webm',
-                'title': 'Three Days Grace - Animal I Have Become',
-                'description': 'md5:8bd86b3693e72a077cf863a8530c54bb',
-                'uploader': 'ThreeDaysGraceVEVO',
-                'uploader_id': 'ThreeDaysGraceVEVO',
-                'upload_date': '20091002',
-            },
-        }, {
-            'add_ie': ['Youtube'],
-            'url': 'https://myspace.com/starset2/music/song/first-light-95799905-106964426',
-            'info_dict': {
-                'id': 'ypWvQgnJrSU',
-                'ext': 'mp4',
-                'title': 'Starset - First Light',
-                'description': 'md5:2d5db6c9d11d527683bcda818d332414',
-                'uploader': 'Yumi K',
-                'uploader_id': 'SorenPromotions',
-                'upload_date': '20140725',
-            }
+        'url': 'https://myspace.com/killsorrow/music/song/of-weakened-soul...-93388656-103880681',
+        'md5': '1d7ee4604a3da226dd69a123f748b262',
+        'info_dict': {
+            'id': '93388656',
+            'ext': 'm4a',
+            'title': 'Of weakened soul...',
+            'uploader': 'Killsorrow',
+            'uploader_id': 'killsorrow',
         },
-    ]
+    }, {
+        'add_ie': ['Youtube'],
+        'url': 'https://myspace.com/threedaysgrace/music/song/animal-i-have-become-28400208-28218041',
+        'info_dict': {
+            'id': 'xqds0B_meys',
+            'ext': 'webm',
+            'title': 'Three Days Grace - Animal I Have Become',
+            'description': 'md5:8bd86b3693e72a077cf863a8530c54bb',
+            'uploader': 'ThreeDaysGraceVEVO',
+            'uploader_id': 'ThreeDaysGraceVEVO',
+            'upload_date': '20091002',
+        },
+    }, {
+        'url': 'https://myspace.com/starset2/music/song/first-light-95799905-106964426',
+        'only_matching': True,
+    }, {
+        'url': 'https://myspace.com/thelargemouthbassband/music/song/02-pure-eyes.mp3-94422330-105113388',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = mobj.group('video_id') or mobj.group('song_id')
         is_song = mobj.group('mediatype').startswith('music/song')
         webpage = self._download_webpage(url, video_id)
         player_url = self._search_regex(
index d2a44d05dffbf7d42298a260ba728f53e43d5efd..62db70b438d365eb60609beaa6bbad9e2d9fc7d4 100644 (file)
@@ -5,10 +5,8 @@ import re
 from .common import InfoExtractor
 from .theplatform import ThePlatformIE
 from .adobepass import AdobePassIE
-from ..compat import compat_urllib_parse_urlparse
 from ..utils import (
     find_xpath_attr,
-    lowercase_escape,
     smuggle_url,
     unescapeHTML,
     update_url_query,
@@ -17,7 +15,7 @@ from ..utils import (
 
 
 class NBCIE(AdobePassIE):
-    _VALID_URL = r'https?://(?:www\.)?nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)'
+    _VALID_URL = r'(?P<permalink>https?://(?:www\.)?nbc\.com/[^/]+/video/[^/]+/(?P<id>n?\d+))'
 
     _TESTS = [
         {
@@ -36,16 +34,6 @@ class NBCIE(AdobePassIE):
                 'skip_download': True,
             },
         },
-        {
-            'url': 'http://www.nbc.com/the-tonight-show/episodes/176',
-            'info_dict': {
-                'id': '176',
-                'ext': 'flv',
-                'title': 'Ricky Gervais, Steven Van Zandt, ILoveMakonnen',
-                'description': 'A brand new episode of The Tonight Show welcomes Ricky Gervais, Steven Van Zandt and ILoveMakonnen.',
-            },
-            'skip': '404 Not Found',
-        },
         {
             'url': 'http://www.nbc.com/saturday-night-live/video/star-wars-teaser/2832821',
             'info_dict': {
@@ -63,11 +51,6 @@ class NBCIE(AdobePassIE):
             },
             'skip': 'Only works from US',
         },
-        {
-            # This video has expired but with an escaped embedURL
-            'url': 'http://www.nbc.com/parenthood/episode-guide/season-5/just-like-at-home/515',
-            'only_matching': True,
-        },
         {
             # HLS streams requires the 'hdnea3' cookie
             'url': 'http://www.nbc.com/Kings/video/goliath/n1806',
@@ -88,59 +71,38 @@ class NBCIE(AdobePassIE):
     ]
 
     def _real_extract(self, url):
-        video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
-        info = {
+        permalink, video_id = re.match(self._VALID_URL, url).groups()
+        video_data = self._download_json(
+            'https://api.nbc.com/v3/videos', video_id, query={
+                'filter[permalink]': permalink,
+            })['data'][0]['attributes']
+        query = {
+            'mbr': 'true',
+            'manifest': 'm3u',
+        }
+        video_id = video_data['guid']
+        title = video_data['title']
+        if video_data.get('entitlement') == 'auth':
+            resource = self._get_mvpd_resource(
+                'nbcentertainment', title, video_id,
+                video_data.get('vChipRating'))
+            query['auth'] = self._extract_mvpd_auth(
+                url, video_id, 'nbcentertainment', resource)
+        theplatform_url = smuggle_url(update_url_query(
+            'http://link.theplatform.com/s/NnzsPC/media/guid/2410887629/' + video_id,
+            query), {'force_smil_url': True})
+        return {
             '_type': 'url_transparent',
-            'ie_key': 'ThePlatform',
             'id': video_id,
+            'title': title,
+            'url': theplatform_url,
+            'description': video_data.get('description'),
+            'keywords': video_data.get('keywords'),
+            'season_number': int_or_none(video_data.get('seasonNumber')),
+            'episode_number': int_or_none(video_data.get('episodeNumber')),
+            'series': video_data.get('showName'),
+            'ie_key': 'ThePlatform',
         }
-        video_data = None
-        preload = self._search_regex(
-            r'PRELOAD\s*=\s*({.+})', webpage, 'preload data', default=None)
-        if preload:
-            preload_data = self._parse_json(preload, video_id)
-            path = compat_urllib_parse_urlparse(url).path.rstrip('/')
-            entity_id = preload_data.get('xref', {}).get(path)
-            video_data = preload_data.get('entities', {}).get(entity_id)
-        if video_data:
-            query = {
-                'mbr': 'true',
-                'manifest': 'm3u',
-            }
-            video_id = video_data['guid']
-            title = video_data['title']
-            if video_data.get('entitlement') == 'auth':
-                resource = self._get_mvpd_resource(
-                    'nbcentertainment', title, video_id,
-                    video_data.get('vChipRating'))
-                query['auth'] = self._extract_mvpd_auth(
-                    url, video_id, 'nbcentertainment', resource)
-            theplatform_url = smuggle_url(update_url_query(
-                'http://link.theplatform.com/s/NnzsPC/media/guid/2410887629/' + video_id,
-                query), {'force_smil_url': True})
-            info.update({
-                'id': video_id,
-                'title': title,
-                'url': theplatform_url,
-                'description': video_data.get('description'),
-                'keywords': video_data.get('keywords'),
-                'season_number': int_or_none(video_data.get('seasonNumber')),
-                'episode_number': int_or_none(video_data.get('episodeNumber')),
-                'series': video_data.get('showName'),
-            })
-        else:
-            theplatform_url = unescapeHTML(lowercase_escape(self._html_search_regex(
-                [
-                    r'(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"',
-                    r'<iframe[^>]+src="((?:https?:)?//player\.theplatform\.com/[^"]+)"',
-                    r'"embedURL"\s*:\s*"([^"]+)"'
-                ],
-                webpage, 'theplatform url').replace('_no_endcard', '').replace('\\/', '/')))
-            if theplatform_url.startswith('//'):
-                theplatform_url = 'http:' + theplatform_url
-            info['url'] = smuggle_url(theplatform_url, {'source_url': url})
-        return info
 
 
 class NBCSportsVPlayerIE(InfoExtractor):
index 9bea610c88a4ac48cc1b84ed5c3fae45789d643e..0e26f8399dd8ea8777c28d0bb61483e27f954965 100644 (file)
@@ -1,6 +1,15 @@
 from __future__ import unicode_literals
 
+import re
+
 from .common import InfoExtractor
+from ..utils import (
+    extract_attributes,
+    int_or_none,
+    parse_duration,
+    parse_filesize,
+    unified_timestamp,
+)
 
 
 class NewgroundsIE(InfoExtractor):
@@ -13,7 +22,10 @@ class NewgroundsIE(InfoExtractor):
             'ext': 'mp3',
             'title': 'B7 - BusMode',
             'uploader': 'Burn7',
-        }
+            'timestamp': 1378878540,
+            'upload_date': '20130911',
+            'duration': 143,
+        },
     }, {
         'url': 'https://www.newgrounds.com/portal/view/673111',
         'md5': '3394735822aab2478c31b1004fe5e5bc',
@@ -22,25 +34,133 @@ class NewgroundsIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'Dancin',
             'uploader': 'Squirrelman82',
+            'timestamp': 1460256780,
+            'upload_date': '20160410',
+        },
+    }, {
+        # source format unavailable, additional mp4 formats
+        'url': 'http://www.newgrounds.com/portal/view/689400',
+        'info_dict': {
+            'id': '689400',
+            'ext': 'mp4',
+            'title': 'ZTV News Episode 8',
+            'uploader': 'BennettTheSage',
+            'timestamp': 1487965140,
+            'upload_date': '20170224',
+        },
+        'params': {
+            'skip_download': True,
         },
     }]
 
     def _real_extract(self, url):
         media_id = self._match_id(url)
+
         webpage = self._download_webpage(url, media_id)
 
         title = self._html_search_regex(
             r'<title>([^>]+)</title>', webpage, 'title')
 
-        uploader = self._html_search_regex(
-            r'Author\s*<a[^>]+>([^<]+)', webpage, 'uploader', fatal=False)
+        media_url = self._parse_json(self._search_regex(
+            r'"url"\s*:\s*("[^"]+"),', webpage, ''), media_id)
+
+        formats = [{
+            'url': media_url,
+            'format_id': 'source',
+            'quality': 1,
+        }]
+
+        max_resolution = int_or_none(self._search_regex(
+            r'max_resolution["\']\s*:\s*(\d+)', webpage, 'max resolution',
+            default=None))
+        if max_resolution:
+            url_base = media_url.rpartition('.')[0]
+            for resolution in (360, 720, 1080):
+                if resolution > max_resolution:
+                    break
+                formats.append({
+                    'url': '%s.%dp.mp4' % (url_base, resolution),
+                    'format_id': '%dp' % resolution,
+                    'height': resolution,
+                })
+
+        self._check_formats(formats, media_id)
+        self._sort_formats(formats)
 
-        music_url = self._parse_json(self._search_regex(
-            r'"url":("[^"]+"),', webpage, ''), media_id)
+        uploader = self._search_regex(
+            r'(?:Author|Writer)\s*<a[^>]+>([^<]+)', webpage, 'uploader',
+            fatal=False)
+
+        timestamp = unified_timestamp(self._search_regex(
+            r'<dt>Uploaded</dt>\s*<dd>([^<]+)', webpage, 'timestamp',
+            default=None))
+        duration = parse_duration(self._search_regex(
+            r'<dd>Song\s*</dd><dd>.+?</dd><dd>([^<]+)', webpage, 'duration',
+            default=None))
+
+        filesize_approx = parse_filesize(self._html_search_regex(
+            r'<dd>Song\s*</dd><dd>(.+?)</dd>', webpage, 'filesize',
+            default=None))
+        if len(formats) == 1:
+            formats[0]['filesize_approx'] = filesize_approx
+
+        if '<dd>Song' in webpage:
+            formats[0]['vcodec'] = 'none'
 
         return {
             'id': media_id,
             'title': title,
-            'url': music_url,
             'uploader': uploader,
+            'timestamp': timestamp,
+            'duration': duration,
+            'formats': formats,
         }
+
+
+class NewgroundsPlaylistIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:collection|[^/]+/search/[^/]+)/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://www.newgrounds.com/collection/cats',
+        'info_dict': {
+            'id': 'cats',
+            'title': 'Cats',
+        },
+        'playlist_mincount': 46,
+    }, {
+        'url': 'http://www.newgrounds.com/portal/search/author/ZONE-SAMA',
+        'info_dict': {
+            'id': 'ZONE-SAMA',
+            'title': 'Portal Search: ZONE-SAMA',
+        },
+        'playlist_mincount': 47,
+    }, {
+        'url': 'http://www.newgrounds.com/audio/search/title/cats',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        title = self._search_regex(
+            r'<title>([^>]+)</title>', webpage, 'title', default=None)
+
+        # cut left menu
+        webpage = self._search_regex(
+            r'(?s)<div[^>]+\bclass=["\']column wide(.+)',
+            webpage, 'wide column', default=webpage)
+
+        entries = []
+        for a, path, media_id in re.findall(
+                r'(<a[^>]+\bhref=["\']/?((?:portal/view|audio/listen)/(\d+))[^>]+>)',
+                webpage):
+            a_class = extract_attributes(a).get('class')
+            if a_class not in ('item-portalsubmission', 'item-audiosubmission'):
+                continue
+            entries.append(
+                self.url_result(
+                    'https://www.newgrounds.com/%s' % path,
+                    ie=NewgroundsIE.ie_key(), video_id=media_id))
+
+        return self.playlist_result(entries, playlist_id, title)
diff --git a/youtube_dl/extractor/nexx.py b/youtube_dl/extractor/nexx.py
new file mode 100644 (file)
index 0000000..d0235fd
--- /dev/null
@@ -0,0 +1,271 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import hashlib
+import random
+import re
+import time
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    parse_duration,
+    try_get,
+    urlencode_postdata,
+)
+
+
+class NexxIE(InfoExtractor):
+    _VALID_URL = r'https?://api\.nexx(?:\.cloud|cdn\.com)/v3/(?P<domain_id>\d+)/videos/byid/(?P<id>\d+)'
+    _TESTS = [{
+        # movie
+        'url': 'https://api.nexx.cloud/v3/748/videos/byid/128907',
+        'md5': '16746bfc28c42049492385c989b26c4a',
+        'info_dict': {
+            'id': '128907',
+            'ext': 'mp4',
+            'title': 'Stiftung Warentest',
+            'alt_title': 'Wie ein Test abläuft',
+            'description': 'md5:d1ddb1ef63de721132abd38639cc2fd2',
+            'release_year': 2013,
+            'creator': 'SPIEGEL TV',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 2509,
+            'timestamp': 1384264416,
+            'upload_date': '20131112',
+        },
+        'params': {
+            'format': 'bestvideo',
+        },
+    }, {
+        # episode
+        'url': 'https://api.nexx.cloud/v3/741/videos/byid/247858',
+        'info_dict': {
+            'id': '247858',
+            'ext': 'mp4',
+            'title': 'Return of the Golden Child (OV)',
+            'description': 'md5:5d969537509a92b733de21bae249dc63',
+            'release_year': 2017,
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 1397,
+            'timestamp': 1495033267,
+            'upload_date': '20170517',
+            'episode_number': 2,
+            'season_number': 2,
+        },
+        'params': {
+            'format': 'bestvideo',
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://api.nexxcdn.com/v3/748/videos/byid/128907',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        # Reference:
+        # 1. https://nx-s.akamaized.net/files/201510/44.pdf
+
+        entries = []
+
+        # JavaScript Integration
+        mobj = re.search(
+            r'<script\b[^>]+\bsrc=["\']https?://require\.nexx(?:\.cloud|cdn\.com)/(?P<id>\d+)',
+            webpage)
+        if mobj:
+            domain_id = mobj.group('id')
+            for video_id in re.findall(
+                    r'(?is)onPLAYReady.+?_play\.init\s*\(.+?\s*,\s*["\']?(\d+)',
+                    webpage):
+                entries.append(
+                    'https://api.nexx.cloud/v3/%s/videos/byid/%s'
+                    % (domain_id, video_id))
+
+        # TODO: support more embed formats
+
+        return entries
+
+    @staticmethod
+    def _extract_url(webpage):
+        return NexxIE._extract_urls(webpage)[0]
+
+    def _handle_error(self, response):
+        status = int_or_none(try_get(
+            response, lambda x: x['metadata']['status']) or 200)
+        if 200 <= status < 300:
+            return
+        raise ExtractorError(
+            '%s said: %s' % (self.IE_NAME, response['metadata']['errorhint']),
+            expected=True)
+
+    def _call_api(self, domain_id, path, video_id, data=None, headers={}):
+        headers['Content-Type'] = 'application/x-www-form-urlencoded; charset=UTF-8'
+        result = self._download_json(
+            'https://api.nexx.cloud/v3/%s/%s' % (domain_id, path), video_id,
+            'Downloading %s JSON' % path, data=urlencode_postdata(data),
+            headers=headers)
+        self._handle_error(result)
+        return result['result']
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        domain_id, video_id = mobj.group('domain_id', 'id')
+
+        # Reverse engineered from JS code (see getDeviceID function)
+        device_id = '%d:%d:%d%d' % (
+            random.randint(1, 4), int(time.time()),
+            random.randint(1e4, 99999), random.randint(1, 9))
+
+        result = self._call_api(domain_id, 'session/init', video_id, data={
+            'nxp_devh': device_id,
+            'nxp_userh': '',
+            'precid': '0',
+            'playlicense': '0',
+            'screenx': '1920',
+            'screeny': '1080',
+            'playerversion': '6.0.00',
+            'gateway': 'html5',
+            'adGateway': '',
+            'explicitlanguage': 'en-US',
+            'addTextTemplates': '1',
+            'addDomainData': '1',
+            'addAdModel': '1',
+        }, headers={
+            'X-Request-Enable-Auth-Fallback': '1',
+        })
+
+        cid = result['general']['cid']
+
+        # As described in [1] X-Request-Token generation algorithm is
+        # as follows:
+        #   md5( operation + domain_id + domain_secret )
+        # where domain_secret is a static value that will be given by nexx.tv
+        # as per [1]. Here is how this "secret" is generated (reversed
+        # from _play.api.init function, search for clienttoken). So it's
+        # actually not static and not that much of a secret.
+        # 1. https://nexxtvstorage.blob.core.windows.net/files/201610/27.pdf
+        secret = result['device']['clienttoken'][int(device_id[0]):]
+        secret = secret[0:len(secret) - int(device_id[-1])]
+
+        op = 'byid'
+
+        # Reversed from JS code for _play.api.call function (search for
+        # X-Request-Token)
+        request_token = hashlib.md5(
+            ''.join((op, domain_id, secret)).encode('utf-8')).hexdigest()
+
+        video = self._call_api(
+            domain_id, 'videos/%s/%s' % (op, video_id), video_id, data={
+                'additionalfields': 'language,channel,actors,studio,licenseby,slug,subtitle,teaser,description',
+                'addInteractionOptions': '1',
+                'addStatusDetails': '1',
+                'addStreamDetails': '1',
+                'addCaptions': '1',
+                'addScenes': '1',
+                'addHotSpots': '1',
+                'addBumpers': '1',
+                'captionFormat': 'data',
+            }, headers={
+                'X-Request-CID': cid,
+                'X-Request-Token': request_token,
+            })
+
+        general = video['general']
+        title = general['title']
+
+        stream_data = video['streamdata']
+        language = general.get('language_raw') or ''
+
+        # TODO: reverse more cdns and formats
+
+        cdn = stream_data['cdnType']
+        assert cdn == 'azure'
+
+        azure_locator = stream_data['azureLocator']
+
+        AZURE_URL = 'http://nx-p%02d.akamaized.net/'
+
+        for secure in ('s', ''):
+            cdn_shield = stream_data.get('cdnShieldHTTP%s' % secure.upper())
+            if cdn_shield:
+                azure_base = 'http%s://%s' % (secure, cdn_shield)
+                break
+        else:
+            azure_base = AZURE_URL % int(stream_data['azureAccount'].replace('nexxplayplus', ''))
+
+        is_ml = ',' in language
+        azure_m3u8_url = '%s%s/%s_src%s.ism/Manifest(format=m3u8-aapl)' % (
+            azure_base, azure_locator, video_id, ('_manifest' if is_ml else ''))
+
+        protection_token = try_get(
+            video, lambda x: x['protectiondata']['token'], compat_str)
+        if protection_token:
+            azure_m3u8_url += '?hdnts=%s' % protection_token
+
+        formats = self._extract_m3u8_formats(
+            azure_m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
+            m3u8_id='%s-hls' % cdn)
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'alt_title': general.get('subtitle'),
+            'description': general.get('description'),
+            'release_year': int_or_none(general.get('year')),
+            'creator': general.get('studio') or general.get('studio_adref'),
+            'thumbnail': try_get(
+                video, lambda x: x['imagedata']['thumb'], compat_str),
+            'duration': parse_duration(general.get('runtime')),
+            'timestamp': int_or_none(general.get('uploaded')),
+            'episode_number': int_or_none(try_get(
+                video, lambda x: x['episodedata']['episode'])),
+            'season_number': int_or_none(try_get(
+                video, lambda x: x['episodedata']['season'])),
+            'formats': formats,
+        }
+
+
+class NexxEmbedIE(InfoExtractor):
+    _VALID_URL = r'https?://embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'http://embed.nexx.cloud/748/KC1614647Z27Y7T?autoplay=1',
+        'md5': '16746bfc28c42049492385c989b26c4a',
+        'info_dict': {
+            'id': '161464',
+            'ext': 'mp4',
+            'title': 'Nervenkitzel Achterbahn',
+            'alt_title': 'Karussellbauer in Deutschland',
+            'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc',
+            'release_year': 2005,
+            'creator': 'SPIEGEL TV',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 2761,
+            'timestamp': 1394021479,
+            'upload_date': '20140305',
+        },
+        'params': {
+            'format': 'bestvideo',
+            'skip_download': True,
+        },
+    }
+
+    @staticmethod
+    def _extract_urls(webpage):
+        # Reference:
+        # 1. https://nx-s.akamaized.net/files/201510/44.pdf
+
+        # iFrame Embed Integration
+        return [mobj.group('url') for mobj in re.finditer(
+            r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:(?!\1).)+)\1',
+            webpage)]
+
+    def _real_extract(self, url):
+        embed_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, embed_id)
+
+        return self.url_result(NexxIE._extract_url(webpage), ie=NexxIE.ie_key())
index 08a75929e1e049249759f94e6179cfa8932ba87c..510b1c41fd42dd3f77214fd804e9962a78e075af 100644 (file)
@@ -12,6 +12,7 @@ class NickIE(MTVServicesInfoExtractor):
     IE_NAME = 'nick.com'
     _VALID_URL = r'https?://(?:(?:www|beta)\.)?nick(?:jr)?\.com/(?:[^/]+/)?(?:videos/clip|[^/]+/videos)/(?P<id>[^/?#.]+)'
     _FEED_URL = 'http://udat.mtvnservices.com/service1/dispatch.htm'
+    _GEO_COUNTRIES = ['US']
     _TESTS = [{
         'url': 'http://www.nick.com/videos/clip/alvinnn-and-the-chipmunks-112-full-episode.html',
         'playlist': [
@@ -74,7 +75,7 @@ class NickIE(MTVServicesInfoExtractor):
 
 class NickDeIE(MTVServicesInfoExtractor):
     IE_NAME = 'nick.de'
-    _VALID_URL = r'https?://(?:www\.)?(?P<host>nick\.de|nickelodeon\.(?:nl|at))/(?:playlist|shows)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+    _VALID_URL = r'https?://(?:www\.)?(?P<host>nick\.(?:de|com\.pl)|nickelodeon\.(?:nl|at))/[^/]+/(?:[^/]+/)*(?P<id>[^/?#&]+)'
     _TESTS = [{
         'url': 'http://www.nick.de/playlist/3773-top-videos/videos/episode/17306-zu-wasser-und-zu-land-rauchende-erdnusse',
         'only_matching': True,
@@ -87,6 +88,9 @@ class NickDeIE(MTVServicesInfoExtractor):
     }, {
         'url': 'http://www.nickelodeon.at/playlist/3773-top-videos/videos/episode/77993-das-letzte-gefecht',
         'only_matching': True,
+    }, {
+        'url': 'http://www.nick.com.pl/seriale/474-spongebob-kanciastoporty/wideo/17412-teatr-to-jest-to-rodeo-oszolom',
+        'only_matching': True,
     }]
 
     def _extract_mrss_url(self, webpage, host):
@@ -124,3 +128,21 @@ class NickNightIE(NickDeIE):
         return self._search_regex(
             r'mrss\s*:\s*(["\'])(?P<url>http.+?)\1', webpage,
             'mrss url', group='url')
+
+
+class NickRuIE(MTVServicesInfoExtractor):
+    IE_NAME = 'nickelodeonru'
+    _VALID_URL = r'https?://(?:www\.)nickelodeon\.ru/(?:playlist|shows|videos)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'http://www.nickelodeon.ru/shows/henrydanger/videos/episodes/3-sezon-15-seriya-licenziya-na-polyot/pmomfb#playlist/7airc6',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.nickelodeon.ru/videos/smotri-na-nickelodeon-v-iyule/g9hvh7',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        mgid = self._extract_mgid(webpage)
+        return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid)
index 8baac23e4b16643a59e6a83570862b6a5afd2b45..026329d3ea4210e2d17af374b67c4a87af252ee1 100644 (file)
@@ -1,23 +1,27 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import re
 import json
 import datetime
 
 from .common import InfoExtractor
 from ..compat import (
+    compat_parse_qs,
     compat_urlparse,
 )
 from ..utils import (
+    determine_ext,
+    dict_get,
     ExtractorError,
     int_or_none,
+    float_or_none,
     parse_duration,
     parse_iso8601,
-    sanitized_Request,
-    xpath_text,
-    determine_ext,
+    remove_start,
+    try_get,
+    unified_timestamp,
     urlencode_postdata,
+    xpath_text,
 )
 
 
@@ -32,12 +36,15 @@ class NiconicoIE(InfoExtractor):
             'id': 'sm22312215',
             'ext': 'mp4',
             'title': 'Big Buck Bunny',
+            'thumbnail': r're:https?://.*',
             'uploader': 'takuya0301',
             'uploader_id': '2698420',
             'upload_date': '20131123',
             'timestamp': 1385182762,
             'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org',
             'duration': 33,
+            'view_count': int,
+            'comment_count': int,
         },
         'skip': 'Requires an account',
     }, {
@@ -49,6 +56,7 @@ class NiconicoIE(InfoExtractor):
             'ext': 'swf',
             'title': '【鏡音リン】Dance on media【オリジナル】take2!',
             'description': 'md5:689f066d74610b3b22e0f1739add0f58',
+            'thumbnail': r're:https?://.*',
             'uploader': 'りょうた',
             'uploader_id': '18822557',
             'upload_date': '20110429',
@@ -65,9 +73,11 @@ class NiconicoIE(InfoExtractor):
             'ext': 'unknown_video',
             'description': 'deleted',
             'title': 'ドラえもんエターナル第3話「決戦第3新東京市」<前編>',
+            'thumbnail': r're:https?://.*',
             'upload_date': '20071224',
             'timestamp': int,  # timestamp field has different value if logged in
             'duration': 304,
+            'view_count': int,
         },
         'skip': 'Requires an account',
     }, {
@@ -77,15 +87,57 @@ class NiconicoIE(InfoExtractor):
             'ext': 'mp4',
             'title': '【第1回】RADIOアニメロミックス ラブライブ!~のぞえりRadio Garden~',
             'description': 'md5:b27d224bb0ff53d3c8269e9f8b561cf1',
+            'thumbnail': r're:https?://.*',
             'timestamp': 1388851200,
             'upload_date': '20140104',
             'uploader': 'アニメロチャンネル',
             'uploader_id': '312',
         },
         'skip': 'The viewing period of the video you were searching for has expired.',
+    }, {
+        # video not available via `getflv`; "old" HTML5 video
+        'url': 'http://www.nicovideo.jp/watch/sm1151009',
+        'md5': '8fa81c364eb619d4085354eab075598a',
+        'info_dict': {
+            'id': 'sm1151009',
+            'ext': 'mp4',
+            'title': 'マスターシステム本体内蔵のスペハリのメインテーマ(PSG版)',
+            'description': 'md5:6ee077e0581ff5019773e2e714cdd0b7',
+            'thumbnail': r're:https?://.*',
+            'duration': 184,
+            'timestamp': 1190868283,
+            'upload_date': '20070927',
+            'uploader': 'denden2',
+            'uploader_id': '1392194',
+            'view_count': int,
+            'comment_count': int,
+        },
+        'skip': 'Requires an account',
+    }, {
+        # "New" HTML5 video
+        'url': 'http://www.nicovideo.jp/watch/sm31464864',
+        'md5': '351647b4917660986dc0fa8864085135',
+        'info_dict': {
+            'id': 'sm31464864',
+            'ext': 'mp4',
+            'title': '新作TVアニメ「戦姫絶唱シンフォギアAXZ」PV 最高画質',
+            'description': 'md5:e52974af9a96e739196b2c1ca72b5feb',
+            'timestamp': 1498514060,
+            'upload_date': '20170626',
+            'uploader': 'ゲス',
+            'uploader_id': '40826363',
+            'thumbnail': r're:https?://.*',
+            'duration': 198,
+            'view_count': int,
+            'comment_count': int,
+        },
+        'skip': 'Requires an account',
+    }, {
+        'url': 'http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg',
+        'only_matching': True,
     }]
 
-    _VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)'
+    _VALID_URL = r'https?://(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)'
     _NETRC_MACHINE = 'niconico'
 
     def _real_initialize(self):
@@ -98,19 +150,102 @@ class NiconicoIE(InfoExtractor):
             return True
 
         # Log in
+        login_ok = True
         login_form_strs = {
-            'mail': username,
+            'mail_tel': username,
             'password': password,
         }
-        login_data = urlencode_postdata(login_form_strs)
-        request = sanitized_Request(
-            'https://secure.nicovideo.jp/secure/login', login_data)
-        login_results = self._download_webpage(
-            request, None, note='Logging in', errnote='Unable to log in')
-        if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None:
+        urlh = self._request_webpage(
+            'https://account.nicovideo.jp/api/v1/login', None,
+            note='Logging in', errnote='Unable to log in',
+            data=urlencode_postdata(login_form_strs))
+        if urlh is False:
+            login_ok = False
+        else:
+            parts = compat_urlparse.urlparse(urlh.geturl())
+            if compat_parse_qs(parts.query).get('message', [None])[0] == 'cant_login':
+                login_ok = False
+        if not login_ok:
             self._downloader.report_warning('unable to log in: bad username or password')
-            return False
-        return True
+        return login_ok
+
+    def _extract_format_for_quality(self, api_data, video_id, audio_quality, video_quality):
+        def yesno(boolean):
+            return 'yes' if boolean else 'no'
+
+        session_api_data = api_data['video']['dmcInfo']['session_api']
+        session_api_endpoint = session_api_data['urls'][0]
+
+        format_id = '-'.join(map(lambda s: remove_start(s['id'], 'archive_'), [video_quality, audio_quality]))
+
+        session_response = self._download_json(
+            session_api_endpoint['url'], video_id,
+            query={'_format': 'json'},
+            headers={'Content-Type': 'application/json'},
+            note='Downloading JSON metadata for %s' % format_id,
+            data=json.dumps({
+                'session': {
+                    'client_info': {
+                        'player_id': session_api_data['player_id'],
+                    },
+                    'content_auth': {
+                        'auth_type': session_api_data['auth_types'][session_api_data['protocols'][0]],
+                        'content_key_timeout': session_api_data['content_key_timeout'],
+                        'service_id': 'nicovideo',
+                        'service_user_id': session_api_data['service_user_id']
+                    },
+                    'content_id': session_api_data['content_id'],
+                    'content_src_id_sets': [{
+                        'content_src_ids': [{
+                            'src_id_to_mux': {
+                                'audio_src_ids': [audio_quality['id']],
+                                'video_src_ids': [video_quality['id']],
+                            }
+                        }]
+                    }],
+                    'content_type': 'movie',
+                    'content_uri': '',
+                    'keep_method': {
+                        'heartbeat': {
+                            'lifetime': session_api_data['heartbeat_lifetime']
+                        }
+                    },
+                    'priority': session_api_data['priority'],
+                    'protocol': {
+                        'name': 'http',
+                        'parameters': {
+                            'http_parameters': {
+                                'parameters': {
+                                    'http_output_download_parameters': {
+                                        'use_ssl': yesno(session_api_endpoint['is_ssl']),
+                                        'use_well_known_port': yesno(session_api_endpoint['is_well_known_port']),
+                                    }
+                                }
+                            }
+                        }
+                    },
+                    'recipe_id': session_api_data['recipe_id'],
+                    'session_operation_auth': {
+                        'session_operation_auth_by_signature': {
+                            'signature': session_api_data['signature'],
+                            'token': session_api_data['token'],
+                        }
+                    },
+                    'timing_constraint': 'unlimited'
+                }
+            }))
+
+        resolution = video_quality.get('resolution', {})
+
+        return {
+            'url': session_response['data']['session']['content_uri'],
+            'format_id': format_id,
+            'ext': 'mp4',  # Session API are used in HTML5, which always serves mp4
+            'abr': float_or_none(audio_quality.get('bitrate'), 1000),
+            'vbr': float_or_none(video_quality.get('bitrate'), 1000),
+            'height': resolution.get('height'),
+            'width': resolution.get('width'),
+        }
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -123,30 +258,84 @@ class NiconicoIE(InfoExtractor):
         if video_id.startswith('so'):
             video_id = self._match_id(handle.geturl())
 
-        video_info = self._download_xml(
-            'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id,
-            note='Downloading video info page')
-
-        # Get flv info
-        flv_info_webpage = self._download_webpage(
-            'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1',
-            video_id, 'Downloading flv info')
-
-        flv_info = compat_urlparse.parse_qs(flv_info_webpage)
-        if 'url' not in flv_info:
-            if 'deleted' in flv_info:
-                raise ExtractorError('The video has been deleted.',
-                                     expected=True)
-            elif 'closed' in flv_info:
-                raise ExtractorError('Niconico videos now require logging in',
-                                     expected=True)
-            else:
-                raise ExtractorError('Unable to find video URL')
-
-        video_real_url = flv_info['url'][0]
+        api_data = self._parse_json(self._html_search_regex(
+            'data-api-data="([^"]+)"', webpage,
+            'API data', default='{}'), video_id)
+
+        def _format_id_from_url(video_url):
+            return 'economy' if video_real_url.endswith('low') else 'normal'
+
+        try:
+            video_real_url = api_data['video']['smileInfo']['url']
+        except KeyError:  # Flash videos
+            # Get flv info
+            flv_info_webpage = self._download_webpage(
+                'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1',
+                video_id, 'Downloading flv info')
+
+            flv_info = compat_urlparse.parse_qs(flv_info_webpage)
+            if 'url' not in flv_info:
+                if 'deleted' in flv_info:
+                    raise ExtractorError('The video has been deleted.',
+                                         expected=True)
+                elif 'closed' in flv_info:
+                    raise ExtractorError('Niconico videos now require logging in',
+                                         expected=True)
+                elif 'error' in flv_info:
+                    raise ExtractorError('%s reports error: %s' % (
+                        self.IE_NAME, flv_info['error'][0]), expected=True)
+                else:
+                    raise ExtractorError('Unable to find video URL')
+
+            video_info_xml = self._download_xml(
+                'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id,
+                video_id, note='Downloading video info page')
+
+            def get_video_info(items):
+                if not isinstance(items, list):
+                    items = [items]
+                for item in items:
+                    ret = xpath_text(video_info_xml, './/' + item)
+                    if ret:
+                        return ret
+
+            video_real_url = flv_info['url'][0]
+
+            extension = get_video_info('movie_type')
+            if not extension:
+                extension = determine_ext(video_real_url)
+
+            formats = [{
+                'url': video_real_url,
+                'ext': extension,
+                'format_id': _format_id_from_url(video_real_url),
+            }]
+        else:
+            formats = []
+
+            dmc_info = api_data['video'].get('dmcInfo')
+            if dmc_info:  # "New" HTML5 videos
+                quality_info = dmc_info['quality']
+                for audio_quality in quality_info['audios']:
+                    for video_quality in quality_info['videos']:
+                        if not audio_quality['available'] or not video_quality['available']:
+                            continue
+                        formats.append(self._extract_format_for_quality(
+                            api_data, video_id, audio_quality, video_quality))
+
+                self._sort_formats(formats)
+            else:  # "Old" HTML5 videos
+                formats = [{
+                    'url': video_real_url,
+                    'ext': 'mp4',
+                    'format_id': _format_id_from_url(video_real_url),
+                }]
+
+            def get_video_info(items):
+                return dict_get(api_data['video'], items)
 
         # Start extracting information
-        title = xpath_text(video_info, './/title')
+        title = get_video_info('title')
         if not title:
             title = self._og_search_title(webpage, default=None)
         if not title:
@@ -160,18 +349,15 @@ class NiconicoIE(InfoExtractor):
         watch_api_data = self._parse_json(watch_api_data_string, video_id) if watch_api_data_string else {}
         video_detail = watch_api_data.get('videoDetail', {})
 
-        extension = xpath_text(video_info, './/movie_type')
-        if not extension:
-            extension = determine_ext(video_real_url)
-
         thumbnail = (
-            xpath_text(video_info, './/thumbnail_url') or
+            get_video_info(['thumbnail_url', 'thumbnailURL']) or
             self._html_search_meta('image', webpage, 'thumbnail', default=None) or
             video_detail.get('thumbnail'))
 
-        description = xpath_text(video_info, './/description')
+        description = get_video_info('description')
 
-        timestamp = parse_iso8601(xpath_text(video_info, './/first_retrieve'))
+        timestamp = (parse_iso8601(get_video_info('first_retrieve')) or
+                     unified_timestamp(get_video_info('postedDateTime')))
         if not timestamp:
             match = self._html_search_meta('datePublished', webpage, 'date published', default=None)
             if match:
@@ -181,7 +367,7 @@ class NiconicoIE(InfoExtractor):
                 video_detail['postedAt'].replace('/', '-'),
                 delimiter=' ', timezone=datetime.timedelta(hours=9))
 
-        view_count = int_or_none(xpath_text(video_info, './/view_counter'))
+        view_count = int_or_none(get_video_info(['view_counter', 'viewCount']))
         if not view_count:
             match = self._html_search_regex(
                 r'>Views: <strong[^>]*>([^<]+)</strong>',
@@ -190,38 +376,33 @@ class NiconicoIE(InfoExtractor):
                 view_count = int_or_none(match.replace(',', ''))
         view_count = view_count or video_detail.get('viewCount')
 
-        comment_count = int_or_none(xpath_text(video_info, './/comment_num'))
+        comment_count = (int_or_none(get_video_info('comment_num')) or
+                         video_detail.get('commentCount') or
+                         try_get(api_data, lambda x: x['thread']['commentCount']))
         if not comment_count:
             match = self._html_search_regex(
                 r'>Comments: <strong[^>]*>([^<]+)</strong>',
                 webpage, 'comment count', default=None)
             if match:
                 comment_count = int_or_none(match.replace(',', ''))
-        comment_count = comment_count or video_detail.get('commentCount')
 
         duration = (parse_duration(
-            xpath_text(video_info, './/length') or
+            get_video_info('length') or
             self._html_search_meta(
                 'video:duration', webpage, 'video duration', default=None)) or
-            video_detail.get('length'))
+            video_detail.get('length') or
+            get_video_info('duration'))
 
-        webpage_url = xpath_text(video_info, './/watch_url') or url
+        webpage_url = get_video_info('watch_url') or url
 
-        if video_info.find('.//ch_id') is not None:
-            uploader_id = video_info.find('.//ch_id').text
-            uploader = video_info.find('.//ch_name').text
-        elif video_info.find('.//user_id') is not None:
-            uploader_id = video_info.find('.//user_id').text
-            uploader = video_info.find('.//user_nickname').text
-        else:
-            uploader_id = uploader = None
+        owner = api_data.get('owner', {})
+        uploader_id = get_video_info(['ch_id', 'user_id']) or owner.get('id')
+        uploader = get_video_info(['ch_name', 'user_nickname']) or owner.get('nickname')
 
         return {
             'id': video_id,
-            'url': video_real_url,
             'title': title,
-            'ext': extension,
-            'format_id': 'economy' if video_real_url.endswith('low') else 'normal',
+            'formats': formats,
             'thumbnail': thumbnail,
             'description': description,
             'uploader': uploader,
index f5e3f6815e6e9fee65a386b8ba93b9e6ba0e4d83..9b5ad5a9f0c38427834fb9e942f018a7aa3c2cbf 100644 (file)
@@ -6,6 +6,7 @@ import re
 from .common import InfoExtractor
 from ..compat import compat_urlparse
 from ..utils import (
+    extract_attributes,
     get_element_by_class,
     urlencode_postdata,
 )
@@ -56,17 +57,24 @@ class NJPWWorldIE(InfoExtractor):
         webpage = self._download_webpage(url, video_id)
 
         formats = []
-        for player_url, kind in re.findall(r'<a[^>]+href="(/player[^"]+)".+?<img[^>]+src="[^"]+qf_btn_([^".]+)', webpage):
-            player_url = compat_urlparse.urljoin(url, player_url)
-
+        for mobj in re.finditer(r'<a[^>]+\bhref=(["\'])/player.+?[^>]*>', webpage):
+            player = extract_attributes(mobj.group(0))
+            player_path = player.get('href')
+            if not player_path:
+                continue
+            kind = self._search_regex(
+                r'(low|high)$', player.get('class') or '', 'kind',
+                default='low')
+            player_url = compat_urlparse.urljoin(url, player_path)
             player_page = self._download_webpage(
                 player_url, video_id, note='Downloading player page')
-
             entries = self._parse_html5_media_entries(
                 player_url, player_page, video_id, m3u8_id='hls-%s' % kind,
-                m3u8_entry_protocol='m3u8_native',
-                preference=2 if 'hq' in kind else 1)
-            formats.extend(entries[0]['formats'])
+                m3u8_entry_protocol='m3u8_native')
+            kind_formats = entries[0]['formats']
+            for f in kind_formats:
+                f['quality'] = 2 if kind == 'high' else 1
+            formats.extend(kind_formats)
 
         self._sort_formats(formats)
 
diff --git a/youtube_dl/extractor/nonktube.py b/youtube_dl/extractor/nonktube.py
new file mode 100644 (file)
index 0000000..63e58aa
--- /dev/null
@@ -0,0 +1,33 @@
+from __future__ import unicode_literals
+
+from .nuevo import NuevoBaseIE
+
+
+class NonkTubeIE(NuevoBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?nonktube\.com/(?:(?:video|embed)/|media/nuevo/embed\.php\?.*?\bid=)(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://www.nonktube.com/video/118636/sensual-wife-uncensored-fucked-in-hairy-pussy-and-facialized',
+        'info_dict': {
+            'id': '118636',
+            'ext': 'mp4',
+            'title': 'Sensual Wife Uncensored Fucked In Hairy Pussy And Facialized',
+            'age_limit': 18,
+            'duration': 1150.98,
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }, {
+        'url': 'https://www.nonktube.com/embed/118636',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        info = self._extract_nuevo(
+            'https://www.nonktube.com/media/nuevo/econfig.php?key=%s'
+            % video_id, video_id)
+
+        info['age_limit'] = 18
+        return info
diff --git a/youtube_dl/extractor/noovo.py b/youtube_dl/extractor/noovo.py
new file mode 100644 (file)
index 0000000..f7fa098
--- /dev/null
@@ -0,0 +1,97 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .brightcove import BrightcoveNewIE
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    int_or_none,
+    smuggle_url,
+    try_get,
+)
+
+
+class NoovoIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:[^/]+\.)?noovo\.ca/videos/(?P<id>[^/]+/[^/?#&]+)'
+    _TESTS = [{
+        # clip
+        'url': 'http://noovo.ca/videos/rpm-plus/chrysler-imperial',
+        'info_dict': {
+            'id': '5386045029001',
+            'ext': 'mp4',
+            'title': 'Chrysler Imperial',
+            'description': 'md5:de3c898d1eb810f3e6243e08c8b4a056',
+            'timestamp': 1491399228,
+            'upload_date': '20170405',
+            'uploader_id': '618566855001',
+            'creator': 'vtele',
+            'view_count': int,
+            'series': 'RPM+',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # episode
+        'url': 'http://noovo.ca/videos/l-amour-est-dans-le-pre/episode-13-8',
+        'info_dict': {
+            'id': '5395865725001',
+            'title': 'Épisode 13 : Les retrouvailles',
+            'description': 'md5:336d5ebc5436534e61d16e63ddfca327',
+            'ext': 'mp4',
+            'timestamp': 1492019320,
+            'upload_date': '20170412',
+            'uploader_id': '618566855001',
+            'creator': 'vtele',
+            'view_count': int,
+            'series': "L'amour est dans le pré",
+            'season_number': 5,
+            'episode': 'Épisode 13',
+            'episode_number': 13,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/618566855001/default_default/index.html?videoId=%s'
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        data = self._download_json(
+            'http://api.noovo.ca/api/v1/pages/single-episode/%s' % video_id,
+            video_id)['data']
+
+        content = try_get(data, lambda x: x['contents'][0])
+
+        brightcove_id = data.get('brightcoveId') or content['brightcoveId']
+
+        series = try_get(
+            data, (
+                lambda x: x['show']['title'],
+                lambda x: x['season']['show']['title']),
+            compat_str)
+
+        episode = None
+        og = data.get('og')
+        if isinstance(og, dict) and og.get('type') == 'video.episode':
+            episode = og.get('title')
+
+        video = content or data
+
+        return {
+            '_type': 'url_transparent',
+            'ie_key': BrightcoveNewIE.ie_key(),
+            'url': smuggle_url(
+                self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id,
+                {'geo_countries': ['CA']}),
+            'id': brightcove_id,
+            'title': video.get('title'),
+            'creator': video.get('source'),
+            'view_count': int_or_none(video.get('viewsCount')),
+            'series': series,
+            'season_number': int_or_none(try_get(
+                data, lambda x: x['season']['seasonNumber'])),
+            'episode': episode,
+            'episode_number': int_or_none(data.get('episodeNumber')),
+        }
index 79296f0ef4bdd3ce172a2c59ffcf59b781abf458..fa4ef20c52959240af41a0c8a7b08c02fd3eb54c 100644 (file)
@@ -28,17 +28,17 @@ class NPOBaseIE(InfoExtractor):
 
 class NPOIE(NPOBaseIE):
     IE_NAME = 'npo'
-    IE_DESC = 'npo.nl and ntr.nl'
+    IE_DESC = 'npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl'
     _VALID_URL = r'''(?x)
                     (?:
                         npo:|
                         https?://
                             (?:www\.)?
                             (?:
-                                npo\.nl/(?!live|radio)(?:[^/]+/){2}|
+                                npo\.nl/(?!(?:live|radio)/)(?:[^/]+/){2}|
                                 ntr\.nl/(?:[^/]+/){2,}|
                                 omroepwnl\.nl/video/fragment/[^/]+__|
-                                zapp\.nl/[^/]+/[^/]+/
+                                (?:zapp|npo3)\.nl/(?:[^/]+/){2}
                             )
                         )
                         (?P<id>[^/?#]+)
@@ -146,10 +146,16 @@ class NPOIE(NPOBaseIE):
     }, {
         'url': 'http://www.zapp.nl/beste-vrienden-quiz/extra-video-s/WO_NTR_1067990',
         'only_matching': True,
+    }, {
+        'url': 'https://www.npo3.nl/3onderzoekt/16-09-2015/VPWON_1239870',
+        'only_matching': True,
     }, {
         # live stream
         'url': 'npo:LI_NL1_4188102',
         'only_matching': True,
+    }, {
+        'url': 'http://www.npo.nl/radio-gaga/13-06-2017/BNN_101383373',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
@@ -338,7 +344,7 @@ class NPOLiveIE(NPOBaseIE):
         webpage = self._download_webpage(url, display_id)
 
         live_id = self._search_regex(
-            r'data-prid="([^"]+)"', webpage, 'live id')
+            [r'media-id="([^"]+)"', r'data-prid="([^"]+)"'], webpage, 'live id')
 
         return {
             '_type': 'url_transparent',
index 7fe79cb539e2ce435d55891e16541fe8d2650f41..18ead94260650e9312cc8adab491423485a59494 100644 (file)
@@ -148,13 +148,34 @@ class NRKBaseIE(InfoExtractor):
 
         vcodec = 'none' if data.get('mediaType') == 'Audio' else None
 
-        # TODO: extract chapters when https://github.com/rg3/youtube-dl/pull/9409 is merged
-
         for entry in entries:
             entry.update(common_info)
             for f in entry['formats']:
                 f['vcodec'] = vcodec
 
+        points = data.get('shortIndexPoints')
+        if isinstance(points, list):
+            chapters = []
+            for next_num, point in enumerate(points, start=1):
+                if not isinstance(point, dict):
+                    continue
+                start_time = parse_duration(point.get('startPoint'))
+                if start_time is None:
+                    continue
+                end_time = parse_duration(
+                    data.get('duration')
+                    if next_num == len(points)
+                    else points[next_num].get('startPoint'))
+                if end_time is None:
+                    continue
+                chapters.append({
+                    'start_time': start_time,
+                    'end_time': end_time,
+                    'title': point.get('title'),
+                })
+            if chapters and len(entries) == 1:
+                entries[0]['chapters'] = chapters
+
         return self.playlist_result(entries, video_id, title, description)
 
 
@@ -216,7 +237,7 @@ class NRKTVIE(NRKBaseIE):
                             (?:/\d{2}-\d{2}-\d{4})?
                             (?:\#del=(?P<part_id>\d+))?
                     ''' % _EPISODE_RE
-    _API_HOST = 'psapi-we.nrk.no'
+    _API_HOST = 'psapi-ne.nrk.no'
 
     _TESTS = [{
         'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
index 87fb94d1f583f5b174fe8d9ace84e4791f3afa4e..be1e09d3752376c5f64c7563090dbee730d25317 100644 (file)
@@ -10,9 +10,10 @@ from ..utils import (
 
 
 class NuevoBaseIE(InfoExtractor):
-    def _extract_nuevo(self, config_url, video_id):
+    def _extract_nuevo(self, config_url, video_id, headers={}):
         config = self._download_xml(
-            config_url, video_id, transform_source=lambda s: s.strip())
+            config_url, video_id, transform_source=lambda s: s.strip(),
+            headers=headers)
 
         title = xpath_text(config, './title', 'title', fatal=True).strip()
         video_id = xpath_text(config, './mediaid', default=video_id)
index 94f57990b21664cc8922c088896dec54ab73a310..58da1bc27b120e6636e61237dc1b0b61972bf96e 100644 (file)
@@ -11,6 +11,7 @@ from ..utils import (
     get_element_by_class,
     int_or_none,
     js_to_json,
+    NO_DEFAULT,
     parse_iso8601,
     remove_start,
     strip_or_none,
@@ -198,6 +199,19 @@ class OnetPlIE(InfoExtractor):
             'upload_date': '20170214',
             'timestamp': 1487078046,
         },
+    }, {
+        # embedded via pulsembed
+        'url': 'http://film.onet.pl/pensjonat-nad-rozlewiskiem-relacja-z-planu-serialu/y428n0',
+        'info_dict': {
+            'id': '501235.965429946',
+            'ext': 'mp4',
+            'title': '"Pensjonat nad rozlewiskiem": relacja z planu serialu',
+            'upload_date': '20170622',
+            'timestamp': 1498159955,
+        },
+        'params': {
+            'skip_download': True,
+        },
     }, {
         'url': 'http://film.onet.pl/zwiastuny/ghost-in-the-shell-drugi-zwiastun-pl/5q6yl3',
         'only_matching': True,
@@ -212,13 +226,25 @@ class OnetPlIE(InfoExtractor):
         'only_matching': True,
     }]
 
+    def _search_mvp_id(self, webpage, default=NO_DEFAULT):
+        return self._search_regex(
+            r'data-(?:params-)?mvp=["\'](\d+\.\d+)', webpage, 'mvp id',
+            default=default)
+
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
         webpage = self._download_webpage(url, video_id)
 
-        mvp_id = self._search_regex(
-            r'data-params-mvp=["\'](\d+\.\d+)', webpage, 'mvp id')
+        mvp_id = self._search_mvp_id(webpage, default=None)
+
+        if not mvp_id:
+            pulsembed_url = self._search_regex(
+                r'data-src=(["\'])(?P<url>(?:https?:)?//pulsembed\.eu/.+?)\1',
+                webpage, 'pulsembed url', group='url')
+            webpage = self._download_webpage(
+                pulsembed_url, video_id, 'Downloading pulsembed webpage')
+            mvp_id = self._search_mvp_id(webpage)
 
         return self.url_result(
             'onetmvp:%s' % mvp_id, OnetMVPIE.ie_key(), video_id=mvp_id)
index 84be2b1e3fe9fec47e915c13b14a8f23c0383232..52580baed3c49242be55529f8c75f5d110ff41f7 100644 (file)
@@ -3,12 +3,14 @@ import re
 import base64
 
 from .common import InfoExtractor
+from ..compat import compat_str
 from ..utils import (
-    int_or_none,
-    float_or_none,
+    determine_ext,
     ExtractorError,
+    float_or_none,
+    int_or_none,
+    try_get,
     unsmuggle_url,
-    determine_ext,
 )
 from ..compat import compat_urllib_parse_urlencode
 
@@ -39,13 +41,15 @@ class OoyalaBaseIE(InfoExtractor):
         formats = []
         if cur_auth_data['authorized']:
             for stream in cur_auth_data['streams']:
-                s_url = base64.b64decode(
-                    stream['url']['data'].encode('ascii')).decode('utf-8')
-                if s_url in urls:
+                url_data = try_get(stream, lambda x: x['url']['data'], compat_str)
+                if not url_data:
+                    continue
+                s_url = base64.b64decode(url_data.encode('ascii')).decode('utf-8')
+                if not s_url or s_url in urls:
                     continue
                 urls.append(s_url)
                 ext = determine_ext(s_url, None)
-                delivery_type = stream['delivery_type']
+                delivery_type = stream.get('delivery_type')
                 if delivery_type == 'hls' or ext == 'm3u8':
                     formats.extend(self._extract_m3u8_formats(
                         re.sub(r'/ip(?:ad|hone)/', '/all/', s_url), embed_code, 'mp4', 'm3u8_native',
@@ -65,7 +69,7 @@ class OoyalaBaseIE(InfoExtractor):
                 else:
                     formats.append({
                         'url': s_url,
-                        'ext': ext or stream.get('delivery_type'),
+                        'ext': ext or delivery_type,
                         'vcodec': stream.get('video_codec'),
                         'format_id': delivery_type,
                         'width': int_or_none(stream.get('width')),
@@ -136,6 +140,11 @@ class OoyalaIE(OoyalaBaseIE):
                 'title': 'Divide Tool Path.mp4',
                 'duration': 204.405,
             }
+        },
+        {
+            # empty stream['url']['data']
+            'url': 'http://player.ooyala.com/player.js?embedCode=w2bnZtYjE6axZ_dw1Cd0hQtXd_ige2Is',
+            'only_matching': True,
         }
     ]
 
index 1e2c54e68c3eb16b0ee8e9afb7b50b07ae429207..cc296eabd9db93bf3106e4c9bf427ad4abe645c9 100644 (file)
@@ -2,8 +2,6 @@
 from __future__ import unicode_literals
 
 import re
-import calendar
-import datetime
 
 from .common import InfoExtractor
 from ..compat import compat_str
@@ -144,77 +142,25 @@ class ORFTVthekIE(InfoExtractor):
         }
 
 
-class ORFOE1IE(InfoExtractor):
-    IE_NAME = 'orf:oe1'
-    IE_DESC = 'Radio Österreich 1'
-    _VALID_URL = r'https?://oe1\.orf\.at/(?:programm/|konsole\?.*?\btrack_id=)(?P<id>[0-9]+)'
-
-    # Audios on ORF radio are only available for 7 days, so we can't add tests.
-    _TESTS = [{
-        'url': 'http://oe1.orf.at/konsole?show=on_demand#?track_id=394211',
-        'only_matching': True,
-    }, {
-        'url': 'http://oe1.orf.at/konsole?show=ondemand&track_id=443608&load_day=/programm/konsole/tag/20160726',
-        'only_matching': True,
-    }]
-
-    def _real_extract(self, url):
-        show_id = self._match_id(url)
-        data = self._download_json(
-            'http://oe1.orf.at/programm/%s/konsole' % show_id,
-            show_id
-        )
-
-        timestamp = datetime.datetime.strptime('%s %s' % (
-            data['item']['day_label'],
-            data['item']['time']
-        ), '%d.%m.%Y %H:%M')
-        unix_timestamp = calendar.timegm(timestamp.utctimetuple())
-
-        return {
-            'id': show_id,
-            'title': data['item']['title'],
-            'url': data['item']['url_stream'],
-            'ext': 'mp3',
-            'description': data['item'].get('info'),
-            'timestamp': unix_timestamp
-        }
-
-
-class ORFFM4IE(InfoExtractor):
-    IE_NAME = 'orf:fm4'
-    IE_DESC = 'radio FM4'
-    _VALID_URL = r'https?://fm4\.orf\.at/(?:7tage/?#|player/)(?P<date>[0-9]+)/(?P<show>\w+)'
-
-    _TEST = {
-        'url': 'http://fm4.orf.at/player/20160110/IS/',
-        'md5': '01e736e8f1cef7e13246e880a59ad298',
-        'info_dict': {
-            'id': '2016-01-10_2100_tl_54_7DaysSun13_11244',
-            'ext': 'mp3',
-            'title': 'Im Sumpf',
-            'description': 'md5:384c543f866c4e422a55f66a62d669cd',
-            'duration': 7173,
-            'timestamp': 1452456073,
-            'upload_date': '20160110',
-        },
-        'skip': 'Live streams on FM4 got deleted soon',
-    }
-
+class ORFRadioIE(InfoExtractor):
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
+        station = mobj.group('station')
         show_date = mobj.group('date')
         show_id = mobj.group('show')
 
+        if station == 'fm4':
+            show_id = '4%s' % show_id
+
         data = self._download_json(
-            'http://audioapi.orf.at/fm4/json/2.0/broadcasts/%s/4%s' % (show_date, show_id),
+            'http://audioapi.orf.at/%s/api/json/current/broadcast/%s/%s' % (station, show_id, show_date),
             show_id
         )
 
         def extract_entry_dict(info, title, subtitle):
             return {
                 'id': info['loopStreamId'].replace('.mp3', ''),
-                'url': 'http://loopstream01.apa.at/?channel=fm4&id=%s' % info['loopStreamId'],
+                'url': 'http://loopstream01.apa.at/?channel=%s&id=%s' % (station, info['loopStreamId']),
                 'title': title,
                 'description': subtitle,
                 'duration': (info['end'] - info['start']) / 1000,
@@ -233,6 +179,47 @@ class ORFFM4IE(InfoExtractor):
         }
 
 
+class ORFFM4IE(ORFRadioIE):
+    IE_NAME = 'orf:fm4'
+    IE_DESC = 'radio FM4'
+    _VALID_URL = r'https?://(?P<station>fm4)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
+
+    _TEST = {
+        'url': 'http://fm4.orf.at/player/20170107/CC',
+        'md5': '2b0be47375432a7ef104453432a19212',
+        'info_dict': {
+            'id': '2017-01-07_2100_tl_54_7DaysSat18_31295',
+            'ext': 'mp3',
+            'title': 'Solid Steel Radioshow',
+            'description': 'Die Mixshow von Coldcut und Ninja Tune.',
+            'duration': 3599,
+            'timestamp': 1483819257,
+            'upload_date': '20170107',
+        },
+        'skip': 'Shows from ORF radios are only available for 7 days.'
+    }
+
+
+class ORFOE1IE(ORFRadioIE):
+    IE_NAME = 'orf:oe1'
+    IE_DESC = 'Radio Österreich 1'
+    _VALID_URL = r'https?://(?P<station>oe1)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
+
+    _TEST = {
+        'url': 'http://oe1.orf.at/player/20170108/456544',
+        'md5': '34d8a6e67ea888293741c86a099b745b',
+        'info_dict': {
+            'id': '2017-01-08_0759_tl_51_7DaysSun6_256141',
+            'ext': 'mp3',
+            'title': 'Morgenjournal',
+            'duration': 609,
+            'timestamp': 1483858796,
+            'upload_date': '20170108',
+        },
+        'skip': 'Shows from ORF radios are only available for 7 days.'
+    }
+
+
 class ORFIPTVIE(InfoExtractor):
     IE_NAME = 'orf:iptv'
     IE_DESC = 'iptv.ORF.at'
index 881f3bcc760492c99e9d0ae35dc3d139025ca7ff..8ed3c6347c90a6bfabe18d51e796a2df4737a804 100644 (file)
@@ -1,9 +1,13 @@
 from __future__ import unicode_literals
 
+import json
 import re
 
 from .common import InfoExtractor
-from ..compat import compat_str
+from ..compat import (
+    compat_str,
+    compat_HTTPError,
+)
 from ..utils import (
     clean_html,
     ExtractorError,
@@ -34,6 +38,25 @@ class PacktPubIE(PacktPubBaseIE):
             'upload_date': '20170331',
         },
     }
+    _NETRC_MACHINE = 'packtpub'
+    _TOKEN = None
+
+    def _real_initialize(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            return
+        try:
+            self._TOKEN = self._download_json(
+                self._MAPT_REST + '/users/tokens', None,
+                'Downloading Authorization Token', data=json.dumps({
+                    'email': username,
+                    'password': password,
+                }).encode())['data']['access']
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401, 404):
+                message = self._parse_json(e.cause.read().decode(), None)['message']
+                raise ExtractorError(message, expected=True)
+            raise
 
     def _handle_error(self, response):
         if response.get('status') != 'success':
@@ -51,14 +74,17 @@ class PacktPubIE(PacktPubBaseIE):
         course_id, chapter_id, video_id = mobj.group(
             'course_id', 'chapter_id', 'id')
 
+        headers = {}
+        if self._TOKEN:
+            headers['Authorization'] = 'Bearer ' + self._TOKEN
         video = self._download_json(
             '%s/users/me/products/%s/chapters/%s/sections/%s'
             % (self._MAPT_REST, course_id, chapter_id, video_id), video_id,
-            'Downloading JSON video')['data']
+            'Downloading JSON video', headers=headers)['data']
 
         content = video.get('content')
         if not content:
-            raise ExtractorError('This video is locked', expected=True)
+            self.raise_login_required('This video is locked')
 
         video_url = content['file']
 
index 133cc9b884c656b9f1ab66c0af7f446cd68a7fbd..c86d7077127e2da252bd90db257bc152aa18e166 100644 (file)
@@ -10,13 +10,13 @@ from ..utils import (
 
 class PandaTVIE(InfoExtractor):
     IE_DESC = '熊猫TV'
-    _VALID_URL = r'http://(?:www\.)?panda\.tv/(?P<id>[0-9]+)'
-    _TEST = {
-        'url': 'http://www.panda.tv/10091',
+    _VALID_URL = r'https?://(?:www\.)?panda\.tv/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://www.panda.tv/66666',
         'info_dict': {
-            'id': '10091',
+            'id': '66666',
             'title': 're:.+',
-            'uploader': 'å\9b\9aå¾\92',
+            'uploader': 'å\88\98æ\9d\80鸡',
             'ext': 'flv',
             'is_live': True,
         },
@@ -24,13 +24,16 @@ class PandaTVIE(InfoExtractor):
             'skip_download': True,
         },
         'skip': 'Live stream is offline',
-    }
+    }, {
+        'url': 'https://www.panda.tv/66666',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
         config = self._download_json(
-            'http://www.panda.tv/api_room?roomid=%s' % video_id, video_id)
+            'https://www.panda.tv/api_room?roomid=%s' % video_id, video_id)
 
         error_code = config.get('errno', 0)
         if error_code is not 0:
@@ -74,7 +77,7 @@ class PandaTVIE(InfoExtractor):
                 continue
             for pref, (ext, pl) in enumerate((('m3u8', '-hls'), ('flv', ''))):
                 formats.append({
-                    'url': 'http://pl%s%s.live.panda.tv/live_panda/%s%s%s.%s'
+                    'url': 'https://pl%s%s.live.panda.tv/live_panda/%s%s%s.%s'
                     % (pl, plflag1, room_key, live_panda, suffix[quality], ext),
                     'format_id': '%s-%s' % (k, ext),
                     'quality': quality,
index 89c95fffb6e6ca7279d927424c19be9325e2ac4d..fc7bd34112f8863b741a67e2b97d473c7fc55e55 100644 (file)
@@ -19,7 +19,7 @@ class PandoraTVIE(InfoExtractor):
     IE_NAME = 'pandora.tv'
     IE_DESC = '판도라TV'
     _VALID_URL = r'https?://(?:.+?\.)?channel\.pandora\.tv/channel/video\.ptv\?'
-    _TEST = {
+    _TESTS = [{
         'url': 'http://jp.channel.pandora.tv/channel/video.ptv?c1=&prgid=53294230&ch_userid=mikakim&ref=main&lot=cate_01_2',
         'info_dict': {
             'id': '53294230',
@@ -34,7 +34,26 @@ class PandoraTVIE(InfoExtractor):
             'view_count': int,
             'like_count': int,
         }
-    }
+    }, {
+        'url': 'http://channel.pandora.tv/channel/video.ptv?ch_userid=gogoucc&prgid=54721744',
+        'info_dict': {
+            'id': '54721744',
+            'ext': 'flv',
+            'title': '[HD] JAPAN COUNTDOWN 170423',
+            'description': '[HD] JAPAN COUNTDOWN 170423',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 1704.9,
+            'upload_date': '20170423',
+            'uploader': 'GOGO_UCC',
+            'uploader_id': 'gogoucc',
+            'view_count': int,
+            'like_count': int,
+        },
+        'params': {
+            # Test metadata only
+            'skip_download': True,
+        },
+    }]
 
     def _real_extract(self, url):
         qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
@@ -86,7 +105,7 @@ class PandoraTVIE(InfoExtractor):
             'description': info.get('body'),
             'thumbnail': info.get('thumbnail') or info.get('poster'),
             'duration': float_or_none(info.get('runtime'), 1000) or parse_duration(info.get('time')),
-            'upload_date': info['fid'][:8] if isinstance(info.get('fid'), compat_str) else None,
+            'upload_date': info['fid'].split('/')[-1][:8] if isinstance(info.get('fid'), compat_str) else None,
             'uploader': info.get('nickname'),
             'uploader_id': info.get('upload_userid'),
             'view_count': str_to_int(info.get('hit')),
index 6166dc2adf41a431c79244e01256ab94f0dcb704..8889e4a1aaa3e41f49a63b53c010cf69d0842b1b 100644 (file)
@@ -8,7 +8,9 @@ from ..utils import (
     ExtractorError,
     determine_ext,
     int_or_none,
+    float_or_none,
     js_to_json,
+    orderedSet,
     strip_jsonp,
     strip_or_none,
     unified_strdate,
@@ -187,7 +189,7 @@ class PBSIE(InfoExtractor):
            # Direct video URL
            (?:%s)/(?:viralplayer|video)/(?P<id>[0-9]+)/? |
            # Article with embedded player (or direct video)
-           (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
+           (?:www\.)?pbs\.org/(?:[^/]+/){1,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
            # Player
            (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/
         )
@@ -263,6 +265,13 @@ class PBSIE(InfoExtractor):
             },
             'playlist_count': 2,
         },
+        {
+            'url': 'http://www.pbs.org/wgbh/americanexperience/films/great-war/',
+            'info_dict': {
+                'id': 'great-war',
+            },
+            'playlist_count': 3,
+        },
         {
             'url': 'http://www.pbs.org/wgbh/americanexperience/films/death/player/',
             'info_dict': {
@@ -336,6 +345,21 @@ class PBSIE(InfoExtractor):
                 'formats': 'mincount:8',
             },
         },
+        {
+            # https://github.com/rg3/youtube-dl/issues/13801
+            'url': 'https://www.pbs.org/video/pbs-newshour-full-episode-july-31-2017-1501539057/',
+            'info_dict': {
+                'id': '3003333873',
+                'ext': 'mp4',
+                'title': 'PBS NewsHour - full episode July 31, 2017',
+                'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+                'duration': 3265,
+                'thumbnail': r're:^https?://.*\.jpg$',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
         {
             'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true',
             'only_matching': True,
@@ -381,10 +405,10 @@ class PBSIE(InfoExtractor):
             # tabbed frontline videos
             MULTI_PART_REGEXES = (
                 r'<div[^>]+class="videotab[^"]*"[^>]+vid="(\d+)"',
-                r'<a[^>]+href=["\']#video-\d+["\'][^>]+data-coveid=["\'](\d+)',
+                r'<a[^>]+href=["\']#(?:video-|part)\d+["\'][^>]+data-cove[Ii]d=["\'](\d+)',
             )
             for p in MULTI_PART_REGEXES:
-                tabbed_videos = re.findall(p, webpage)
+                tabbed_videos = orderedSet(re.findall(p, webpage))
                 if tabbed_videos:
                     return tabbed_videos, presumptive_id, upload_date, description
 
@@ -424,6 +448,9 @@ class PBSIE(InfoExtractor):
                 if url:
                     break
 
+            if not url:
+                url = self._og_search_url(webpage)
+
             mobj = re.match(self._VALID_URL, url)
 
         player_id = mobj.group('player_id')
@@ -464,6 +491,7 @@ class PBSIE(InfoExtractor):
                     redirects.append(redirect)
                     redirect_urls.add(redirect_url)
 
+        chapters = []
         # Player pages may also serve different qualities
         for page in ('widget/partnerplayer', 'portalplayer'):
             player = self._download_webpage(
@@ -479,6 +507,20 @@ class PBSIE(InfoExtractor):
                     extract_redirect_urls(video_info)
                     if not info:
                         info = video_info
+                if not chapters:
+                    for chapter_data in re.findall(r'(?s)chapters\.push\(({.*?})\)', player):
+                        chapter = self._parse_json(chapter_data, video_id, js_to_json, fatal=False)
+                        if not chapter:
+                            continue
+                        start_time = float_or_none(chapter.get('start_time'), 1000)
+                        duration = float_or_none(chapter.get('duration'), 1000)
+                        if start_time is None or duration is None:
+                            continue
+                        chapters.append({
+                            'start_time': start_time,
+                            'end_time': start_time + duration,
+                            'title': chapter.get('title'),
+                        })
 
         formats = []
         http_url = None
@@ -588,4 +630,5 @@ class PBSIE(InfoExtractor):
             'upload_date': upload_date,
             'formats': formats,
             'subtitles': subtitles,
+            'chapters': chapters,
         }
diff --git a/youtube_dl/extractor/pearvideo.py b/youtube_dl/extractor/pearvideo.py
new file mode 100644 (file)
index 0000000..1d77722
--- /dev/null
@@ -0,0 +1,63 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    qualities,
+    unified_timestamp,
+)
+
+
+class PearVideoIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?pearvideo\.com/video_(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.pearvideo.com/video_1076290',
+        'info_dict': {
+            'id': '1076290',
+            'ext': 'mp4',
+            'title': '小浣熊在主人家玻璃上滚石头:没砸',
+            'description': 'md5:01d576b747de71be0ee85eb7cac25f9d',
+            'timestamp': 1494275280,
+            'upload_date': '20170508',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        quality = qualities(
+            ('ldflv', 'ld', 'sdflv', 'sd', 'hdflv', 'hd', 'src'))
+
+        formats = [{
+            'url': mobj.group('url'),
+            'format_id': mobj.group('id'),
+            'quality': quality(mobj.group('id')),
+        } for mobj in re.finditer(
+            r'(?P<id>[a-zA-Z]+)Url\s*=\s*(["\'])(?P<url>(?:https?:)?//.+?)\2',
+            webpage)]
+        self._sort_formats(formats)
+
+        title = self._search_regex(
+            (r'<h1[^>]+\bclass=(["\'])video-tt\1[^>]*>(?P<value>[^<]+)',
+             r'<[^>]+\bdata-title=(["\'])(?P<value>(?:(?!\1).)+)\1'),
+            webpage, 'title', group='value')
+        description = self._search_regex(
+            (r'<div[^>]+\bclass=(["\'])summary\1[^>]*>(?P<value>[^<]+)',
+             r'<[^>]+\bdata-summary=(["\'])(?P<value>(?:(?!\1).)+)\1'),
+            webpage, 'description', default=None,
+            group='value') or self._html_search_meta('Description', webpage)
+        timestamp = unified_timestamp(self._search_regex(
+            r'<div[^>]+\bclass=["\']date["\'][^>]*>([^<]+)',
+            webpage, 'timestamp', fatal=False))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'timestamp': timestamp,
+            'formats': formats,
+        }
index 1add6b84035041176f77f2bf454ca6bbfb8c7137..e5e08538c3dcde0797cd3805dad67d247f66b80f 100644 (file)
@@ -49,7 +49,7 @@ class PeriscopeIE(PeriscopeBaseIE):
     @staticmethod
     def _extract_url(webpage):
         mobj = re.search(
-            r'<iframe[^>]+src=([\'"])(?P<url>(?:https?:)?//(?:www\.)?periscope\.tv/(?:(?!\1).)+)\1', webpage)
+            r'<iframe[^>]+src=([\'"])(?P<url>(?:https?:)?//(?:www\.)?(?:periscope|pscp)\.tv/(?:(?!\1).)+)\1', webpage)
         if mobj:
             return mobj.group('url')
 
@@ -80,18 +80,24 @@ class PeriscopeIE(PeriscopeBaseIE):
         stream = self._call_api(
             'getAccessPublic', {'broadcast_id': token}, token)
 
+        video_urls = set()
         formats = []
-        for format_id in ('replay', 'rtmp', 'hls', 'https_hls'):
+        for format_id in ('replay', 'rtmp', 'hls', 'https_hls', 'lhls', 'lhlsweb'):
             video_url = stream.get(format_id + '_url')
-            if not video_url:
+            if not video_url or video_url in video_urls:
                 continue
-            f = {
+            video_urls.add(video_url)
+            if format_id != 'rtmp':
+                formats.extend(self._extract_m3u8_formats(
+                    video_url, token, 'mp4',
+                    entry_protocol='m3u8_native'
+                    if state in ('ended', 'timed_out') else 'm3u8',
+                    m3u8_id=format_id, fatal=False))
+                continue
+            formats.append({
                 'url': video_url,
                 'ext': 'flv' if format_id == 'rtmp' else 'mp4',
-            }
-            if format_id != 'rtmp':
-                f['protocol'] = 'm3u8_native' if state in ('ended', 'timed_out') else 'm3u8'
-            formats.append(f)
+            })
         self._sort_formats(formats)
 
         return {
index e45d9fe552e2c5af4c3f35182aa8063dcca9e390..f6a9131b19bf693511a84cfa3118d89030053c6d 100644 (file)
@@ -18,6 +18,7 @@ from ..utils import (
     parse_duration,
     qualities,
     srt_subtitles_timecode,
+    try_get,
     update_url_query,
     urlencode_postdata,
 )
@@ -26,6 +27,39 @@ from ..utils import (
 class PluralsightBaseIE(InfoExtractor):
     _API_BASE = 'https://app.pluralsight.com'
 
+    def _download_course(self, course_id, url, display_id):
+        try:
+            return self._download_course_rpc(course_id, url, display_id)
+        except ExtractorError:
+            # Old API fallback
+            return self._download_json(
+                'https://app.pluralsight.com/player/user/api/v1/player/payload',
+                display_id, data=urlencode_postdata({'courseId': course_id}),
+                headers={'Referer': url})
+
+    def _download_course_rpc(self, course_id, url, display_id):
+        response = self._download_json(
+            '%s/player/functions/rpc' % self._API_BASE, display_id,
+            'Downloading course JSON',
+            data=json.dumps({
+                'fn': 'bootstrapPlayer',
+                'payload': {
+                    'courseId': course_id,
+                },
+            }).encode('utf-8'),
+            headers={
+                'Content-Type': 'application/json;charset=utf-8',
+                'Referer': url,
+            })
+
+        course = try_get(response, lambda x: x['payload']['course'], dict)
+        if course:
+            return course
+
+        raise ExtractorError(
+            '%s said: %s' % (self.IE_NAME, response['error']['message']),
+            expected=True)
+
 
 class PluralsightIE(PluralsightBaseIE):
     IE_NAME = 'pluralsight'
@@ -162,10 +196,7 @@ class PluralsightIE(PluralsightBaseIE):
 
         display_id = '%s-%s' % (name, clip_id)
 
-        course = self._download_json(
-            'https://app.pluralsight.com/player/user/api/v1/player/payload',
-            display_id, data=urlencode_postdata({'courseId': course_name}),
-            headers={'Referer': url})
+        course = self._download_course(course_name, url, display_id)
 
         collection = course['modules']
 
@@ -224,6 +255,7 @@ class PluralsightIE(PluralsightBaseIE):
                 req_format_split = req_format.split('-', 1)
                 if len(req_format_split) > 1:
                     req_ext, req_quality = req_format_split
+                    req_quality = '-'.join(req_quality.split('-')[:2])
                     for allowed_quality in ALLOWED_QUALITIES:
                         if req_ext == allowed_quality.ext and req_quality in allowed_quality.qualities:
                             return (AllowedQuality(req_ext, (req_quality, )), )
@@ -330,18 +362,7 @@ class PluralsightCourseIE(PluralsightBaseIE):
 
         # TODO: PSM cookie
 
-        course = self._download_json(
-            '%s/player/functions/rpc' % self._API_BASE, course_id,
-            'Downloading course JSON',
-            data=json.dumps({
-                'fn': 'bootstrapPlayer',
-                'payload': {
-                    'courseId': course_id,
-                }
-            }).encode('utf-8'),
-            headers={
-                'Content-Type': 'application/json;charset=utf-8'
-            })['payload']['course']
+        course = self._download_course(course_id, url, course_id)
 
         title = course['title']
         course_name = course['name']
index f20946a2bd0616d8d90cebf360570a3f0bc40e94..25fcebf9fa6a06a6bf955468ddf0740b648593a5 100644 (file)
@@ -9,39 +9,46 @@ from ..utils import int_or_none
 
 class PodomaticIE(InfoExtractor):
     IE_NAME = 'podomatic'
-    _VALID_URL = r'^(?P<proto>https?)://(?P<channel>[^.]+)\.podomatic\.com/entry/(?P<id>[^?]+)'
+    _VALID_URL = r'''(?x)
+                    (?P<proto>https?)://
+                        (?:
+                            (?P<channel>[^.]+)\.podomatic\.com/entry|
+                            (?:www\.)?podomatic\.com/podcasts/(?P<channel_2>[^/]+)/episodes
+                        )/
+                        (?P<id>[^/?#&]+)
+                '''
 
-    _TESTS = [
-        {
-            'url': 'http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00',
-            'md5': '84bb855fcf3429e6bf72460e1eed782d',
-            'info_dict': {
-                'id': '2009-01-02T16_03_35-08_00',
-                'ext': 'mp3',
-                'uploader': 'Science Teaching Tips',
-                'uploader_id': 'scienceteachingtips',
-                'title': '64.  When the Moon Hits Your Eye',
-                'duration': 446,
-            }
-        },
-        {
-            'url': 'http://ostbahnhof.podomatic.com/entry/2013-11-15T16_31_21-08_00',
-            'md5': 'd2cf443931b6148e27638650e2638297',
-            'info_dict': {
-                'id': '2013-11-15T16_31_21-08_00',
-                'ext': 'mp3',
-                'uploader': 'Ostbahnhof / Techno Mix',
-                'uploader_id': 'ostbahnhof',
-                'title': 'Einunddreizig',
-                'duration': 3799,
-            }
-        },
-    ]
+    _TESTS = [{
+        'url': 'http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00',
+        'md5': '84bb855fcf3429e6bf72460e1eed782d',
+        'info_dict': {
+            'id': '2009-01-02T16_03_35-08_00',
+            'ext': 'mp3',
+            'uploader': 'Science Teaching Tips',
+            'uploader_id': 'scienceteachingtips',
+            'title': '64.  When the Moon Hits Your Eye',
+            'duration': 446,
+        }
+    }, {
+        'url': 'http://ostbahnhof.podomatic.com/entry/2013-11-15T16_31_21-08_00',
+        'md5': 'd2cf443931b6148e27638650e2638297',
+        'info_dict': {
+            'id': '2013-11-15T16_31_21-08_00',
+            'ext': 'mp3',
+            'uploader': 'Ostbahnhof / Techno Mix',
+            'uploader_id': 'ostbahnhof',
+            'title': 'Einunddreizig',
+            'duration': 3799,
+        }
+    }, {
+        'url': 'https://www.podomatic.com/podcasts/scienceteachingtips/episodes/2009-01-02T16_03_35-08_00',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
-        channel = mobj.group('channel')
+        channel = mobj.group('channel') or mobj.group('channel_2')
 
         json_url = (('%s://%s.podomatic.com/entry/embed_params/%s' +
                      '?permalink=true&rtmp=0') %
index 2ac1fcb0bc90f500696ca0dc29db9f4c911f0d27..978d6f813b6d0a88aafb707aba40de092c926a44 100644 (file)
@@ -65,7 +65,7 @@ class PolskieRadioIE(InfoExtractor):
         webpage = self._download_webpage(url, playlist_id)
 
         content = self._search_regex(
-            r'(?s)<div[^>]+class="audio atarticle"[^>]*>(.+?)<script>',
+            r'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>',
             webpage, 'content')
 
         timestamp = unified_timestamp(self._html_search_regex(
index 842317e6c9cc2312064fae4e61e5703352aa0096..b52879c7a444fef98b2354660025958a7c5ce180 100644 (file)
@@ -54,7 +54,7 @@ class PornHdIE(InfoExtractor):
              r'<title>(.+?) - .*?[Pp]ornHD.*?</title>'], webpage, 'title')
 
         sources = self._parse_json(js_to_json(self._search_regex(
-            r"(?s)'sources'\s*:\s*(\{.+?\})\s*\}[;,)]",
+            r"(?s)sources'?\s*[:=]\s*(\{.+?\})",
             webpage, 'sources', default='{}')), video_id)
 
         if not sources:
@@ -82,7 +82,8 @@ class PornHdIE(InfoExtractor):
         view_count = int_or_none(self._html_search_regex(
             r'(\d+) views\s*<', webpage, 'view count', fatal=False))
         thumbnail = self._search_regex(
-            r"'poster'\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False)
+            r"poster'?\s*:\s*([\"'])(?P<url>(?:(?!\1).)+)\1", webpage,
+            'thumbnail', fatal=False, group='url')
 
         return {
             'id': video_id,
index b25f1f193fc7b1590d57966e7c819d16a4b43cd0..3428458afa987fb3eb8c3be061742cefce6e2734 100644 (file)
@@ -33,7 +33,7 @@ class PornHubIE(InfoExtractor):
     _VALID_URL = r'''(?x)
                     https?://
                         (?:
-                            (?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)|
+                            (?:[a-z]+\.)?pornhub\.com/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
                             (?:www\.)?thumbzilla\.com/video/
                         )
                         (?P<id>[\da-z]+)
@@ -97,6 +97,9 @@ class PornHubIE(InfoExtractor):
     }, {
         'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',
         'only_matching': True,
+    }, {
+        'url': 'http://www.pornhub.com/video/show?viewkey=648719015',
+        'only_matching': True,
     }]
 
     @staticmethod
@@ -183,7 +186,7 @@ class PornHubIE(InfoExtractor):
             title, thumbnail, duration = [None] * 3
 
         video_uploader = self._html_search_regex(
-            r'(?s)From:&nbsp;.+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<',
+            r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:user|channel)s/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
             webpage, 'uploader', fatal=False)
 
         view_count = self._extract_count(
@@ -224,13 +227,20 @@ class PornHubIE(InfoExtractor):
 
 class PornHubPlaylistBaseIE(InfoExtractor):
     def _extract_entries(self, webpage):
+        # Only process container div with main playlist content skipping
+        # drop-down menu that uses similar pattern for videos (see
+        # https://github.com/rg3/youtube-dl/issues/11594).
+        container = self._search_regex(
+            r'(?s)(<div[^>]+class=["\']container.+)', webpage,
+            'container', default=webpage)
+
         return [
             self.url_result(
                 'http://www.pornhub.com/%s' % video_url,
                 PornHubIE.ie_key(), video_title=title)
             for video_url, title in orderedSet(re.findall(
                 r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"',
-                webpage))
+                container))
         ]
 
     def _real_extract(self, url):
@@ -238,22 +248,18 @@ class PornHubPlaylistBaseIE(InfoExtractor):
 
         webpage = self._download_webpage(url, playlist_id)
 
-        # Only process container div with main playlist content skipping
-        # drop-down menu that uses similar pattern for videos (see
-        # https://github.com/rg3/youtube-dl/issues/11594).
-        container = self._search_regex(
-            r'(?s)(<div[^>]+class=["\']container.+)', webpage,
-            'container', default=webpage)
-
-        entries = self._extract_entries(container)
+        entries = self._extract_entries(webpage)
 
         playlist = self._parse_json(
             self._search_regex(
-                r'playlistObject\s*=\s*({.+?});', webpage, 'playlist'),
-            playlist_id)
+                r'(?:playlistObject|PLAYLIST_VIEW)\s*=\s*({.+?});', webpage,
+                'playlist', default='{}'),
+            playlist_id, fatal=False)
+        title = playlist.get('title') or self._search_regex(
+            r'>Videos\s+in\s+(.+?)\s+[Pp]laylist<', webpage, 'title', fatal=False)
 
         return self.playlist_result(
-            entries, playlist_id, playlist.get('title'), playlist.get('description'))
+            entries, playlist_id, title, playlist.get('description'))
 
 
 class PornHubPlaylistIE(PornHubPlaylistBaseIE):
@@ -293,6 +299,7 @@ class PornHubUserVideosIE(PornHubPlaylistBaseIE):
             except ExtractorError as e:
                 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
                     break
+                raise
             page_entries = self._extract_entries(webpage)
             if not page_entries:
                 break
index 17c27da46da7576205afba3c53254728f711d974..084308aeb8922a1632eafe5a8395381710952c60 100644 (file)
@@ -2,38 +2,37 @@
 from __future__ import unicode_literals
 
 import random
-import time
 import re
+import time
 
 from .common import InfoExtractor
 from ..utils import (
-    sanitized_Request,
-    strip_jsonp,
-    unescapeHTML,
     clean_html,
     ExtractorError,
+    strip_jsonp,
+    unescapeHTML,
 )
 
 
 class QQMusicIE(InfoExtractor):
     IE_NAME = 'qqmusic'
     IE_DESC = 'QQ音乐'
-    _VALID_URL = r'https?://y\.qq\.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)'
+    _VALID_URL = r'https?://y\.qq\.com/n/yqq/song/(?P<id>[0-9A-Za-z]+)\.html'
     _TESTS = [{
-        'url': 'http://y.qq.com/#type=song&mid=004295Et37taLD',
-        'md5': '9ce1c1c8445f561506d2e3cfb0255705',
+        'url': 'https://y.qq.com/n/yqq/song/004295Et37taLD.html',
+        'md5': '5f1e6cea39e182857da7ffc5ef5e6bb8',
         'info_dict': {
             'id': '004295Et37taLD',
             'ext': 'mp3',
             'title': '可惜没如果',
             'release_date': '20141227',
             'creator': '林俊杰',
-            'description': 'md5:d327722d0361576fde558f1ac68a7065',
+            'description': 'md5:d85afb3051952ecc50a1ee8a286d1eac',
             'thumbnail': r're:^https?://.*\.jpg$',
         }
     }, {
         'note': 'There is no mp3-320 version of this song.',
-        'url': 'http://y.qq.com/#type=song&mid=004MsGEo3DdNxV',
+        'url': 'https://y.qq.com/n/yqq/song/004MsGEo3DdNxV.html',
         'md5': 'fa3926f0c585cda0af8fa4f796482e3e',
         'info_dict': {
             'id': '004MsGEo3DdNxV',
@@ -46,14 +45,14 @@ class QQMusicIE(InfoExtractor):
         }
     }, {
         'note': 'lyrics not in .lrc format',
-        'url': 'http://y.qq.com/#type=song&mid=001JyApY11tIp6',
+        'url': 'https://y.qq.com/n/yqq/song/001JyApY11tIp6.html',
         'info_dict': {
             'id': '001JyApY11tIp6',
             'ext': 'mp3',
             'title': 'Shadows Over Transylvania',
             'release_date': '19970225',
             'creator': 'Dark Funeral',
-            'description': 'md5:ed14d5bd7ecec19609108052c25b2c11',
+            'description': 'md5:c9b20210587cbcd6836a1c597bab4525',
             'thumbnail': r're:^https?://.*\.jpg$',
         },
         'params': {
@@ -105,7 +104,7 @@ class QQMusicIE(InfoExtractor):
             [r'albummid:\'([0-9a-zA-Z]+)\'', r'"albummid":"([0-9a-zA-Z]+)"'],
             detail_info_page, 'album mid', default=None)
         if albummid:
-            thumbnail_url = "http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg" \
+            thumbnail_url = 'http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg' \
                             % (albummid[-2:-1], albummid[-1], albummid)
 
         guid = self.m_r_get_ruin()
@@ -156,15 +155,39 @@ class QQPlaylistBaseIE(InfoExtractor):
     def qq_static_url(category, mid):
         return 'http://y.qq.com/y/static/%s/%s/%s/%s.html' % (category, mid[-2], mid[-1], mid)
 
-    @classmethod
-    def get_entries_from_page(cls, page):
+    def get_singer_all_songs(self, singmid, num):
+        return self._download_webpage(
+            r'https://c.y.qq.com/v8/fcg-bin/fcg_v8_singer_track_cp.fcg', singmid,
+            query={
+                'format': 'json',
+                'inCharset': 'utf8',
+                'outCharset': 'utf-8',
+                'platform': 'yqq',
+                'needNewCode': 0,
+                'singermid': singmid,
+                'order': 'listen',
+                'begin': 0,
+                'num': num,
+                'songstatus': 1,
+            })
+
+    def get_entries_from_page(self, singmid):
         entries = []
 
-        for item in re.findall(r'class="data"[^<>]*>([^<>]+)</', page):
-            song_mid = unescapeHTML(item).split('|')[-5]
-            entries.append(cls.url_result(
-                'http://y.qq.com/#type=song&mid=' + song_mid, 'QQMusic',
-                song_mid))
+        default_num = 1
+        json_text = self.get_singer_all_songs(singmid, default_num)
+        json_obj_all_songs = self._parse_json(json_text, singmid)
+
+        if json_obj_all_songs['code'] == 0:
+            total = json_obj_all_songs['data']['total']
+            json_text = self.get_singer_all_songs(singmid, total)
+            json_obj_all_songs = self._parse_json(json_text, singmid)
+
+        for item in json_obj_all_songs['data']['list']:
+            if item['musicData'].get('songmid') is not None:
+                songmid = item['musicData']['songmid']
+                entries.append(self.url_result(
+                    r'https://y.qq.com/n/yqq/song/%s.html' % songmid, 'QQMusic', songmid))
 
         return entries
 
@@ -172,42 +195,32 @@ class QQPlaylistBaseIE(InfoExtractor):
 class QQMusicSingerIE(QQPlaylistBaseIE):
     IE_NAME = 'qqmusic:singer'
     IE_DESC = 'QQ音乐 - 歌手'
-    _VALID_URL = r'https?://y\.qq\.com/#type=singer&mid=(?P<id>[0-9A-Za-z]+)'
+    _VALID_URL = r'https?://y\.qq\.com/n/yqq/singer/(?P<id>[0-9A-Za-z]+)\.html'
     _TEST = {
-        'url': 'http://y.qq.com/#type=singer&mid=001BLpXF2DyJe2',
+        'url': 'https://y.qq.com/n/yqq/singer/001BLpXF2DyJe2.html',
         'info_dict': {
             'id': '001BLpXF2DyJe2',
             'title': '林俊杰',
             'description': 'md5:870ec08f7d8547c29c93010899103751',
         },
-        'playlist_count': 12,
+        'playlist_mincount': 12,
     }
 
     def _real_extract(self, url):
         mid = self._match_id(url)
 
-        singer_page = self._download_webpage(
-            self.qq_static_url('singer', mid), mid, 'Download singer page')
-
-        entries = self.get_entries_from_page(singer_page)
-
+        entries = self.get_entries_from_page(mid)
+        singer_page = self._download_webpage(url, mid, 'Download singer page')
         singer_name = self._html_search_regex(
-            r"singername\s*:\s*'([^']+)'", singer_page, 'singer name',
-            default=None)
-
-        singer_id = self._html_search_regex(
-            r"singerid\s*:\s*'([0-9]+)'", singer_page, 'singer id',
-            default=None)
-
+            r"singername\s*:\s*'(.*?)'", singer_page, 'singer name', default=None)
         singer_desc = None
 
-        if singer_id:
-            req = sanitized_Request(
-                'http://s.plcloud.music.qq.com/fcgi-bin/fcg_get_singer_desc.fcg?utf8=1&outCharset=utf-8&format=xml&singerid=%s' % singer_id)
-            req.add_header(
-                'Referer', 'http://s.plcloud.music.qq.com/xhr_proxy_utf8.html')
+        if mid:
             singer_desc_page = self._download_xml(
-                req, mid, 'Donwload singer description XML')
+                'http://s.plcloud.music.qq.com/fcgi-bin/fcg_get_singer_desc.fcg', mid,
+                'Donwload singer description XML',
+                query={'utf8': 1, 'outCharset': 'utf-8', 'format': 'xml', 'singermid': mid},
+                headers={'Referer': 'https://y.qq.com/n/yqq/singer/'})
 
             singer_desc = singer_desc_page.find('./data/info/desc').text
 
@@ -217,10 +230,10 @@ class QQMusicSingerIE(QQPlaylistBaseIE):
 class QQMusicAlbumIE(QQPlaylistBaseIE):
     IE_NAME = 'qqmusic:album'
     IE_DESC = 'QQ音乐 - 专辑'
-    _VALID_URL = r'https?://y\.qq\.com/#type=album&mid=(?P<id>[0-9A-Za-z]+)'
+    _VALID_URL = r'https?://y\.qq\.com/n/yqq/album/(?P<id>[0-9A-Za-z]+)\.html'
 
     _TESTS = [{
-        'url': 'http://y.qq.com/#type=album&mid=000gXCTb2AhRR1',
+        'url': 'https://y.qq.com/n/yqq/album/000gXCTb2AhRR1.html',
         'info_dict': {
             'id': '000gXCTb2AhRR1',
             'title': '我们都是这样长大的',
@@ -228,7 +241,7 @@ class QQMusicAlbumIE(QQPlaylistBaseIE):
         },
         'playlist_count': 4,
     }, {
-        'url': 'http://y.qq.com/#type=album&mid=002Y5a3b3AlCu3',
+        'url': 'https://y.qq.com/n/yqq/album/002Y5a3b3AlCu3.html',
         'info_dict': {
             'id': '002Y5a3b3AlCu3',
             'title': '그리고...',
@@ -246,7 +259,7 @@ class QQMusicAlbumIE(QQPlaylistBaseIE):
 
         entries = [
             self.url_result(
-                'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid']
+                'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid']
             ) for song in album['list']
         ]
         album_name = album.get('name')
@@ -260,31 +273,30 @@ class QQMusicAlbumIE(QQPlaylistBaseIE):
 class QQMusicToplistIE(QQPlaylistBaseIE):
     IE_NAME = 'qqmusic:toplist'
     IE_DESC = 'QQ音乐 - 排行榜'
-    _VALID_URL = r'https?://y\.qq\.com/#type=toplist&p=(?P<id>(top|global)_[0-9]+)'
+    _VALID_URL = r'https?://y\.qq\.com/n/yqq/toplist/(?P<id>[0-9]+)\.html'
 
     _TESTS = [{
-        'url': 'http://y.qq.com/#type=toplist&p=global_123',
+        'url': 'https://y.qq.com/n/yqq/toplist/123.html',
         'info_dict': {
-            'id': 'global_123',
+            'id': '123',
             'title': '美国iTunes榜',
+            'description': 'md5:89db2335fdbb10678dee2d43fe9aba08',
         },
-        'playlist_count': 10,
+        'playlist_count': 100,
     }, {
-        'url': 'http://y.qq.com/#type=toplist&p=top_3',
+        'url': 'https://y.qq.com/n/yqq/toplist/3.html',
         'info_dict': {
-            'id': 'top_3',
+            'id': '3',
             'title': '巅峰榜·欧美',
-            'description': 'QQ音乐巅峰榜·欧美根据用户收听行为自动生成,集结当下最流行的欧美新歌!:更新时间:每周四22点|统'
-                           '计周期:一周(上周四至本周三)|统计对象:三个月内发行的欧美歌曲|统计数量:100首|统计算法:根据'
-                           '歌曲在一周内的有效播放次数,由高到低取前100名(同一歌手最多允许5首歌曲同时上榜)|有效播放次数:'
-                           '登录用户完整播放一首歌曲,记为一次有效播放;同一用户收听同一首歌曲,每天记录为1次有效播放'
+            'description': 'md5:5a600d42c01696b26b71f8c4d43407da',
         },
         'playlist_count': 100,
     }, {
-        'url': 'http://y.qq.com/#type=toplist&p=global_106',
+        'url': 'https://y.qq.com/n/yqq/toplist/106.html',
         'info_dict': {
-            'id': 'global_106',
+            'id': '106',
             'title': '韩国Mnet榜',
+            'description': 'md5:cb84b325215e1d21708c615cac82a6e7',
         },
         'playlist_count': 50,
     }]
@@ -292,18 +304,15 @@ class QQMusicToplistIE(QQPlaylistBaseIE):
     def _real_extract(self, url):
         list_id = self._match_id(url)
 
-        list_type, num_id = list_id.split("_")
-
         toplist_json = self._download_json(
-            'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg?type=%s&topid=%s&format=json'
-            % (list_type, num_id),
-            list_id, 'Download toplist page')
+            'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg', list_id,
+            note='Download toplist page',
+            query={'type': 'toplist', 'topid': list_id, 'format': 'json'})
 
-        entries = [
-            self.url_result(
-                'http://y.qq.com/#type=song&mid=' + song['data']['songmid'], 'QQMusic', song['data']['songmid']
-            ) for song in toplist_json['songlist']
-        ]
+        entries = [self.url_result(
+            'https://y.qq.com/n/yqq/song/' + song['data']['songmid'] + '.html', 'QQMusic',
+            song['data']['songmid'])
+            for song in toplist_json['songlist']]
 
         topinfo = toplist_json.get('topinfo', {})
         list_name = topinfo.get('ListName')
@@ -314,10 +323,10 @@ class QQMusicToplistIE(QQPlaylistBaseIE):
 class QQMusicPlaylistIE(QQPlaylistBaseIE):
     IE_NAME = 'qqmusic:playlist'
     IE_DESC = 'QQ音乐 - 歌单'
-    _VALID_URL = r'https?://y\.qq\.com/#type=taoge&id=(?P<id>[0-9]+)'
+    _VALID_URL = r'https?://y\.qq\.com/n/yqq/playlist/(?P<id>[0-9]+)\.html'
 
     _TESTS = [{
-        'url': 'http://y.qq.com/#type=taoge&id=3462654915',
+        'url': 'http://y.qq.com/n/yqq/playlist/3462654915.html',
         'info_dict': {
             'id': '3462654915',
             'title': '韩国5月新歌精选下旬',
@@ -326,7 +335,7 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE):
         'playlist_count': 40,
         'skip': 'playlist gone',
     }, {
-        'url': 'http://y.qq.com/#type=taoge&id=1374105607',
+        'url': 'https://y.qq.com/n/yqq/playlist/1374105607.html',
         'info_dict': {
             'id': '1374105607',
             'title': '易入人心的华语民谣',
@@ -339,8 +348,9 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE):
         list_id = self._match_id(url)
 
         list_json = self._download_json(
-            'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg?type=1&json=1&utf8=1&onlysong=0&disstid=%s'
-            % list_id, list_id, 'Download list page',
+            'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg',
+            list_id, 'Download list page',
+            query={'type': 1, 'json': 1, 'utf8': 1, 'onlysong': 0, 'disstid': list_id},
             transform_source=strip_jsonp)
         if not len(list_json.get('cdlist', [])):
             if list_json.get('code'):
@@ -350,11 +360,9 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE):
             raise ExtractorError('Unable to get playlist info')
 
         cdlist = list_json['cdlist'][0]
-        entries = [
-            self.url_result(
-                'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid']
-            ) for song in cdlist['songlist']
-        ]
+        entries = [self.url_result(
+            'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid'])
+            for song in cdlist['songlist']]
 
         list_name = cdlist.get('dissname')
         list_description = clean_html(unescapeHTML(cdlist.get('desc')))
index 3b40002a8f4bf7480e15f253fc8b17d0c2d6e7ca..b952e59b47535efc5268322f716f0e29fc8c6296 100644 (file)
@@ -20,20 +20,37 @@ from ..utils import (
 class RadioCanadaIE(InfoExtractor):
     IE_NAME = 'radiocanada'
     _VALID_URL = r'(?:radiocanada:|https?://ici\.radio-canada\.ca/widgets/mediaconsole/)(?P<app_code>[^:/]+)[:/](?P<id>[0-9]+)'
-    _TEST = {
-        'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272',
-        'info_dict': {
-            'id': '7184272',
-            'ext': 'mp4',
-            'title': 'Le parcours du tireur capté sur vidéo',
-            'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa',
-            'upload_date': '20141023',
-        },
-        'params': {
-            # m3u8 download
-            'skip_download': True,
+    _TESTS = [
+        {
+            'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272',
+            'info_dict': {
+                'id': '7184272',
+                'ext': 'mp4',
+                'title': 'Le parcours du tireur capté sur vidéo',
+                'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa',
+                'upload_date': '20141023',
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            }
         },
-    }
+        {
+            # empty Title
+            'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7754998/',
+            'info_dict': {
+                'id': '7754998',
+                'ext': 'mp4',
+                'title': 'letelejournal22h',
+                'description': 'INTEGRALE WEB 22H-TJ',
+                'upload_date': '20170720',
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        }
+    ]
 
     def _real_extract(self, url):
         url, smuggled_data = unsmuggle_url(url, {})
@@ -59,6 +76,7 @@ class RadioCanadaIE(InfoExtractor):
             device_types.append('android')
 
         formats = []
+        error = None
         # TODO: extract f4m formats
         # f4m formats can be extracted using flashhd device_type but they produce unplayable file
         for device_type in device_types:
@@ -84,8 +102,8 @@ class RadioCanadaIE(InfoExtractor):
             if not v_url:
                 continue
             if v_url == 'null':
-                raise ExtractorError('%s said: %s' % (
-                    self.IE_NAME, xpath_text(v_data, 'message')), expected=True)
+                error = xpath_text(v_data, 'message')
+                continue
             ext = determine_ext(v_url)
             if ext == 'm3u8':
                 formats.extend(self._extract_m3u8_formats(
@@ -129,6 +147,9 @@ class RadioCanadaIE(InfoExtractor):
                             formats.extend(self._extract_f4m_formats(
                                 base_url + '/manifest.f4m', video_id,
                                 f4m_id='hds', fatal=False))
+        if not formats and error:
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, error), expected=True)
         self._sort_formats(formats)
 
         subtitles = {}
@@ -141,7 +162,7 @@ class RadioCanadaIE(InfoExtractor):
 
         return {
             'id': video_id,
-            'title': get_meta('Title'),
+            'title': get_meta('Title') or get_meta('AV-nomEmission'),
             'description': get_meta('Description') or get_meta('ShortDescription'),
             'thumbnail': get_meta('imageHR') or get_meta('imageMR') or get_meta('imageBR'),
             'duration': int_or_none(get_meta('length')),
index 81eb9db85b9127e292894e22c9dd6272b134832a..5bf64a56b71d6e0d4282d69476410e1108555877 100644 (file)
@@ -191,11 +191,12 @@ class RaiPlayIE(RaiBaseIE):
 
         info = {
             'id': video_id,
-            'title': title,
+            'title': self._live_title(title) if relinker_info.get(
+                'is_live') else title,
             'alt_title': media.get('subtitle'),
             'description': media.get('description'),
-            'uploader': media.get('channel'),
-            'creator': media.get('editor'),
+            'uploader': strip_or_none(media.get('channel')),
+            'creator': strip_or_none(media.get('editor')),
             'duration': parse_duration(video.get('duration')),
             'timestamp': timestamp,
             'thumbnails': thumbnails,
@@ -208,10 +209,46 @@ class RaiPlayIE(RaiBaseIE):
         }
 
         info.update(relinker_info)
-
         return info
 
 
+class RaiPlayLiveIE(RaiBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'http://www.raiplay.it/dirette/rainews24',
+        'info_dict': {
+            'id': 'd784ad40-e0ae-4a69-aa76-37519d238a9c',
+            'display_id': 'rainews24',
+            'ext': 'mp4',
+            'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'description': 'md5:6eca31500550f9376819f174e5644754',
+            'uploader': 'Rai News 24',
+            'creator': 'Rai News 24',
+            'is_live': True,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_id = self._search_regex(
+            r'data-uniquename=["\']ContentItem-(%s)' % RaiBaseIE._UUID_RE,
+            webpage, 'content id')
+
+        return {
+            '_type': 'url_transparent',
+            'ie_key': RaiPlayIE.ie_key(),
+            'url': 'http://www.raiplay.it/dirette/ContentItem-%s.html' % video_id,
+            'id': video_id,
+            'display_id': display_id,
+        }
+
+
 class RaiIE(RaiBaseIE):
     _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/dl/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE
     _TESTS = [{
@@ -308,11 +345,11 @@ class RaiIE(RaiBaseIE):
         media_type = media['type']
         if 'Audio' in media_type:
             relinker_info = {
-                'formats': {
+                'formats': [{
                     'format_id': media.get('formatoAudio'),
                     'url': media['audioUrl'],
                     'ext': media.get('formatoAudio'),
-                }
+                }]
             }
         elif 'Video' in media_type:
             relinker_info = self._extract_relinker_info(media['mediaUri'], content_id)
index afab624261c6db20e277c93f3f00379bc45f01de..5d6cc3610c4311ea637e137c87ac216e16c8e719 100644 (file)
@@ -13,7 +13,7 @@ from ..utils import (
 
 
 class RedBullTVIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?redbull\.tv/(?:video|film)/(?P<id>AP-\w+)'
+    _VALID_URL = r'https?://(?:www\.)?redbull\.tv/(?:video|film|live)/(?:AP-\w+/segment/)?(?P<id>AP-\w+)'
     _TESTS = [{
         # film
         'url': 'https://www.redbull.tv/video/AP-1Q756YYX51W11/abc-of-wrc',
@@ -42,6 +42,22 @@ class RedBullTVIE(InfoExtractor):
             'season_number': 2,
             'episode_number': 4,
         },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # segment
+        'url': 'https://www.redbull.tv/live/AP-1R5DX49XS1W11/segment/AP-1QSAQJ6V52111/semi-finals',
+        'info_dict': {
+            'id': 'AP-1QSAQJ6V52111',
+            'ext': 'mp4',
+            'title': 'Semi Finals - Vans Park Series Pro Tour',
+            'description': 'md5:306a2783cdafa9e65e39aa62f514fd97',
+            'duration': 11791.991,
+        },
+        'params': {
+            'skip_download': True,
+        },
     }, {
         'url': 'https://www.redbull.tv/film/AP-1MSKKF5T92111/in-motion',
         'only_matching': True,
@@ -82,7 +98,8 @@ class RedBullTVIE(InfoExtractor):
         title = info['title'].strip()
 
         formats = self._extract_m3u8_formats(
-            video['url'], video_id, 'mp4', 'm3u8_native')
+            video['url'], video_id, 'mp4', entry_protocol='m3u8_native',
+            m3u8_id='hls')
         self._sort_formats(formats)
 
         subtitles = {}
diff --git a/youtube_dl/extractor/reddit.py b/youtube_dl/extractor/reddit.py
new file mode 100644 (file)
index 0000000..01c85ee
--- /dev/null
@@ -0,0 +1,114 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    float_or_none,
+)
+
+
+class RedditIE(InfoExtractor):
+    _VALID_URL = r'https?://v\.redd\.it/(?P<id>[^/?#&]+)'
+    _TEST = {
+        # from https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/
+        'url': 'https://v.redd.it/zv89llsvexdz',
+        'md5': '655d06ace653ea3b87bccfb1b27ec99d',
+        'info_dict': {
+            'id': 'zv89llsvexdz',
+            'ext': 'mp4',
+            'title': 'zv89llsvexdz',
+        },
+        'params': {
+            'format': 'bestvideo',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        formats = self._extract_m3u8_formats(
+            'https://v.redd.it/%s/HLSPlaylist.m3u8' % video_id, video_id,
+            'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
+
+        formats.extend(self._extract_mpd_formats(
+            'https://v.redd.it/%s/DASHPlaylist.mpd' % video_id, video_id,
+            mpd_id='dash', fatal=False))
+
+        return {
+            'id': video_id,
+            'title': video_id,
+            'formats': formats,
+        }
+
+
+class RedditRIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?reddit\.com/r/[^/]+/comments/(?P<id>[^/]+)'
+    _TESTS = [{
+        'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/',
+        'info_dict': {
+            'id': 'zv89llsvexdz',
+            'ext': 'mp4',
+            'title': 'That small heart attack.',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'timestamp': 1501941939,
+            'upload_date': '20170805',
+            'uploader': 'Antw87',
+            'like_count': int,
+            'dislike_count': int,
+            'comment_count': int,
+            'age_limit': 0,
+        },
+        'params': {
+            'format': 'bestvideo',
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.reddit.com/r/videos/comments/6rrwyj',
+        'only_matching': True,
+    }, {
+        # imgur
+        'url': 'https://www.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/',
+        'only_matching': True,
+    }, {
+        # streamable
+        'url': 'https://www.reddit.com/r/videos/comments/6t7sg9/comedians_hilarious_joke_about_the_guam_flag/',
+        'only_matching': True,
+    }, {
+        # youtube
+        'url': 'https://www.reddit.com/r/videos/comments/6t75wq/southern_man_tries_to_speak_without_an_accent/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        data = self._download_json(
+            url + '.json', video_id)[0]['data']['children'][0]['data']
+
+        video_url = data['url']
+
+        # Avoid recursing into the same reddit URL
+        if 'reddit.com/' in video_url and '/%s/' % video_id in video_url:
+            raise ExtractorError('No media found', expected=True)
+
+        over_18 = data.get('over_18')
+        if over_18 is True:
+            age_limit = 18
+        elif over_18 is False:
+            age_limit = 0
+        else:
+            age_limit = None
+
+        return {
+            '_type': 'url_transparent',
+            'url': video_url,
+            'title': data.get('title'),
+            'thumbnail': data.get('thumbnail'),
+            'timestamp': float_or_none(data.get('created_utc')),
+            'uploader': data.get('author'),
+            'like_count': int_or_none(data.get('ups')),
+            'dislike_count': int_or_none(data.get('downs')),
+            'comment_count': int_or_none(data.get('num_comments')),
+            'age_limit': age_limit,
+        }
index c367a6ae74f3a7b63dd50f035a2d380f76dc3719..f70a75256c638f4a3ce9cda3b9577176e49f3cca 100644 (file)
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_str
 from ..utils import (
     ExtractorError,
     int_or_none,
@@ -62,7 +63,23 @@ class RedTubeIE(InfoExtractor):
                         'format_id': format_id,
                         'height': int_or_none(format_id),
                     })
-        else:
+        medias = self._parse_json(
+            self._search_regex(
+                r'mediaDefinition\s*:\s*(\[.+?\])', webpage,
+                'media definitions', default='{}'),
+            video_id, fatal=False)
+        if medias and isinstance(medias, list):
+            for media in medias:
+                format_url = media.get('videoUrl')
+                if not format_url or not isinstance(format_url, compat_str):
+                    continue
+                format_id = media.get('quality')
+                formats.append({
+                    'url': format_url,
+                    'format_id': format_id,
+                    'height': int_or_none(format_id),
+                })
+        if not formats:
             video_url = self._html_search_regex(
                 r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL')
             formats.append({'url': video_url})
@@ -73,7 +90,7 @@ class RedTubeIE(InfoExtractor):
             r'<span[^>]+class="added-time"[^>]*>ADDED ([^<]+)<',
             webpage, 'upload date', fatal=False))
         duration = int_or_none(self._search_regex(
-            r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False))
+            r'videoDuration\s*:\s*(\d+)', webpage, 'duration', default=None))
         view_count = str_to_int(self._search_regex(
             r'<span[^>]*>VIEWS</span></td>\s*<td>([\d,.]+)',
             webpage, 'view count', fatal=False))
index 2340dae535b3ed719ffd363283a1c542a2da80b1..e921ca3e6204a3f4dcd892c3e18b00e6148d4cc8 100644 (file)
@@ -13,21 +13,20 @@ class RMCDecouverteIE(InfoExtractor):
     _VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/mediaplayer-replay.*?\bid=(?P<id>\d+)'
 
     _TEST = {
-        'url': 'http://rmcdecouverte.bfmtv.com/mediaplayer-replay/?id=1430&title=LES%20HEROS%20DU%2088e%20ETAGE',
+        'url': 'http://rmcdecouverte.bfmtv.com/mediaplayer-replay/?id=13502&title=AQUAMEN:LES%20ROIS%20DES%20AQUARIUMS%20:UN%20DELICIEUX%20PROJET',
         'info_dict': {
-            'id': '5111223049001',
+            'id': '5419055995001',
             'ext': 'mp4',
-            'title': ': LES HEROS DU 88e ETAGE',
-            'description': 'Découvrez comment la bravoure de deux hommes dans la Tour Nord du World Trade Center a sauvé  la vie d\'innombrables personnes le 11 septembre 2001.',
+            'title': 'UN DELICIEUX PROJET',
+            'description': 'md5:63610df7c8b1fc1698acd4d0d90ba8b5',
             'uploader_id': '1969646226001',
-            'upload_date': '20160904',
-            'timestamp': 1472951103,
+            'upload_date': '20170502',
+            'timestamp': 1493745308,
         },
         'params': {
-            # rtmp download
             'skip_download': True,
         },
-        'skip': 'Only works from France',
+        'skip': 'only available for a week',
     }
     BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1969646226001/default_default/index.html?videoId=%s'
 
@@ -35,5 +34,12 @@ class RMCDecouverteIE(InfoExtractor):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
         brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)
-        brightcove_id = compat_parse_qs(compat_urlparse.urlparse(brightcove_legacy_url).query)['@videoPlayer'][0]
-        return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id)
+        if brightcove_legacy_url:
+            brightcove_id = compat_parse_qs(compat_urlparse.urlparse(
+                brightcove_legacy_url).query)['@videoPlayer'][0]
+        else:
+            brightcove_id = self._search_regex(
+                r'data-video-id=["\'](\d+)', webpage, 'brightcove id')
+        return self.url_result(
+            self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew',
+            brightcove_id)
index 54076de280f4e4ae24321c875f553cfcd146a574..3e22998c6d8384893ac801c98c4f6bde6988c2d1 100644 (file)
@@ -15,7 +15,7 @@ class RtlNlIE(InfoExtractor):
         https?://(?:www\.)?
         (?:
             rtlxl\.nl/[^\#]*\#!/[^/]+/|
-            rtl\.nl/system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html\b.+?\buuid=
+            rtl\.nl/(?:system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html\b.+?\buuid=|video/)
         )
         (?P<id>[0-9a-f-]+)'''
 
@@ -70,6 +70,9 @@ class RtlNlIE(InfoExtractor):
     }, {
         'url': 'http://rtlxl.nl/?_ga=1.204735956.572365465.1466978370#!/rtl-nieuws-132237/3c487912-023b-49ac-903e-2c5d79f8410f',
         'only_matching': True,
+    }, {
+        'url': 'https://www.rtl.nl/video/c603c9c2-601d-4b5e-8175-64f1e942dc7d/',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
index 889fa76289dc75aa7e868ee2b7496ee69e2f0e72..89d89b65a8108215a98957e20ae3045c0c205471 100644 (file)
@@ -7,43 +7,84 @@ import itertools
 from .common import InfoExtractor
 from ..compat import (
     compat_str,
+    compat_parse_qs,
+    compat_urllib_parse_urlparse,
 )
 from ..utils import (
     determine_ext,
-    unified_strdate,
+    bool_or_none,
+    int_or_none,
+    try_get,
+    unified_timestamp,
 )
 
 
-class RutubeIE(InfoExtractor):
+class RutubeBaseIE(InfoExtractor):
+    def _extract_video(self, video, video_id=None, require_title=True):
+        title = video['title'] if require_title else video.get('title')
+
+        age_limit = video.get('is_adult')
+        if age_limit is not None:
+            age_limit = 18 if age_limit is True else 0
+
+        uploader_id = try_get(video, lambda x: x['author']['id'])
+        category = try_get(video, lambda x: x['category']['name'])
+
+        return {
+            'id': video.get('id') or video_id,
+            'title': title,
+            'description': video.get('description'),
+            'thumbnail': video.get('thumbnail_url'),
+            'duration': int_or_none(video.get('duration')),
+            'uploader': try_get(video, lambda x: x['author']['name']),
+            'uploader_id': compat_str(uploader_id) if uploader_id else None,
+            'timestamp': unified_timestamp(video.get('created_ts')),
+            'category': [category] if category else None,
+            'age_limit': age_limit,
+            'view_count': int_or_none(video.get('hits')),
+            'comment_count': int_or_none(video.get('comments_count')),
+            'is_live': bool_or_none(video.get('is_livestream')),
+        }
+
+
+class RutubeIE(RutubeBaseIE):
     IE_NAME = 'rutube'
     IE_DESC = 'Rutube videos'
     _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/(?P<id>[\da-z]{32})'
 
     _TESTS = [{
         'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/',
+        'md5': '79938ade01294ef7e27574890d0d3769',
         'info_dict': {
             'id': '3eac3b4561676c17df9132a9a1e62e3e',
-            'ext': 'mp4',
+            'ext': 'flv',
             'title': 'Раненный кенгуру забежал в аптеку',
             'description': 'http://www.ntdtv.ru ',
             'duration': 80,
             'uploader': 'NTDRussian',
             'uploader_id': '29790',
+            'timestamp': 1381943602,
             'upload_date': '20131016',
             'age_limit': 0,
         },
-        'params': {
-            # It requires ffmpeg (m3u8 download)
-            'skip_download': True,
-        },
     }, {
         'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661',
         'only_matching': True,
     }, {
         'url': 'http://rutube.ru/embed/a10e53b86e8f349080f718582ce4c661',
         'only_matching': True,
+    }, {
+        'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/?pl_id=4252',
+        'only_matching': True,
+    }, {
+        'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_type=source',
+        'only_matching': True,
     }]
 
+    @classmethod
+    def suitable(cls, url):
+        return False if RutubePlaylistIE.suitable(url) else super(RutubeIE, cls).suitable(url)
+
     @staticmethod
     def _extract_urls(webpage):
         return [mobj.group('url') for mobj in re.finditer(
@@ -52,12 +93,12 @@ class RutubeIE(InfoExtractor):
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
+
         video = self._download_json(
             'http://rutube.ru/api/video/%s/?format=json' % video_id,
             video_id, 'Downloading video JSON')
 
-        # Some videos don't have the author field
-        author = video.get('author') or {}
+        info = self._extract_video(video, video_id)
 
         options = self._download_json(
             'http://rutube.ru/api/play/options/%s/?format=json' % video_id,
@@ -79,19 +120,8 @@ class RutubeIE(InfoExtractor):
                 })
         self._sort_formats(formats)
 
-        return {
-            'id': video['id'],
-            'title': video['title'],
-            'description': video['description'],
-            'duration': video['duration'],
-            'view_count': video['hits'],
-            'formats': formats,
-            'thumbnail': video['thumbnail_url'],
-            'uploader': author.get('name'),
-            'uploader_id': compat_str(author['id']) if author else None,
-            'upload_date': unified_strdate(video['created_ts']),
-            'age_limit': 18 if video['is_adult'] else 0,
-        }
+        info['formats'] = formats
+        return info
 
 
 class RutubeEmbedIE(InfoExtractor):
@@ -103,7 +133,8 @@ class RutubeEmbedIE(InfoExtractor):
         'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=',
         'info_dict': {
             'id': 'a10e53b86e8f349080f718582ce4c661',
-            'ext': 'mp4',
+            'ext': 'flv',
+            'timestamp': 1387830582,
             'upload_date': '20131223',
             'uploader_id': '297833',
             'description': 'Видео группы ★http://vk.com/foxkidsreset★ музей Fox Kids и Jetix<br/><br/> восстановлено и сделано в шикоформате subziro89 http://vk.com/subziro89',
@@ -111,7 +142,7 @@ class RutubeEmbedIE(InfoExtractor):
             'title': 'Мистический городок Эйри в Индиан 5 серия озвучка subziro89',
         },
         'params': {
-            'skip_download': 'Requires ffmpeg',
+            'skip_download': True,
         },
     }, {
         'url': 'http://rutube.ru/play/embed/8083783',
@@ -125,10 +156,51 @@ class RutubeEmbedIE(InfoExtractor):
         canonical_url = self._html_search_regex(
             r'<link\s+rel="canonical"\s+href="([^"]+?)"', webpage,
             'Canonical URL')
-        return self.url_result(canonical_url, 'Rutube')
+        return self.url_result(canonical_url, RutubeIE.ie_key())
+
+
+class RutubePlaylistBaseIE(RutubeBaseIE):
+    def _next_page_url(self, page_num, playlist_id, *args, **kwargs):
+        return self._PAGE_TEMPLATE % (playlist_id, page_num)
 
+    def _entries(self, playlist_id, *args, **kwargs):
+        next_page_url = None
+        for pagenum in itertools.count(1):
+            page = self._download_json(
+                next_page_url or self._next_page_url(
+                    pagenum, playlist_id, *args, **kwargs),
+                playlist_id, 'Downloading page %s' % pagenum)
+
+            results = page.get('results')
+            if not results or not isinstance(results, list):
+                break
+
+            for result in results:
+                video_url = result.get('video_url')
+                if not video_url or not isinstance(video_url, compat_str):
+                    continue
+                entry = self._extract_video(result, require_title=False)
+                entry.update({
+                    '_type': 'url',
+                    'url': video_url,
+                    'ie_key': RutubeIE.ie_key(),
+                })
+                yield entry
 
-class RutubeChannelIE(InfoExtractor):
+            next_page_url = page.get('next')
+            if not next_page_url or not page.get('has_next'):
+                break
+
+    def _extract_playlist(self, playlist_id, *args, **kwargs):
+        return self.playlist_result(
+            self._entries(playlist_id, *args, **kwargs),
+            playlist_id, kwargs.get('playlist_name'))
+
+    def _real_extract(self, url):
+        return self._extract_playlist(self._match_id(url))
+
+
+class RutubeChannelIE(RutubePlaylistBaseIE):
     IE_NAME = 'rutube:channel'
     IE_DESC = 'Rutube channels'
     _VALID_URL = r'https?://rutube\.ru/tags/video/(?P<id>\d+)'
@@ -142,27 +214,8 @@ class RutubeChannelIE(InfoExtractor):
 
     _PAGE_TEMPLATE = 'http://rutube.ru/api/tags/video/%s/?page=%s&format=json'
 
-    def _extract_videos(self, channel_id, channel_title=None):
-        entries = []
-        for pagenum in itertools.count(1):
-            page = self._download_json(
-                self._PAGE_TEMPLATE % (channel_id, pagenum),
-                channel_id, 'Downloading page %s' % pagenum)
-            results = page['results']
-            if not results:
-                break
-            entries.extend(self.url_result(result['video_url'], 'Rutube') for result in results)
-            if not page['has_next']:
-                break
-        return self.playlist_result(entries, channel_id, channel_title)
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        channel_id = mobj.group('id')
-        return self._extract_videos(channel_id)
-
 
-class RutubeMovieIE(RutubeChannelIE):
+class RutubeMovieIE(RutubePlaylistBaseIE):
     IE_NAME = 'rutube:movie'
     IE_DESC = 'Rutube movies'
     _VALID_URL = r'https?://rutube\.ru/metainfo/tv/(?P<id>\d+)'
@@ -176,11 +229,11 @@ class RutubeMovieIE(RutubeChannelIE):
         movie = self._download_json(
             self._MOVIE_TEMPLATE % movie_id, movie_id,
             'Downloading movie JSON')
-        movie_name = movie['name']
-        return self._extract_videos(movie_id, movie_name)
+        return self._extract_playlist(
+            movie_id, playlist_name=movie.get('name'))
 
 
-class RutubePersonIE(RutubeChannelIE):
+class RutubePersonIE(RutubePlaylistBaseIE):
     IE_NAME = 'rutube:person'
     IE_DESC = 'Rutube person videos'
     _VALID_URL = r'https?://rutube\.ru/video/person/(?P<id>\d+)'
@@ -193,3 +246,37 @@ class RutubePersonIE(RutubeChannelIE):
     }]
 
     _PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json'
+
+
+class RutubePlaylistIE(RutubePlaylistBaseIE):
+    IE_NAME = 'rutube:playlist'
+    IE_DESC = 'Rutube playlists'
+    _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/[\da-z]{32}/\?.*?\bpl_id=(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://rutube.ru/video/cecd58ed7d531fc0f3d795d51cee9026/?pl_id=3097&pl_type=tag',
+        'info_dict': {
+            'id': '3097',
+        },
+        'playlist_count': 27,
+    }, {
+        'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_id=4252&pl_type=source',
+        'only_matching': True,
+    }]
+
+    _PAGE_TEMPLATE = 'http://rutube.ru/api/playlist/%s/%s/?page=%s&format=json'
+
+    @classmethod
+    def suitable(cls, url):
+        if not super(RutubePlaylistIE, cls).suitable(url):
+            return False
+        params = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+        return params.get('pl_type', [None])[0] and int_or_none(params.get('pl_id', [None])[0])
+
+    def _next_page_url(self, page_num, playlist_id, item_kind):
+        return self._PAGE_TEMPLATE % (item_kind, playlist_id, page_num)
+
+    def _real_extract(self, url):
+        qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+        playlist_kind = qs['pl_type'][0]
+        playlist_id = qs['pl_id'][0]
+        return self._extract_playlist(playlist_id, item_kind=playlist_kind)
index a5e672c0a674e3461c261e0b6b2ca7ca9435ea30..d2713c19a053cba19448ad772525157144b19efa 100644 (file)
@@ -13,11 +13,15 @@ from ..utils import (
 class RUTVIE(InfoExtractor):
     IE_DESC = 'RUTV.RU'
     _VALID_URL = r'''(?x)
-        https?://player\.(?:rutv\.ru|vgtrk\.com)/
-            (?P<path>flash\d+v/container\.swf\?id=
-            |iframe/(?P<type>swf|video|live)/id/
-            |index/iframe/cast_id/)
-            (?P<id>\d+)'''
+                    https?://
+                        (?:test)?player\.(?:rutv\.ru|vgtrk\.com)/
+                        (?P<path>
+                            flash\d+v/container\.swf\?id=|
+                            iframe/(?P<type>swf|video|live)/id/|
+                            index/iframe/cast_id/
+                        )
+                        (?P<id>\d+)
+                    '''
 
     _TESTS = [
         {
@@ -99,17 +103,21 @@ class RUTVIE(InfoExtractor):
                 'skip_download': True,
             },
         },
+        {
+            'url': 'https://testplayer.vgtrk.com/iframe/live/id/19201/showZoomBtn/false/isPlay/true/',
+            'only_matching': True,
+        },
     ]
 
     @classmethod
     def _extract_url(cls, webpage):
         mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage)
+            r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage)
         if mobj:
             return mobj.group('url')
 
         mobj = re.search(
-            r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)',
+            r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)',
             webpage)
         if mobj:
             return mobj.group('url')
diff --git a/youtube_dl/extractor/ruv.py b/youtube_dl/extractor/ruv.py
new file mode 100644 (file)
index 0000000..8f3cc40
--- /dev/null
@@ -0,0 +1,101 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    unified_timestamp,
+)
+
+
+class RuvIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?ruv\.is/(?:sarpurinn/[^/]+|node)/(?P<id>[^/]+(?:/\d+)?)'
+    _TESTS = [{
+        # m3u8
+        'url': 'http://ruv.is/sarpurinn/ruv-aukaras/fh-valur/20170516',
+        'md5': '66347652f4e13e71936817102acc1724',
+        'info_dict': {
+            'id': '1144499',
+            'display_id': 'fh-valur/20170516',
+            'ext': 'mp4',
+            'title': 'FH - Valur',
+            'description': 'Bein útsending frá 3. leik FH og Vals í úrslitum Olísdeildar karla í handbolta.',
+            'timestamp': 1494963600,
+            'upload_date': '20170516',
+        },
+    }, {
+        # mp3
+        'url': 'http://ruv.is/sarpurinn/ras-2/morgunutvarpid/20170619',
+        'md5': '395ea250c8a13e5fdb39d4670ef85378',
+        'info_dict': {
+            'id': '1153630',
+            'display_id': 'morgunutvarpid/20170619',
+            'ext': 'mp3',
+            'title': 'Morgunútvarpið',
+            'description': 'md5:a4cf1202c0a1645ca096b06525915418',
+            'timestamp': 1497855000,
+            'upload_date': '20170619',
+        },
+    }, {
+        'url': 'http://ruv.is/sarpurinn/ruv/frettir/20170614',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.ruv.is/node/1151854',
+        'only_matching': True,
+    }, {
+        'url': 'http://ruv.is/sarpurinn/klippa/secret-soltice-hefst-a-morgun',
+        'only_matching': True,
+    }, {
+        'url': 'http://ruv.is/sarpurinn/ras-1/morgunvaktin/20170619',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        title = self._og_search_title(webpage)
+
+        FIELD_RE = r'video\.%s\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1'
+
+        media_url = self._html_search_regex(
+            FIELD_RE % 'src', webpage, 'video URL', group='url')
+
+        video_id = self._search_regex(
+            r'<link\b[^>]+\bhref=["\']https?://www\.ruv\.is/node/(\d+)',
+            webpage, 'video id', default=display_id)
+
+        ext = determine_ext(media_url)
+
+        if ext == 'm3u8':
+            formats = self._extract_m3u8_formats(
+                media_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                m3u8_id='hls')
+        elif ext == 'mp3':
+            formats = [{
+                'format_id': 'mp3',
+                'url': media_url,
+                'vcodec': 'none',
+            }]
+        else:
+            formats = [{
+                'url': media_url,
+            }]
+
+        description = self._og_search_description(webpage, default=None)
+        thumbnail = self._og_search_thumbnail(
+            webpage, default=None) or self._search_regex(
+            FIELD_RE % 'poster', webpage, 'thumbnail', fatal=False)
+        timestamp = unified_timestamp(self._html_search_meta(
+            'article:published_time', webpage, 'timestamp', fatal=False))
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'formats': formats,
+        }
index c3aec1edde5e9d02efb377fa39941ae01d2f04b4..909a6ba97993f161c6c39646b073185733626d18 100644 (file)
@@ -16,7 +16,6 @@ from ..utils import (
 
 class SafariBaseIE(InfoExtractor):
     _LOGIN_URL = 'https://www.safaribooksonline.com/accounts/login/'
-    _SUCCESSFUL_LOGIN_REGEX = r'<a href="/accounts/logout/"[^>]*>Sign Out</a>'
     _NETRC_MACHINE = 'safari'
 
     _API_BASE = 'https://www.safaribooksonline.com/api/v1'
@@ -28,10 +27,6 @@ class SafariBaseIE(InfoExtractor):
         self._login()
 
     def _login(self):
-        # We only need to log in once for courses or individual videos
-        if self.LOGGED_IN:
-            return
-
         (username, password) = self._get_login_info()
         if username is None:
             return
@@ -39,11 +34,17 @@ class SafariBaseIE(InfoExtractor):
         headers = std_headers.copy()
         if 'Referer' not in headers:
             headers['Referer'] = self._LOGIN_URL
-        login_page_request = sanitized_Request(self._LOGIN_URL, headers=headers)
 
         login_page = self._download_webpage(
-            login_page_request, None,
-            'Downloading login form')
+            self._LOGIN_URL, None, 'Downloading login form', headers=headers)
+
+        def is_logged(webpage):
+            return any(re.search(p, webpage) for p in (
+                r'href=["\']/accounts/logout/', r'>Sign Out<'))
+
+        if is_logged(login_page):
+            self.LOGGED_IN = True
+            return
 
         csrf = self._html_search_regex(
             r"name='csrfmiddlewaretoken'\s+value='([^']+)'",
@@ -62,14 +63,12 @@ class SafariBaseIE(InfoExtractor):
         login_page = self._download_webpage(
             request, None, 'Logging in as %s' % username)
 
-        if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None:
+        if not is_logged(login_page):
             raise ExtractorError(
                 'Login failed; make sure your credentials are correct and try again.',
                 expected=True)
 
-        SafariBaseIE.LOGGED_IN = True
-
-        self.to_screen('Login successful')
+        self.LOGGED_IN = True
 
 
 class SafariIE(SafariBaseIE):
index 5e22ea73029b3d254d76fa0722c8041daa17a6fd..3df51520b06e8bc2da1b3afa3fee75db4e21be46 100644 (file)
@@ -32,8 +32,9 @@ class SexuIE(InfoExtractor):
         formats = [{
             'url': source['file'].replace('\\', ''),
             'format_id': source.get('label'),
-            'height': self._search_regex(
-                r'^(\d+)[pP]', source.get('label', ''), 'height', default=None),
+            'height': int(self._search_regex(
+                r'^(\d+)[pP]', source.get('label', ''), 'height',
+                default=None)),
         } for source in sources if source.get('file')]
         self._sort_formats(formats)
 
index 74a1dc672e7725f2f3500284a53ade4ca16c380d..e89ebebe7567ebb0abae4a15b21b100339c64b44 100644 (file)
@@ -31,7 +31,7 @@ class SlideshareIE(InfoExtractor):
         page_title = mobj.group('title')
         webpage = self._download_webpage(url, page_title)
         slideshare_obj = self._search_regex(
-            r'\$\.extend\(slideshare_object,\s*(\{.*?\})\);',
+            r'\$\.extend\(.*?slideshare_object,\s*(\{.*?\})\);',
             webpage, 'slideshare object')
         info = json.loads(slideshare_obj)
         if info['slideshow']['type'] != 'video':
index 7da12cef8fe1c06a064ee82da3e08ed39a39991d..a62ed84f1e34b75a721914d68a101ad2a0bb8983 100644 (file)
@@ -8,7 +8,11 @@ from ..compat import (
     compat_str,
     compat_urllib_parse_urlencode,
 )
-from ..utils import ExtractorError
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    try_get,
+)
 
 
 class SohuIE(InfoExtractor):
@@ -169,10 +173,11 @@ class SohuIE(InfoExtractor):
                 formats.append({
                     'url': video_url,
                     'format_id': format_id,
-                    'filesize': data['clipsBytes'][i],
-                    'width': data['width'],
-                    'height': data['height'],
-                    'fps': data['fps'],
+                    'filesize': int_or_none(
+                        try_get(data, lambda x: x['clipsBytes'][i])),
+                    'width': int_or_none(data.get('width')),
+                    'height': int_or_none(data.get('height')),
+                    'fps': int_or_none(data.get('fps')),
                 })
             self._sort_formats(formats)
 
index 0ee4a8ff8988de543e08e8a06d78dac2b27b018a..1c6799d579523806e9912912a0f50aa187c8773b 100644 (file)
@@ -1,8 +1,8 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import re
 import itertools
+import re
 
 from .common import (
     InfoExtractor,
@@ -17,6 +17,7 @@ from ..utils import (
     ExtractorError,
     int_or_none,
     unified_strdate,
+    update_url_query,
 )
 
 
@@ -31,6 +32,7 @@ class SoundcloudIE(InfoExtractor):
 
     _VALID_URL = r'''(?x)^(?:https?://)?
                     (?:(?:(?:www\.|m\.)?soundcloud\.com/
+                            (?!stations/track)
                             (?P<uploader>[\w\d-]+)/
                             (?!(?:tracks|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#]))
                             (?P<title>[\w\d-]+)/?
@@ -119,9 +121,24 @@ class SoundcloudIE(InfoExtractor):
                 'license': 'cc-by-sa',
             },
         },
+        # private link, downloadable format
+        {
+            'url': 'https://soundcloud.com/oriuplift/uponly-238-no-talking-wav/s-AyZUd',
+            'md5': '64a60b16e617d41d0bef032b7f55441e',
+            'info_dict': {
+                'id': '340344461',
+                'ext': 'wav',
+                'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]',
+                'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366',
+                'uploader': 'Ori Uplift Music',
+                'upload_date': '20170831',
+                'duration': 7449,
+                'license': 'all-rights-reserved',
+            },
+        },
     ]
 
-    _CLIENT_ID = '2t9loNQH90kzJcsFCODdigxfp325aq4z'
+    _CLIENT_ID = 'JlZIsxg2hY5WnBgtn3jfS0UYCl0K8DOg'
     _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf'
 
     @staticmethod
@@ -136,7 +153,7 @@ class SoundcloudIE(InfoExtractor):
 
     @classmethod
     def _resolv_url(cls, url):
-        return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID
+        return 'https://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID
 
     def _extract_info_dict(self, info, full_title=None, quiet=False, secret_token=None):
         track_id = compat_str(info['id'])
@@ -159,11 +176,13 @@ class SoundcloudIE(InfoExtractor):
             'license': info.get('license'),
         }
         formats = []
+        query = {'client_id': self._CLIENT_ID}
+        if secret_token is not None:
+            query['secret_token'] = secret_token
         if info.get('downloadable', False):
             # We can build a direct link to the song
-            format_url = (
-                'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format(
-                    track_id, self._CLIENT_ID))
+            format_url = update_url_query(
+                'https://api.soundcloud.com/tracks/%s/download' % track_id, query)
             formats.append({
                 'format_id': 'download',
                 'ext': info.get('original_format', 'mp3'),
@@ -174,11 +193,8 @@ class SoundcloudIE(InfoExtractor):
 
         # We have to retrieve the url
         format_dict = self._download_json(
-            'http://api.soundcloud.com/i1/tracks/%s/streams' % track_id,
-            track_id, 'Downloading track url', query={
-                'client_id': self._CLIENT_ID,
-                'secret_token': secret_token,
-            })
+            'https://api.soundcloud.com/i1/tracks/%s/streams' % track_id,
+            track_id, 'Downloading track url', query=query)
 
         for key, stream_url in format_dict.items():
             abr = int_or_none(self._search_regex(
@@ -215,7 +231,7 @@ class SoundcloudIE(InfoExtractor):
             # cannot be always used, sometimes it can give an HTTP 404 error
             formats.append({
                 'format_id': 'fallback',
-                'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID,
+                'url': update_url_query(info['stream_url'], query),
                 'ext': ext,
             })
 
@@ -236,7 +252,7 @@ class SoundcloudIE(InfoExtractor):
         track_id = mobj.group('track_id')
 
         if track_id is not None:
-            info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID
+            info_json_url = 'https://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID
             full_title = track_id
             token = mobj.group('secret_token')
             if token:
@@ -261,7 +277,7 @@ class SoundcloudIE(InfoExtractor):
 
             self.report_resolve(full_title)
 
-            url = 'http://soundcloud.com/%s' % resolve_title
+            url = 'https://soundcloud.com/%s' % resolve_title
             info_json_url = self._resolv_url(url)
         info = self._download_json(info_json_url, full_title, 'Downloading info JSON')
 
@@ -290,7 +306,7 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
             'id': '2284613',
             'title': 'The Royal Concept EP',
         },
-        'playlist_mincount': 6,
+        'playlist_mincount': 5,
     }, {
         'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep/token',
         'only_matching': True,
@@ -304,7 +320,7 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
         # extract simple title (uploader + slug of song title)
         slug_title = mobj.group('slug_title')
         full_title = '%s/sets/%s' % (uploader, slug_title)
-        url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
+        url = 'https://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
 
         token = mobj.group('token')
         if token:
@@ -330,7 +346,63 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
         }
 
 
-class SoundcloudUserIE(SoundcloudPlaylistBaseIE):
+class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE):
+    _API_BASE = 'https://api.soundcloud.com'
+    _API_V2_BASE = 'https://api-v2.soundcloud.com'
+
+    def _extract_playlist(self, base_url, playlist_id, playlist_title):
+        COMMON_QUERY = {
+            'limit': 50,
+            'client_id': self._CLIENT_ID,
+            'linked_partitioning': '1',
+        }
+
+        query = COMMON_QUERY.copy()
+        query['offset'] = 0
+
+        next_href = base_url + '?' + compat_urllib_parse_urlencode(query)
+
+        entries = []
+        for i in itertools.count():
+            response = self._download_json(
+                next_href, playlist_id, 'Downloading track page %s' % (i + 1))
+
+            collection = response['collection']
+            if not collection:
+                break
+
+            def resolve_permalink_url(candidates):
+                for cand in candidates:
+                    if isinstance(cand, dict):
+                        permalink_url = cand.get('permalink_url')
+                        entry_id = self._extract_id(cand)
+                        if permalink_url and permalink_url.startswith('http'):
+                            return permalink_url, entry_id
+
+            for e in collection:
+                permalink_url, entry_id = resolve_permalink_url((e, e.get('track'), e.get('playlist')))
+                if permalink_url:
+                    entries.append(self.url_result(permalink_url, video_id=entry_id))
+
+            next_href = response.get('next_href')
+            if not next_href:
+                break
+
+            parsed_next_href = compat_urlparse.urlparse(response['next_href'])
+            qs = compat_urlparse.parse_qs(parsed_next_href.query)
+            qs.update(COMMON_QUERY)
+            next_href = compat_urlparse.urlunparse(
+                parsed_next_href._replace(query=compat_urllib_parse_urlencode(qs, True)))
+
+        return {
+            '_type': 'playlist',
+            'id': playlist_id,
+            'title': playlist_title,
+            'entries': entries,
+        }
+
+
+class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE):
     _VALID_URL = r'''(?x)
                         https?://
                             (?:(?:www|m)\.)?soundcloud\.com/
@@ -380,21 +452,18 @@ class SoundcloudUserIE(SoundcloudPlaylistBaseIE):
         'url': 'https://soundcloud.com/grynpyret/spotlight',
         'info_dict': {
             'id': '7098329',
-            'title': 'GRYNPYRET (Spotlight)',
+            'title': 'Grynpyret (Spotlight)',
         },
         'playlist_mincount': 1,
     }]
 
-    _API_BASE = 'https://api.soundcloud.com'
-    _API_V2_BASE = 'https://api-v2.soundcloud.com'
-
     _BASE_URL_MAP = {
-        'all': '%s/profile/soundcloud:users:%%s' % _API_V2_BASE,
-        'tracks': '%s/users/%%s/tracks' % _API_BASE,
-        'sets': '%s/users/%%s/playlists' % _API_V2_BASE,
-        'reposts': '%s/profile/soundcloud:users:%%s/reposts' % _API_V2_BASE,
-        'likes': '%s/users/%%s/likes' % _API_V2_BASE,
-        'spotlight': '%s/users/%%s/spotlight' % _API_V2_BASE,
+        'all': '%s/profile/soundcloud:users:%%s' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
+        'tracks': '%s/users/%%s/tracks' % SoundcloudPagedPlaylistBaseIE._API_BASE,
+        'sets': '%s/users/%%s/playlists' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
+        'reposts': '%s/profile/soundcloud:users:%%s/reposts' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
+        'likes': '%s/users/%%s/likes' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
+        'spotlight': '%s/users/%%s/spotlight' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
     }
 
     _TITLE_MAP = {
@@ -410,70 +479,49 @@ class SoundcloudUserIE(SoundcloudPlaylistBaseIE):
         mobj = re.match(self._VALID_URL, url)
         uploader = mobj.group('user')
 
-        url = 'http://soundcloud.com/%s/' % uploader
+        url = 'https://soundcloud.com/%s/' % uploader
         resolv_url = self._resolv_url(url)
         user = self._download_json(
             resolv_url, uploader, 'Downloading user info')
 
         resource = mobj.group('rsrc') or 'all'
-        base_url = self._BASE_URL_MAP[resource] % user['id']
 
-        COMMON_QUERY = {
-            'limit': 50,
-            'client_id': self._CLIENT_ID,
-            'linked_partitioning': '1',
-        }
+        return self._extract_playlist(
+            self._BASE_URL_MAP[resource] % user['id'], compat_str(user['id']),
+            '%s (%s)' % (user['username'], self._TITLE_MAP[resource]))
 
-        query = COMMON_QUERY.copy()
-        query['offset'] = 0
-
-        next_href = base_url + '?' + compat_urllib_parse_urlencode(query)
 
-        entries = []
-        for i in itertools.count():
-            response = self._download_json(
-                next_href, uploader, 'Downloading track page %s' % (i + 1))
-
-            collection = response['collection']
-            if not collection:
-                break
-
-            def resolve_permalink_url(candidates):
-                for cand in candidates:
-                    if isinstance(cand, dict):
-                        permalink_url = cand.get('permalink_url')
-                        entry_id = self._extract_id(cand)
-                        if permalink_url and permalink_url.startswith('http'):
-                            return permalink_url, entry_id
+class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE):
+    _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/stations/track/[^/]+/(?P<id>[^/?#&]+)'
+    IE_NAME = 'soundcloud:trackstation'
+    _TESTS = [{
+        'url': 'https://soundcloud.com/stations/track/officialsundial/your-text',
+        'info_dict': {
+            'id': '286017854',
+            'title': 'Track station: your-text',
+        },
+        'playlist_mincount': 47,
+    }]
 
-            for e in collection:
-                permalink_url, entry_id = resolve_permalink_url((e, e.get('track'), e.get('playlist')))
-                if permalink_url:
-                    entries.append(self.url_result(permalink_url, video_id=entry_id))
+    def _real_extract(self, url):
+        track_name = self._match_id(url)
 
-            next_href = response.get('next_href')
-            if not next_href:
-                break
+        webpage = self._download_webpage(url, track_name)
 
-            parsed_next_href = compat_urlparse.urlparse(response['next_href'])
-            qs = compat_urlparse.parse_qs(parsed_next_href.query)
-            qs.update(COMMON_QUERY)
-            next_href = compat_urlparse.urlunparse(
-                parsed_next_href._replace(query=compat_urllib_parse_urlencode(qs, True)))
+        track_id = self._search_regex(
+            r'soundcloud:track-stations:(\d+)', webpage, 'track id')
 
-        return {
-            '_type': 'playlist',
-            'id': compat_str(user['id']),
-            'title': '%s (%s)' % (user['username'], self._TITLE_MAP[resource]),
-            'entries': entries,
-        }
+        return self._extract_playlist(
+            '%s/stations/soundcloud:track-stations:%s/tracks'
+            % (self._API_V2_BASE, track_id),
+            track_id, 'Track station: %s' % track_name)
 
 
 class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE):
     _VALID_URL = r'https?://api\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$'
     IE_NAME = 'soundcloud:playlist'
     _TESTS = [{
-        'url': 'http://api.soundcloud.com/playlists/4110309',
+        'url': 'https://api.soundcloud.com/playlists/4110309',
         'info_dict': {
             'id': '4110309',
             'title': 'TILT Brass - Bowery Poetry Club, August \'03 [Non-Site SCR 02]',
index ec1b603889754af70d516e70c123be7d2604387a..84298fee4279bb1484f5e52edbfa03f0bc148b01 100644 (file)
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from .nexx import NexxEmbedIE
 from .spiegeltv import SpiegeltvIE
 from ..compat import compat_urlparse
 from ..utils import (
@@ -121,6 +122,26 @@ class SpiegelArticleIE(InfoExtractor):
 
         },
         'playlist_count': 6,
+    }, {
+        # Nexx iFrame embed
+        'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html',
+        'info_dict': {
+            'id': '161464',
+            'ext': 'mp4',
+            'title': 'Nervenkitzel Achterbahn',
+            'alt_title': 'Karussellbauer in Deutschland',
+            'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc',
+            'release_year': 2005,
+            'creator': 'SPIEGEL TV',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 2761,
+            'timestamp': 1394021479,
+            'upload_date': '20140305',
+        },
+        'params': {
+            'format': 'bestvideo',
+            'skip_download': True,
+        },
     }]
 
     def _real_extract(self, url):
@@ -143,6 +164,9 @@ class SpiegelArticleIE(InfoExtractor):
         entries = [
             self.url_result(compat_urlparse.urljoin(
                 self.http_scheme() + '//spiegel.de/', embed_path))
-            for embed_path in embeds
-        ]
-        return self.playlist_result(entries)
+            for embed_path in embeds]
+        if embeds:
+            return self.playlist_result(entries)
+
+        return self.playlist_from_matches(
+            NexxEmbedIE._extract_urls(webpage), ie=NexxEmbedIE.ie_key())
index e1cfb869834cf0d50b04a11c5fc137d7c9afcad8..6ccf4c3423a608d39b2bb4bfe1c189a1705cc8da 100644 (file)
-# coding: utf-8
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..compat import compat_urllib_parse_urlparse
-from ..utils import (
-    determine_ext,
-    float_or_none,
-)
+from .nexx import NexxIE
 
 
 class SpiegeltvIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/(?:#/)?filme/(?P<id>[\-a-z0-9]+)'
-    _TESTS = [{
-        'url': 'http://www.spiegel.tv/filme/flug-mh370/',
-        'info_dict': {
-            'id': 'flug-mh370',
-            'ext': 'm4v',
-            'title': 'Flug MH370',
-            'description': 'Das Rätsel um die Boeing 777 der Malaysia-Airlines',
-            'thumbnail': r're:http://.*\.jpg$',
-        },
-        'params': {
-            # m3u8 download
-            'skip_download': True,
-        }
-    }, {
-        'url': 'http://www.spiegel.tv/#/filme/alleskino-die-wahrheit-ueber-maenner/',
+    _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/videos/(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.spiegel.tv/videos/161681-flug-mh370/',
         'only_matching': True,
-    }]
+    }
 
     def _real_extract(self, url):
-        if '/#/' in url:
-            url = url.replace('/#/', '/')
-        video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
-        title = self._html_search_regex(r'<h1.*?>(.*?)</h1>', webpage, 'title')
-
-        apihost = 'http://spiegeltv-ivms2-restapi.s3.amazonaws.com'
-        version_json = self._download_json(
-            '%s/version.json' % apihost, video_id,
-            note='Downloading version information')
-        version_name = version_json['version_name']
-
-        slug_json = self._download_json(
-            '%s/%s/restapi/slugs/%s.json' % (apihost, version_name, video_id),
-            video_id,
-            note='Downloading object information')
-        oid = slug_json['object_id']
-
-        media_json = self._download_json(
-            '%s/%s/restapi/media/%s.json' % (apihost, version_name, oid),
-            video_id, note='Downloading media information')
-        uuid = media_json['uuid']
-        is_wide = media_json['is_wide']
-
-        server_json = self._download_json(
-            'http://spiegeltv-prod-static.s3.amazonaws.com/projectConfigs/projectConfig.json',
-            video_id, note='Downloading server information')
-
-        format = '16x9' if is_wide else '4x3'
-
-        formats = []
-        for streamingserver in server_json['streamingserver']:
-            endpoint = streamingserver.get('endpoint')
-            if not endpoint:
-                continue
-            play_path = 'mp4:%s_spiegeltv_0500_%s.m4v' % (uuid, format)
-            if endpoint.startswith('rtmp'):
-                formats.append({
-                    'url': endpoint,
-                    'format_id': 'rtmp',
-                    'app': compat_urllib_parse_urlparse(endpoint).path[1:],
-                    'play_path': play_path,
-                    'player_path': 'http://prod-static.spiegel.tv/frontend-076.swf',
-                    'ext': 'flv',
-                    'rtmp_live': True,
-                })
-            elif determine_ext(endpoint) == 'm3u8':
-                formats.append({
-                    'url': endpoint.replace('[video]', play_path),
-                    'ext': 'm4v',
-                    'format_id': 'hls',  # Prefer hls since it allows to workaround georestriction
-                    'protocol': 'm3u8',
-                    'preference': 1,
-                    'http_headers': {
-                        'Accept-Encoding': 'deflate',  # gzip causes trouble on the server side
-                    },
-                })
-            else:
-                formats.append({
-                    'url': endpoint,
-                })
-        self._check_formats(formats, video_id)
-
-        thumbnails = []
-        for image in media_json['images']:
-            thumbnails.append({
-                'url': image['url'],
-                'width': image['width'],
-                'height': image['height'],
-            })
-
-        description = media_json['subtitle']
-        duration = float_or_none(media_json.get('duration_in_ms'), scale=1000)
-
-        return {
-            'id': video_id,
-            'title': title,
-            'description': description,
-            'duration': duration,
-            'thumbnails': thumbnails,
-            'formats': formats,
-        }
+        return self.url_result(
+            'https://api.nexx.cloud/v3/748/videos/byid/%s'
+            % self._match_id(url), ie=NexxIE.ie_key())
index e7bd5bf91921752757e1bc9beb390798b86a9c8a..54497c880ec2cc9cbfc5ff20756cbc19d4ee6c65 100644 (file)
@@ -4,7 +4,11 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..utils import js_to_json
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    js_to_json,
+)
 
 
 class SportBoxEmbedIE(InfoExtractor):
@@ -14,8 +18,10 @@ class SportBoxEmbedIE(InfoExtractor):
         'info_dict': {
             'id': '211355',
             'ext': 'mp4',
-            'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»',
+            'title': '211355',
             'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 292,
+            'view_count': int,
         },
         'params': {
             # m3u8 download
@@ -24,6 +30,9 @@ class SportBoxEmbedIE(InfoExtractor):
     }, {
         'url': 'http://news.sportbox.ru/vdl/player?nid=370908&only_player=1&autostart=false&playeri=2&height=340&width=580',
         'only_matching': True,
+    }, {
+        'url': 'https://news.sportbox.ru/vdl/player/media/193095',
+        'only_matching': True,
     }]
 
     @staticmethod
@@ -37,36 +46,34 @@ class SportBoxEmbedIE(InfoExtractor):
 
         webpage = self._download_webpage(url, video_id)
 
-        formats = []
-
-        def cleanup_js(code):
-            # desktop_advert_config contains complex Javascripts and we don't need it
-            return js_to_json(re.sub(r'desktop_advert_config.*', '', code))
-
-        jwplayer_data = self._parse_json(self._search_regex(
-            r'(?s)player\.setup\(({.+?})\);', webpage, 'jwplayer settings'), video_id,
-            transform_source=cleanup_js)
-
-        hls_url = jwplayer_data.get('hls_url')
-        if hls_url:
-            formats.extend(self._extract_m3u8_formats(
-                hls_url, video_id, ext='mp4', m3u8_id='hls'))
-
-        rtsp_url = jwplayer_data.get('rtsp_url')
-        if rtsp_url:
-            formats.append({
-                'url': rtsp_url,
-                'format_id': 'rtsp',
-            })
+        wjplayer_data = self._parse_json(
+            self._search_regex(
+                r'(?s)wjplayer\(({.+?})\);', webpage, 'wjplayer settings'),
+            video_id, transform_source=js_to_json)
 
+        formats = []
+        for source in wjplayer_data['sources']:
+            src = source.get('src')
+            if not src:
+                continue
+            if determine_ext(src) == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    src, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+            else:
+                formats.append({
+                    'url': src,
+                })
         self._sort_formats(formats)
 
-        title = jwplayer_data['node_title']
-        thumbnail = jwplayer_data.get('image_url')
+        view_count = int_or_none(self._search_regex(
+            r'Просмотров\s*:\s*(\d+)', webpage, 'view count', default=None))
 
         return {
             'id': video_id,
-            'title': title,
-            'thumbnail': thumbnail,
+            'title': video_id,
+            'thumbnail': wjplayer_data.get('poster'),
+            'duration': int_or_none(wjplayer_data.get('duration')),
+            'view_count': view_count,
             'formats': formats,
         }
index aa4fad162c5ff2e50b6c055af3b0673a3f72f39e..a9e34c027504ff0f6585fa51cae4ba3a1ec5bfcd 100644 (file)
@@ -21,6 +21,17 @@ class StreamangoIE(InfoExtractor):
             'ext': 'mp4',
             'title': '20170315_150006.mp4',
         }
+    }, {
+        # no og:title
+        'url': 'https://streamango.com/embed/foqebrpftarclpob/asdf_asd_2_mp4',
+        'info_dict': {
+            'id': 'foqebrpftarclpob',
+            'ext': 'mp4',
+            'title': 'foqebrpftarclpob',
+        },
+        'params': {
+            'skip_download': True,
+        },
     }, {
         'url': 'https://streamango.com/embed/clapasobsptpkdfe/20170315_150006_mp4',
         'only_matching': True,
@@ -31,7 +42,7 @@ class StreamangoIE(InfoExtractor):
 
         webpage = self._download_webpage(url, video_id)
 
-        title = self._og_search_title(webpage)
+        title = self._og_search_title(webpage, default=video_id)
 
         formats = []
         for format_ in re.findall(r'({[^}]*\bsrc\s*:\s*[^}]*})', webpage):
index 9e533103c88b93157efdd28d7765a7e9ae961603..58e0b4c802781728cf9d4ba17a19236bc3b54565 100644 (file)
@@ -26,7 +26,7 @@ class StreamCZIE(InfoExtractor):
 
     _TESTS = [{
         'url': 'http://www.stream.cz/peklonataliri/765767-ecka-pro-deti',
-        'md5': '6d3ca61a8d0633c9c542b92fcb936b0c',
+        'md5': '934bb6a6d220d99c010783c9719960d5',
         'info_dict': {
             'id': '765767',
             'ext': 'mp4',
@@ -37,7 +37,7 @@ class StreamCZIE(InfoExtractor):
         },
     }, {
         'url': 'http://www.stream.cz/blanik/10002447-tri-roky-pro-mazanka',
-        'md5': 'e54a254fb8b871968fd8403255f28589',
+        'md5': '849a88c1e1ca47d41403c2ba5e59e261',
         'info_dict': {
             'id': '10002447',
             'ext': 'mp4',
@@ -85,6 +85,14 @@ class StreamCZIE(InfoExtractor):
         else:
             title = data['name']
 
+        subtitles = {}
+        srt_url = data.get('subtitles_srt')
+        if srt_url:
+            subtitles['cs'] = [{
+                'ext': 'srt',
+                'url': srt_url,
+            }]
+
         return {
             'id': video_id,
             'title': title,
@@ -93,4 +101,5 @@ class StreamCZIE(InfoExtractor):
             'description': data.get('web_site_text'),
             'duration': int_or_none(data.get('duration')),
             'view_count': int_or_none(data.get('views')),
+            'subtitles': subtitles,
         }
index 1b5afb73ee473b23e78c27caba6708ce9ceb348f..48bc4529e6ae8a265a672c066ac59b388ea3a5d5 100644 (file)
@@ -181,7 +181,8 @@ class SVTPlayIE(SVTBaseIE):
 
         if video_id:
             data = self._download_json(
-                'http://www.svt.se/videoplayer-api/video/%s' % video_id, video_id)
+                'https://api.svt.se/videoplayer-api/video/%s' % video_id,
+                video_id, headers=self.geo_verification_headers())
             info_dict = self._extract_video(data, video_id)
             if not info_dict.get('title'):
                 info_dict['title'] = re.sub(
diff --git a/youtube_dl/extractor/tastytrade.py b/youtube_dl/extractor/tastytrade.py
new file mode 100644 (file)
index 0000000..7fe96bd
--- /dev/null
@@ -0,0 +1,43 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .ooyala import OoyalaIE
+
+
+class TastyTradeIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?tastytrade\.com/tt/shows/[^/]+/episodes/(?P<id>[^/?#&]+)'
+
+    _TESTS = [{
+        'url': 'https://www.tastytrade.com/tt/shows/market-measures/episodes/correlation-in-short-volatility-06-28-2017',
+        'info_dict': {
+            'id': 'F3bnlzbToeI6pLEfRyrlfooIILUjz4nM',
+            'ext': 'mp4',
+            'title': 'A History of Teaming',
+            'description': 'md5:2a9033db8da81f2edffa4c99888140b3',
+            'duration': 422.255,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': ['Ooyala'],
+    }, {
+        'url': 'https://www.tastytrade.com/tt/shows/daily-dose/episodes/daily-dose-06-30-2017',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+
+        ooyala_code = self._search_regex(
+            r'data-media-id=(["\'])(?P<code>(?:(?!\1).)+)\1',
+            webpage, 'ooyala code', group='code')
+
+        info = self._search_json_ld(webpage, display_id, fatal=False)
+        info.update({
+            '_type': 'url_transparent',
+            'ie_key': OoyalaIE.ie_key(),
+            'url': 'ooyala:%s' % ooyala_code,
+            'display_id': display_id,
+        })
+        return info
index bf93eb8682e1f7424216ebd08438dbb6a3a9b955..e9474533f4dc66072539f75adf7abc7b20723240 100644 (file)
@@ -8,6 +8,9 @@ from ..utils import extract_attributes
 
 
 class TBSIE(TurnerBaseIE):
+    # https://github.com/rg3/youtube-dl/issues/13658
+    _WORKING = False
+
     _VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com/videos/(?:[^/]+/)+(?P<id>[^/?#]+)\.html'
     _TESTS = [{
         'url': 'http://www.tbs.com/videos/people-of-earth/season-1/extras/2007318/theatrical-trailer.html',
@@ -17,7 +20,8 @@ class TBSIE(TurnerBaseIE):
             'ext': 'mp4',
             'title': 'Theatrical Trailer',
             'description': 'Catch the latest comedy from TBS, People of Earth, premiering Halloween night--Monday, October 31, at 9/8c.',
-        }
+        },
+        'skip': 'TBS videos are deleted after a while',
     }, {
         'url': 'http://www.tntdrama.com/videos/good-behavior/season-1/extras/1538823/you-better-run.html',
         'md5': 'ce53c6ead5e9f3280b4ad2031a6fab56',
@@ -26,7 +30,8 @@ class TBSIE(TurnerBaseIE):
             'ext': 'mp4',
             'title': 'You Better Run',
             'description': 'Letty Raines must figure out what she\'s running toward while running away from her past. Good Behavior premieres November 15 at 9/8c.',
-        }
+        },
+        'skip': 'TBS videos are deleted after a while',
     }]
 
     def _real_extract(self, url):
diff --git a/youtube_dl/extractor/teamfourstar.py b/youtube_dl/extractor/teamfourstar.py
deleted file mode 100644 (file)
index a8c6ed7..0000000
+++ /dev/null
@@ -1,48 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from .jwplatform import JWPlatformIE
-from ..utils import unified_strdate
-
-
-class TeamFourStarIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?teamfourstar\.com/(?P<id>[a-z0-9\-]+)'
-    _TEST = {
-        'url': 'http://teamfourstar.com/tfs-abridged-parody-episode-1-2/',
-        'info_dict': {
-            'id': '0WdZO31W',
-            'title': 'TFS Abridged Parody Episode 1',
-            'description': 'md5:d60bc389588ebab2ee7ad432bda953ae',
-            'ext': 'mp4',
-            'timestamp': 1394168400,
-            'upload_date': '20080508',
-        },
-    }
-
-    def _real_extract(self, url):
-        display_id = self._match_id(url)
-        webpage = self._download_webpage(url, display_id)
-
-        jwplatform_url = JWPlatformIE._extract_url(webpage)
-
-        video_title = self._html_search_regex(
-            r'<h1[^>]+class="entry-title"[^>]*>(?P<title>.+?)</h1>',
-            webpage, 'title')
-        video_date = unified_strdate(self._html_search_regex(
-            r'<span[^>]+class="meta-date date updated"[^>]*>(?P<date>.+?)</span>',
-            webpage, 'date', fatal=False))
-        video_description = self._html_search_regex(
-            r'(?s)<div[^>]+class="content-inner"[^>]*>.*?(?P<description><p>.+?)</div>',
-            webpage, 'description', fatal=False)
-        video_thumbnail = self._og_search_thumbnail(webpage)
-
-        return {
-            '_type': 'url_transparent',
-            'display_id': display_id,
-            'title': video_title,
-            'description': video_description,
-            'upload_date': video_date,
-            'thumbnail': video_thumbnail,
-            'url': jwplatform_url,
-        }
index 3f3c681aeffb697acec31ece111aa3589d410742..06a27fd0428b469abb64e6428faf263fc080cc33 100644 (file)
@@ -6,7 +6,10 @@ import re
 from .common import InfoExtractor
 
 from ..compat import compat_str
-from ..utils import int_or_none
+from ..utils import (
+    int_or_none,
+    try_get,
+)
 
 
 class TEDIE(InfoExtractor):
@@ -113,8 +116,9 @@ class TEDIE(InfoExtractor):
     }
 
     def _extract_info(self, webpage):
-        info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
-                                       webpage, 'info json')
+        info_json = self._search_regex(
+            r'(?s)q\(\s*"\w+.init"\s*,\s*({.+})\)\s*</script>',
+            webpage, 'info json')
         return json.loads(info_json)
 
     def _real_extract(self, url):
@@ -136,11 +140,16 @@ class TEDIE(InfoExtractor):
         webpage = self._download_webpage(url, name,
                                          'Downloading playlist webpage')
         info = self._extract_info(webpage)
-        playlist_info = info['playlist']
+
+        playlist_info = try_get(
+            info, lambda x: x['__INITIAL_DATA__']['playlist'],
+            dict) or info['playlist']
 
         playlist_entries = [
             self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
-            for talk in info['talks']
+            for talk in try_get(
+                info, lambda x: x['__INITIAL_DATA__']['talks'],
+                dict) or info['talks']
         ]
         return self.playlist_result(
             playlist_entries,
@@ -149,9 +158,14 @@ class TEDIE(InfoExtractor):
 
     def _talk_info(self, url, video_name):
         webpage = self._download_webpage(url, video_name)
-        self.report_extraction(video_name)
 
-        talk_info = self._extract_info(webpage)['talks'][0]
+        info = self._extract_info(webpage)
+
+        talk_info = try_get(
+            info, lambda x: x['__INITIAL_DATA__']['talks'][0],
+            dict) or info['talks'][0]
+
+        title = talk_info['title'].strip()
 
         external = talk_info.get('external')
         if external:
@@ -165,19 +179,27 @@ class TEDIE(InfoExtractor):
                 'url': ext_url or external['uri'],
             }
 
+        native_downloads = try_get(
+            talk_info, lambda x: x['downloads']['nativeDownloads'],
+            dict) or talk_info['nativeDownloads']
+
         formats = [{
             'url': format_url,
             'format_id': format_id,
             'format': format_id,
-        } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
+        } for (format_id, format_url) in native_downloads.items() if format_url is not None]
         if formats:
             for f in formats:
                 finfo = self._NATIVE_FORMATS.get(f['format_id'])
                 if finfo:
                     f.update(finfo)
 
+        player_talk = talk_info['player_talks'][0]
+
+        resources_ = player_talk.get('resources') or talk_info.get('resources')
+
         http_url = None
-        for format_id, resources in talk_info['resources'].items():
+        for format_id, resources in resources_.items():
             if format_id == 'h264':
                 for resource in resources:
                     h264_url = resource.get('file')
@@ -237,14 +259,11 @@ class TEDIE(InfoExtractor):
 
         video_id = compat_str(talk_info['id'])
 
-        thumbnail = talk_info['thumb']
-        if not thumbnail.startswith('http'):
-            thumbnail = 'http://' + thumbnail
         return {
             'id': video_id,
-            'title': talk_info['title'].strip(),
-            'uploader': talk_info['speaker'],
-            'thumbnail': thumbnail,
+            'title': title,
+            'uploader': player_talk.get('speaker') or talk_info.get('speaker'),
+            'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'),
             'description': self._og_search_description(webpage),
             'subtitles': self._get_subtitles(video_id, talk_info),
             'formats': formats,
@@ -252,20 +271,22 @@ class TEDIE(InfoExtractor):
         }
 
     def _get_subtitles(self, video_id, talk_info):
-        languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
-        if languages:
-            sub_lang_list = {}
-            for l in languages:
-                sub_lang_list[l] = [
-                    {
-                        'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
-                        'ext': ext,
-                    }
-                    for ext in ['ted', 'srt']
-                ]
-            return sub_lang_list
-        else:
-            return {}
+        sub_lang_list = {}
+        for language in try_get(
+                talk_info,
+                (lambda x: x['downloads']['languages'],
+                 lambda x: x['languages']), list):
+            lang_code = language.get('languageCode') or language.get('ianaCode')
+            if not lang_code:
+                continue
+            sub_lang_list[lang_code] = [
+                {
+                    'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, lang_code, ext),
+                    'ext': ext,
+                }
+                for ext in ['ted', 'srt']
+            ]
+        return sub_lang_list
 
     def _watch_info(self, url, name):
         webpage = self._download_webpage(url, name)
index 9a424b1c6aeb089af8050d7eee6b29591968c3aa..de236bbba899837f87a748cb7aab6cb8182b77c4 100644 (file)
@@ -80,14 +80,33 @@ class ThePlatformBaseIE(OnceIE):
                     'url': src,
                 })
 
+        duration = info.get('duration')
+        tp_chapters = info.get('chapters', [])
+        chapters = []
+        if tp_chapters:
+            def _add_chapter(start_time, end_time):
+                start_time = float_or_none(start_time, 1000)
+                end_time = float_or_none(end_time, 1000)
+                if start_time is None or end_time is None:
+                    return
+                chapters.append({
+                    'start_time': start_time,
+                    'end_time': end_time,
+                })
+
+            for chapter in tp_chapters[:-1]:
+                _add_chapter(chapter.get('startTime'), chapter.get('endTime'))
+            _add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration)
+
         return {
             'title': info['title'],
             'subtitles': subtitles,
             'description': info['description'],
             'thumbnail': info['defaultThumbnailUrl'],
-            'duration': int_or_none(info.get('duration'), 1000),
+            'duration': float_or_none(duration, 1000),
             'timestamp': int_or_none(info.get('pubDate'), 1000) or None,
             'uploader': info.get('billingCode'),
+            'chapters': chapters,
         }
 
     def _extract_theplatform_metadata(self, path, video_id):
index b8504f0ebdc04ade7d580102f0bcf506bebc4230..cd642355c9fd738bb461bff921aa395fa0304f4b 100644 (file)
@@ -3,10 +3,6 @@ from __future__ import unicode_literals
 from .common import InfoExtractor
 
 from ..compat import compat_urlparse
-from ..utils import (
-    int_or_none,
-    qualities,
-)
 
 
 class TheSceneIE(InfoExtractor):
@@ -24,6 +20,9 @@ class TheSceneIE(InfoExtractor):
             'season': 'Ready To Wear Spring 2013',
             'tags': list,
             'categories': list,
+            'upload_date': '20120913',
+            'timestamp': 1347512400,
+            'uploader': 'vogue',
         },
     }
 
@@ -37,32 +36,9 @@ class TheSceneIE(InfoExtractor):
             self._html_search_regex(
                 r'id=\'js-player-script\'[^>]+src=\'(.+?)\'', webpage, 'player url'))
 
-        player = self._download_webpage(player_url, display_id)
-        info = self._parse_json(
-            self._search_regex(
-                r'(?m)video\s*:\s*({.+?}),$', player, 'info json'),
-            display_id)
-
-        video_id = info['id']
-        title = info['title']
-
-        qualities_order = qualities(('low', 'high'))
-        formats = [{
-            'format_id': '{0}-{1}'.format(f['type'].split('/')[0], f['quality']),
-            'url': f['src'],
-            'quality': qualities_order(f['quality']),
-        } for f in info['sources']]
-        self._sort_formats(formats)
-
         return {
-            'id': video_id,
+            '_type': 'url_transparent',
             'display_id': display_id,
-            'title': title,
-            'formats': formats,
-            'thumbnail': info.get('poster_frame'),
-            'duration': int_or_none(info.get('duration')),
-            'series': info.get('series_title'),
-            'season': info.get('season_title'),
-            'tags': info.get('tags'),
-            'categories': info.get('categories'),
+            'url': player_url,
+            'ie_key': 'CondeNast',
         }
index 197258df141b4b6864afa0e4c1df7d0db431f64e..6ab147ad726306ba9250599d34491a50e64e82d0 100644 (file)
@@ -2,13 +2,15 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import try_get
 
 
 class ThisOldHouseIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?thisoldhouse\.com/(?:watch|how-to|tv-episode)/(?P<id>[^/?#]+)'
     _TESTS = [{
         'url': 'https://www.thisoldhouse.com/how-to/how-to-build-storage-bench',
-        'md5': '946f05bbaa12a33f9ae35580d2dfcfe3',
+        'md5': '568acf9ca25a639f0c4ff905826b662f',
         'info_dict': {
             'id': '2REGtUDQ',
             'ext': 'mp4',
@@ -28,8 +30,15 @@ class ThisOldHouseIE(InfoExtractor):
     def _real_extract(self, url):
         display_id = self._match_id(url)
         webpage = self._download_webpage(url, display_id)
-        drupal_settings = self._parse_json(self._search_regex(
-            r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
-            webpage, 'drupal settings'), display_id)
-        video_id = drupal_settings['jwplatform']['video_id']
+        video_id = self._search_regex(
+            (r'data-mid=(["\'])(?P<id>(?:(?!\1).)+)\1',
+             r'id=(["\'])inline-video-player-(?P<id>(?:(?!\1).)+)\1'),
+            webpage, 'video id', default=None, group='id')
+        if not video_id:
+            drupal_settings = self._parse_json(self._search_regex(
+                r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
+                webpage, 'drupal settings'), display_id)
+            video_id = try_get(
+                drupal_settings, lambda x: x['jwplatform']['video_id'],
+                compat_str) or list(drupal_settings['comScore'])[0]
         return self.url_result('jwplatform:' + video_id, 'JWPlatform', video_id)
index c54b876d39678837bad4f7e5715fc02ddec747c1..348d6ecdf19ecde7787cbb12331561c8bb2a0284 100644 (file)
@@ -17,7 +17,7 @@ from ..utils import (
 
 class ToggleIE(InfoExtractor):
     IE_NAME = 'toggle'
-    _VALID_URL = r'https?://video\.toggle\.sg/(?:en|zh)/(?:series|clips|movies)/(?:[^/]+/)+(?P<id>[0-9]+)'
+    _VALID_URL = r'https?://video\.toggle\.sg/(?:en|zh)/(?:[^/]+/){2,}(?P<id>[0-9]+)'
     _TESTS = [{
         'url': 'http://video.toggle.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115',
         'info_dict': {
@@ -73,6 +73,12 @@ class ToggleIE(InfoExtractor):
     }, {
         'url': 'http://video.toggle.sg/en/movies/seven-days/321936',
         'only_matching': True,
+    }, {
+        'url': 'https://video.toggle.sg/en/tv-show/news/may-2017-cna-singapore-tonight/fri-19-may-2017/512456',
+        'only_matching': True,
+    }, {
+        'url': 'http://video.toggle.sg/en/channels/eleven-plus/401585',
+        'only_matching': True,
     }]
 
     _FORMAT_PREFERENCES = {
index 26d770992ab1618c95094513371a01fcf99d1a70..e59ed266109d16e12e7c64a9ba499ef4c116e449 100644 (file)
@@ -5,7 +5,6 @@ from .common import InfoExtractor
 from ..utils import (
     int_or_none,
     js_to_json,
-    ExtractorError,
     urlencode_postdata,
     extract_attributes,
     smuggle_url,
@@ -78,8 +77,10 @@ class TouTvIE(InfoExtractor):
     def _real_extract(self, url):
         path = self._match_id(url)
         metadata = self._download_json('http://ici.tou.tv/presentation/%s' % path, path)
+        # IsDrm does not necessarily mean the video is DRM protected (see
+        # https://github.com/rg3/youtube-dl/issues/13994).
         if metadata.get('IsDrm'):
-            raise ExtractorError('This video is DRM protected.', expected=True)
+            self.report_warning('This video is probably DRM protected.', path)
         video_id = metadata['IdMedia']
         details = metadata['Details']
         title = details['OriginalTitle']
index 938e05076313cb5b3d3284083d2cc7e699241d21..f705a06c95a7d04645d3ccb50784ff444cc6e934 100644 (file)
@@ -6,42 +6,48 @@ import re
 
 
 class ToypicsIE(InfoExtractor):
-    IE_DESC = 'Toypics user profile'
-    _VALID_URL = r'https?://videos\.toypics\.net/view/(?P<id>[0-9]+)/.*'
+    IE_DESC = 'Toypics video'
+    _VALID_URL = r'https?://videos\.toypics\.net/view/(?P<id>[0-9]+)'
     _TEST = {
         'url': 'http://videos.toypics.net/view/514/chancebulged,-2-1/',
         'md5': '16e806ad6d6f58079d210fe30985e08b',
         'info_dict': {
             'id': '514',
             'ext': 'mp4',
-            'title': 'Chance-Bulge\'d, 2',
+            'title': "Chance-Bulge'd, 2",
             'age_limit': 18,
             'uploader': 'kidsune',
         }
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        page = self._download_webpage(url, video_id)
-        video_url = self._html_search_regex(
-            r'src:\s+"(http://static[0-9]+\.toypics\.net/flvideo/[^"]+)"', page, 'video URL')
-        title = self._html_search_regex(
-            r'<title>Toypics - ([^<]+)</title>', page, 'title')
-        username = self._html_search_regex(
-            r'toypics.net/([^/"]+)" class="user-name">', page, 'username')
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        formats = self._parse_html5_media_entries(
+            url, webpage, video_id)[0]['formats']
+        title = self._html_search_regex([
+            r'<h1[^>]+class=["\']view-video-title[^>]+>([^<]+)</h',
+            r'<title>([^<]+) - Toypics</title>',
+        ], webpage, 'title')
+
+        uploader = self._html_search_regex(
+            r'More videos from <strong>([^<]+)</strong>', webpage, 'uploader',
+            fatal=False)
+
         return {
             'id': video_id,
-            'url': video_url,
+            'formats': formats,
             'title': title,
-            'uploader': username,
+            'uploader': uploader,
             'age_limit': 18,
         }
 
 
 class ToypicsUserIE(InfoExtractor):
     IE_DESC = 'Toypics user profile'
-    _VALID_URL = r'https?://videos\.toypics\.net/(?P<username>[^/?]+)(?:$|[?#])'
+    _VALID_URL = r'https?://videos\.toypics\.net/(?!view)(?P<id>[^/?#&]+)'
     _TEST = {
         'url': 'http://videos.toypics.net/Mikey',
         'info_dict': {
@@ -51,8 +57,7 @@ class ToypicsUserIE(InfoExtractor):
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        username = mobj.group('username')
+        username = self._match_id(url)
 
         profile_page = self._download_webpage(
             url, username, note='Retrieving profile page')
@@ -71,7 +76,7 @@ class ToypicsUserIE(InfoExtractor):
                 note='Downloading page %d/%d' % (n, page_count))
             urls.extend(
                 re.findall(
-                    r'<p class="video-entry-title">\s+<a href="(https?://videos.toypics.net/view/[^"]+)">',
+                    r'<div[^>]+class=["\']preview[^>]+>\s*<a[^>]+href="(https?://videos\.toypics\.net/view/[^"]+)"',
                     lpage))
 
         return {
index 2aae55e7e8f8742b471e4f8ffe94ab2ae79bae25..7421378a8c771543b3418a7d1f54b40cacb324e1 100644 (file)
@@ -3,138 +3,6 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..compat import compat_str
-from ..utils import (
-    ExtractorError,
-    int_or_none,
-    InAdvancePagedList,
-    float_or_none,
-    unescapeHTML,
-)
-
-
-class TudouIE(InfoExtractor):
-    IE_NAME = 'tudou'
-    _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:(?:programs|wlplay)/view|(?:listplay|albumplay)/[\w-]{11})/(?P<id>[\w-]{11})'
-    _TESTS = [{
-        'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
-        'md5': '140a49ed444bd22f93330985d8475fcb',
-        'info_dict': {
-            'id': '159448201',
-            'ext': 'f4v',
-            'title': '卡马乔国足开大脚长传冲吊集锦',
-            'thumbnail': r're:^https?://.*\.jpg$',
-            'timestamp': 1372113489000,
-            'description': '卡马乔卡家军,开大脚先进战术不完全集锦!',
-            'duration': 289.04,
-            'view_count': int,
-            'filesize': int,
-        }
-    }, {
-        'url': 'http://www.tudou.com/programs/view/ajX3gyhL0pc/',
-        'info_dict': {
-            'id': '117049447',
-            'ext': 'f4v',
-            'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012',
-            'thumbnail': r're:^https?://.*\.jpg$',
-            'timestamp': 1349207518000,
-            'description': 'md5:294612423894260f2dcd5c6c04fe248b',
-            'duration': 5478.33,
-            'view_count': int,
-            'filesize': int,
-        }
-    }]
-
-    _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf'
-
-    # Translated from tudou/tools/TVCHelper.as in PortalPlayer_193.swf
-    # 0001, 0002 and 4001 are not included as they indicate temporary issues
-    TVC_ERRORS = {
-        '0003': 'The video is deleted or does not exist',
-        '1001': 'This video is unavailable due to licensing issues',
-        '1002': 'This video is unavailable as it\'s under review',
-        '1003': 'This video is unavailable as it\'s under review',
-        '3001': 'Password required',
-        '5001': 'This video is available in Mainland China only due to licensing issues',
-        '7001': 'This video is unavailable',
-        '8001': 'This video is unavailable due to licensing issues',
-    }
-
-    def _url_for_id(self, video_id, quality=None):
-        info_url = 'http://v2.tudou.com/f?id=' + compat_str(video_id)
-        if quality:
-            info_url += '&hd' + quality
-        xml_data = self._download_xml(info_url, video_id, 'Opening the info XML page')
-        error = xml_data.attrib.get('error')
-        if error is not None:
-            raise ExtractorError('Tudou said: %s' % error, expected=True)
-        final_url = xml_data.text
-        return final_url
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        item_data = self._download_json(
-            'http://www.tudou.com/tvp/getItemInfo.action?ic=%s' % video_id, video_id)
-
-        youku_vcode = item_data.get('vcode')
-        if youku_vcode:
-            return self.url_result('youku:' + youku_vcode, ie='Youku')
-
-        if not item_data.get('itemSegs'):
-            tvc_code = item_data.get('tvcCode')
-            if tvc_code:
-                err_msg = self.TVC_ERRORS.get(tvc_code)
-                if err_msg:
-                    raise ExtractorError('Tudou said: %s' % err_msg, expected=True)
-                raise ExtractorError('Unexpected error %s returned from Tudou' % tvc_code)
-            raise ExtractorError('Unxpected error returned from Tudou')
-
-        title = unescapeHTML(item_data['kw'])
-        description = item_data.get('desc')
-        thumbnail_url = item_data.get('pic')
-        view_count = int_or_none(item_data.get('playTimes'))
-        timestamp = int_or_none(item_data.get('pt'))
-
-        segments = self._parse_json(item_data['itemSegs'], video_id)
-        # It looks like the keys are the arguments that have to be passed as
-        # the hd field in the request url, we pick the higher
-        # Also, filter non-number qualities (see issue #3643).
-        quality = sorted(filter(lambda k: k.isdigit(), segments.keys()),
-                         key=lambda k: int(k))[-1]
-        parts = segments[quality]
-        len_parts = len(parts)
-        if len_parts > 1:
-            self.to_screen('%s: found %s parts' % (video_id, len_parts))
-
-        def part_func(partnum):
-            part = parts[partnum]
-            part_id = part['k']
-            final_url = self._url_for_id(part_id, quality)
-            ext = (final_url.split('?')[0]).split('.')[-1]
-            return [{
-                'id': '%s' % part_id,
-                'url': final_url,
-                'ext': ext,
-                'title': title,
-                'thumbnail': thumbnail_url,
-                'description': description,
-                'view_count': view_count,
-                'timestamp': timestamp,
-                'duration': float_or_none(part.get('seconds'), 1000),
-                'filesize': int_or_none(part.get('size')),
-                'http_headers': {
-                    'Referer': self._PLAYER_URL,
-                },
-            }]
-
-        entries = InAdvancePagedList(part_func, len_parts, 1)
-
-        return {
-            '_type': 'multi_video',
-            'entries': entries,
-            'id': video_id,
-            'title': title,
-        }
 
 
 class TudouPlaylistIE(InfoExtractor):
index 25aa9c58e522ec0cdecceeeb296f129c8da92a2d..be3eaa5c29c740f8fd39d6e76e69d1b006201602 100644 (file)
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_str
 from ..utils import (
     ExtractorError,
     int_or_none,
@@ -49,7 +50,7 @@ class TurboIE(InfoExtractor):
         for child in item:
             m = re.search(r'url_video_(?P<quality>.+)', child.tag)
             if m:
-                quality = m.group('quality')
+                quality = compat_str(m.group('quality'))
                 formats.append({
                     'format_id': quality,
                     'url': child.text,
index 1c0be9fc6aa97260622b1148763a57dbf25ce50f..efeb677ee9a7be81e3b9289968ca9df41a2cb4ef 100644 (file)
@@ -13,6 +13,7 @@ from ..utils import (
     xpath_attr,
     update_url_query,
     ExtractorError,
+    strip_or_none,
 )
 
 
@@ -163,17 +164,21 @@ class TurnerBaseIE(AdobePassIE):
             'height': int_or_none(image.get('height')),
         } for image in video_data.findall('images/image')]
 
+        is_live = xpath_text(video_data, 'isLive') == 'true'
+
         return {
             'id': video_id,
-            'title': title,
+            'title': self._live_title(title) if is_live else title,
             'formats': formats,
             'subtitles': subtitles,
             'thumbnails': thumbnails,
-            'description': xpath_text(video_data, 'description'),
+            'thumbnail': xpath_text(video_data, 'poster'),
+            'description': strip_or_none(xpath_text(video_data, 'description')),
             'duration': parse_duration(xpath_text(video_data, 'length') or xpath_text(video_data, 'trt')),
             'timestamp': self._extract_timestamp(video_data),
             'upload_date': xpath_attr(video_data, 'metas', 'version'),
             'series': xpath_text(video_data, 'showTitle'),
             'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')),
             'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')),
+            'is_live': is_live,
         }
index b6537141ae7c998267fa4dc7c9d72440c14d4590..8f8686a65414a2f8c390832b0a8f88483457413d 100644 (file)
@@ -2,9 +2,13 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..compat import (
+    compat_HTTPError,
+    compat_str,
+)
 from ..utils import (
     extract_attributes,
+    try_get,
     urlencode_postdata,
     ExtractorError,
 )
@@ -34,25 +38,32 @@ class TVPlayerIE(InfoExtractor):
             webpage, 'channel element'))
         title = current_channel['data-name']
 
-        resource_id = self._search_regex(
-            r'resourceId\s*=\s*"(\d+)"', webpage, 'resource id')
-        platform = self._search_regex(
-            r'platform\s*=\s*"([^"]+)"', webpage, 'platform')
+        resource_id = current_channel['data-id']
+
         token = self._search_regex(
-            r'token\s*=\s*"([^"]+)"', webpage, 'token', default='null')
-        validate = self._search_regex(
-            r'validate\s*=\s*"([^"]+)"', webpage, 'validate', default='null')
+            r'data-token=(["\'])(?P<token>(?!\1).+)\1', webpage,
+            'token', group='token')
+
+        context = self._download_json(
+            'https://tvplayer.com/watch/context', display_id,
+            'Downloading JSON context', query={
+                'resource': resource_id,
+                'gen': token,
+            })
+
+        validate = context['validate']
+        platform = try_get(
+            context, lambda x: x['platform']['key'], compat_str) or 'firefox'
 
         try:
             response = self._download_json(
                 'http://api.tvplayer.com/api/v2/stream/live',
-                resource_id, headers={
+                display_id, 'Downloading JSON stream', headers={
                     'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
                 }, data=urlencode_postdata({
+                    'id': resource_id,
                     'service': 1,
                     'platform': platform,
-                    'id': resource_id,
-                    'token': token,
                     'validate': validate,
                 }))['tvplayer']['response']
         except ExtractorError as e:
@@ -63,7 +74,7 @@ class TVPlayerIE(InfoExtractor):
                     '%s said: %s' % (self.IE_NAME, response['error']), expected=True)
             raise
 
-        formats = self._extract_m3u8_formats(response['stream'], resource_id, 'mp4')
+        formats = self._extract_m3u8_formats(response['stream'], display_id, 'mp4')
         self._sort_formats(formats)
 
         return {
index 4fd1aa4bfbdaea2ec5abbac2161f6aea25e5fbbd..a42977f397e03ab71f70c585e1f3c34113b78eda 100644 (file)
@@ -50,7 +50,7 @@ class TwentyMinutenIE(InfoExtractor):
     @staticmethod
     def _extract_urls(webpage):
         return [m.group('url') for m in re.finditer(
-            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:www\.)?20min\.ch/videoplayer/videoplayer.html\?.*?\bvideoId@\d+.*?)\1',
+            r'<iframe[^>]+src=(["\'])(?P<url>(?:(?:https?:)?//)?(?:www\.)?20min\.ch/videoplayer/videoplayer.html\?.*?\bvideoId@\d+.*?)\1',
             webpage)]
 
     def _real_extract(self, url):
index 37e3bc4129fdc43033556d5ab9941965f70b3b8c..6eaf360a6f893a05b37c27b9f66b28705e20f4be 100644 (file)
@@ -7,20 +7,38 @@ from .common import InfoExtractor
 from ..compat import compat_urlparse
 from ..utils import (
     determine_ext,
+    dict_get,
+    ExtractorError,
     float_or_none,
-    xpath_text,
-    remove_end,
     int_or_none,
-    ExtractorError,
+    remove_end,
+    try_get,
+    xpath_text,
 )
 
 from .periscope import PeriscopeIE
 
 
 class TwitterBaseIE(InfoExtractor):
-    def _get_vmap_video_url(self, vmap_url, video_id):
+    def _extract_formats_from_vmap_url(self, vmap_url, video_id):
         vmap_data = self._download_xml(vmap_url, video_id)
-        return xpath_text(vmap_data, './/MediaFile').strip()
+        video_url = xpath_text(vmap_data, './/MediaFile').strip()
+        if determine_ext(video_url) == 'm3u8':
+            return self._extract_m3u8_formats(
+                video_url, video_id, ext='mp4', m3u8_id='hls',
+                entry_protocol='m3u8_native')
+        return [{
+            'url': video_url,
+        }]
+
+    @staticmethod
+    def _search_dimensions_in_video_url(a_format, video_url):
+        m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url)
+        if m:
+            a_format.update({
+                'width': int(m.group('width')),
+                'height': int(m.group('height')),
+            })
 
 
 class TwitterCardIE(TwitterBaseIE):
@@ -36,7 +54,8 @@ class TwitterCardIE(TwitterBaseIE):
                 'title': 'Twitter Card',
                 'thumbnail': r're:^https?://.*\.jpg$',
                 'duration': 30.033,
-            }
+            },
+            'skip': 'Video gone',
         },
         {
             'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768',
@@ -48,6 +67,7 @@ class TwitterCardIE(TwitterBaseIE):
                 'thumbnail': r're:^https?://.*\.jpg',
                 'duration': 80.155,
             },
+            'skip': 'Video gone',
         },
         {
             'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977',
@@ -65,7 +85,7 @@ class TwitterCardIE(TwitterBaseIE):
         },
         {
             'url': 'https://twitter.com/i/cards/tfw/v1/665289828897005568',
-            'md5': 'ab2745d0b0ce53319a534fccaa986439',
+            'md5': '6dabeaca9e68cbb71c99c322a4b42a11',
             'info_dict': {
                 'id': 'iBb2x00UVlv',
                 'ext': 'mp4',
@@ -73,16 +93,17 @@ class TwitterCardIE(TwitterBaseIE):
                 'uploader_id': '1189339351084113920',
                 'uploader': 'ArsenalTerje',
                 'title': 'Vine by ArsenalTerje',
+                'timestamp': 1447451307,
             },
             'add_ie': ['Vine'],
         }, {
             'url': 'https://twitter.com/i/videos/tweet/705235433198714880',
-            'md5': '3846d0a07109b5ab622425449b59049d',
+            'md5': '884812a2adc8aaf6fe52b15ccbfa3b88',
             'info_dict': {
                 'id': '705235433198714880',
                 'ext': 'mp4',
                 'title': 'Twitter web player',
-                'thumbnail': r're:^https?://.*\.jpg',
+                'thumbnail': r're:^https?://.*',
             },
         }, {
             'url': 'https://twitter.com/i/videos/752274308186120192',
@@ -90,6 +111,59 @@ class TwitterCardIE(TwitterBaseIE):
         },
     ]
 
+    def _parse_media_info(self, media_info, video_id):
+        formats = []
+        for media_variant in media_info.get('variants', []):
+            media_url = media_variant['url']
+            if media_url.endswith('.m3u8'):
+                formats.extend(self._extract_m3u8_formats(media_url, video_id, ext='mp4', m3u8_id='hls'))
+            elif media_url.endswith('.mpd'):
+                formats.extend(self._extract_mpd_formats(media_url, video_id, mpd_id='dash'))
+            else:
+                vbr = int_or_none(dict_get(media_variant, ('bitRate', 'bitrate')), scale=1000)
+                a_format = {
+                    'url': media_url,
+                    'format_id': 'http-%d' % vbr if vbr else 'http',
+                    'vbr': vbr,
+                }
+                # Reported bitRate may be zero
+                if not a_format['vbr']:
+                    del a_format['vbr']
+
+                self._search_dimensions_in_video_url(a_format, media_url)
+
+                formats.append(a_format)
+        return formats
+
+    def _extract_mobile_formats(self, username, video_id):
+        webpage = self._download_webpage(
+            'https://mobile.twitter.com/%s/status/%s' % (username, video_id),
+            video_id, 'Downloading mobile webpage',
+            headers={
+                # A recent mobile UA is necessary for `gt` cookie
+                'User-Agent': 'Mozilla/5.0 (Android 6.0.1; Mobile; rv:54.0) Gecko/54.0 Firefox/54.0',
+            })
+        main_script_url = self._html_search_regex(
+            r'<script[^>]+src="([^"]+main\.[^"]+)"', webpage, 'main script URL')
+        main_script = self._download_webpage(
+            main_script_url, video_id, 'Downloading main script')
+        bearer_token = self._search_regex(
+            r'BEARER_TOKEN\s*:\s*"([^"]+)"',
+            main_script, 'bearer token')
+        guest_token = self._search_regex(
+            r'document\.cookie\s*=\s*decodeURIComponent\("gt=(\d+)',
+            webpage, 'guest token')
+        api_data = self._download_json(
+            'https://api.twitter.com/2/timeline/conversation/%s.json' % video_id,
+            video_id, 'Downloading mobile API data',
+            headers={
+                'Authorization': 'Bearer ' + bearer_token,
+                'x-guest-token': guest_token,
+            })
+        media_info = try_get(api_data, lambda o: o['globalObjects']['tweets'][video_id]
+                                                  ['extended_entities']['media'][0]['video_info']) or {}
+        return self._parse_media_info(media_info, video_id)
+
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
@@ -117,14 +191,6 @@ class TwitterCardIE(TwitterBaseIE):
         if periscope_url:
             return self.url_result(periscope_url, PeriscopeIE.ie_key())
 
-        def _search_dimensions_in_video_url(a_format, video_url):
-            m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url)
-            if m:
-                a_format.update({
-                    'width': int(m.group('width')),
-                    'height': int(m.group('height')),
-                })
-
         video_url = config.get('video_url') or config.get('playlist', [{}])[0].get('source')
 
         if video_url:
@@ -135,15 +201,14 @@ class TwitterCardIE(TwitterBaseIE):
                     'url': video_url,
                 }
 
-                _search_dimensions_in_video_url(f, video_url)
+                self._search_dimensions_in_video_url(f, video_url)
 
                 formats.append(f)
 
         vmap_url = config.get('vmapUrl') or config.get('vmap_url')
         if vmap_url:
-            formats.append({
-                'url': self._get_vmap_video_url(vmap_url, video_id),
-            })
+            formats.extend(
+                self._extract_formats_from_vmap_url(vmap_url, video_id))
 
         media_info = None
 
@@ -152,29 +217,14 @@ class TwitterCardIE(TwitterBaseIE):
                 media_info = entity['mediaInfo']
 
         if media_info:
-            for media_variant in media_info['variants']:
-                media_url = media_variant['url']
-                if media_url.endswith('.m3u8'):
-                    formats.extend(self._extract_m3u8_formats(media_url, video_id, ext='mp4', m3u8_id='hls'))
-                elif media_url.endswith('.mpd'):
-                    formats.extend(self._extract_mpd_formats(media_url, video_id, mpd_id='dash'))
-                else:
-                    vbr = int_or_none(media_variant.get('bitRate'), scale=1000)
-                    a_format = {
-                        'url': media_url,
-                        'format_id': 'http-%d' % vbr if vbr else 'http',
-                        'vbr': vbr,
-                    }
-                    # Reported bitRate may be zero
-                    if not a_format['vbr']:
-                        del a_format['vbr']
-
-                    _search_dimensions_in_video_url(a_format, media_url)
-
-                    formats.append(a_format)
-
+            formats.extend(self._parse_media_info(media_info, video_id))
             duration = float_or_none(media_info.get('duration', {}).get('nanos'), scale=1e9)
 
+        username = config.get('user', {}).get('screen_name')
+        if username:
+            formats.extend(self._extract_mobile_formats(username, video_id))
+
+        self._remove_duplicate_formats(formats)
         self._sort_formats(formats)
 
         title = self._search_regex(r'<title>([^<]+)</title>', webpage, 'title')
@@ -255,10 +305,10 @@ class TwitterIE(InfoExtractor):
         'info_dict': {
             'id': '700207533655363584',
             'ext': 'mp4',
-            'title': 'JG - BEAT PROD: @suhmeduh #Damndaniel',
-            'description': 'JG on Twitter: "BEAT PROD: @suhmeduh  https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"',
+            'title': 'Donte - BEAT PROD: @suhmeduh #Damndaniel',
+            'description': 'Donte on Twitter: "BEAT PROD: @suhmeduh  https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"',
             'thumbnail': r're:^https?://.*\.jpg',
-            'uploader': 'JG',
+            'uploader': 'Donte',
             'uploader_id': 'jaydingeer',
         },
         'params': {
@@ -270,9 +320,11 @@ class TwitterIE(InfoExtractor):
         'info_dict': {
             'id': 'MIOxnrUteUd',
             'ext': 'mp4',
-            'title': 'Dr.Pepperの飲み方 #japanese #バカ #ドクペ #電動ガン',
-            'uploader': 'TAKUMA',
-            'uploader_id': '1004126642786242560',
+            'title': 'FilmDrunk - Vine of the day',
+            'description': 'FilmDrunk on Twitter: "Vine of the day https://t.co/xmTvRdqxWf"',
+            'uploader': 'FilmDrunk',
+            'uploader_id': 'Filmdrunk',
+            'timestamp': 1402826626,
             'upload_date': '20140615',
         },
         'add_ie': ['Vine'],
@@ -294,13 +346,28 @@ class TwitterIE(InfoExtractor):
         'info_dict': {
             'id': '1zqKVVlkqLaKB',
             'ext': 'mp4',
-            'title': 'Sgt Kerry Schmidt - Ontario Provincial Police - Road rage, mischief, assault, rollover and fire in one occurrence',
+            'title': 'Sgt Kerry Schmidt - LIVE on #Periscope: Road rage, mischief, assault, rollover and fire in one occurrence',
+            'description': 'Sgt Kerry Schmidt on Twitter: "LIVE on #Periscope: Road rage, mischief, assault, rollover and fire in one occurrence  https://t.co/EKrVgIXF3s"',
             'upload_date': '20160923',
             'uploader_id': 'OPP_HSD',
-            'uploader': 'Sgt Kerry Schmidt - Ontario Provincial Police',
+            'uploader': 'Sgt Kerry Schmidt',
             'timestamp': 1474613214,
         },
         'add_ie': ['Periscope'],
+    }, {
+        # has mp4 formats via mobile API
+        'url': 'https://twitter.com/news_al3alm/status/852138619213144067',
+        'info_dict': {
+            'id': '852138619213144067',
+            'ext': 'mp4',
+            'title': 'عالم الأخبار - كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة',
+            'description': 'عالم الأخبار on Twitter: "كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة   https://t.co/xg6OhpyKfN"',
+            'uploader': 'عالم الأخبار',
+            'uploader_id': 'news_al3alm',
+        },
+        'params': {
+            'format': 'best[format_id^=http-]',
+        },
     }]
 
     def _real_extract(self, url):
@@ -393,7 +460,7 @@ class TwitterAmplifyIE(TwitterBaseIE):
 
         vmap_url = self._html_search_meta(
             'twitter:amplify:vmap', webpage, 'vmap url')
-        video_url = self._get_vmap_video_url(vmap_url, video_id)
+        formats = self._extract_formats_from_vmap_url(vmap_url, video_id)
 
         thumbnails = []
         thumbnail = self._html_search_meta(
@@ -415,11 +482,10 @@ class TwitterAmplifyIE(TwitterBaseIE):
             })
 
         video_w, video_h = _find_dimension('player')
-        formats = [{
-            'url': video_url,
+        formats[0].update({
             'width': video_w,
             'height': video_h,
-        }]
+        })
 
         return {
             'id': video_id,
index dae1aa3c6ba6fd211233abbd316fb78409c33faf..207c4a6a7ee8131c3e2e5d5823aefb336ad47c47 100644 (file)
@@ -15,6 +15,7 @@ from ..utils import (
     ExtractorError,
     float_or_none,
     int_or_none,
+    js_to_json,
     sanitized_Request,
     unescapeHTML,
     urlencode_postdata,
@@ -52,6 +53,10 @@ class UdemyIE(InfoExtractor):
         # new URL schema
         'url': 'https://www.udemy.com/electric-bass-right-from-the-start/learn/v4/t/lecture/4580906',
         'only_matching': True,
+    }, {
+        # no url in outputs format entry
+        'url': 'https://www.udemy.com/learn-web-development-complete-step-by-step-guide-to-success/learn/v4/t/lecture/4125812',
+        'only_matching': True,
     }]
 
     def _extract_course_info(self, webpage, video_id):
@@ -69,7 +74,7 @@ class UdemyIE(InfoExtractor):
             return compat_urlparse.urljoin(base_url, url) if not url.startswith('http') else url
 
         checkout_url = unescapeHTML(self._search_regex(
-            r'href=(["\'])(?P<url>(?:https?://(?:www\.)?udemy\.com)?/payment/checkout/.+?)\1',
+            r'href=(["\'])(?P<url>(?:https?://(?:www\.)?udemy\.com)?/(?:payment|cart)/checkout/.+?)\1',
             webpage, 'checkout url', group='url', default=None))
         if checkout_url:
             raise ExtractorError(
@@ -219,7 +224,7 @@ class UdemyIE(InfoExtractor):
 
         def extract_output_format(src, f_id):
             return {
-                'url': src['url'],
+                'url': src.get('url'),
                 'format_id': '%sp' % (src.get('height') or f_id),
                 'width': int_or_none(src.get('width')),
                 'height': int_or_none(src.get('height')),
@@ -264,6 +269,25 @@ class UdemyIE(InfoExtractor):
                     f = add_output_format_meta(f, format_id)
                 formats.append(f)
 
+        def extract_subtitles(track_list):
+            if not isinstance(track_list, list):
+                return
+            for track in track_list:
+                if not isinstance(track, dict):
+                    continue
+                if track.get('kind') != 'captions':
+                    continue
+                src = track.get('src')
+                if not src or not isinstance(src, compat_str):
+                    continue
+                lang = track.get('language') or track.get(
+                    'srclang') or track.get('label')
+                sub_dict = automatic_captions if track.get(
+                    'autogenerated') is True else subtitles
+                sub_dict.setdefault(lang, []).append({
+                    'url': src,
+                })
+
         download_urls = asset.get('download_urls')
         if isinstance(download_urls, dict):
             extract_formats(download_urls.get('Video'))
@@ -311,23 +335,16 @@ class UdemyIE(InfoExtractor):
                 extract_formats(data.get('sources'))
                 if not duration:
                     duration = int_or_none(data.get('duration'))
-                tracks = data.get('tracks')
-                if isinstance(tracks, list):
-                    for track in tracks:
-                        if not isinstance(track, dict):
-                            continue
-                        if track.get('kind') != 'captions':
-                            continue
-                        src = track.get('src')
-                        if not src or not isinstance(src, compat_str):
-                            continue
-                        lang = track.get('language') or track.get(
-                            'srclang') or track.get('label')
-                        sub_dict = automatic_captions if track.get(
-                            'autogenerated') is True else subtitles
-                        sub_dict.setdefault(lang, []).append({
-                            'url': src,
-                        })
+                extract_subtitles(data.get('tracks'))
+
+            if not subtitles and not automatic_captions:
+                text_tracks = self._parse_json(
+                    self._search_regex(
+                        r'text-tracks=(["\'])(?P<data>\[.+?\])\1', view_html,
+                        'text tracks', default='{}', group='data'), video_id,
+                    transform_source=lambda s: js_to_json(unescapeHTML(s)),
+                    fatal=False)
+                extract_subtitles(text_tracks)
 
         self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id'))
 
diff --git a/youtube_dl/extractor/upskill.py b/youtube_dl/extractor/upskill.py
new file mode 100644 (file)
index 0000000..30297b4
--- /dev/null
@@ -0,0 +1,176 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .wistia import WistiaIE
+from ..compat import compat_str
+from ..utils import (
+    clean_html,
+    ExtractorError,
+    get_element_by_class,
+    urlencode_postdata,
+    urljoin,
+)
+
+
+class UpskillBaseIE(InfoExtractor):
+    _LOGIN_URL = 'http://upskillcourses.com/sign_in'
+    _NETRC_MACHINE = 'upskill'
+
+    def _real_initialize(self):
+        self._login()
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None:
+            return
+
+        login_page, urlh = self._download_webpage_handle(
+            self._LOGIN_URL, None, 'Downloading login page')
+
+        login_url = compat_str(urlh.geturl())
+
+        login_form = self._hidden_inputs(login_page)
+
+        login_form.update({
+            'user[email]': username,
+            'user[password]': password,
+        })
+
+        post_url = self._search_regex(
+            r'<form[^>]+action=(["\'])(?P<url>(?:(?!\1).)+)\1', login_page,
+            'post url', default=login_url, group='url')
+
+        if not post_url.startswith('http'):
+            post_url = urljoin(login_url, post_url)
+
+        response = self._download_webpage(
+            post_url, None, 'Logging in',
+            data=urlencode_postdata(login_form),
+            headers={
+                'Content-Type': 'application/x-www-form-urlencoded',
+                'Referer': login_url,
+            })
+
+        # Successful login
+        if any(re.search(p, response) for p in (
+                r'class=["\']user-signout',
+                r'<a[^>]+\bhref=["\']/sign_out',
+                r'>\s*Log out\s*<')):
+            return
+
+        message = get_element_by_class('alert', response)
+        if message is not None:
+            raise ExtractorError(
+                'Unable to login: %s' % clean_html(message), expected=True)
+
+        raise ExtractorError('Unable to log in')
+
+
+class UpskillIE(UpskillBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?upskillcourses\.com/courses/[^/]+/lectures/(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'http://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100',
+        'info_dict': {
+            'id': 'uzw6zw58or',
+            'ext': 'mp4',
+            'title': 'Welcome to the Course!',
+            'description': 'md5:8d66c13403783370af62ca97a7357bdd',
+            'duration': 138.763,
+            'timestamp': 1479846621,
+            'upload_date': '20161122',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://upskillcourses.com/courses/119763/lectures/1747100',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        wistia_url = WistiaIE._extract_url(webpage)
+        if not wistia_url:
+            if any(re.search(p, webpage) for p in (
+                    r'class=["\']lecture-contents-locked',
+                    r'>\s*Lecture contents locked',
+                    r'id=["\']lecture-locked')):
+                self.raise_login_required('Lecture contents locked')
+
+        title = self._og_search_title(webpage, default=None)
+
+        return {
+            '_type': 'url_transparent',
+            'url': wistia_url,
+            'ie_key': WistiaIE.ie_key(),
+            'title': title,
+        }
+
+
+class UpskillCourseIE(UpskillBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?upskillcourses\.com/courses/(?:enrolled/)?(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'http://upskillcourses.com/courses/essential-web-developer-course/',
+        'info_dict': {
+            'id': '119763',
+            'title': 'The Essential Web Developer Course (Free)',
+        },
+        'playlist_count': 192,
+    }, {
+        'url': 'http://upskillcourses.com/courses/119763/',
+        'only_matching': True,
+    }, {
+        'url': 'http://upskillcourses.com/courses/enrolled/119763',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if UpskillIE.suitable(url) else super(
+            UpskillCourseIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        course_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, course_id)
+
+        course_id = self._search_regex(
+            r'data-course-id=["\'](\d+)', webpage, 'course id',
+            default=course_id)
+
+        entries = []
+
+        for mobj in re.finditer(
+                r'(?s)(?P<li><li[^>]+class=(["\'])(?:(?!\2).)*?section-item[^>]+>.+?</li>)',
+                webpage):
+            li = mobj.group('li')
+            if 'fa-youtube-play' not in li:
+                continue
+            lecture_url = self._search_regex(
+                r'<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', li,
+                'lecture url', default=None, group='url')
+            if not lecture_url:
+                continue
+            lecture_id = self._search_regex(
+                r'/lectures/(\d+)', lecture_url, 'lecture id', default=None)
+            title = self._html_search_regex(
+                r'<span[^>]+class=["\']lecture-name[^>]+>([^<]+)', li,
+                'title', default=None)
+            entries.append(
+                self.url_result(
+                    urljoin('http://upskillcourses.com/', lecture_url),
+                    ie=UpskillIE.ie_key(), video_id=lecture_id,
+                    video_title=clean_html(title)))
+
+        course_title = self._html_search_regex(
+            (r'(?s)<img[^>]+class=["\']course-image[^>]+>\s*<h\d>(.+?)</h',
+             r'(?s)<h\d[^>]+class=["\']course-title[^>]+>(.+?)</h'),
+            webpage, 'course title', fatal=False)
+
+        return self.playlist_result(entries, course_id, course_title)
index 0f5d6873808ed2dce5cde2e6239b6973cf809367..b20dddc5c4eb1307e02da56efc6bff35c603933c 100644 (file)
@@ -12,47 +12,46 @@ from ..utils import (
 
 
 class VeohIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P<id>(?:v|yapi-)[\da-zA-Z]+)'
+    _VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P<id>(?:v|e|yapi-)[\da-zA-Z]+)'
 
-    _TESTS = [
-        {
-            'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3',
-            'md5': '620e68e6a3cff80086df3348426c9ca3',
-            'info_dict': {
-                'id': '56314296',
-                'ext': 'mp4',
-                'title': 'Straight Backs Are Stronger',
-                'uploader': 'LUMOback',
-                'description': 'At LUMOback, we believe straight backs are stronger.  The LUMOback Posture & Movement Sensor:  It gently vibrates when you slouch, inspiring improved posture and mobility.  Use the app to track your data and improve your posture over time. ',
-            },
+    _TESTS = [{
+        'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3',
+        'md5': '620e68e6a3cff80086df3348426c9ca3',
+        'info_dict': {
+            'id': '56314296',
+            'ext': 'mp4',
+            'title': 'Straight Backs Are Stronger',
+            'uploader': 'LUMOback',
+            'description': 'At LUMOback, we believe straight backs are stronger.  The LUMOback Posture & Movement Sensor:  It gently vibrates when you slouch, inspiring improved posture and mobility.  Use the app to track your data and improve your posture over time. ',
         },
-        {
-            'url': 'http://www.veoh.com/watch/v27701988pbTc4wzN?h1=Chile+workers+cover+up+to+avoid+skin+damage',
-            'md5': '4a6ff84b87d536a6a71e6aa6c0ad07fa',
-            'info_dict': {
-                'id': '27701988',
-                'ext': 'mp4',
-                'title': 'Chile workers cover up to avoid skin damage',
-                'description': 'md5:2bd151625a60a32822873efc246ba20d',
-                'uploader': 'afp-news',
-                'duration': 123,
-            },
-            'skip': 'This video has been deleted.',
+    }, {
+        'url': 'http://www.veoh.com/watch/v27701988pbTc4wzN?h1=Chile+workers+cover+up+to+avoid+skin+damage',
+        'md5': '4a6ff84b87d536a6a71e6aa6c0ad07fa',
+        'info_dict': {
+            'id': '27701988',
+            'ext': 'mp4',
+            'title': 'Chile workers cover up to avoid skin damage',
+            'description': 'md5:2bd151625a60a32822873efc246ba20d',
+            'uploader': 'afp-news',
+            'duration': 123,
         },
-        {
-            'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX',
-            'md5': '4fde7b9e33577bab2f2f8f260e30e979',
-            'note': 'Embedded ooyala video',
-            'info_dict': {
-                'id': '69525809',
-                'ext': 'mp4',
-                'title': 'Doctors Alter Plan For Preteen\'s Weight Loss Surgery',
-                'description': 'md5:f5a11c51f8fb51d2315bca0937526891',
-                'uploader': 'newsy-videos',
-            },
-            'skip': 'This video has been deleted.',
+        'skip': 'This video has been deleted.',
+    }, {
+        'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX',
+        'md5': '4fde7b9e33577bab2f2f8f260e30e979',
+        'note': 'Embedded ooyala video',
+        'info_dict': {
+            'id': '69525809',
+            'ext': 'mp4',
+            'title': 'Doctors Alter Plan For Preteen\'s Weight Loss Surgery',
+            'description': 'md5:f5a11c51f8fb51d2315bca0937526891',
+            'uploader': 'newsy-videos',
         },
-    ]
+        'skip': 'This video has been deleted.',
+    }, {
+        'url': 'http://www.veoh.com/watch/e152215AJxZktGS',
+        'only_matching': True,
+    }]
 
     def _extract_formats(self, source):
         formats = []
index 9aa38bc5a725068125098e6c76c2d6bdaa6b7e00..890a149ead3234428a3f33d78d8af43365a278e2 100644 (file)
@@ -1,6 +1,7 @@
 from __future__ import unicode_literals
 
 import re
+import json
 
 from .common import InfoExtractor
 from ..compat import (
@@ -11,7 +12,6 @@ from ..compat import (
 from ..utils import (
     ExtractorError,
     int_or_none,
-    sanitized_Request,
     parse_iso8601,
 )
 
@@ -154,19 +154,24 @@ class VevoIE(VevoBaseIE):
     }
 
     def _initialize_api(self, video_id):
-        req = sanitized_Request(
-            'http://www.vevo.com/auth', data=b'')
         webpage = self._download_webpage(
-            req, None,
+            'https://accounts.vevo.com/token', None,
             note='Retrieving oauth token',
-            errnote='Unable to retrieve oauth token')
+            errnote='Unable to retrieve oauth token',
+            data=json.dumps({
+                'client_id': 'SPupX1tvqFEopQ1YS6SS',
+                'grant_type': 'urn:vevo:params:oauth:grant-type:anonymous',
+            }).encode('utf-8'),
+            headers={
+                'Content-Type': 'application/json',
+            })
 
         if re.search(r'(?i)THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION', webpage):
             self.raise_geo_restricted(
                 '%s said: This page is currently unavailable in your region' % self.IE_NAME)
 
         auth_info = self._parse_json(webpage, video_id)
-        self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['access_token']
+        self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['legacy_token']
 
     def _call_api(self, path, *args, **kwargs):
         try:
index 6be3774b7aa7367e9a067b417d7a749fece55d05..570fa45ea7d7e492ac924882f04708d4ecbf5c59 100644 (file)
@@ -121,7 +121,11 @@ class VH1IE(MTVIE):
         idoc = self._download_xml(
             doc_url, video_id,
             'Downloading info', transform_source=fix_xml_ampersands)
-        return self.playlist_result(
-            [self._get_video_info(item) for item in idoc.findall('.//item')],
-            playlist_id=video_id,
-        )
+
+        entries = []
+        for item in idoc.findall('.//item'):
+            info = self._get_video_info(item)
+            if info:
+                entries.append(info)
+
+        return self.playlist_result(entries, playlist_id=video_id)
index f0a7fd7397bd81670f993dbeb18553eb011bbd2a..b8b8bf97968ea430a445c521622461736346af7b 100644 (file)
@@ -7,6 +7,7 @@ import hashlib
 import json
 
 from .adobepass import AdobePassIE
+from .youtube import YoutubeIE
 from .common import InfoExtractor
 from ..compat import compat_HTTPError
 from ..utils import (
@@ -20,7 +21,7 @@ from ..utils import (
 
 
 class ViceBaseIE(AdobePassIE):
-    def _extract_preplay_video(self, url, webpage):
+    def _extract_preplay_video(self, url, locale, webpage):
         watch_hub_data = extract_attributes(self._search_regex(
             r'(?s)(<watch-hub\s*.+?</watch-hub>)', webpage, 'watch hub'))
         video_id = watch_hub_data['vms-id']
@@ -32,7 +33,8 @@ class ViceBaseIE(AdobePassIE):
             resource = self._get_mvpd_resource(
                 'VICELAND', title, video_id,
                 watch_hub_data.get('video-rating'))
-            query['tvetoken'] = self._extract_mvpd_auth(url, video_id, 'VICELAND', resource)
+            query['tvetoken'] = self._extract_mvpd_auth(
+                url, video_id, 'VICELAND', resource)
 
         # signature generation algorithm is reverse engineered from signatureGenerator in
         # webpack:///../shared/~/vice-player/dist/js/vice-player.js in
@@ -45,11 +47,14 @@ class ViceBaseIE(AdobePassIE):
 
         try:
             host = 'www.viceland' if is_locked else self._PREPLAY_HOST
-            preplay = self._download_json('https://%s.com/en_us/preplay/%s' % (host, video_id), video_id, query=query)
+            preplay = self._download_json(
+                'https://%s.com/%s/preplay/%s' % (host, locale, video_id),
+                video_id, query=query)
         except ExtractorError as e:
             if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
                 error = json.loads(e.cause.read().decode())
-                raise ExtractorError('%s said: %s' % (self.IE_NAME, error['details']), expected=True)
+                raise ExtractorError('%s said: %s' % (
+                    self.IE_NAME, error['details']), expected=True)
             raise
 
         video_data = preplay['video']
@@ -88,41 +93,30 @@ class ViceBaseIE(AdobePassIE):
 
 
 class ViceIE(ViceBaseIE):
-    _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?videos?/(?P<id>[^/?#&]+)'
+    IE_NAME = 'vice'
+    _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:(?P<locale>[^/]+)/)?videos?/(?P<id>[^/?#&]+)'
 
     _TESTS = [{
-        'url': 'http://www.vice.com/video/cowboy-capitalists-part-1',
-        'md5': 'e9d77741f9e42ba583e683cd170660f7',
+        'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab',
+        'md5': '7d3ae2f9ba5f196cdd9f9efd43657ac2',
         'info_dict': {
-            'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
+            'id': 'N2bzkydjraWDGwnt8jAttCF6Y0PDv4Zj',
             'ext': 'flv',
-            'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
-            'duration': 725.983,
+            'title': 'Monkey Labs of Holland',
+            'description': 'md5:92b3c7dcbfe477f772dd4afa496c9149',
         },
         'add_ie': ['Ooyala'],
-    }, {
-        'url': 'http://www.vice.com/video/how-to-hack-a-car',
-        'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2',
-        'info_dict': {
-            'id': '3jstaBeXgAs',
-            'ext': 'mp4',
-            'title': 'How to Hack a Car: Phreaked Out (Episode 2)',
-            'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30',
-            'uploader_id': 'MotherboardTV',
-            'uploader': 'Motherboard',
-            'upload_date': '20140529',
-        },
-        'add_ie': ['Youtube'],
     }, {
         'url': 'https://video.vice.com/en_us/video/the-signal-from-tolva/5816510690b70e6c5fd39a56',
-        'md5': '',
         'info_dict': {
             'id': '5816510690b70e6c5fd39a56',
             'ext': 'mp4',
             'uploader': 'Waypoint',
             'title': 'The Signal From Tölva',
+            'description': 'md5:3927e3c79f9e8094606a2b3c5b5e55d5',
             'uploader_id': '57f7d621e05ca860fa9ccaf9',
-            'timestamp': 1477941983938,
+            'timestamp': 1477941983,
+            'upload_date': '20161031',
         },
         'params': {
             # m3u8 download
@@ -130,19 +124,31 @@ class ViceIE(ViceBaseIE):
         },
         'add_ie': ['UplynkPreplay'],
     }, {
-        'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab',
-        'only_matching': True,
-    }, {
-        'url': 'http://www.vice.com/ru/video/big-night-out-ibiza-clive-martin-229',
-        'only_matching': True,
+        'url': 'https://video.vice.com/alps/video/ulfs-wien-beruchtigste-grafitti-crew-part-1/581b12b60a0e1f4c0fb6ea2f',
+        'info_dict': {
+            'id': '581b12b60a0e1f4c0fb6ea2f',
+            'ext': 'mp4',
+            'title': 'ULFs - Wien berüchtigste Grafitti Crew - Part 1',
+            'description': '<p>Zwischen Hinterzimmer-Tattoos und U-Bahnschächten erzählen uns die Ulfs, wie es ist, "süchtig nach Sachbeschädigung" zu sein.</p>',
+            'uploader': 'VICE',
+            'uploader_id': '57a204088cb727dec794c67b',
+            'timestamp': 1485368119,
+            'upload_date': '20170125',
+            'age_limit': 14,
+        },
+        'params': {
+            # AES-encrypted m3u8
+            'skip_download': True,
+        },
+        'add_ie': ['UplynkPreplay'],
     }, {
-        'url': 'https://munchies.vice.com/en/videos/watch-the-trailer-for-our-new-series-the-pizza-show',
+        'url': 'https://video.vice.com/en_us/video/pizza-show-trailer/56d8c9a54d286ed92f7f30e4',
         'only_matching': True,
     }]
     _PREPLAY_HOST = 'video.vice'
 
     def _real_extract(self, url):
-        video_id = self._match_id(url)
+        locale, video_id = re.match(self._VALID_URL, url).groups()
         webpage, urlh = self._download_webpage_handle(url, video_id)
         embed_code = self._search_regex(
             r'embedCode=([^&\'"]+)', webpage,
@@ -153,10 +159,11 @@ class ViceIE(ViceBaseIE):
             r'data-youtube-id="([^"]+)"', webpage, 'youtube id', default=None)
         if youtube_id:
             return self.url_result(youtube_id, 'Youtube')
-        return self._extract_preplay_video(urlh.geturl(), webpage)
+        return self._extract_preplay_video(urlh.geturl(), locale, webpage)
 
 
 class ViceShowIE(InfoExtractor):
+    IE_NAME = 'vice:show'
     _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?show/(?P<id>[^/?#&]+)'
 
     _TEST = {
@@ -183,6 +190,84 @@ class ViceShowIE(InfoExtractor):
             r'<title>(.+?)</title>', webpage, 'title', default=None)
         if title:
             title = re.sub(r'(.+)\s*\|\s*.+$', r'\1', title).strip()
-        description = self._html_search_meta('description', webpage, 'description')
+        description = self._html_search_meta(
+            'description', webpage, 'description')
 
         return self.playlist_result(entries, show_id, title, description)
+
+
+class ViceArticleIE(InfoExtractor):
+    IE_NAME = 'vice:article'
+    _VALID_URL = r'https://www.vice.com/[^/]+/article/(?P<id>[^?#]+)'
+
+    _TESTS = [{
+        'url': 'https://www.vice.com/en_us/article/on-set-with-the-woman-making-mormon-porn-in-utah',
+        'info_dict': {
+            'id': '58dc0a3dee202d2a0ccfcbd8',
+            'ext': 'mp4',
+            'title': 'Mormon War on Porn ',
+            'description': 'md5:ad396a2481e7f8afb5ed486878421090',
+            'uploader': 'VICE',
+            'uploader_id': '57a204088cb727dec794c693',
+            'timestamp': 1489160690,
+            'upload_date': '20170310',
+        },
+        'params': {
+            # AES-encrypted m3u8
+            'skip_download': True,
+        },
+        'add_ie': ['UplynkPreplay'],
+    }, {
+        'url': 'https://www.vice.com/en_us/article/how-to-hack-a-car',
+        'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2',
+        'info_dict': {
+            'id': '3jstaBeXgAs',
+            'ext': 'mp4',
+            'title': 'How to Hack a Car: Phreaked Out (Episode 2)',
+            'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30',
+            'uploader_id': 'MotherboardTV',
+            'uploader': 'Motherboard',
+            'upload_date': '20140529',
+        },
+        'add_ie': ['Youtube'],
+    }, {
+        'url': 'https://www.vice.com/en_us/article/cowboy-capitalists-part-1',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.vice.com/ru/article/big-night-out-ibiza-clive-martin-229',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        prefetch_data = self._parse_json(self._search_regex(
+            r'window\.__PREFETCH_DATA\s*=\s*({.*});',
+            webpage, 'prefetch data'), display_id)
+        body = prefetch_data['body']
+
+        def _url_res(video_url, ie_key):
+            return {
+                '_type': 'url_transparent',
+                'url': video_url,
+                'display_id': display_id,
+                'ie_key': ie_key,
+            }
+
+        embed_code = self._search_regex(
+            r'embedCode=([^&\'"]+)', body,
+            'ooyala embed code', default=None)
+        if embed_code:
+            return _url_res('ooyala:%s' % embed_code, 'Ooyala')
+
+        youtube_url = YoutubeIE._extract_url(body)
+        if youtube_url:
+            return _url_res(youtube_url, YoutubeIE.ie_key())
+
+        video_url = self._html_search_regex(
+            r'data-video-url="([^"]+)"',
+            prefetch_data['embed_code'], 'video URL')
+
+        return _url_res(video_url, ViceIE.ie_key())
index 87f9216b5da6965cf5b9aa163040ac42ce7baae4..bd60235c8845ac3c8662d4023478c06cd3ecb71b 100644 (file)
@@ -1,11 +1,13 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import re
+
 from .vice import ViceBaseIE
 
 
 class VicelandIE(ViceBaseIE):
-    _VALID_URL = r'https?://(?:www\.)?viceland\.com/[^/]+/video/[^/]+/(?P<id>[a-f0-9]+)'
+    _VALID_URL = r'https?://(?:www\.)?viceland\.com/(?P<locale>[^/]+)/video/[^/]+/(?P<id>[a-f0-9]+)'
     _TEST = {
         'url': 'https://www.viceland.com/en_us/video/trapped/588a70d0dba8a16007de7316',
         'info_dict': {
@@ -24,10 +26,13 @@ class VicelandIE(ViceBaseIE):
             'skip_download': True,
         },
         'add_ie': ['UplynkPreplay'],
+        'skip': '404',
     }
     _PREPLAY_HOST = 'www.viceland'
 
     def _real_extract(self, url):
-        video_id = self._match_id(url)
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        locale = mobj.group('locale')
         webpage = self._download_webpage(url, video_id)
-        return self._extract_preplay_video(url, webpage)
+        return self._extract_preplay_video(url, locale, webpage)
index 049db25a591f72597dd3734def6b8256ac126864..e5f964d39abaa06ea0400bb68214a5ed7cab42f2 100644 (file)
@@ -1,7 +1,6 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import random
 import re
 
 from .common import InfoExtractor
@@ -11,6 +10,7 @@ from ..utils import (
     float_or_none,
     parse_age_limit,
     qualities,
+    random_birthday,
     try_get,
     unified_timestamp,
     urljoin,
@@ -47,13 +47,10 @@ class VideoPressIE(InfoExtractor):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
+        query = random_birthday('birth_year', 'birth_month', 'birth_day')
         video = self._download_json(
             'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id,
-            video_id, query={
-                'birth_month': random.randint(1, 12),
-                'birth_day': random.randint(1, 31),
-                'birth_year': random.randint(1950, 1995),
-            })
+            video_id, query=query)
 
         title = video['title']
 
index 701bb1d01c646b75091b769701566d92853ec365..01da32f1cdd05505a6d53eff4f9bc6691aaae512 100644 (file)
@@ -56,7 +56,8 @@ class VidioIE(InfoExtractor):
         self._sort_formats(formats)
 
         duration = int_or_none(duration or self._search_regex(
-            r'data-video-duration=(["\'])(?P<duartion>\d+)\1', webpage, 'duration'))
+            r'data-video-duration=(["\'])(?P<duration>\d+)\1', webpage,
+            'duration', fatal=False, group='duration'))
         thumbnail = thumbnail or self._og_search_thumbnail(webpage)
 
         like_count = int_or_none(self._search_regex(
index e9ff336c4f5cb2e5a4b08fe5a97aa9993bdf87e0..59adb2377e06c95bb8681432a0411723b7879945 100644 (file)
@@ -3,7 +3,10 @@ from __future__ import unicode_literals
 import itertools
 
 from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..compat import (
+    compat_HTTPError,
+    compat_str,
+)
 from ..utils import (
     ExtractorError,
     int_or_none,
@@ -161,13 +164,28 @@ class VidmeIE(InfoExtractor):
                 'or for violating the terms of use.',
                 expected=True)
 
-        formats = [{
-            'format_id': f.get('type'),
-            'url': f['uri'],
-            'width': int_or_none(f.get('width')),
-            'height': int_or_none(f.get('height')),
-            'preference': 0 if f.get('type', '').endswith('clip') else 1,
-        } for f in video.get('formats', []) if f.get('uri')]
+        formats = []
+        for f in video.get('formats', []):
+            format_url = f.get('uri')
+            if not format_url or not isinstance(format_url, compat_str):
+                continue
+            format_type = f.get('type')
+            if format_type == 'dash':
+                formats.extend(self._extract_mpd_formats(
+                    format_url, video_id, mpd_id='dash', fatal=False))
+            elif format_type == 'hls':
+                formats.extend(self._extract_m3u8_formats(
+                    format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+            else:
+                formats.append({
+                    'format_id': f.get('type'),
+                    'url': format_url,
+                    'width': int_or_none(f.get('width')),
+                    'height': int_or_none(f.get('height')),
+                    'preference': 0 if f.get('type', '').endswith(
+                        'clip') else 1,
+                })
 
         if not formats and video.get('complete_url'):
             formats.append({
@@ -245,29 +263,35 @@ class VidmeListBaseIE(InfoExtractor):
 
 class VidmeUserIE(VidmeListBaseIE):
     IE_NAME = 'vidme:user'
-    _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]{6,})(?!/likes)(?:[^\da-zA-Z]|$)'
+    _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z_-]{6,})(?!/likes)(?:[^\da-zA-Z_-]|$)'
     _API_ITEM = 'list'
     _TITLE = 'Videos'
-    _TEST = {
-        'url': 'https://vid.me/EFARCHIVE',
+    _TESTS = [{
+        'url': 'https://vid.me/MasakoX',
         'info_dict': {
-            'id': '3834632',
-            'title': 'EFARCHIVE - %s' % _TITLE,
+            'id': '16112341',
+            'title': 'MasakoX - %s' % _TITLE,
         },
-        'playlist_mincount': 238,
-    }
+        'playlist_mincount': 191,
+    }, {
+        'url': 'https://vid.me/unsQuare_netWork',
+        'only_matching': True,
+    }]
 
 
 class VidmeUserLikesIE(VidmeListBaseIE):
     IE_NAME = 'vidme:user:likes'
-    _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]{6,})/likes'
+    _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z_-]{6,})/likes'
     _API_ITEM = 'likes'
     _TITLE = 'Likes'
-    _TEST = {
+    _TESTS = [{
         'url': 'https://vid.me/ErinAlexis/likes',
         'info_dict': {
             'id': '6483530',
             'title': 'ErinAlexis - %s' % _TITLE,
         },
         'playlist_mincount': 415,
-    }
+    }, {
+        'url': 'https://vid.me/Kaleidoscope-Ish/likes',
+        'only_matching': True,
+    }]
index 5ef7635b6e6a9e82e76c9019361051527c9a3f53..dbd5ba9bad82e3643a462a7db42fce9714dc54bb 100644 (file)
@@ -5,24 +5,44 @@ import re
 import itertools
 
 from .common import InfoExtractor
+from ..utils import (
+    urlencode_postdata,
+    int_or_none,
+    unified_strdate,
+)
 
 
 class VierIE(InfoExtractor):
     IE_NAME = 'vier'
     IE_DESC = 'vier.be and vijf.be'
-    _VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?|video/v3/embed/(?P<embed_id>\d+))'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:www\.)?(?P<site>vier|vijf)\.be/
+                        (?:
+                            (?:
+                                [^/]+/videos|
+                                video(?:/[^/]+)*
+                            )/
+                            (?P<display_id>[^/]+)(?:/(?P<id>\d+))?|
+                            (?:
+                                video/v3/embed|
+                                embed/video/public
+                            )/(?P<embed_id>\d+)
+                        )
+                    '''
+    _NETRC_MACHINE = 'vier'
     _TESTS = [{
         'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129',
+        'md5': 'e4ae2054a6b040ef1e289e20d111b46e',
         'info_dict': {
             'id': '16129',
             'display_id': 'het-wordt-warm-de-moestuin',
             'ext': 'mp4',
             'title': 'Het wordt warm in De Moestuin',
             'description': 'De vele uren werk eisen hun tol. Wim droomt van assistentie...',
-        },
-        'params': {
-            # m3u8 download
-            'skip_download': True,
+            'upload_date': '20121025',
+            'series': 'Plan B',
+            'tags': ['De Moestuin', 'Moestuin', 'meisjes', 'Tomaat', 'Wim', 'Droom'],
         },
     }, {
         'url': 'http://www.vijf.be/temptationisland/videos/zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas/2561614',
@@ -30,46 +50,145 @@ class VierIE(InfoExtractor):
             'id': '2561614',
             'display_id': 'zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas',
             'ext': 'mp4',
-            'title': 'ZO grappig: Temptation Island hosts moeten kiezen tussen onmogelijke dilemma\'s',
-            'description': 'Het spel is simpel: Annelien Coorevits en Rick Brandsteder krijgen telkens 2 dilemma\'s voorgeschoteld en ze MOETEN een keuze maken.',
+            'title': 'md5:84f45fe48b8c1fa296a7f6d208d080a7',
+            'description': 'md5:0356d4981e58b8cbee19355cbd51a8fe',
+            'upload_date': '20170228',
+            'series': 'Temptation Island',
+            'tags': list,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839',
+        'info_dict': {
+            'id': '2674839',
+            'display_id': 'jani-gaat-naar-tokio-aflevering-4',
+            'ext': 'mp4',
+            'title': 'Jani gaat naar Tokio - Aflevering 4',
+            'description': 'md5:aa8d611541db6ae9e863125704511f88',
+            'upload_date': '20170501',
+            'series': 'Jani gaat',
+            'episode_number': 4,
+            'tags': ['Jani Gaat', 'Volledige Aflevering'],
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'skip': 'Requires account credentials',
+    }, {
+        # Requires account credentials but bypassed extraction via v3/embed page
+        # without metadata
+        'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839',
+        'info_dict': {
+            'id': '2674839',
+            'display_id': 'jani-gaat-naar-tokio-aflevering-4',
+            'ext': 'mp4',
+            'title': 'jani-gaat-naar-tokio-aflevering-4',
         },
         'params': {
-            # m3u8 download
             'skip_download': True,
         },
+        'expected_warnings': ['Log in to extract metadata'],
     }, {
-        'url': 'http://www.vier.be/planb/videos/mieren-herders-van-de-bladluizen',
+        # Without video id in URL
+        'url': 'http://www.vier.be/planb/videos/dit-najaar-plan-b',
         'only_matching': True,
     }, {
         'url': 'http://www.vier.be/video/v3/embed/16129',
         'only_matching': True,
+    }, {
+        'url': 'https://www.vijf.be/embed/video/public/4093',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.vier.be/video/blockbusters/in-juli-en-augustus-summer-classics',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.vier.be/video/achter-de-rug/2017/achter-de-rug-seizoen-1-aflevering-6',
+        'only_matching': True,
     }]
 
+    def _real_initialize(self):
+        self._logged_in = False
+
+    def _login(self, site):
+        username, password = self._get_login_info()
+        if username is None or password is None:
+            return
+
+        login_page = self._download_webpage(
+            'http://www.%s.be/user/login' % site,
+            None, note='Logging in', errnote='Unable to log in',
+            data=urlencode_postdata({
+                'form_id': 'user_login',
+                'name': username,
+                'pass': password,
+            }),
+            headers={'Content-Type': 'application/x-www-form-urlencoded'})
+
+        login_error = self._html_search_regex(
+            r'(?s)<div class="messages error">\s*<div>\s*<h2.+?</h2>(.+?)<',
+            login_page, 'login error', default=None)
+        if login_error:
+            self.report_warning('Unable to log in: %s' % login_error)
+        else:
+            self._logged_in = True
+
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         embed_id = mobj.group('embed_id')
         display_id = mobj.group('display_id') or embed_id
+        video_id = mobj.group('id') or embed_id
         site = mobj.group('site')
 
+        if not self._logged_in:
+            self._login(site)
+
         webpage = self._download_webpage(url, display_id)
 
+        if r'id="user-login"' in webpage:
+            self.report_warning(
+                'Log in to extract metadata', video_id=display_id)
+            webpage = self._download_webpage(
+                'http://www.%s.be/video/v3/embed/%s' % (site, video_id),
+                display_id)
+
         video_id = self._search_regex(
             [r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'],
-            webpage, 'video id')
-        application = self._search_regex(
-            [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'],
-            webpage, 'application', default=site + '_vod')
-        filename = self._search_regex(
-            [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'],
-            webpage, 'filename')
-
-        playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename)
-        formats = self._extract_wowza_formats(playlist_url, display_id, skip_protocols=['dash'])
+            webpage, 'video id', default=video_id or display_id)
+
+        playlist_url = self._search_regex(
+            r'data-file=(["\'])(?P<url>(?:https?:)?//[^/]+/.+?\.m3u8.*?)\1',
+            webpage, 'm3u8 url', default=None, group='url')
+
+        if not playlist_url:
+            application = self._search_regex(
+                [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'],
+                webpage, 'application', default=site + '_vod')
+            filename = self._search_regex(
+                [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'],
+                webpage, 'filename')
+            playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename)
+
+        formats = self._extract_wowza_formats(
+            playlist_url, display_id, skip_protocols=['dash'])
         self._sort_formats(formats)
 
         title = self._og_search_title(webpage, default=display_id)
-        description = self._og_search_description(webpage, default=None)
+        description = self._html_search_regex(
+            r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-type-text-with-summary\b[^>]*?\1[^>]*>.*?<p>(?P<value>.+?)</p>',
+            webpage, 'description', default=None, group='value')
         thumbnail = self._og_search_thumbnail(webpage, default=None)
+        upload_date = unified_strdate(self._html_search_regex(
+            r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-name-post-date\b[^>]*?\1[^>]*>.*?(?P<value>\d{2}/\d{2}/\d{4})',
+            webpage, 'upload date', default=None, group='value'))
+
+        series = self._search_regex(
+            r'data-program=(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
+            'series', default=None, group='value')
+        episode_number = int_or_none(self._search_regex(
+            r'(?i)aflevering (\d+)', title, 'episode number', default=None))
+        tags = re.findall(r'<a\b[^>]+\bhref=["\']/tags/[^>]+>([^<]+)<', webpage)
 
         return {
             'id': video_id,
@@ -77,6 +196,10 @@ class VierIE(InfoExtractor):
             'title': title,
             'description': description,
             'thumbnail': thumbnail,
+            'upload_date': upload_date,
+            'series': series,
+            'episode_number': episode_number,
+            'tags': tags,
             'formats': formats,
         }
 
index 4adcd183030438762f4a82be90adb171e1a38d34..a0abbae60fee59fda14d8cf7ac5a235a3df7472d 100644 (file)
@@ -4,12 +4,14 @@ import re
 
 from .common import InfoExtractor
 from ..compat import (
-    compat_urlparse,
+    compat_HTTPError,
     compat_str,
+    compat_urlparse,
 )
 from ..utils import (
-    parse_duration,
+    ExtractorError,
     js_to_json,
+    parse_duration,
     parse_iso8601,
 )
 
@@ -128,9 +130,16 @@ class ViideaIE(InfoExtractor):
 
         base_url = self._proto_relative_url(cfg['livepipe'], 'http:')
 
-        lecture_data = self._download_json(
-            '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id),
-            lecture_id)['lecture'][0]
+        try:
+            lecture_data = self._download_json(
+                '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id),
+                lecture_id)['lecture'][0]
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+                msg = self._parse_json(
+                    e.cause.read().decode('utf-8'), lecture_id)
+                raise ExtractorError(msg['detail'], expected=True)
+            raise
 
         lecture_info = {
             'id': lecture_id,
index 61cc469bf27b58bfc70eb8bd036737ec0a4cb66c..c3f71b45e3bbee17bbf7e1d4787ac0aa888f251b 100644 (file)
@@ -151,10 +151,16 @@ class VimeoBaseInfoExtractor(InfoExtractor):
                     else:
                         mpd_manifest_urls = [(format_id, manifest_url)]
                     for f_id, m_url in mpd_manifest_urls:
-                        formats.extend(self._extract_mpd_formats(
+                        mpd_formats = self._extract_mpd_formats(
                             m_url.replace('/master.json', '/master.mpd'), video_id, f_id,
                             'Downloading %s MPD information' % cdn_name,
-                            fatal=False))
+                            fatal=False)
+                        for f in mpd_formats:
+                            if f.get('vcodec') == 'none':
+                                f['preference'] = -50
+                            elif f.get('acodec') == 'none':
+                                f['preference'] = -40
+                        formats.extend(mpd_formats)
 
         subtitles = {}
         text_tracks = config['request'].get('text_tracks')
@@ -609,7 +615,10 @@ class VimeoIE(VimeoBaseInfoExtractor):
                 if download_url and not source_file.get('is_cold') and not source_file.get('is_defrosting'):
                     source_name = source_file.get('public_name', 'Original')
                     if self._is_valid_url(download_url, video_id, '%s video' % source_name):
-                        ext = source_file.get('extension', determine_ext(download_url)).lower()
+                        ext = (try_get(
+                            source_file, lambda x: x['extension'],
+                            compat_str) or determine_ext(
+                            download_url, None) or 'mp4').lower()
                         formats.append({
                             'url': download_url,
                             'ext': ext,
index 4957a07f7bde0797f1f8b6f4c4ef9297f8e915e3..46950d3a1499b20bec50a9482d47a0ebdbbfb8b3 100644 (file)
@@ -92,10 +92,12 @@ class VineIE(InfoExtractor):
 
         username = data.get('username')
 
+        alt_title = 'Vine by %s' % username if username else None
+
         return {
             'id': video_id,
-            'title': data.get('description'),
-            'alt_title': 'Vine by %s' % username if username else None,
+            'title': data.get('description') or alt_title or 'Vine video',
+            'alt_title': alt_title,
             'thumbnail': data.get('thumbnailUrl'),
             'timestamp': unified_timestamp(data.get('created')),
             'uploader': username,
index db6a65d2ed93b233b050561c56fa7b6133ddb1b7..5cf93591cd0d863bb1f62968a0c25ea86e4cc744 100644 (file)
@@ -4,7 +4,10 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..compat import compat_str
+from ..compat import (
+    compat_kwargs,
+    compat_str,
+)
 from ..utils import (
     ExtractorError,
     int_or_none,
@@ -36,7 +39,8 @@ class ViuBaseIE(InfoExtractor):
         headers.update(kwargs.get('headers', {}))
         kwargs['headers'] = headers
         response = self._download_json(
-            'https://www.viu.com/api/' + path, *args, **kwargs)['response']
+            'https://www.viu.com/api/' + path, *args,
+            **compat_kwargs(kwargs))['response']
         if response.get('status') != 'success':
             raise ExtractorError('%s said: %s' % (
                 self.IE_NAME, response['message']), expected=True)
index dc2719cf987981999a31487966508e589c86f64b..105e172d539d4376e4534df31266c489f3d62518 100644 (file)
@@ -25,6 +25,7 @@ from ..utils import (
 from .dailymotion import DailymotionIE
 from .pladform import PladformIE
 from .vimeo import VimeoIE
+from .youtube import YoutubeIE
 
 
 class VKBaseIE(InfoExtractor):
@@ -345,11 +346,9 @@ class VKIE(VKBaseIE):
             if re.search(error_re, info_page):
                 raise ExtractorError(error_msg % video_id, expected=True)
 
-        youtube_url = self._search_regex(
-            r'<iframe[^>]+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"',
-            info_page, 'youtube iframe', default=None)
+        youtube_url = YoutubeIE._extract_url(info_page)
         if youtube_url:
-            return self.url_result(youtube_url, 'Youtube')
+            return self.url_result(youtube_url, ie=YoutubeIE.ie_key())
 
         vimeo_url = VimeoIE._extract_url(url, info_page)
         if vimeo_url is not None:
index e58940607a491581bb6e263ca2ffd3dd9741feac..64d0224e6f92779c68e2170564c2b1ad459269ed 100644 (file)
@@ -49,6 +49,10 @@ class VLiveIE(InfoExtractor):
         },
     }]
 
+    @classmethod
+    def suitable(cls, url):
+        return False if VLivePlaylistIE.suitable(url) else super(VLiveIE, cls).suitable(url)
+
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
@@ -232,7 +236,12 @@ class VLiveChannelIE(InfoExtractor):
                 query={
                     'app_id': app_id,
                     'channelSeq': channel_seq,
-                    'maxNumOfRows': 1000,
+                    # Large values of maxNumOfRows (~300 or above) may cause
+                    # empty responses (see [1]), e.g. this happens for [2] that
+                    # has more than 300 videos.
+                    # 1. https://github.com/rg3/youtube-dl/issues/13830
+                    # 2. http://channels.vlive.tv/EDBF.
+                    'maxNumOfRows': 100,
                     '_': int(time.time()),
                     'pageNo': page_num
                 }
@@ -261,3 +270,54 @@ class VLiveChannelIE(InfoExtractor):
 
         return self.playlist_result(
             entries, channel_code, channel_name)
+
+
+class VLivePlaylistIE(InfoExtractor):
+    IE_NAME = 'vlive:playlist'
+    _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<video_id>[0-9]+)/playlist/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.vlive.tv/video/22867/playlist/22912',
+        'info_dict': {
+            'id': '22912',
+            'title': 'Valentine Day Message from TWICE'
+        },
+        'playlist_mincount': 9
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id, playlist_id = mobj.group('video_id', 'id')
+
+        VIDEO_URL_TEMPLATE = 'http://www.vlive.tv/video/%s'
+        if self._downloader.params.get('noplaylist'):
+            self.to_screen(
+                'Downloading just video %s because of --no-playlist' % video_id)
+            return self.url_result(
+                VIDEO_URL_TEMPLATE % video_id,
+                ie=VLiveIE.ie_key(), video_id=video_id)
+
+        self.to_screen(
+            'Downloading playlist %s - add --no-playlist to just download video'
+            % playlist_id)
+
+        webpage = self._download_webpage(
+            'http://www.vlive.tv/video/%s/playlist/%s'
+            % (video_id, playlist_id), playlist_id)
+
+        item_ids = self._parse_json(
+            self._search_regex(
+                r'playlistVideoSeqs\s*=\s*(\[[^]]+\])', webpage,
+                'playlist video seqs'),
+            playlist_id)
+
+        entries = [
+            self.url_result(
+                VIDEO_URL_TEMPLATE % item_id, ie=VLiveIE.ie_key(),
+                video_id=compat_str(item_id))
+            for item_id in item_ids]
+
+        playlist_name = self._html_search_regex(
+            r'<div[^>]+class="[^"]*multicam_playlist[^>]*>\s*<h3[^>]+>([^<]+)',
+            webpage, 'playlist title', fatal=False)
+
+        return self.playlist_result(entries, playlist_id, playlist_name)
diff --git a/youtube_dl/extractor/voot.py b/youtube_dl/extractor/voot.py
new file mode 100644 (file)
index 0000000..5de3deb
--- /dev/null
@@ -0,0 +1,98 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .kaltura import KalturaIE
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    try_get,
+    unified_timestamp,
+)
+
+
+class VootIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?voot\.com/(?:[^/]+/)+(?P<id>\d+)'
+    _GEO_COUNTRIES = ['IN']
+    _TESTS = [{
+        'url': 'https://www.voot.com/shows/ishq-ka-rang-safed/1/360558/is-this-the-end-of-kamini-/441353',
+        'info_dict': {
+            'id': '0_8ledb18o',
+            'ext': 'mp4',
+            'title': 'Ishq Ka Rang Safed - Season 01 - Episode 340',
+            'description': 'md5:06291fbbbc4dcbe21235c40c262507c1',
+            'uploader_id': 'batchUser',
+            'timestamp': 1472162937,
+            'upload_date': '20160825',
+            'duration': 1146,
+            'series': 'Ishq Ka Rang Safed',
+            'season_number': 1,
+            'episode': 'Is this the end of Kamini?',
+            'episode_number': 340,
+            'view_count': int,
+            'like_count': int,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'expected_warnings': ['Failed to download m3u8 information'],
+    }, {
+        'url': 'https://www.voot.com/kids/characters/mighty-cat-masked-niyander-e-/400478/school-bag-disappears/440925',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.voot.com/movies/pandavas-5/424627',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        media_info = self._download_json(
+            'https://wapi.voot.com/ws/ott/getMediaInfo.json', video_id,
+            query={
+                'platform': 'Web',
+                'pId': 2,
+                'mediaId': video_id,
+            })
+
+        status_code = try_get(media_info, lambda x: x['status']['code'], int)
+        if status_code != 0:
+            raise ExtractorError(media_info['status']['message'], expected=True)
+
+        media = media_info['assets']
+
+        entry_id = media['EntryId']
+        title = media['MediaName']
+
+        description, series, season_number, episode, episode_number = [None] * 5
+
+        for meta in try_get(media, lambda x: x['Metas'], list) or []:
+            key, value = meta.get('Key'), meta.get('Value')
+            if not key or not value:
+                continue
+            if key == 'ContentSynopsis':
+                description = value
+            elif key == 'RefSeriesTitle':
+                series = value
+            elif key == 'RefSeriesSeason':
+                season_number = int_or_none(value)
+            elif key == 'EpisodeMainTitle':
+                episode = value
+            elif key == 'EpisodeNo':
+                episode_number = int_or_none(value)
+
+        return {
+            '_type': 'url_transparent',
+            'url': 'kaltura:1982551:%s' % entry_id,
+            'ie_key': KalturaIE.ie_key(),
+            'title': title,
+            'description': description,
+            'series': series,
+            'season_number': season_number,
+            'episode': episode,
+            'episode_number': episode_number,
+            'timestamp': unified_timestamp(media.get('CreationDate')),
+            'duration': int_or_none(media.get('Duration')),
+            'view_count': int_or_none(media.get('ViewCounter')),
+            'like_count': int_or_none(media.get('like_counter')),
+        }
index 487047fd78c65a758ccfa04fd50c16bfddda2623..9959627c0ad3690e02594bc10b9633c023604f7e 100644 (file)
@@ -112,21 +112,41 @@ class VRVIE(VRVBaseIE):
 
         audio_locale = streams_json.get('audio_locale')
         formats = []
-        for stream_id, stream in streams_json.get('streams', {}).get('adaptive_hls', {}).items():
-            stream_url = stream.get('url')
-            if not stream_url:
-                continue
-            stream_id = stream_id or audio_locale
-            m3u8_formats = self._extract_m3u8_formats(
-                stream_url, video_id, 'mp4', m3u8_id=stream_id,
-                note='Downloading %s m3u8 information' % stream_id,
-                fatal=False)
-            if audio_locale:
-                for f in m3u8_formats:
-                    f['language'] = audio_locale
-            formats.extend(m3u8_formats)
+        for stream_type, streams in streams_json.get('streams', {}).items():
+            if stream_type in ('adaptive_hls', 'adaptive_dash'):
+                for stream in streams.values():
+                    stream_url = stream.get('url')
+                    if not stream_url:
+                        continue
+                    stream_id = stream.get('hardsub_locale') or audio_locale
+                    format_id = '%s-%s' % (stream_type.split('_')[1], stream_id)
+                    if stream_type == 'adaptive_hls':
+                        adaptive_formats = self._extract_m3u8_formats(
+                            stream_url, video_id, 'mp4', m3u8_id=format_id,
+                            note='Downloading %s m3u8 information' % stream_id,
+                            fatal=False)
+                    else:
+                        adaptive_formats = self._extract_mpd_formats(
+                            stream_url, video_id, mpd_id=format_id,
+                            note='Downloading %s MPD information' % stream_id,
+                            fatal=False)
+                    if audio_locale:
+                        for f in adaptive_formats:
+                            if f.get('acodec') != 'none':
+                                f['language'] = audio_locale
+                    formats.extend(adaptive_formats)
         self._sort_formats(formats)
 
+        subtitles = {}
+        for subtitle in streams_json.get('subtitles', {}).values():
+            subtitle_url = subtitle.get('url')
+            if not subtitle_url:
+                continue
+            subtitles.setdefault(subtitle.get('locale', 'en-US'), []).append({
+                'url': subtitle_url,
+                'ext': subtitle.get('format', 'ass'),
+            })
+
         thumbnails = []
         for thumbnail in video_data.get('images', {}).get('thumbnails', []):
             thumbnail_url = thumbnail.get('source')
@@ -142,6 +162,7 @@ class VRVIE(VRVBaseIE):
             'id': video_id,
             'title': title,
             'formats': formats,
+            'subtitles': subtitles,
             'thumbnails': thumbnails,
             'description': video_data.get('description'),
             'duration': float_or_none(video_data.get('duration_ms'), 1000),
index b270f08d1e8796889ac54e4885c2234e4e7eb46b..02fcd52c74c268ea8274359af46810a6beeb0636 100644 (file)
@@ -1,6 +1,8 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import re
+
 from .common import InfoExtractor
 from ..utils import (
     int_or_none,
@@ -28,6 +30,12 @@ class VzaarIE(InfoExtractor):
         },
     }]
 
+    @staticmethod
+    def _extract_urls(webpage):
+        return re.findall(
+            r'<iframe[^>]+src=["\']((?:https?:)?//(?:view\.vzaar\.com)/[0-9]+)',
+            webpage)
+
     def _real_extract(self, url):
         video_id = self._match_id(url)
         video_data = self._download_json(
index 839cad986cbbf4edc8f73ca5639e780f210163c2..625d0a1cc14a52604f46e264a0f93342056fd9df 100644 (file)
@@ -13,6 +13,7 @@ from ..utils import (
 class WashingtonPostIE(InfoExtractor):
     IE_NAME = 'washingtonpost'
     _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+    _EMBED_URL = r'https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'
     _TEST = {
         'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d',
         'md5': '6f537e1334b714eb15f9563bd4b9cdfa',
@@ -27,6 +28,11 @@ class WashingtonPostIE(InfoExtractor):
         },
     }
 
+    @classmethod
+    def _extract_urls(cls, webpage):
+        return re.findall(
+            r'<iframe[^>]+\bsrc=["\'](%s)' % cls._EMBED_URL, webpage)
+
     def _real_extract(self, url):
         video_id = self._match_id(url)
         video_data = self._download_json(
diff --git a/youtube_dl/extractor/watchbox.py b/youtube_dl/extractor/watchbox.py
new file mode 100644 (file)
index 0000000..b382338
--- /dev/null
@@ -0,0 +1,151 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    int_or_none,
+    js_to_json,
+    strip_or_none,
+    try_get,
+    unified_timestamp,
+)
+
+
+class WatchBoxIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?watchbox\.de/(?P<kind>serien|filme)/(?:[^/]+/)*[^/]+-(?P<id>\d+)'
+    _TESTS = [{
+        # film
+        'url': 'https://www.watchbox.de/filme/free-jimmy-12325.html',
+        'info_dict': {
+            'id': '341368',
+            'ext': 'mp4',
+            'title': 'Free Jimmy',
+            'description': 'md5:bcd8bafbbf9dc0ef98063d344d7cc5f6',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 4890,
+            'age_limit': 16,
+            'release_year': 2009,
+        },
+        'params': {
+            'format': 'bestvideo',
+            'skip_download': True,
+        },
+        'expected_warnings': ['Failed to download m3u8 information'],
+    }, {
+        # episode
+        'url': 'https://www.watchbox.de/serien/ugly-americans-12231/staffel-1/date-in-der-hoelle-328286.html',
+        'info_dict': {
+            'id': '328286',
+            'ext': 'mp4',
+            'title': 'S01 E01 - Date in der Hölle',
+            'description': 'md5:2f31c74a8186899f33cb5114491dae2b',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 1291,
+            'age_limit': 12,
+            'release_year': 2010,
+            'series': 'Ugly Americans',
+            'season_number': 1,
+            'episode': 'Date in der Hölle',
+            'episode_number': 1,
+        },
+        'params': {
+            'format': 'bestvideo',
+            'skip_download': True,
+        },
+        'expected_warnings': ['Failed to download m3u8 information'],
+    }, {
+        'url': 'https://www.watchbox.de/serien/ugly-americans-12231/staffel-2/der-ring-des-powers-328270',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        kind, video_id = mobj.group('kind', 'id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        source = self._parse_json(
+            self._search_regex(
+                r'(?s)source\s*:\s*({.+?})\s*,\s*\n', webpage, 'source',
+                default='{}'),
+            video_id, transform_source=js_to_json, fatal=False) or {}
+
+        video_id = compat_str(source.get('videoId') or video_id)
+
+        devapi = self._download_json(
+            'http://api.watchbox.de/devapi/id/%s' % video_id, video_id, query={
+                'format': 'json',
+                'apikey': 'hbbtv',
+            }, fatal=False)
+
+        item = try_get(devapi, lambda x: x['items'][0], dict) or {}
+
+        title = item.get('title') or try_get(
+            item, lambda x: x['movie']['headline_movie'],
+            compat_str) or source['title']
+
+        formats = []
+        hls_url = item.get('media_videourl_hls') or source.get('hls')
+        if hls_url:
+            formats.extend(self._extract_m3u8_formats(
+                hls_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                m3u8_id='hls', fatal=False))
+        dash_url = item.get('media_videourl_wv') or source.get('dash')
+        if dash_url:
+            formats.extend(self._extract_mpd_formats(
+                dash_url, video_id, mpd_id='dash', fatal=False))
+        mp4_url = item.get('media_videourl')
+        if mp4_url:
+            formats.append({
+                'url': mp4_url,
+                'format_id': 'mp4',
+                'width': int_or_none(item.get('width')),
+                'height': int_or_none(item.get('height')),
+                'tbr': int_or_none(item.get('bitrate')),
+            })
+        self._sort_formats(formats)
+
+        description = strip_or_none(item.get('descr'))
+        thumbnail = item.get('media_content_thumbnail_large') or source.get('poster') or item.get('media_thumbnail')
+        duration = int_or_none(item.get('media_length') or source.get('length'))
+        timestamp = unified_timestamp(item.get('pubDate'))
+        view_count = int_or_none(item.get('media_views'))
+        age_limit = int_or_none(try_get(item, lambda x: x['movie']['fsk']))
+        release_year = int_or_none(try_get(item, lambda x: x['movie']['rel_year']))
+
+        info = {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'timestamp': timestamp,
+            'view_count': view_count,
+            'age_limit': age_limit,
+            'release_year': release_year,
+            'formats': formats,
+        }
+
+        if kind.lower() == 'serien':
+            series = try_get(
+                item, lambda x: x['special']['title'],
+                compat_str) or source.get('format')
+            season_number = int_or_none(self._search_regex(
+                r'^S(\d{1,2})\s*E\d{1,2}', title, 'season number',
+                default=None) or self._search_regex(
+                    r'/staffel-(\d+)/', url, 'season number', default=None))
+            episode = source.get('title')
+            episode_number = int_or_none(self._search_regex(
+                r'^S\d{1,2}\s*E(\d{1,2})', title, 'episode number',
+                default=None))
+            info.update({
+                'series': series,
+                'season_number': season_number,
+                'episode': episode,
+                'episode_number': episode_number,
+            })
+
+        return info
index ed099beea632b7eed16e93a7b386b6aec3089778..fadc539eefddc0918ef70fe97367be87edbf09c4 100644 (file)
@@ -4,11 +4,7 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..utils import (
-    unified_strdate,
-    parse_duration,
-    int_or_none,
-)
+from ..utils import parse_duration
 
 
 class WatchIndianPornIE(InfoExtractor):
@@ -23,11 +19,8 @@ class WatchIndianPornIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'Hot milf from kerala shows off her gorgeous large breasts on camera',
             'thumbnail': r're:^https?://.*\.jpg$',
-            'uploader': 'LoveJay',
-            'upload_date': '20160428',
             'duration': 226,
             'view_count': int,
-            'comment_count': int,
             'categories': list,
             'age_limit': 18,
         }
@@ -40,51 +33,36 @@ class WatchIndianPornIE(InfoExtractor):
 
         webpage = self._download_webpage(url, display_id)
 
-        video_url = self._html_search_regex(
-            r"url: escape\('([^']+)'\)", webpage, 'url')
+        info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0]
 
-        title = self._html_search_regex(
-            r'<h2 class="he2"><span>(.*?)</span>',
-            webpage, 'title')
-        thumbnail = self._html_search_regex(
-            r'<span id="container"><img\s+src="([^"]+)"',
-            webpage, 'thumbnail', fatal=False)
-
-        uploader = self._html_search_regex(
-            r'class="aupa">\s*(.*?)</a>',
-            webpage, 'uploader')
-        upload_date = unified_strdate(self._html_search_regex(
-            r'Added: <strong>(.+?)</strong>', webpage, 'upload date', fatal=False))
+        title = self._html_search_regex((
+            r'<title>(.+?)\s*-\s*Indian\s+Porn</title>',
+            r'<h4>(.+?)</h4>'
+        ), webpage, 'title')
 
         duration = parse_duration(self._search_regex(
-            r'<td>Time:\s*</td>\s*<td align="right"><span>\s*(.+?)\s*</span>',
+            r'Time:\s*<strong>\s*(.+?)\s*</strong>',
             webpage, 'duration', fatal=False))
 
-        view_count = int_or_none(self._search_regex(
-            r'<td>Views:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>',
+        view_count = int(self._search_regex(
+            r'(?s)Time:\s*<strong>.*?</strong>.*?<strong>\s*(\d+)\s*</strong>',
             webpage, 'view count', fatal=False))
-        comment_count = int_or_none(self._search_regex(
-            r'<td>Comments:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>',
-            webpage, 'comment count', fatal=False))
 
         categories = re.findall(
-            r'<a href="[^"]+/search/video/desi"><span>([^<]+)</span></a>',
+            r'<a[^>]+class=[\'"]categories[\'"][^>]*>\s*([^<]+)\s*</a>',
             webpage)
 
-        return {
+        info_dict.update({
             'id': video_id,
             'display_id': display_id,
-            'url': video_url,
             'http_headers': {
                 'Referer': url,
             },
             'title': title,
-            'thumbnail': thumbnail,
-            'uploader': uploader,
-            'upload_date': upload_date,
             'duration': duration,
             'view_count': view_count,
-            'comment_count': comment_count,
             'categories': categories,
             'age_limit': 18,
-        }
+        })
+
+        return info_dict
index c634b8decddf8fdb15649b05e8f49ad9efc36254..2182d6fd485bf4f1ed6ead7d78451b7c357ddc37 100644 (file)
@@ -1,10 +1,13 @@
 from __future__ import unicode_literals
 
+import re
+
 from .common import InfoExtractor
 from ..utils import (
     ExtractorError,
     int_or_none,
     float_or_none,
+    unescapeHTML,
 )
 
 
@@ -34,6 +37,25 @@ class WistiaIE(InfoExtractor):
         'only_matching': True,
     }]
 
+    @staticmethod
+    def _extract_url(webpage):
+        match = re.search(
+            r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
+        if match:
+            return unescapeHTML(match.group('url'))
+
+        match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
+        if match:
+            return 'wistia:%s' % match.group('id')
+
+        match = re.search(
+            r'''(?sx)
+                <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*?
+                <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2
+            ''', webpage)
+        if match:
+            return 'wistia:%s' % match.group('id')
+
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
index 45cfca7c52e92fa8f4aa8e06e633d5198e3066d3..9b5487710e9d1b73a2e26983bacde9e2e8891761 100644 (file)
@@ -13,7 +13,7 @@ class WSJIE(InfoExtractor):
     _VALID_URL = r'''(?x)
                         (?:
                             https?://video-api\.wsj\.com/api-video/player/iframe\.html\?.*?\bguid=|
-                            https?://(?:www\.)?wsj\.com/video/[^/]+/|
+                            https?://(?:www\.)?(?:wsj|barrons)\.com/video/[^/]+/|
                             wsj:
                         )
                         (?P<id>[a-fA-F0-9-]{36})
@@ -35,6 +35,9 @@ class WSJIE(InfoExtractor):
     }, {
         'url': 'http://www.wsj.com/video/can-alphabet-build-a-smarter-city/359DDAA8-9AC1-489C-82E6-0429C1E430E0.html',
         'only_matching': True,
+    }, {
+        'url': 'http://www.barrons.com/video/capitalism-deserves-more-respect-from-millennials/F301217E-6F46-43AE-B8D2-B7180D642EE9.html',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
index 13f8be6cbe365a0413be0b84d26ccd7c33b68e77..ad747978d2fdb468d387191578e33b6bebf41f14 100644 (file)
@@ -10,7 +10,6 @@ from ..utils import (
     ExtractorError,
     int_or_none,
     NO_DEFAULT,
-    sanitized_Request,
     urlencode_postdata,
 )
 
@@ -30,6 +29,8 @@ class XFileShareIE(InfoExtractor):
         (r'vidabc\.com', 'Vid ABC'),
         (r'vidbom\.com', 'VidBom'),
         (r'vidlo\.us', 'vidlo'),
+        (r'rapidvideo\.(?:cool|org)', 'RapidVideo.TV'),
+        (r'fastvideo\.me', 'FastVideo.me'),
     )
 
     IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1])
@@ -109,6 +110,12 @@ class XFileShareIE(InfoExtractor):
         'params': {
             'skip_download': True,
         },
+    }, {
+        'url': 'http://www.rapidvideo.cool/b667kprndr8w',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.fastvideo.me/k8604r8nk8sn/FAST_FURIOUS_8_-_Trailer_italiano_ufficiale.mp4.html',
+        'only_matching': True
     }]
 
     def _real_extract(self, url):
@@ -130,12 +137,12 @@ class XFileShareIE(InfoExtractor):
             if countdown:
                 self._sleep(countdown, video_id)
 
-            post = urlencode_postdata(fields)
-
-            req = sanitized_Request(url, post)
-            req.add_header('Content-type', 'application/x-www-form-urlencoded')
-
-            webpage = self._download_webpage(req, video_id, 'Downloading video page')
+            webpage = self._download_webpage(
+                url, video_id, 'Downloading video page',
+                data=urlencode_postdata(fields), headers={
+                    'Referer': url,
+                    'Content-type': 'application/x-www-form-urlencoded',
+                })
 
         title = (self._search_regex(
             (r'style="z-index: [0-9]+;">([^<]+)</span>',
@@ -150,7 +157,7 @@ class XFileShareIE(InfoExtractor):
         def extract_formats(default=NO_DEFAULT):
             urls = []
             for regex in (
-                    r'file\s*:\s*(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1',
+                    r'(?:file|src)\s*:\s*(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1',
                     r'file_link\s*=\s*(["\'])(?P<url>http(?:(?!\1).)+)\1',
                     r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?P<url>http(?:(?!\2).)+)\2\)',
                     r'<embed[^>]+src=(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1'):
index 7b67037140653cacdc5740f7c560280335de0a0d..c42b59e51f31a02c8508217c1246d5f0a8c4ca9d 100644 (file)
@@ -3,7 +3,9 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_str
 from ..utils import (
+    clean_html,
     dict_get,
     ExtractorError,
     int_or_none,
@@ -13,29 +15,41 @@ from ..utils import (
 
 
 class XHamsterIE(InfoExtractor):
-    _VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.*?)\.html(?:\?.*)?'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:.+?\.)?xhamster\.com/
+                        (?:
+                            movies/(?P<id>\d+)/(?P<display_id>[^/]*)\.html|
+                            videos/(?P<display_id_2>[^/]*)-(?P<id_2>\d+)
+                        )
+                    '''
+
     _TESTS = [{
         'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',
         'md5': '8281348b8d3c53d39fffb377d24eac4e',
         'info_dict': {
             'id': '1509445',
+            'display_id': 'femaleagent_shy_beauty_takes_the_bait',
             'ext': 'mp4',
             'title': 'FemaleAgent Shy beauty takes the bait',
             'upload_date': '20121014',
             'uploader': 'Ruseful2011',
             'duration': 893,
             'age_limit': 18,
+            'categories': ['Fake Hub', 'Amateur', 'MILFs', 'POV', 'Boss', 'Office', 'Oral', 'Reality', 'Sexy'],
         },
     }, {
         'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',
         'info_dict': {
             'id': '2221348',
+            'display_id': 'britney_spears_sexy_booty',
             'ext': 'mp4',
             'title': 'Britney Spears  Sexy Booty',
             'upload_date': '20130914',
             'uploader': 'jojo747400',
             'duration': 200,
             'age_limit': 18,
+            'categories': ['Britney Spears', 'Celebrities', 'HD Videos', 'Sexy', 'Sexy Booty'],
         },
         'params': {
             'skip_download': True,
@@ -51,6 +65,7 @@ class XHamsterIE(InfoExtractor):
             'uploader': 'parejafree',
             'duration': 72,
             'age_limit': 18,
+            'categories': ['Amateur', 'Blowjobs'],
         },
         'params': {
             'skip_download': True,
@@ -62,26 +77,18 @@ class XHamsterIE(InfoExtractor):
         # This video is visible for marcoalfa123456's friends only
         'url': 'https://it.xhamster.com/movies/7263980/la_mia_vicina.html',
         'only_matching': True,
+    }, {
+        # new URL schema
+        'url': 'https://pt.xhamster.com/videos/euro-pedal-pumping-7937821',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
-        def extract_video_url(webpage, name):
-            return self._search_regex(
-                [r'''file\s*:\s*(?P<q>["'])(?P<mp4>.+?)(?P=q)''',
-                 r'''<a\s+href=(?P<q>["'])(?P<mp4>.+?)(?P=q)\s+class=["']mp4Thumb''',
-                 r'''<video[^>]+file=(?P<q>["'])(?P<mp4>.+?)(?P=q)[^>]*>'''],
-                webpage, name, group='mp4')
-
-        def is_hd(webpage):
-            return '<div class=\'icon iconHD\'' in webpage
-
         mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id') or mobj.group('id_2')
+        display_id = mobj.group('display_id') or mobj.group('display_id_2')
 
-        video_id = mobj.group('id')
-        seo = mobj.group('seo')
-        proto = mobj.group('proto')
-        mrss_url = '%s://xhamster.com/movies/%s/%s.html' % (proto, video_id, seo)
-        webpage = self._download_webpage(mrss_url, video_id)
+        webpage = self._download_webpage(url, video_id)
 
         error = self._html_search_regex(
             r'<div[^>]+id=["\']videoClosed["\'][^>]*>(.+?)</div>',
@@ -95,6 +102,39 @@ class XHamsterIE(InfoExtractor):
              r'<title[^>]*>(.+?)(?:,\s*[^,]*?\s*Porn\s*[^,]*?:\s*xHamster[^<]*| - xHamster\.com)</title>'],
             webpage, 'title')
 
+        formats = []
+        format_urls = set()
+
+        sources = self._parse_json(
+            self._search_regex(
+                r'sources\s*:\s*({.+?})\s*,?\s*\n', webpage, 'sources',
+                default='{}'),
+            video_id, fatal=False)
+        for format_id, format_url in sources.items():
+            if not isinstance(format_url, compat_str):
+                continue
+            if format_url in format_urls:
+                continue
+            format_urls.add(format_url)
+            formats.append({
+                'format_id': format_id,
+                'url': format_url,
+                'height': int_or_none(self._search_regex(
+                    r'^(\d+)[pP]', format_id, 'height', default=None))
+            })
+
+        video_url = self._search_regex(
+            [r'''file\s*:\s*(?P<q>["'])(?P<mp4>.+?)(?P=q)''',
+             r'''<a\s+href=(?P<q>["'])(?P<mp4>.+?)(?P=q)\s+class=["']mp4Thumb''',
+             r'''<video[^>]+file=(?P<q>["'])(?P<mp4>.+?)(?P=q)[^>]*>'''],
+            webpage, 'video url', group='mp4', default=None)
+        if video_url and video_url not in format_urls:
+            formats.append({
+                'url': video_url,
+            })
+
+        self._sort_formats(formats)
+
         # Only a few videos have an description
         mobj = re.search(r'<span>Description: </span>([^<]+)', webpage)
         description = mobj.group(1) if mobj else None
@@ -104,7 +144,7 @@ class XHamsterIE(InfoExtractor):
             webpage, 'upload date', fatal=False))
 
         uploader = self._html_search_regex(
-            r'<span[^>]+itemprop=["\']author[^>]+><a[^>]+href=["\'].+?xhamster\.com/user/[^>]+>(?P<uploader>.+?)</a>',
+            r'<span[^>]+itemprop=["\']author[^>]+><a[^>]+><span[^>]+>([^<]+)',
             webpage, 'uploader', default='anonymous')
 
         thumbnail = self._search_regex(
@@ -113,14 +153,15 @@ class XHamsterIE(InfoExtractor):
             webpage, 'thumbnail', fatal=False, group='thumbnail')
 
         duration = parse_duration(self._search_regex(
-            r'Runtime:\s*</span>\s*([\d:]+)', webpage,
+            [r'<[^<]+\bitemprop=["\']duration["\'][^<]+\bcontent=["\'](.+?)["\']',
+             r'Runtime:\s*</span>\s*([\d:]+)'], webpage,
             'duration', fatal=False))
 
         view_count = int_or_none(self._search_regex(
             r'content=["\']User(?:View|Play)s:(\d+)',
             webpage, 'view count', fatal=False))
 
-        mobj = re.search(r"hint='(?P<likecount>\d+) Likes / (?P<dislikecount>\d+) Dislikes'", webpage)
+        mobj = re.search(r'hint=[\'"](?P<likecount>\d+) Likes / (?P<dislikecount>\d+) Dislikes', webpage)
         (like_count, dislike_count) = (mobj.group('likecount'), mobj.group('dislikecount')) if mobj else (None, None)
 
         mobj = re.search(r'</label>Comments \((?P<commentcount>\d+)\)</div>', webpage)
@@ -128,32 +169,15 @@ class XHamsterIE(InfoExtractor):
 
         age_limit = self._rta_search(webpage)
 
-        hd = is_hd(webpage)
-
-        format_id = 'hd' if hd else 'sd'
-
-        video_url = extract_video_url(webpage, format_id)
-        formats = [{
-            'url': video_url,
-            'format_id': 'hd' if hd else 'sd',
-            'preference': 1,
-        }]
-
-        if not hd:
-            mrss_url = self._search_regex(r'<link rel="canonical" href="([^"]+)', webpage, 'mrss_url')
-            webpage = self._download_webpage(mrss_url + '?hd', video_id, note='Downloading HD webpage')
-            if is_hd(webpage):
-                video_url = extract_video_url(webpage, 'hd')
-                formats.append({
-                    'url': video_url,
-                    'format_id': 'hd',
-                    'preference': 2,
-                })
-
-        self._sort_formats(formats)
+        categories_html = self._search_regex(
+            r'(?s)<table.+?(<span>Categories:.+?)</table>', webpage,
+            'categories', default=None)
+        categories = [clean_html(category) for category in re.findall(
+            r'<a[^>]+>(.+?)</a>', categories_html)] if categories_html else None
 
         return {
             'id': video_id,
+            'display_id': display_id,
             'title': title,
             'description': description,
             'upload_date': upload_date,
@@ -165,6 +189,7 @@ class XHamsterIE(InfoExtractor):
             'dislike_count': int_or_none(dislike_count),
             'comment_count': int_or_none(comment_count),
             'age_limit': age_limit,
+            'categories': categories,
             'formats': formats,
         }
 
index 5584674a061fc5a67bbb65bc0b58fc96e96eae3b..bea9b87ad4123f90bcf554d7115cfd35d431afe6 100644 (file)
@@ -6,6 +6,7 @@ import re
 from .common import InfoExtractor
 from ..utils import (
     int_or_none,
+    js_to_json,
     orderedSet,
     parse_duration,
     sanitized_Request,
@@ -37,6 +38,22 @@ class XTubeIE(InfoExtractor):
             'comment_count': int,
             'age_limit': 18,
         }
+    }, {
+        # FLV videos with duplicated formats
+        'url': 'http://www.xtube.com/video-watch/A-Super-Run-Part-1-YT-9299752',
+        'md5': 'a406963eb349dd43692ec54631efd88b',
+        'info_dict': {
+            'id': '9299752',
+            'display_id': 'A-Super-Run-Part-1-YT',
+            'ext': 'flv',
+            'title': 'A Super Run - Part 1 (YT)',
+            'description': 'md5:ca0d47afff4a9b2942e4b41aa970fd93',
+            'uploader': 'tshirtguy59',
+            'duration': 579,
+            'view_count': int,
+            'comment_count': int,
+            'age_limit': 18,
+        },
     }, {
         # new URL schema
         'url': 'http://www.xtube.com/video-watch/strange-erotica-625837',
@@ -68,8 +85,9 @@ class XTubeIE(InfoExtractor):
             })
 
         sources = self._parse_json(self._search_regex(
-            r'(["\'])sources\1\s*:\s*(?P<sources>{.+?}),',
-            webpage, 'sources', group='sources'), video_id)
+            r'(["\'])?sources\1?\s*:\s*(?P<sources>{.+?}),',
+            webpage, 'sources', group='sources'), video_id,
+            transform_source=js_to_json)
 
         formats = []
         for format_id, format_url in sources.items():
@@ -78,6 +96,7 @@ class XTubeIE(InfoExtractor):
                 'format_id': format_id,
                 'height': int_or_none(format_id),
             })
+        self._remove_duplicate_formats(formats)
         self._sort_formats(formats)
 
         title = self._search_regex(
index e0818201a2b9ff122904fcbbf7a775a770c5dc5c..0276c0dbb26f584c0fc49cf35614f399cc203508 100644 (file)
@@ -1,14 +1,13 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import base64
-
 from .common import InfoExtractor
-from ..compat import compat_urllib_parse_unquote
 from ..utils import (
     ExtractorError,
+    float_or_none,
+    get_element_by_attribute,
     parse_iso8601,
-    parse_duration,
+    remove_end,
 )
 
 
@@ -24,6 +23,7 @@ class XuiteIE(InfoExtractor):
             'id': '3860914',
             'ext': 'mp3',
             'title': '孤單南半球-歐德陽',
+            'description': '孤單南半球-歐德陽',
             'thumbnail': r're:^https?://.*\.jpg$',
             'duration': 247.246,
             'timestamp': 1314932940,
@@ -44,7 +44,7 @@ class XuiteIE(InfoExtractor):
             'duration': 596.458,
             'timestamp': 1454242500,
             'upload_date': '20160131',
-            'uploader': 'yan12125',
+            'uploader': '屁姥',
             'uploader_id': '12158353',
             'categories': ['個人短片'],
             'description': 'http://download.blender.org/peach/bigbuckbunny_movies/BigBuckBunny_320x180.mp4',
@@ -72,10 +72,10 @@ class XuiteIE(InfoExtractor):
         # from http://forgetfulbc.blogspot.com/2016/06/date.html
         'url': 'http://vlog.xuite.net/embed/cE1xbENoLTI3NDQ3MzM2LmZsdg==?ar=0&as=0',
         'info_dict': {
-            'id': 'cE1xbENoLTI3NDQ3MzM2LmZsdg==',
+            'id': '27447336',
             'ext': 'mp4',
             'title': '男女平權只是口號?專家解釋約會時男生是否該幫女生付錢 (中字)',
-            'description': 'md5:f0abdcb69df300f522a5442ef3146f2a',
+            'description': 'md5:1223810fa123b179083a3aed53574706',
             'timestamp': 1466160960,
             'upload_date': '20160617',
             'uploader': 'B.C. & Lowy',
@@ -86,29 +86,9 @@ class XuiteIE(InfoExtractor):
         'only_matching': True,
     }]
 
-    @staticmethod
-    def base64_decode_utf8(data):
-        return base64.b64decode(data.encode('utf-8')).decode('utf-8')
-
-    @staticmethod
-    def base64_encode_utf8(data):
-        return base64.b64encode(data.encode('utf-8')).decode('utf-8')
-
-    def _extract_flv_config(self, encoded_media_id):
-        flv_config = self._download_xml(
-            'http://vlog.xuite.net/flash/player?media=%s' % encoded_media_id,
-            'flv config')
-        prop_dict = {}
-        for prop in flv_config.findall('./property'):
-            prop_id = self.base64_decode_utf8(prop.attrib['id'])
-            # CDATA may be empty in flv config
-            if not prop.text:
-                continue
-            encoded_content = self.base64_decode_utf8(prop.text)
-            prop_dict[prop_id] = compat_urllib_parse_unquote(encoded_content)
-        return prop_dict
-
     def _real_extract(self, url):
+        # /play/ URLs provide embedded video URL and more metadata
+        url = url.replace('/embed/', '/play/')
         video_id = self._match_id(url)
 
         webpage = self._download_webpage(url, video_id)
@@ -121,51 +101,53 @@ class XuiteIE(InfoExtractor):
                 '%s returned error: %s' % (self.IE_NAME, error_msg),
                 expected=True)
 
-        encoded_media_id = self._search_regex(
-            r'attributes\.name\s*=\s*"([^"]+)"', webpage,
-            'encoded media id', default=None)
-        if encoded_media_id is None:
-            video_id = self._html_search_regex(
-                r'data-mediaid="(\d+)"', webpage, 'media id')
-            encoded_media_id = self.base64_encode_utf8(video_id)
-        flv_config = self._extract_flv_config(encoded_media_id)
+        media_info = self._parse_json(self._search_regex(
+            r'var\s+mediaInfo\s*=\s*({.*});', webpage, 'media info'), video_id)
 
-        FORMATS = {
-            'audio': 'mp3',
-            'video': 'mp4',
-        }
+        video_id = media_info['MEDIA_ID']
 
         formats = []
-        for format_tag in ('src', 'hq_src'):
-            video_url = flv_config.get(format_tag)
+        for key in ('html5Url', 'html5HQUrl'):
+            video_url = media_info.get(key)
             if not video_url:
                 continue
             format_id = self._search_regex(
-                r'\bq=(.+?)\b', video_url, 'format id', default=format_tag)
+                r'\bq=(.+?)\b', video_url, 'format id', default=None)
             formats.append({
                 'url': video_url,
-                'ext': FORMATS.get(flv_config['type'], 'mp4'),
+                'ext': 'mp4' if format_id.isnumeric() else format_id,
                 'format_id': format_id,
                 'height': int(format_id) if format_id.isnumeric() else None,
             })
         self._sort_formats(formats)
 
-        timestamp = flv_config.get('publish_datetime')
+        timestamp = media_info.get('PUBLISH_DATETIME')
         if timestamp:
             timestamp = parse_iso8601(timestamp + ' +0800', ' ')
 
-        category = flv_config.get('category')
+        category = media_info.get('catName')
         categories = [category] if category else []
 
+        uploader = media_info.get('NICKNAME')
+        uploader_url = None
+
+        author_div = get_element_by_attribute('itemprop', 'author', webpage)
+        if author_div:
+            uploader = uploader or self._html_search_meta('name', author_div)
+            uploader_url = self._html_search_regex(
+                r'<link[^>]+itemprop="url"[^>]+href="([^"]+)"', author_div,
+                'uploader URL', fatal=False)
+
         return {
             'id': video_id,
-            'title': flv_config['title'],
-            'description': flv_config.get('description'),
-            'thumbnail': flv_config.get('thumb'),
+            'title': media_info['TITLE'],
+            'description': remove_end(media_info.get('metaDesc'), ' (Xuite 影音)'),
+            'thumbnail': media_info.get('ogImageUrl'),
             'timestamp': timestamp,
-            'uploader': flv_config.get('author_name'),
-            'uploader_id': flv_config.get('author_id'),
-            'duration': parse_duration(flv_config.get('duration')),
+            'uploader': uploader,
+            'uploader_id': media_info.get('MEMBER_ID'),
+            'uploader_url': uploader_url,
+            'duration': float_or_none(media_info.get('MEDIA_DURATION'), 1000000),
             'categories': categories,
             'formats': formats,
         }
index 30825daae956e12f450e32bc66aea737a5c293e0..eca603028d9ac91cf98e3da6c3651444f1653730 100644 (file)
@@ -6,8 +6,10 @@ from .common import InfoExtractor
 from ..compat import compat_urllib_parse_unquote
 from ..utils import (
     clean_html,
-    ExtractorError,
     determine_ext,
+    ExtractorError,
+    int_or_none,
+    parse_duration,
 )
 
 
@@ -20,6 +22,7 @@ class XVideosIE(InfoExtractor):
             'id': '4588838',
             'ext': 'mp4',
             'title': 'Biker Takes his Girl',
+            'duration': 108,
             'age_limit': 18,
         }
     }
@@ -36,6 +39,11 @@ class XVideosIE(InfoExtractor):
             r'<title>(.*?)\s+-\s+XVID', webpage, 'title')
         video_thumbnail = self._search_regex(
             r'url_bigthumb=(.+?)&amp', webpage, 'thumbnail', fatal=False)
+        video_duration = int_or_none(self._og_search_property(
+            'duration', webpage, default=None)) or parse_duration(
+            self._search_regex(
+                r'<span[^>]+class=["\']duration["\'][^>]*>.*?(\d[^<]+)',
+                webpage, 'duration', fatal=False))
 
         formats = []
 
@@ -67,6 +75,7 @@ class XVideosIE(InfoExtractor):
             'id': video_id,
             'formats': formats,
             'title': video_title,
+            'duration': video_duration,
             'thumbnail': video_thumbnail,
             'age_limit': 18,
         }
index 5c8f17eb2fa1c3bb12924ca189699181eb2b598e..e34ebe3a6612264815295dfe043f0eca1f5b2dcc 100644 (file)
@@ -39,8 +39,8 @@ class XXXYMoviesIE(InfoExtractor):
             r"video_url\s*:\s*'([^']+)'", webpage, 'video URL')
 
         title = self._html_search_regex(
-            [r'<div class="block_header">\s*<h1>([^<]+)</h1>',
-             r'<title>(.*?)\s*-\s*XXXYMovies\.com</title>'],
+            [r'<div[^>]+\bclass="block_header"[^>]*>\s*<h1>([^<]+)<',
+             r'<title>(.*?)\s*-\s*(?:XXXYMovies\.com|XXX\s+Movies)</title>'],
             webpage, 'title')
 
         thumbnail = self._search_regex(
diff --git a/youtube_dl/extractor/yam.py b/youtube_dl/extractor/yam.py
deleted file mode 100644 (file)
index ef55355..0000000
+++ /dev/null
@@ -1,123 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..compat import compat_urlparse
-from ..utils import (
-    float_or_none,
-    month_by_abbreviation,
-    ExtractorError,
-    get_element_by_attribute,
-)
-
-
-class YamIE(InfoExtractor):
-    IE_DESC = '蕃薯藤yam天空部落'
-    _VALID_URL = r'https?://mymedia\.yam\.com/m/(?P<id>\d+)'
-
-    _TESTS = [{
-        # An audio hosted on Yam
-        'url': 'http://mymedia.yam.com/m/2283921',
-        'md5': 'c011b8e262a52d5473d9c2e3c9963b9c',
-        'info_dict': {
-            'id': '2283921',
-            'ext': 'mp3',
-            'title': '發現 - 趙薇 京華煙雲主題曲',
-            'description': '發現 - 趙薇 京華煙雲主題曲',
-            'uploader_id': 'princekt',
-            'upload_date': '20080807',
-            'duration': 313.0,
-        }
-    }, {
-        # An external video hosted on YouTube
-        'url': 'http://mymedia.yam.com/m/3599430',
-        'md5': '03127cf10d8f35d120a9e8e52e3b17c6',
-        'info_dict': {
-            'id': 'CNpEoQlrIgA',
-            'ext': 'mp4',
-            'upload_date': '20150306',
-            'uploader': '新莊社大瑜伽社',
-            'description': 'md5:11e2e405311633ace874f2e6226c8b17',
-            'uploader_id': '2323agoy',
-            'title': '20090412陽明山二子坪-1',
-        },
-        'skip': 'Video does not exist',
-    }, {
-        'url': 'http://mymedia.yam.com/m/3598173',
-        'info_dict': {
-            'id': '3598173',
-            'ext': 'mp4',
-        },
-        'skip': 'cause Yam system error',
-    }, {
-        'url': 'http://mymedia.yam.com/m/3599437',
-        'info_dict': {
-            'id': '3599437',
-            'ext': 'mp4',
-        },
-        'skip': 'invalid YouTube URL',
-    }, {
-        'url': 'http://mymedia.yam.com/m/2373534',
-        'md5': '7ff74b91b7a817269d83796f8c5890b1',
-        'info_dict': {
-            'id': '2373534',
-            'ext': 'mp3',
-            'title': '林俊傑&蔡卓妍-小酒窩',
-            'description': 'md5:904003395a0fcce6cfb25028ff468420',
-            'upload_date': '20080928',
-            'uploader_id': 'onliner2',
-        }
-    }]
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        page = self._download_webpage(url, video_id)
-
-        # Check for errors
-        system_msg = self._html_search_regex(
-            r'系統訊息(?:<br>|\n|\r)*([^<>]+)<br>', page, 'system message',
-            default=None)
-        if system_msg:
-            raise ExtractorError(system_msg, expected=True)
-
-        # Is it hosted externally on YouTube?
-        youtube_url = self._html_search_regex(
-            r'<embed src="(http://www.youtube.com/[^"]+)"',
-            page, 'YouTube url', default=None)
-        if youtube_url:
-            return self.url_result(youtube_url, 'Youtube')
-
-        title = self._html_search_regex(
-            r'<h1[^>]+class="heading"[^>]*>\s*(.+)\s*</h1>', page, 'title')
-
-        api_page = self._download_webpage(
-            'http://mymedia.yam.com/api/a/?pID=' + video_id, video_id,
-            note='Downloading API page')
-        api_result_obj = compat_urlparse.parse_qs(api_page)
-
-        info_table = get_element_by_attribute('class', 'info', page)
-        uploader_id = self._html_search_regex(
-            r'<!-- 發表作者 -->:[\n ]+<a href="/([a-z0-9]+)"',
-            info_table, 'uploader id', fatal=False)
-        mobj = re.search(r'<!-- 發表於 -->(?P<mon>[A-Z][a-z]{2})\s+' +
-                         r'(?P<day>\d{1,2}), (?P<year>\d{4})', page)
-        if mobj:
-            upload_date = '%s%02d%02d' % (
-                mobj.group('year'),
-                month_by_abbreviation(mobj.group('mon')),
-                int(mobj.group('day')))
-        else:
-            upload_date = None
-        duration = float_or_none(api_result_obj['totaltime'][0], scale=1000)
-
-        return {
-            'id': video_id,
-            'url': api_result_obj['mp3file'][0],
-            'title': title,
-            'description': self._html_search_meta('description', page),
-            'duration': duration,
-            'uploader_id': uploader_id,
-            'upload_date': upload_date,
-        }
diff --git a/youtube_dl/extractor/yandexdisk.py b/youtube_dl/extractor/yandexdisk.py
new file mode 100644 (file)
index 0000000..e8f6ae1
--- /dev/null
@@ -0,0 +1,118 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    determine_ext,
+    float_or_none,
+    int_or_none,
+    try_get,
+    urlencode_postdata,
+)
+
+
+class YandexDiskIE(InfoExtractor):
+    _VALID_URL = r'https?://yadi\.sk/[di]/(?P<id>[^/?#&]+)'
+
+    _TESTS = [{
+        'url': 'https://yadi.sk/i/VdOeDou8eZs6Y',
+        'md5': '33955d7ae052f15853dc41f35f17581c',
+        'info_dict': {
+            'id': 'VdOeDou8eZs6Y',
+            'ext': 'mp4',
+            'title': '4.mp4',
+            'duration': 168.6,
+            'uploader': 'y.botova',
+            'uploader_id': '300043621',
+            'view_count': int,
+        },
+    }, {
+        'url': 'https://yadi.sk/d/h3WAXvDS3Li3Ce',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        status = self._download_webpage(
+            'https://disk.yandex.com/auth/status', video_id, query={
+                'urlOrigin': url,
+                'source': 'public',
+                'md5': 'false',
+            })
+
+        sk = self._search_regex(
+            r'(["\'])sk(?:External)?\1\s*:\s*(["\'])(?P<value>(?:(?!\2).)+)\2',
+            status, 'sk', group='value')
+
+        webpage = self._download_webpage(url, video_id)
+
+        models = self._parse_json(
+            self._search_regex(
+                r'<script[^>]+id=["\']models-client[^>]+>\s*(\[.+?\])\s*</script',
+                webpage, 'video JSON'),
+            video_id)
+
+        data = next(
+            model['data'] for model in models
+            if model.get('model') == 'resource')
+
+        video_hash = data['id']
+        title = data['name']
+
+        models = self._download_json(
+            'https://disk.yandex.com/models/', video_id,
+            data=urlencode_postdata({
+                '_model.0': 'videoInfo',
+                'id.0': video_hash,
+                '_model.1': 'do-get-resource-url',
+                'id.1': video_hash,
+                'version': '13.6',
+                'sk': sk,
+            }), query={'_m': 'videoInfo'})['models']
+
+        videos = try_get(models, lambda x: x[0]['data']['videos'], list) or []
+        source_url = try_get(
+            models, lambda x: x[1]['data']['file'], compat_str)
+
+        formats = []
+        if source_url:
+            formats.append({
+                'url': source_url,
+                'format_id': 'source',
+                'ext': determine_ext(title, 'mp4'),
+                'quality': 1,
+            })
+        for video in videos:
+            format_url = video.get('url')
+            if not format_url:
+                continue
+            if determine_ext(format_url) == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+            else:
+                formats.append({
+                    'url': format_url,
+                })
+        self._sort_formats(formats)
+
+        duration = float_or_none(try_get(
+            models, lambda x: x[0]['data']['duration']), 1000)
+        uploader = try_get(
+            data, lambda x: x['user']['display_name'], compat_str)
+        uploader_id = try_get(
+            data, lambda x: x['user']['uid'], compat_str)
+        view_count = int_or_none(try_get(
+            data, lambda x: x['meta']['views_counter']))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'duration': duration,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'view_count': view_count,
+            'formats': formats,
+        }
index fd6268ba4119d02988172a4771514cc34603db1a..eb1062142ecbc6a4702f0dc7763c414934fd3645 100644 (file)
@@ -234,7 +234,8 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):
                 'overembed': 'false',
             })['playlist']
 
-        tracks, track_ids = playlist['tracks'], map(compat_str, playlist['trackIds'])
+        tracks = playlist['tracks']
+        track_ids = [compat_str(track_id) for track_id in playlist['trackIds']]
 
         # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks,
         # missing tracks should be retrieved manually.
index b50f34e9bb30e47c679940ca1577ea8cc6683934..f33fabe194daceb9ac6ffaf838536e7c9d53cd34 100644 (file)
@@ -1,39 +1,95 @@
 from __future__ import unicode_literals
 
+import re
+
 from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    parse_duration,
+)
 
 
 class YouJizzIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]+)?-(?P<id>[0-9]+)\.html(?:$|[?#])'
+    _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]*-(?P<id>\d+)\.html|embed/(?P<embed_id>\d+))'
     _TESTS = [{
         'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html',
-        'md5': '78fc1901148284c69af12640e01c6310',
+        'md5': 'b1e1dfaa8bb9537d8b84eeda9cf4acf4',
         'info_dict': {
             'id': '2189178',
             'ext': 'mp4',
             'title': 'Zeichentrick 1',
             'age_limit': 18,
+            'duration': 2874,
         }
     }, {
         'url': 'http://www.youjizz.com/videos/-2189178.html',
         'only_matching': True,
+    }, {
+        'url': 'https://www.youjizz.com/videos/embed/31991001',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
-        video_id = self._match_id(url)
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id') or mobj.group('embed_id')
+
         webpage = self._download_webpage(url, video_id)
-        # YouJizz's HTML5 player has invalid HTML
-        webpage = webpage.replace('"controls', '" controls')
-        age_limit = self._rta_search(webpage)
-        video_title = self._html_search_regex(
-            r'<title>\s*(.*)\s*</title>', webpage, 'title')
 
-        info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0]
+        title = self._html_search_regex(
+            r'<title>(.+?)</title>', webpage, 'title')
+
+        formats = []
+
+        encodings = self._parse_json(
+            self._search_regex(
+                r'encodings\s*=\s*(\[.+?\]);\n', webpage, 'encodings',
+                default='[]'),
+            video_id, fatal=False)
+        for encoding in encodings:
+            if not isinstance(encoding, dict):
+                continue
+            format_url = encoding.get('filename')
+            if not isinstance(format_url, compat_str):
+                continue
+            if determine_ext(format_url) == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+            else:
+                format_id = encoding.get('name') or encoding.get('quality')
+                height = int_or_none(self._search_regex(
+                    r'^(\d+)[pP]', format_id, 'height', default=None))
+                formats.append({
+                    'url': format_url,
+                    'format_id': format_id,
+                    'height': height,
+                })
+
+        if formats:
+            info_dict = {
+                'formats': formats,
+            }
+        else:
+            # YouJizz's HTML5 player has invalid HTML
+            webpage = webpage.replace('"controls', '" controls')
+            info_dict = self._parse_html5_media_entries(
+                url, webpage, video_id)[0]
+
+        duration = parse_duration(self._search_regex(
+            r'<strong>Runtime:</strong>([^<]+)', webpage, 'duration',
+            default=None))
+        uploader = self._search_regex(
+            r'<strong>Uploaded By:.*?<a[^>]*>([^<]+)', webpage, 'uploader',
+            default=None)
 
         info_dict.update({
             'id': video_id,
-            'title': video_title,
-            'age_limit': age_limit,
+            'title': title,
+            'age_limit': self._rta_search(webpage),
+            'duration': duration,
+            'uploader': uploader,
         })
 
         return info_dict
index 73ebe57598a281a4debcb6ea671ebf081b63c610..0c4bc2edab616cceff7cf68f7f64d12010c6e16d 100644 (file)
@@ -1,23 +1,18 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import base64
-import itertools
 import random
 import re
 import string
 import time
 
 from .common import InfoExtractor
-from ..compat import (
-    compat_ord,
-    compat_str,
-    compat_urllib_parse_urlencode,
-)
 from ..utils import (
     ExtractorError,
-    get_element_by_attribute,
-    try_get,
+    get_element_by_class,
+    js_to_json,
+    str_or_none,
+    strip_jsonp,
 )
 
 
@@ -26,7 +21,9 @@ class YoukuIE(InfoExtractor):
     IE_DESC = '优酷'
     _VALID_URL = r'''(?x)
         (?:
-            http://(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)|
+            https?://(
+                (?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)|
+                video\.tudou\.com/v/)|
             youku:)
         (?P<id>[A-Za-z0-9]+)(?:\.html|/v\.swf|)
     '''
@@ -35,9 +32,15 @@ class YoukuIE(InfoExtractor):
         # MD5 is unstable
         'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html',
         'info_dict': {
-            'id': 'XMTc1ODE5Njcy_part1',
+            'id': 'XMTc1ODE5Njcy',
             'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.',
-            'ext': 'flv'
+            'ext': 'mp4',
+            'duration': 74.73,
+            'thumbnail': r're:^https?://.*',
+            'uploader': '。躲猫猫、',
+            'uploader_id': '36017967',
+            'uploader_url': 'http://i.youku.com/u/UMTQ0MDcxODY4',
+            'tags': list,
         }
     }, {
         'url': 'http://player.youku.com/player.php/sid/XNDgyMDQ2NTQw/v.swf',
@@ -46,25 +49,42 @@ class YoukuIE(InfoExtractor):
         'url': 'http://v.youku.com/v_show/id_XODgxNjg1Mzk2_ev_1.html',
         'info_dict': {
             'id': 'XODgxNjg1Mzk2',
+            'ext': 'mp4',
             'title': '武媚娘传奇 85',
+            'duration': 1999.61,
+            'thumbnail': r're:^https?://.*',
+            'uploader': '疯狂豆花',
+            'uploader_id': '62583473',
+            'uploader_url': 'http://i.youku.com/u/UMjUwMzMzODky',
+            'tags': list,
         },
-        'playlist_count': 11,
-        'skip': 'Available in China only',
     }, {
         'url': 'http://v.youku.com/v_show/id_XMTI1OTczNDM5Mg==.html',
         'info_dict': {
             'id': 'XMTI1OTczNDM5Mg',
+            'ext': 'mp4',
             'title': '花千骨 04',
+            'duration': 2363,
+            'thumbnail': r're:^https?://.*',
+            'uploader': '放剧场-花千骨',
+            'uploader_id': '772849359',
+            'uploader_url': 'http://i.youku.com/u/UMzA5MTM5NzQzNg==',
+            'tags': list,
         },
-        'playlist_count': 13,
     }, {
         'url': 'http://v.youku.com/v_show/id_XNjA1NzA2Njgw.html',
         'note': 'Video protected with password',
         'info_dict': {
             'id': 'XNjA1NzA2Njgw',
+            'ext': 'mp4',
             'title': '邢義田复旦讲座之想象中的胡人—从“左衽孔子”说起',
+            'duration': 7264.5,
+            'thumbnail': r're:^https?://.*',
+            'uploader': 'FoxJin1006',
+            'uploader_id': '322014285',
+            'uploader_url': 'http://i.youku.com/u/UMTI4ODA1NzE0MA==',
+            'tags': list,
         },
-        'playlist_count': 19,
         'params': {
             'videopassword': '100600',
         },
@@ -73,130 +93,38 @@ class YoukuIE(InfoExtractor):
         'url': 'http://v.youku.com/v_show/id_XOTUxMzg4NDMy.html',
         'info_dict': {
             'id': 'XOTUxMzg4NDMy',
+            'ext': 'mp4',
             'title': '我的世界☆明月庄主☆车震猎杀☆杀人艺术Minecraft',
+            'duration': 702.08,
+            'thumbnail': r're:^https?://.*',
+            'uploader': '明月庄主moon',
+            'uploader_id': '38465621',
+            'uploader_url': 'http://i.youku.com/u/UMTUzODYyNDg0',
+            'tags': list,
+        },
+    }, {
+        'url': 'http://video.tudou.com/v/XMjIyNzAzMTQ4NA==.html?f=46177805',
+        'info_dict': {
+            'id': 'XMjIyNzAzMTQ4NA',
+            'ext': 'mp4',
+            'title': '卡马乔国足开大脚长传冲吊集锦',
+            'duration': 289,
+            'thumbnail': r're:^https?://.*',
+            'uploader': '阿卜杜拉之星',
+            'uploader_id': '2382249',
+            'uploader_url': 'http://i.youku.com/u/UOTUyODk5Ng==',
+            'tags': list,
         },
-        'playlist_count': 6,
+    }, {
+        'url': 'http://video.tudou.com/v/XMjE4ODI3OTg2MA==.html',
+        'only_matching': True,
     }]
 
-    def construct_video_urls(self, data):
-        # get sid, token
-        def yk_t(s1, s2):
-            ls = list(range(256))
-            t = 0
-            for i in range(256):
-                t = (t + ls[i] + compat_ord(s1[i % len(s1)])) % 256
-                ls[i], ls[t] = ls[t], ls[i]
-            s = bytearray()
-            x, y = 0, 0
-            for i in range(len(s2)):
-                y = (y + 1) % 256
-                x = (x + ls[y]) % 256
-                ls[x], ls[y] = ls[y], ls[x]
-                s.append(compat_ord(s2[i]) ^ ls[(ls[x] + ls[y]) % 256])
-            return bytes(s)
-
-        sid, token = yk_t(
-            b'becaf9be', base64.b64decode(data['security']['encrypt_string'].encode('ascii'))
-        ).decode('ascii').split('_')
-
-        # get oip
-        oip = data['security']['ip']
-
-        fileid_dict = {}
-        for stream in data['stream']:
-            if stream.get('channel_type') == 'tail':
-                continue
-            format = stream.get('stream_type')
-            fileid = try_get(
-                stream, lambda x: x['segs'][0]['fileid'],
-                compat_str) or stream['stream_fileid']
-            fileid_dict[format] = fileid
-
-        def get_fileid(format, n):
-            number = hex(int(str(n), 10))[2:].upper()
-            if len(number) == 1:
-                number = '0' + number
-            streamfileids = fileid_dict[format]
-            fileid = streamfileids[0:8] + number + streamfileids[10:]
-            return fileid
-
-        # get ep
-        def generate_ep(format, n):
-            fileid = get_fileid(format, n)
-            ep_t = yk_t(
-                b'bf7e5f01',
-                ('%s_%s_%s' % (sid, fileid, token)).encode('ascii')
-            )
-            ep = base64.b64encode(ep_t).decode('ascii')
-            return ep
-
-        # generate video_urls
-        video_urls_dict = {}
-        for stream in data['stream']:
-            if stream.get('channel_type') == 'tail':
-                continue
-            format = stream.get('stream_type')
-            video_urls = []
-            for dt in stream['segs']:
-                n = str(stream['segs'].index(dt))
-                param = {
-                    'K': dt['key'],
-                    'hd': self.get_hd(format),
-                    'myp': 0,
-                    'ypp': 0,
-                    'ctype': 12,
-                    'ev': 1,
-                    'token': token,
-                    'oip': oip,
-                    'ep': generate_ep(format, n)
-                }
-                video_url = \
-                    'http://k.youku.com/player/getFlvPath/' + \
-                    'sid/' + sid + \
-                    '_00' + \
-                    '/st/' + self.parse_ext_l(format) + \
-                    '/fileid/' + get_fileid(format, n) + '?' + \
-                    compat_urllib_parse_urlencode(param)
-                video_urls.append(video_url)
-            video_urls_dict[format] = video_urls
-
-        return video_urls_dict
-
     @staticmethod
     def get_ysuid():
         return '%d%s' % (int(time.time()), ''.join([
             random.choice(string.ascii_letters) for i in range(3)]))
 
-    def get_hd(self, fm):
-        hd_id_dict = {
-            '3gp': '0',
-            '3gphd': '1',
-            'flv': '0',
-            'flvhd': '0',
-            'mp4': '1',
-            'mp4hd': '1',
-            'mp4hd2': '1',
-            'mp4hd3': '1',
-            'hd2': '2',
-            'hd3': '3',
-        }
-        return hd_id_dict[fm]
-
-    def parse_ext_l(self, fm):
-        ext_dict = {
-            '3gp': 'flv',
-            '3gphd': 'mp4',
-            'flv': 'flv',
-            'flvhd': 'flv',
-            'mp4': 'mp4',
-            'mp4hd': 'mp4',
-            'mp4hd2': 'flv',
-            'mp4hd3': 'flv',
-            'hd2': 'flv',
-            'hd3': 'flv',
-        }
-        return ext_dict[fm]
-
     def get_format_name(self, fm):
         _dict = {
             '3gp': 'h6',
@@ -210,32 +138,40 @@ class YoukuIE(InfoExtractor):
             'hd2': 'h2',
             'hd3': 'h1',
         }
-        return _dict[fm]
+        return _dict.get(fm)
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
         self._set_cookie('youku.com', '__ysuid', self.get_ysuid())
+        self._set_cookie('youku.com', 'xreferrer', 'http://www.youku.com')
 
-        def retrieve_data(req_url, note):
-            headers = {
-                'Referer': req_url,
-            }
-            headers.update(self.geo_verification_headers())
-            self._set_cookie('youku.com', 'xreferrer', 'http://www.youku.com')
+        _, urlh = self._download_webpage_handle(
+            'https://log.mmstat.com/eg.js', video_id, 'Retrieving cna info')
+        # The etag header is '"foobar"'; let's remove the double quotes
+        cna = urlh.headers['etag'][1:-1]
 
-            raw_data = self._download_json(req_url, video_id, note=note, headers=headers)
-
-            return raw_data['data']
+        # request basic data
+        basic_data_params = {
+            'vid': video_id,
+            'ccode': '0402' if 'tudou.com' in url else '0401',
+            'client_ip': '192.168.1.1',
+            'utid': cna,
+            'client_ts': time.time() / 1000,
+        }
 
         video_password = self._downloader.params.get('videopassword')
-
-        # request basic data
-        basic_data_url = 'http://play.youku.com/play/get.json?vid=%s&ct=12' % video_id
         if video_password:
-            basic_data_url += '&pwd=%s' % video_password
+            basic_data_params['password'] = video_password
 
-        data = retrieve_data(basic_data_url, 'Downloading JSON metadata')
+        headers = {
+            'Referer': url,
+        }
+        headers.update(self.geo_verification_headers())
+        data = self._download_json(
+            'https://ups.youku.com/ups/get.json', video_id,
+            'Downloading JSON metadata',
+            query=basic_data_params, headers=headers)['data']
 
         error = data.get('error')
         if error:
@@ -253,86 +189,111 @@ class YoukuIE(InfoExtractor):
                 raise ExtractorError(msg)
 
         # get video title
-        title = data['video']['title']
-
-        # generate video_urls_dict
-        video_urls_dict = self.construct_video_urls(data)
-
-        # construct info
-        entries = [{
-            'id': '%s_part%d' % (video_id, i + 1),
-            'title': title,
-            'formats': [],
-            # some formats are not available for all parts, we have to detect
-            # which one has all
-        } for i in range(max(len(v.get('segs')) for v in data['stream']))]
-        for stream in data['stream']:
-            if stream.get('channel_type') == 'tail':
-                continue
-            fm = stream.get('stream_type')
-            video_urls = video_urls_dict[fm]
-            for video_url, seg, entry in zip(video_urls, stream['segs'], entries):
-                entry['formats'].append({
-                    'url': video_url,
-                    'format_id': self.get_format_name(fm),
-                    'ext': self.parse_ext_l(fm),
-                    'filesize': int(seg['size']),
-                    'width': stream.get('width'),
-                    'height': stream.get('height'),
-                })
+        video_data = data['video']
+        title = video_data['title']
+
+        formats = [{
+            'url': stream['m3u8_url'],
+            'format_id': self.get_format_name(stream.get('stream_type')),
+            'ext': 'mp4',
+            'protocol': 'm3u8_native',
+            'filesize': int(stream.get('size')),
+            'width': stream.get('width'),
+            'height': stream.get('height'),
+        } for stream in data['stream'] if stream.get('channel_type') != 'tail']
+        self._sort_formats(formats)
 
         return {
-            '_type': 'multi_video',
             'id': video_id,
             'title': title,
-            'entries': entries,
+            'formats': formats,
+            'duration': video_data.get('seconds'),
+            'thumbnail': video_data.get('logo'),
+            'uploader': video_data.get('username'),
+            'uploader_id': str_or_none(video_data.get('userid')),
+            'uploader_url': data.get('uploader', {}).get('homepage'),
+            'tags': video_data.get('tags'),
         }
 
 
 class YoukuShowIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?youku\.com/show_page/id_(?P<id>[0-9a-z]+)\.html'
+    _VALID_URL = r'https?://list\.youku\.com/show/id_(?P<id>[0-9a-z]+)\.html'
     IE_NAME = 'youku:show'
 
-    _TEST = {
-        'url': 'http://www.youku.com/show_page/id_zc7c670be07ff11e48b3f.html',
+    _TESTS = [{
+        'url': 'http://list.youku.com/show/id_zc7c670be07ff11e48b3f.html',
         'info_dict': {
             'id': 'zc7c670be07ff11e48b3f',
-            'title': '花千骨 未删减版',
-            'description': 'md5:578d4f2145ae3f9128d9d4d863312910',
+            'title': '花千骨 DVD版',
+            'description': 'md5:a1ae6f5618571bbeb5c9821f9c81b558',
         },
         'playlist_count': 50,
-    }
-
-    _PAGE_SIZE = 40
+    }, {
+        # Episode number not starting from 1
+        'url': 'http://list.youku.com/show/id_zefbfbd70efbfbd780bef.html',
+        'info_dict': {
+            'id': 'zefbfbd70efbfbd780bef',
+            'title': '超级飞侠3',
+            'description': 'md5:275715156abebe5ccc2a1992e9d56b98',
+        },
+        'playlist_count': 24,
+    }, {
+        # Ongoing playlist. The initial page is the last one
+        'url': 'http://list.youku.com/show/id_za7c275ecd7b411e1a19e.html',
+        'only_matchine': True,
+    }]
 
-    def _find_videos_in_page(self, webpage):
-        videos = re.findall(
-            r'<li><a[^>]+href="(?P<url>https?://v\.youku\.com/[^"]+)"[^>]+title="(?P<title>[^"]+)"', webpage)
-        return [
-            self.url_result(video_url, YoukuIE.ie_key(), title)
-            for video_url, title in videos]
+    def _extract_entries(self, playlist_data_url, show_id, note, query):
+        query['callback'] = 'cb'
+        playlist_data = self._download_json(
+            playlist_data_url, show_id, query=query, note=note,
+            transform_source=lambda s: js_to_json(strip_jsonp(s)))['html']
+        drama_list = (get_element_by_class('p-drama-grid', playlist_data) or
+                      get_element_by_class('p-drama-half-row', playlist_data))
+        if drama_list is None:
+            raise ExtractorError('No episodes found')
+        video_urls = re.findall(r'<a[^>]+href="([^"]+)"', drama_list)
+        return playlist_data, [
+            self.url_result(self._proto_relative_url(video_url, 'http:'), YoukuIE.ie_key())
+            for video_url in video_urls]
 
     def _real_extract(self, url):
         show_id = self._match_id(url)
         webpage = self._download_webpage(url, show_id)
 
-        entries = self._find_videos_in_page(webpage)
-
-        playlist_title = self._html_search_regex(
-            r'<span[^>]+class="name">([^<]+)</span>', webpage, 'playlist title', fatal=False)
-        detail_div = get_element_by_attribute('class', 'detail', webpage) or ''
-        playlist_description = self._html_search_regex(
-            r'<span[^>]+style="display:none"[^>]*>([^<]+)</span>',
-            detail_div, 'playlist description', fatal=False)
-
-        for idx in itertools.count(1):
-            episodes_page = self._download_webpage(
-                'http://www.youku.com/show_episode/id_%s.html' % show_id,
-                show_id, query={'divid': 'reload_%d' % (idx * self._PAGE_SIZE + 1)},
-                note='Downloading episodes page %d' % idx)
-            new_entries = self._find_videos_in_page(episodes_page)
+        entries = []
+        page_config = self._parse_json(self._search_regex(
+            r'var\s+PageConfig\s*=\s*({.+});', webpage, 'page config'),
+            show_id, transform_source=js_to_json)
+        first_page, initial_entries = self._extract_entries(
+            'http://list.youku.com/show/module', show_id,
+            note='Downloading initial playlist data page',
+            query={
+                'id': page_config['showid'],
+                'tab': 'showInfo',
+            })
+        first_page_reload_id = self._html_search_regex(
+            r'<div[^>]+id="(reload_\d+)', first_page, 'first page reload id')
+        # The first reload_id has the same items as first_page
+        reload_ids = re.findall('<li[^>]+data-id="([^"]+)">', first_page)
+        for idx, reload_id in enumerate(reload_ids):
+            if reload_id == first_page_reload_id:
+                entries.extend(initial_entries)
+                continue
+            _, new_entries = self._extract_entries(
+                'http://list.youku.com/show/episode', show_id,
+                note='Downloading playlist data page %d' % (idx + 1),
+                query={
+                    'id': page_config['showid'],
+                    'stage': reload_id,
+                })
             entries.extend(new_entries)
-            if len(new_entries) < self._PAGE_SIZE:
-                break
 
-        return self.playlist_result(entries, show_id, playlist_title, playlist_description)
+        desc = self._html_search_meta('description', webpage, fatal=False)
+        playlist_title = desc.split(',')[0] if desc else None
+        detail_li = get_element_by_class('p-intro', webpage)
+        playlist_description = get_element_by_class(
+            'intro-more', detail_li) if detail_li else None
+
+        return self.playlist_result(
+            entries, show_id, playlist_title, playlist_description)
index 34ab878a4167580b50ef5b3d5bee94e58b02fe41..547adefeba00ffb58a7e4dbd4205fc935c7ff2be 100644 (file)
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_str
 from ..utils import (
     int_or_none,
     sanitized_Request,
@@ -26,7 +27,7 @@ class YouPornIE(InfoExtractor):
             'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?',
             'thumbnail': r're:^https?://.*\.jpg$',
             'uploader': 'Ask Dan And Jennifer',
-            'upload_date': '20101221',
+            'upload_date': '20101217',
             'average_rating': int,
             'view_count': int,
             'comment_count': int,
@@ -45,7 +46,7 @@ class YouPornIE(InfoExtractor):
             'description': 'http://sweetlivegirls.com Big Tits Awesome Brunette On amazing webcam show.mp4',
             'thumbnail': r're:^https?://.*\.jpg$',
             'uploader': 'Unknown',
-            'upload_date': '20111125',
+            'upload_date': '20110418',
             'average_rating': int,
             'view_count': int,
             'comment_count': int,
@@ -68,28 +69,46 @@ class YouPornIE(InfoExtractor):
         webpage = self._download_webpage(request, display_id)
 
         title = self._search_regex(
-            [r'(?:video_titles|videoTitle)\s*[:=]\s*(["\'])(?P<title>.+?)\1',
-             r'<h1[^>]+class=["\']heading\d?["\'][^>]*>([^<])<'],
-            webpage, 'title', group='title')
+            [r'(?:video_titles|videoTitle)\s*[:=]\s*(["\'])(?P<title>(?:(?!\1).)+)\1',
+             r'<h1[^>]+class=["\']heading\d?["\'][^>]*>(?P<title>[^<]+)<'],
+            webpage, 'title', group='title',
+            default=None) or self._og_search_title(
+            webpage, default=None) or self._html_search_meta(
+            'title', webpage, fatal=True)
 
         links = []
 
+        # Main source
+        definitions = self._parse_json(
+            self._search_regex(
+                r'mediaDefinition\s*=\s*(\[.+?\]);', webpage,
+                'media definitions', default='[]'),
+            video_id, fatal=False)
+        if definitions:
+            for definition in definitions:
+                if not isinstance(definition, dict):
+                    continue
+                video_url = definition.get('videoUrl')
+                if isinstance(video_url, compat_str) and video_url:
+                    links.append(video_url)
+
+        # Fallback #1, this also contains extra low quality 180p format
+        for _, link in re.findall(r'<a[^>]+href=(["\'])(http.+?)\1[^>]+title=["\']Download [Vv]ideo', webpage):
+            links.append(link)
+
+        # Fallback #2 (unavailable as at 22.06.2017)
         sources = self._search_regex(
             r'(?s)sources\s*:\s*({.+?})', webpage, 'sources', default=None)
         if sources:
             for _, link in re.findall(r'[^:]+\s*:\s*(["\'])(http.+?)\1', sources):
                 links.append(link)
 
-        # Fallback #1
+        # Fallback #3 (unavailable as at 22.06.2017)
         for _, link in re.findall(
-                r'(?:videoUrl|videoSrc|videoIpadUrl|html5PlayerSrc)\s*[:=]\s*(["\'])(http.+?)\1', webpage):
-            links.append(link)
-
-        # Fallback #2, this also contains extra low quality 180p format
-        for _, link in re.findall(r'<a[^>]+href=(["\'])(http.+?)\1[^>]+title=["\']Download [Vv]ideo', webpage):
+                r'(?:videoSrc|videoIpadUrl|html5PlayerSrc)\s*[:=]\s*(["\'])(http.+?)\1', webpage):
             links.append(link)
 
-        # Fallback #3, encrypted links
+        # Fallback #4, encrypted links (unavailable as at 22.06.2017)
         for _, encrypted_link in re.findall(
                 r'encryptedQuality\d{3,4}URL\s*=\s*(["\'])([\da-zA-Z+/=]+)\1', webpage):
             links.append(aes_decrypt_text(encrypted_link, title, 32).decode('utf-8'))
@@ -124,7 +143,8 @@ class YouPornIE(InfoExtractor):
             r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>',
             webpage, 'uploader', fatal=False)
         upload_date = unified_strdate(self._html_search_regex(
-            r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>',
+            [r'Date\s+[Aa]dded:\s*<span>([^<]+)',
+             r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>'],
             webpage, 'upload date', fatal=False))
 
         age_limit = self._rta_search(webpage)
index 480f403dadf91a355a6e91fac8c46945a04d1dc1..ad2e933ee4e34c9ebdb982ca66278e6e4c4a06b0 100644 (file)
@@ -16,6 +16,7 @@ from ..jsinterp import JSInterpreter
 from ..swfinterp import SWFInterpreter
 from ..compat import (
     compat_chr,
+    compat_kwargs,
     compat_parse_qs,
     compat_urllib_parse_unquote,
     compat_urllib_parse_unquote_plus,
@@ -38,7 +39,6 @@ from ..utils import (
     parse_duration,
     remove_quotes,
     remove_start,
-    sanitized_Request,
     smuggle_url,
     str_to_int,
     try_get,
@@ -54,7 +54,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
     """Provide base functions for Youtube extractors"""
     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
     _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
-    _PASSWORD_CHALLENGE_URL = 'https://accounts.google.com/signin/challenge/sl/password'
+
+    _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
+    _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
+    _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
+
     _NETRC_MACHINE = 'youtube'
     # If True it will raise an error if no login info is provided
     _LOGIN_REQUIRED = False
@@ -96,74 +100,157 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
 
         login_form = self._hidden_inputs(login_page)
 
-        login_form.update({
-            'checkConnection': 'youtube',
-            'Email': username,
-            'Passwd': password,
-        })
+        def req(url, f_req, note, errnote):
+            data = login_form.copy()
+            data.update({
+                'pstMsg': 1,
+                'checkConnection': 'youtube',
+                'checkedDomains': 'youtube',
+                'hl': 'en',
+                'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
+                'f.req': json.dumps(f_req),
+                'flowName': 'GlifWebSignIn',
+                'flowEntry': 'ServiceLogin',
+            })
+            return self._download_json(
+                url, None, note=note, errnote=errnote,
+                transform_source=lambda s: re.sub(r'^[^[]*', '', s),
+                fatal=False,
+                data=urlencode_postdata(data), headers={
+                    'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
+                    'Google-Accounts-XSRF': 1,
+                })
 
-        login_results = self._download_webpage(
-            self._PASSWORD_CHALLENGE_URL, None,
-            note='Logging in', errnote='unable to log in', fatal=False,
-            data=urlencode_postdata(login_form))
-        if login_results is False:
-            return False
+        def warn(message):
+            self._downloader.report_warning(message)
+
+        lookup_req = [
+            username,
+            None, [], None, 'US', None, None, 2, False, True,
+            [
+                None, None,
+                [2, 1, None, 1,
+                 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
+                 None, [], 4],
+                1, [None, None, []], None, None, None, True
+            ],
+            username,
+        ]
 
-        error_msg = self._html_search_regex(
-            r'<[^>]+id="errormsg_0_Passwd"[^>]*>([^<]+)<',
-            login_results, 'error message', default=None)
-        if error_msg:
-            raise ExtractorError('Unable to login: %s' % error_msg, expected=True)
+        lookup_results = req(
+            self._LOOKUP_URL, lookup_req,
+            'Looking up account info', 'Unable to look up account info')
 
-        if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
-            raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
+        if lookup_results is False:
+            return False
 
-        # Two-Factor
-        # TODO add SMS and phone call support - these require making a request and then prompting the user
+        user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
+        if not user_hash:
+            warn('Unable to extract user hash')
+            return False
 
-        if re.search(r'(?i)<form[^>]+id="challenge"', login_results) is not None:
-            tfa_code = self._get_tfa_info('2-step verification code')
+        challenge_req = [
+            user_hash,
+            None, 1, None, [1, None, None, None, [password, None, True]],
+            [
+                None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
+                1, [None, None, []], None, None, None, True
+            ]]
 
-            if not tfa_code:
-                self._downloader.report_warning(
-                    'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
-                    '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
-                return False
+        challenge_results = req(
+            self._CHALLENGE_URL, challenge_req,
+            'Logging in', 'Unable to log in')
 
-            tfa_code = remove_start(tfa_code, 'G-')
+        if challenge_results is False:
+            return
 
-            tfa_form_strs = self._form_hidden_inputs('challenge', login_results)
+        login_res = try_get(challenge_results, lambda x: x[0][5], list)
+        if login_res:
+            login_msg = try_get(login_res, lambda x: x[5], compat_str)
+            warn(
+                'Unable to login: %s' % 'Invalid password'
+                if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
+            return False
 
-            tfa_form_strs.update({
-                'Pin': tfa_code,
-                'TrustDevice': 'on',
-            })
+        res = try_get(challenge_results, lambda x: x[0][-1], list)
+        if not res:
+            warn('Unable to extract result entry')
+            return False
 
-            tfa_data = urlencode_postdata(tfa_form_strs)
+        tfa = try_get(res, lambda x: x[0][0], list)
+        if tfa:
+            tfa_str = try_get(tfa, lambda x: x[2], compat_str)
+            if tfa_str == 'TWO_STEP_VERIFICATION':
+                # SEND_SUCCESS - TFA code has been successfully sent to phone
+                # QUOTA_EXCEEDED - reached the limit of TFA codes
+                status = try_get(tfa, lambda x: x[5], compat_str)
+                if status == 'QUOTA_EXCEEDED':
+                    warn('Exceeded the limit of TFA codes, try later')
+                    return False
+
+                tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
+                if not tl:
+                    warn('Unable to extract TL')
+                    return False
+
+                tfa_code = self._get_tfa_info('2-step verification code')
+
+                if not tfa_code:
+                    warn(
+                        'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
+                        '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
+                    return False
+
+                tfa_code = remove_start(tfa_code, 'G-')
+
+                tfa_req = [
+                    user_hash, None, 2, None,
+                    [
+                        9, None, None, None, None, None, None, None,
+                        [None, tfa_code, True, 2]
+                    ]]
+
+                tfa_results = req(
+                    self._TFA_URL.format(tl), tfa_req,
+                    'Submitting TFA code', 'Unable to submit TFA code')
+
+                if tfa_results is False:
+                    return False
+
+                tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
+                if tfa_res:
+                    tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
+                    warn(
+                        'Unable to finish TFA: %s' % 'Invalid TFA code'
+                        if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
+                    return False
+
+                check_cookie_url = try_get(
+                    tfa_results, lambda x: x[0][-1][2], compat_str)
+        else:
+            check_cookie_url = try_get(res, lambda x: x[2], compat_str)
 
-            tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data)
-            tfa_results = self._download_webpage(
-                tfa_req, None,
-                note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
+        if not check_cookie_url:
+            warn('Unable to extract CheckCookie URL')
+            return False
 
-            if tfa_results is False:
-                return False
+        check_cookie_results = self._download_webpage(
+            check_cookie_url, None, 'Checking cookie', fatal=False)
 
-            if re.search(r'(?i)<form[^>]+id="challenge"', tfa_results) is not None:
-                self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.')
-                return False
-            if re.search(r'(?i)<form[^>]+id="gaia_loginform"', tfa_results) is not None:
-                self._downloader.report_warning('unable to log in - did the page structure change?')
-                return False
-            if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
-                self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
-                return False
+        if check_cookie_results is False:
+            return False
 
-        if re.search(r'(?i)<form[^>]+id="gaia_loginform"', login_results) is not None:
-            self._downloader.report_warning('unable to log in: bad username or password')
+        if 'https://myaccount.google.com/' not in check_cookie_results:
+            warn('Unable to log in')
             return False
+
         return True
 
+    def _download_webpage(self, *args, **kwargs):
+        kwargs.setdefault('query', {})['disable_polymer'] = 'true'
+        return super(YoutubeBaseInfoExtractor, self)._download_webpage(
+            *args, **compat_kwargs(kwargs))
+
     def _real_initialize(self):
         if self._downloader is None:
             return
@@ -592,6 +679,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             },
         },
         # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
+        # YouTube Red ad is not captured for creator
         {
             'url': '__2ABJjxzNo',
             'info_dict': {
@@ -921,6 +1009,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'Skipping DASH manifest',
             ],
         },
+        {
+            # The following content has been identified by the YouTube community
+            # as inappropriate or offensive to some audiences.
+            'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
+            'info_dict': {
+                'id': '6SJNVb0GnPI',
+                'ext': 'mp4',
+                'title': 'Race Differences in Intelligence',
+                'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
+                'duration': 965,
+                'upload_date': '20140124',
+                'uploader': 'New Century Foundation',
+                'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
+                'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
+                'license': 'Standard YouTube License',
+                'view_count': int,
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
         {
             # itag 212
             'url': '1t24XAntNCY',
@@ -1188,37 +1297,57 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                     sub_lang_list[sub_lang] = sub_formats
                 return sub_lang_list
 
+            def make_captions(sub_url, sub_langs):
+                parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
+                caption_qs = compat_parse_qs(parsed_sub_url.query)
+                captions = {}
+                for sub_lang in sub_langs:
+                    sub_formats = []
+                    for ext in self._SUBTITLE_FORMATS:
+                        caption_qs.update({
+                            'tlang': [sub_lang],
+                            'fmt': [ext],
+                        })
+                        sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
+                            query=compat_urllib_parse_urlencode(caption_qs, True)))
+                        sub_formats.append({
+                            'url': sub_url,
+                            'ext': ext,
+                        })
+                    captions[sub_lang] = sub_formats
+                return captions
+
+            # New captions format as of 22.06.2017
+            player_response = args.get('player_response')
+            if player_response and isinstance(player_response, compat_str):
+                player_response = self._parse_json(
+                    player_response, video_id, fatal=False)
+                if player_response:
+                    renderer = player_response['captions']['playerCaptionsTracklistRenderer']
+                    base_url = renderer['captionTracks'][0]['baseUrl']
+                    sub_lang_list = []
+                    for lang in renderer['translationLanguages']:
+                        lang_code = lang.get('languageCode')
+                        if lang_code:
+                            sub_lang_list.append(lang_code)
+                    return make_captions(base_url, sub_lang_list)
+
             # Some videos don't provide ttsurl but rather caption_tracks and
             # caption_translation_languages (e.g. 20LmZk1hakA)
+            # Does not used anymore as of 22.06.2017
             caption_tracks = args['caption_tracks']
             caption_translation_languages = args['caption_translation_languages']
             caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
-            parsed_caption_url = compat_urllib_parse_urlparse(caption_url)
-            caption_qs = compat_parse_qs(parsed_caption_url.query)
-
-            sub_lang_list = {}
+            sub_lang_list = []
             for lang in caption_translation_languages.split(','):
                 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
                 sub_lang = lang_qs.get('lc', [None])[0]
-                if not sub_lang:
-                    continue
-                sub_formats = []
-                for ext in self._SUBTITLE_FORMATS:
-                    caption_qs.update({
-                        'tlang': [sub_lang],
-                        'fmt': [ext],
-                    })
-                    sub_url = compat_urlparse.urlunparse(parsed_caption_url._replace(
-                        query=compat_urllib_parse_urlencode(caption_qs, True)))
-                    sub_formats.append({
-                        'url': sub_url,
-                        'ext': ext,
-                    })
-                sub_lang_list[sub_lang] = sub_formats
-            return sub_lang_list
+                if sub_lang:
+                    sub_lang_list.append(sub_lang)
+            return make_captions(caption_url, sub_lang_list)
         # An extractor error can be raise by the download process if there are
         # no automatic captions but there are subtitles
-        except (KeyError, ExtractorError):
+        except (KeyError, IndexError, ExtractorError):
             self._downloader.report_warning(err_msg)
             return {}
 
@@ -1245,6 +1374,43 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             playback_url, video_id, 'Marking watched',
             'Unable to mark watched', fatal=False)
 
+    @staticmethod
+    def _extract_urls(webpage):
+        # Embedded YouTube player
+        entries = [
+            unescapeHTML(mobj.group('url'))
+            for mobj in re.finditer(r'''(?x)
+            (?:
+                <iframe[^>]+?src=|
+                data-video-url=|
+                <embed[^>]+?src=|
+                embedSWF\(?:\s*|
+                <object[^>]+data=|
+                new\s+SWFObject\(
+            )
+            (["\'])
+                (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
+                (?:embed|v|p)/.+?)
+            \1''', webpage)]
+
+        # lazyYT YouTube embed
+        entries.extend(list(map(
+            unescapeHTML,
+            re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
+
+        # Wordpress "YouTube Video Importer" plugin
+        matches = re.findall(r'''(?x)<div[^>]+
+            class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
+            data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
+        entries.extend(m[-1] for m in matches)
+
+        return entries
+
+    @staticmethod
+    def _extract_url(webpage):
+        urls = YoutubeIE._extract_urls(webpage)
+        return urls[0] if urls else None
+
     @classmethod
     def extract_id(cls, url):
         mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
@@ -1257,6 +1423,41 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
         return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
 
+    @staticmethod
+    def _extract_chapters(description, duration):
+        if not description:
+            return None
+        chapter_lines = re.findall(
+            r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
+            description)
+        if not chapter_lines:
+            return None
+        chapters = []
+        for next_num, (chapter_line, time_point) in enumerate(
+                chapter_lines, start=1):
+            start_time = parse_duration(time_point)
+            if start_time is None:
+                continue
+            if start_time > duration:
+                break
+            end_time = (duration if next_num == len(chapter_lines)
+                        else parse_duration(chapter_lines[next_num][1]))
+            if end_time is None:
+                continue
+            if end_time > duration:
+                end_time = duration
+            if start_time > end_time:
+                break
+            chapter_title = re.sub(
+                r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
+            chapter_title = re.sub(r'\s+', ' ', chapter_title)
+            chapters.append({
+                'start_time': start_time,
+                'end_time': end_time,
+                'title': chapter_title,
+            })
+        return chapters
+
     def _real_extract(self, url):
         url, smuggled_data = unsmuggle_url(url, {})
 
@@ -1300,9 +1501,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             if dash_mpd and dash_mpd[0] not in dash_mpds:
                 dash_mpds.append(dash_mpd[0])
 
+        is_live = None
+        view_count = None
+
+        def extract_view_count(v_info):
+            return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
+
         # Get video info
         embed_webpage = None
-        is_live = None
         if re.search(r'player-age-gate-content">', video_webpage) is not None:
             age_gate = True
             # We simulate the access to the video from www.youtube.com/v/{video_id}
@@ -1325,6 +1531,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         else:
             age_gate = False
             video_info = None
+            sts = None
             # Try looking directly into the video webpage
             ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
             if ytplayer_config:
@@ -1341,6 +1548,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                         args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
                 if args.get('livestream') == '1' or args.get('live_playback') == 1:
                     is_live = True
+                sts = ytplayer_config.get('sts')
             if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
                 # We also try looking in get_video_info since it may contain different dashmpd
                 # URL that points to a DASH manifest with possibly different itag set (some itags
@@ -1349,17 +1557,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 # The general idea is to take a union of itags of both DASH manifests (for example
                 # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
                 self.report_video_info_webpage_download(video_id)
-                for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
-                    video_info_url = (
-                        '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
-                        % (proto, video_id, el_type))
+                for el in ('info', 'embedded', 'detailpage', 'vevo', ''):
+                    query = {
+                        'video_id': video_id,
+                        'ps': 'default',
+                        'eurl': '',
+                        'gl': 'US',
+                        'hl': 'en',
+                    }
+                    if el:
+                        query['el'] = el
+                    if sts:
+                        query['sts'] = sts
                     video_info_webpage = self._download_webpage(
-                        video_info_url,
+                        '%s://www.youtube.com/get_video_info' % proto,
                         video_id, note=False,
-                        errnote='unable to download video info webpage')
+                        errnote='unable to download video info webpage',
+                        fatal=False, query=query)
+                    if not video_info_webpage:
+                        continue
                     get_video_info = compat_parse_qs(video_info_webpage)
-                    if get_video_info.get('use_cipher_signature') != ['True']:
-                        add_dash_mpd(get_video_info)
+                    add_dash_mpd(get_video_info)
+                    if view_count is None:
+                        view_count = extract_view_count(get_video_info)
                     if not video_info:
                         video_info = get_video_info
                     if 'token' in get_video_info:
@@ -1399,9 +1619,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             video_title = '_'
 
         # description
-        video_description = get_element_by_id("eow-description", video_webpage)
+        description_original = video_description = get_element_by_id("eow-description", video_webpage)
         if video_description:
-            video_description = re.sub(r'''(?x)
+            description_original = video_description = re.sub(r'''(?x)
                 <a\s+
                     (?:[a-zA-Z-]+="[^"]*"\s+)*?
                     (?:title|href)="([^"]+)"\s+
@@ -1443,10 +1663,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 return self.playlist_result(entries, video_id, video_title, video_description)
             self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
 
-        if 'view_count' in video_info:
-            view_count = int(video_info['view_count'][0])
-        else:
-            view_count = None
+        if view_count is None:
+            view_count = extract_view_count(video_info)
 
         # Check for "rental" videos
         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
@@ -1490,10 +1708,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         if not upload_date:
             upload_date = self._search_regex(
                 [r'(?s)id="eow-date.*?>(.*?)</span>',
-                 r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'],
+                 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
                 video_webpage, 'upload date', default=None)
-            if upload_date:
-                upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
         upload_date = unified_strdate(upload_date)
 
         video_license = self._html_search_regex(
@@ -1501,7 +1717,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             video_webpage, 'license', default=None)
 
         m_music = re.search(
-            r'<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*<ul[^>]*>\s*<li>(?P<title>.+?) by (?P<creator>.+?)(?:\(.+?\))?</li',
+            r'''(?x)
+                <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
+                <ul[^>]*>\s*
+                <li>(?P<title>.+?)
+                by (?P<creator>.+?)
+                (?:
+                    \(.+?\)|
+                    <a[^>]*
+                        (?:
+                            \bhref=["\']/red[^>]*>|             # drop possible
+                            >\s*Listen ad-free with YouTube Red # YouTube Red ad
+                        )
+                    .*?
+                )?</li
+            ''',
             video_webpage)
         if m_music:
             video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
@@ -1558,6 +1788,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         if self._downloader.params.get('writeannotations', False):
             video_annotations = self._extract_annotations(video_id)
 
+        chapters = self._extract_chapters(description_original, video_duration)
+
         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
             self.report_rtmp_download()
             formats = [{
@@ -1591,12 +1823,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 format_id = url_data['itag'][0]
                 url = url_data['url'][0]
 
-                if 'sig' in url_data:
-                    url += '&signature=' + url_data['sig'][0]
-                elif 's' in url_data:
-                    encrypted_sig = url_data['s'][0]
+                if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
                     ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
-
                     jsplayer_url_json = self._search_regex(
                         ASSETS_RE,
                         embed_webpage if age_gate else video_webpage,
@@ -1617,6 +1845,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                             video_webpage, 'age gate player URL')
                         player_url = json.loads(player_url_json)
 
+                if 'sig' in url_data:
+                    url += '&signature=' + url_data['sig'][0]
+                elif 's' in url_data:
+                    encrypted_sig = url_data['s'][0]
+
                     if self._downloader.params.get('verbose'):
                         if player_url is None:
                             player_version = 'unknown'
@@ -1790,6 +2023,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             'duration': video_duration,
             'age_limit': 18 if age_gate else 0,
             'annotations': video_annotations,
+            'chapters': chapters,
             'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
             'view_count': view_count,
             'like_count': like_count,
@@ -1861,7 +2095,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
                      |
                         (%(playlist_id)s)
                      )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
-    _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&disable_polymer=true'
+    _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
     _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'
     IE_NAME = 'youtube:playlist'
     _TESTS = [{
diff --git a/youtube_dl/extractor/zaq1.py b/youtube_dl/extractor/zaq1.py
new file mode 100644 (file)
index 0000000..889aff5
--- /dev/null
@@ -0,0 +1,101 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    unified_timestamp,
+)
+
+
+class Zaq1IE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?zaq1\.pl/video/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'http://zaq1.pl/video/xev0e',
+        'md5': '24a5eb3f052e604ae597c4d0d19b351e',
+        'info_dict': {
+            'id': 'xev0e',
+            'title': 'DJ NA WESELE. TANIEC Z FIGURAMI.węgrów/sokołów podlaski/siedlce/mińsk mazowiecki/warszawa',
+            'description': 'www.facebook.com/weseledjKontakt: 728 448 199 / 505 419 147',
+            'ext': 'mp4',
+            'duration': 511,
+            'timestamp': 1490896361,
+            'uploader': 'Anonim',
+            'upload_date': '20170330',
+            'view_count': int,
+        }
+    }, {
+        # malformed JSON-LD
+        'url': 'http://zaq1.pl/video/x81vn',
+        'info_dict': {
+            'id': 'x81vn',
+            'title': 'SEKRETNE ŻYCIE WALTERA MITTY',
+            'ext': 'mp4',
+            'duration': 6234,
+            'timestamp': 1493494860,
+            'uploader': 'Anonim',
+            'upload_date': '20170429',
+            'view_count': int,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'expected_warnings': ['Failed to parse JSON'],
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_url = self._search_regex(
+            r'data-video-url=(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
+            'video url', group='url')
+
+        info = self._search_json_ld(webpage, video_id, fatal=False)
+
+        def extract_data(field, name, fatal=False):
+            return self._search_regex(
+                r'data-%s=(["\'])(?P<field>(?:(?!\1).)+)\1' % field,
+                webpage, field, fatal=fatal, group='field')
+
+        if not info.get('title'):
+            info['title'] = extract_data('file-name', 'title', fatal=True)
+
+        if not info.get('duration'):
+            info['duration'] = int_or_none(extract_data('duration', 'duration'))
+
+        if not info.get('thumbnail'):
+            info['thumbnail'] = extract_data('photo-url', 'thumbnail')
+
+        if not info.get('timestamp'):
+            info['timestamp'] = unified_timestamp(self._html_search_meta(
+                'uploadDate', webpage, 'timestamp'))
+
+        if not info.get('interactionCount'):
+            info['view_count'] = int_or_none(self._html_search_meta(
+                'interactionCount', webpage, 'view count'))
+
+        uploader = self._html_search_regex(
+            r'Wideo dodał:\s*<a[^>]*>([^<]+)</a>', webpage, 'uploader',
+            fatal=False)
+
+        width = int_or_none(self._html_search_meta(
+            'width', webpage, fatal=False))
+        height = int_or_none(self._html_search_meta(
+            'height', webpage, fatal=False))
+
+        info.update({
+            'id': video_id,
+            'formats': [{
+                'url': video_url,
+                'width': width,
+                'height': height,
+                'http_headers': {
+                    'Referer': url,
+                },
+            }],
+            'uploader': uploader,
+        })
+
+        return info
index 24cdec28c6cb2332232212d6bcf39d03edc27c7a..7bda596102a40775b06fe4318c3915633d586a67 100644 (file)
@@ -6,6 +6,7 @@ import re
 
 from .utils import (
     ExtractorError,
+    remove_quotes,
 )
 
 _OPERATORS = [
@@ -57,7 +58,6 @@ class JSInterpreter(object):
 
     def interpret_expression(self, expr, local_vars, allow_recursion):
         expr = expr.strip()
-
         if expr == '':  # Empty expression
             return None
 
@@ -121,11 +121,19 @@ class JSInterpreter(object):
             pass
 
         m = re.match(
-            r'(?P<var>%s)\.(?P<member>[^(]+)(?:\(+(?P<args>[^()]*)\))?$' % _NAME_RE,
+            r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr)
+        if m:
+            val = local_vars[m.group('in')]
+            idx = self.interpret_expression(
+                m.group('idx'), local_vars, allow_recursion - 1)
+            return val[idx]
+
+        m = re.match(
+            r'(?P<var>%s)(?:\.(?P<member>[^(]+)|\[(?P<member2>[^]]+)\])\s*(?:\(+(?P<args>[^()]*)\))?$' % _NAME_RE,
             expr)
         if m:
             variable = m.group('var')
-            member = m.group('member')
+            member = remove_quotes(m.group('member') or m.group('member2'))
             arg_str = m.group('args')
 
             if variable in local_vars:
@@ -173,14 +181,6 @@ class JSInterpreter(object):
 
             return obj[member](argvals)
 
-        m = re.match(
-            r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr)
-        if m:
-            val = local_vars[m.group('in')]
-            idx = self.interpret_expression(
-                m.group('idx'), local_vars, allow_recursion - 1)
-            return val[idx]
-
         for op, opfunc in _OPERATORS:
             m = re.match(r'(?P<x>.+?)%s(?P<y>.+)' % re.escape(op), expr)
             if not m:
@@ -211,21 +211,25 @@ class JSInterpreter(object):
         raise ExtractorError('Unsupported JS expression %r' % expr)
 
     def extract_object(self, objname):
+        _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')'''
         obj = {}
         obj_m = re.search(
-            (r'(?<!this\.)%s\s*=\s*\{' % re.escape(objname)) +
-            r'\s*(?P<fields>([a-zA-Z$0-9]+\s*:\s*function\(.*?\)\s*\{.*?\}(?:,\s*)?)*)' +
-            r'\}\s*;',
+            r'''(?x)
+                (?<!this\.)%s\s*=\s*{\s*
+                    (?P<fields>(%s\s*:\s*function\s*\(.*?\)\s*{.*?}(?:,\s*)?)*)
+                }\s*;
+            ''' % (re.escape(objname), _FUNC_NAME_RE),
             self.code)
         fields = obj_m.group('fields')
         # Currently, it only supports function definitions
         fields_m = re.finditer(
-            r'(?P<key>[a-zA-Z$0-9]+)\s*:\s*function'
-            r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
+            r'''(?x)
+                (?P<key>%s)\s*:\s*function\s*\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}
+            ''' % _FUNC_NAME_RE,
             fields)
         for f in fields_m:
             argnames = f.group('args').split(',')
-            obj[f.group('key')] = self.build_function(argnames, f.group('code'))
+            obj[remove_quotes(f.group('key'))] = self.build_function(argnames, f.group('code'))
 
         return obj
 
index 52309fb8421a0e1017646572f369b56443371a81..38439c97165e18e37d9bf4cd3ebf906f099c9b30 100644 (file)
@@ -20,6 +20,24 @@ from .utils import (
 from .version import __version__
 
 
+def _hide_login_info(opts):
+    PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
+    eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
+
+    def _scrub_eq(o):
+        m = eqre.match(o)
+        if m:
+            return m.group('key') + '=PRIVATE'
+        else:
+            return o
+
+    opts = list(map(_scrub_eq, opts))
+    for idx, opt in enumerate(opts):
+        if opt in PRIVATE_OPTS and idx + 1 < len(opts):
+            opts[idx + 1] = 'PRIVATE'
+    return opts
+
+
 def parseOpts(overrideArguments=None):
     def _readOptions(filename_bytes, default=[]):
         try:
@@ -93,26 +111,6 @@ def parseOpts(overrideArguments=None):
     def _comma_separated_values_options_callback(option, opt_str, value, parser):
         setattr(parser.values, option.dest, value.split(','))
 
-    def _hide_login_info(opts):
-        PRIVATE_OPTS = ['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username']
-        eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
-
-        def _scrub_eq(o):
-            m = eqre.match(o)
-            if m:
-                return m.group('key') + '=PRIVATE'
-            else:
-                return o
-
-        opts = list(map(_scrub_eq, opts))
-        for private_opt in PRIVATE_OPTS:
-            try:
-                i = opts.index(private_opt)
-                opts[i + 1] = 'PRIVATE'
-            except ValueError:
-                pass
-        return opts
-
     # No need to wrap help messages if we're on a wide console
     columns = compat_get_terminal_size().columns
     max_width = columns if columns else 80
@@ -310,7 +308,7 @@ def parseOpts(overrideArguments=None):
         metavar='FILTER', dest='match_filter', default=None,
         help=(
             'Generic video filter. '
-            'Specify any key (see help for -o for a list of available keys) to '
+            'Specify any key (see the "OUTPUT TEMPLATE" for a list of available keys) to '
             'match if the key is present, '
             '!key to check if the key is not present, '
             'key > NUMBER (like "comment_count > 12", also works with '
@@ -618,7 +616,7 @@ def parseOpts(overrideArguments=None):
     verbosity.add_option(
         '-j', '--dump-json',
         action='store_true', dest='dumpjson', default=False,
-        help='Simulate, quiet but print JSON information. See --output for a description of available keys.')
+        help='Simulate, quiet but print JSON information. See the "OUTPUT TEMPLATE" for a description of available keys.')
     verbosity.add_option(
         '-J', '--dump-single-json',
         action='store_true', dest='dump_single_json', default=False,
@@ -814,11 +812,12 @@ def parseOpts(overrideArguments=None):
         '--metadata-from-title',
         metavar='FORMAT', dest='metafromtitle',
         help='Parse additional metadata like song title / artist from the video title. '
-             'The format syntax is the same as --output, '
-             'the parsed parameters replace existing values. '
-             'Additional templates: %(album)s, %(artist)s. '
+             'The format syntax is the same as --output. Regular expression with '
+             'named capture groups may also be used. '
+             'The parsed parameters replace existing values. '
              'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like '
-             '"Coldplay - Paradise"')
+             '"Coldplay - Paradise". '
+             'Example (regex): --metadata-from-title "(?P<artist>.+?) - (?P<title>.+)"')
     postproc.add_option(
         '--xattrs',
         action='store_true', dest='xattrs', default=False,
index 90630c2d7391de9fd288662c8f207433702f8c99..64dabe790bcb74e88b4959f3c20e0eb23a0bcf8c 100644 (file)
@@ -4,7 +4,10 @@ import subprocess
 
 from .common import PostProcessor
 from ..compat import compat_shlex_quote
-from ..utils import PostProcessingError
+from ..utils import (
+    encodeArgument,
+    PostProcessingError,
+)
 
 
 class ExecAfterDownloadPP(PostProcessor):
@@ -20,7 +23,7 @@ class ExecAfterDownloadPP(PostProcessor):
         cmd = cmd.replace('{}', compat_shlex_quote(information['filepath']))
 
         self._downloader.to_screen('[exec] Executing command: %s' % cmd)
-        retCode = subprocess.call(cmd, shell=True)
+        retCode = subprocess.call(encodeArgument(cmd), shell=True)
         if retCode != 0:
             raise PostProcessingError(
                 'Command returned error code %d' % retCode)
index 665109558849692608db45fbdae9ade5a39d4f95..51256a3fb52c0f51547c335eecf6774af07307c8 100644 (file)
@@ -4,6 +4,7 @@ import io
 import os
 import subprocess
 import time
+import re
 
 
 from .common import AudioConversionError, PostProcessor
@@ -22,6 +23,7 @@ from ..utils import (
     subtitles_filename,
     dfxp2srt,
     ISO639Utils,
+    replace_extension,
 )
 
 
@@ -429,17 +431,40 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
 
         filename = info['filepath']
         temp_filename = prepend_extension(filename, 'temp')
+        in_filenames = [filename]
+        options = []
 
         if info['ext'] == 'm4a':
-            options = ['-vn', '-acodec', 'copy']
+            options.extend(['-vn', '-acodec', 'copy'])
         else:
-            options = ['-c', 'copy']
+            options.extend(['-c', 'copy'])
 
         for (name, value) in metadata.items():
             options.extend(['-metadata', '%s=%s' % (name, value)])
 
+        chapters = info.get('chapters', [])
+        if chapters:
+            metadata_filename = replace_extension(filename, 'meta')
+            with io.open(metadata_filename, 'wt', encoding='utf-8') as f:
+                def ffmpeg_escape(text):
+                    return re.sub(r'(=|;|#|\\|\n)', r'\\\1', text)
+
+                metadata_file_content = ';FFMETADATA1\n'
+                for chapter in chapters:
+                    metadata_file_content += '[CHAPTER]\nTIMEBASE=1/1000\n'
+                    metadata_file_content += 'START=%d\n' % (chapter['start_time'] * 1000)
+                    metadata_file_content += 'END=%d\n' % (chapter['end_time'] * 1000)
+                    chapter_title = chapter.get('title')
+                    if chapter_title:
+                        metadata_file_content += 'title=%s\n' % ffmpeg_escape(chapter_title)
+                f.write(metadata_file_content)
+                in_filenames.append(metadata_filename)
+                options.extend(['-map_metadata', '1'])
+
         self._downloader.to_screen('[ffmpeg] Adding metadata to \'%s\'' % filename)
-        self.run_ffmpeg(filename, temp_filename, options)
+        self.run_ffmpeg_multiple_files(in_filenames, temp_filename, options)
+        if chapters:
+            os.remove(metadata_filename)
         os.remove(encodeFilename(filename))
         os.rename(encodeFilename(temp_filename), encodeFilename(filename))
         return [], info
@@ -517,7 +542,7 @@ class FFmpegFixupM3u8PP(FFmpegPostProcessor):
             temp_filename = prepend_extension(filename, 'temp')
 
             options = ['-c', 'copy', '-f', 'mp4', '-bsf:a', 'aac_adtstoasc']
-            self._downloader.to_screen('[ffmpeg] Fixing malformated aac bitstream in "%s"' % filename)
+            self._downloader.to_screen('[ffmpeg] Fixing malformed AAC bitstream in "%s"' % filename)
             self.run_ffmpeg(filename, temp_filename, options)
 
             os.remove(encodeFilename(filename))
index a7d637a3c84f16ba8024b618ea30549fdcf109a6..f5c14d974f4b44c9a5c014a536a0b2f72c65b2db 100644 (file)
@@ -9,7 +9,9 @@ class MetadataFromTitlePP(PostProcessor):
     def __init__(self, downloader, titleformat):
         super(MetadataFromTitlePP, self).__init__(downloader)
         self._titleformat = titleformat
-        self._titleregex = self.format_to_regex(titleformat)
+        self._titleregex = (self.format_to_regex(titleformat)
+                            if re.search(r'%\(\w+\)s', titleformat)
+                            else titleformat)
 
     def format_to_regex(self, fmt):
         r"""
@@ -33,11 +35,14 @@ class MetadataFromTitlePP(PostProcessor):
         title = info['title']
         match = re.match(self._titleregex, title)
         if match is None:
-            self._downloader.to_screen('[fromtitle] Could not interpret title of video as "%s"' % self._titleformat)
+            self._downloader.to_screen(
+                '[fromtitle] Could not interpret title of video as "%s"'
+                % self._titleformat)
             return [], info
         for attribute, value in match.groupdict().items():
-            value = match.group(attribute)
             info[attribute] = value
-            self._downloader.to_screen('[fromtitle] parsed ' + attribute + ': ' + value)
+            self._downloader.to_screen(
+                '[fromtitle] parsed %s: %s'
+                % (attribute, value if value is not None else 'NA'))
 
         return [], info
index 91e235ff2f6166106c93e4b8de5bbeb18a6d8b7a..c42dd4c3ae20d59b6646b64180720c4482a0e901 100644 (file)
@@ -11,6 +11,7 @@ import contextlib
 import ctypes
 import datetime
 import email.utils
+import email.header
 import errno
 import functools
 import gzip
@@ -21,7 +22,6 @@ import locale
 import math
 import operator
 import os
-import pipes
 import platform
 import random
 import re
@@ -35,6 +35,7 @@ import xml.etree.ElementTree
 import zlib
 
 from .compat import (
+    compat_HTMLParseError,
     compat_HTMLParser,
     compat_basestring,
     compat_chr,
@@ -364,9 +365,9 @@ def get_elements_by_attribute(attribute, value, html, escape_value=True):
     retlist = []
     for m in re.finditer(r'''(?xs)
         <([a-zA-Z0-9:._-]+)
-         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
+         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
          \s+%s=['"]?%s['"]?
-         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
+         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
         \s*>
         (?P<content>.*?)
         </\1>
@@ -408,8 +409,12 @@ def extract_attributes(html_element):
     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
     """
     parser = HTMLAttributeParser()
-    parser.feed(html_element)
-    parser.close()
+    try:
+        parser.feed(html_element)
+        parser.close()
+    # Older Python may throw HTMLParseError in case of malformed HTML
+    except compat_HTMLParseError:
+        pass
     return parser.attrs
 
 
@@ -421,8 +426,8 @@ def clean_html(html):
 
     # Newline vs <br />
     html = html.replace('\n', ' ')
-    html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
-    html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
+    html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
+    html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
     # Strip html tags
     html = re.sub('<.*?>', '', html)
     # Replace html entities
@@ -591,7 +596,7 @@ def unescapeHTML(s):
     assert type(s) == compat_str
 
     return re.sub(
-        r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
+        r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 
 
 def get_subprocess_encoding():
@@ -931,14 +936,6 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
         except zlib.error:
             return zlib.decompress(data)
 
-    @staticmethod
-    def addinfourl_wrapper(stream, headers, url, code):
-        if hasattr(compat_urllib_request.addinfourl, 'getcode'):
-            return compat_urllib_request.addinfourl(stream, headers, url, code)
-        ret = compat_urllib_request.addinfourl(stream, headers, url)
-        ret.code = code
-        return ret
-
     def http_request(self, req):
         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
         # always respected by websites, some tend to give out URLs with non percent-encoded
@@ -990,13 +987,13 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
                     break
                 else:
                     raise original_ioerror
-            resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
+            resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
             resp.msg = old_resp.msg
             del resp.headers['Content-encoding']
         # deflate
         if resp.headers.get('Content-encoding', '') == 'deflate':
             gz = io.BytesIO(self.deflate(resp.read()))
-            resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
+            resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
             resp.msg = old_resp.msg
             del resp.headers['Content-encoding']
         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
@@ -1186,7 +1183,7 @@ def unified_timestamp(date_str, day_first=True):
     if date_str is None:
         return None
 
-    date_str = date_str.replace(',', ' ')
+    date_str = re.sub(r'[,|]', '', date_str)
 
     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
     timezone, date_str = extract_timezone(date_str)
@@ -1194,6 +1191,11 @@ def unified_timestamp(date_str, day_first=True):
     # Remove AM/PM + timezone
     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
 
+    # Remove unrecognized timezones from ISO 8601 alike timestamps
+    m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
+    if m:
+        date_str = date_str[:-len(m.group('tz'))]
+
     for expression in date_formats(day_first):
         try:
             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
@@ -1532,7 +1534,7 @@ def shell_quote(args):
         if isinstance(a, bytes):
             # We may get a filename encoded with 'encodeFilename'
             a = a.decode(encoding)
-        quoted_args.append(pipes.quote(a))
+        quoted_args.append(compat_shlex_quote(a))
     return ' '.join(quoted_args)
 
 
@@ -1813,6 +1815,10 @@ def float_or_none(v, scale=1, invscale=1, default=None):
         return default
 
 
+def bool_or_none(v, default=None):
+    return v if isinstance(v, bool) else default
+
+
 def strip_or_none(v):
     return None if v is None else v.strip()
 
@@ -2092,6 +2098,58 @@ def update_Request(req, url=None, data=None, headers={}, query={}):
     return new_req
 
 
+def _multipart_encode_impl(data, boundary):
+    content_type = 'multipart/form-data; boundary=%s' % boundary
+
+    out = b''
+    for k, v in data.items():
+        out += b'--' + boundary.encode('ascii') + b'\r\n'
+        if isinstance(k, compat_str):
+            k = k.encode('utf-8')
+        if isinstance(v, compat_str):
+            v = v.encode('utf-8')
+        # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
+        # suggests sending UTF-8 directly. Firefox sends UTF-8, too
+        content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
+        if boundary.encode('ascii') in content:
+            raise ValueError('Boundary overlaps with data')
+        out += content
+
+    out += b'--' + boundary.encode('ascii') + b'--\r\n'
+
+    return out, content_type
+
+
+def multipart_encode(data, boundary=None):
+    '''
+    Encode a dict to RFC 7578-compliant form-data
+
+    data:
+        A dict where keys and values can be either Unicode or bytes-like
+        objects.
+    boundary:
+        If specified a Unicode object, it's used as the boundary. Otherwise
+        a random boundary is generated.
+
+    Reference: https://tools.ietf.org/html/rfc7578
+    '''
+    has_specified_boundary = boundary is not None
+
+    while True:
+        if boundary is None:
+            boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
+
+        try:
+            out, content_type = _multipart_encode_impl(data, boundary)
+            break
+        except ValueError:
+            if has_specified_boundary:
+                raise
+            boundary = None
+
+    return out, content_type
+
+
 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
     if isinstance(key_or_keys, (list, tuple)):
         for key in key_or_keys:
@@ -2153,7 +2211,12 @@ def parse_age_limit(s):
 
 def strip_jsonp(code):
     return re.sub(
-        r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
+        r'''(?sx)^
+            (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+)
+            (?:\s*&&\s*(?P=func_name))?
+            \s*\(\s*(?P<callback_data>.*)\);?
+            \s*?(?://[^\n]*)*$''',
+        r'\g<callback_data>', code)
 
 
 def js_to_json(code):
@@ -2273,10 +2336,8 @@ def mimetype2ext(mt):
     return {
         '3gpp': '3gp',
         'smptett+xml': 'tt',
-        'srt': 'srt',
         'ttaf+xml': 'dfxp',
         'ttml+xml': 'ttml',
-        'vtt': 'vtt',
         'x-flv': 'flv',
         'x-mp4-fragmented': 'mp4',
         'x-ms-wmv': 'wmv',
@@ -2284,11 +2345,11 @@ def mimetype2ext(mt):
         'x-mpegurl': 'm3u8',
         'vnd.apple.mpegurl': 'm3u8',
         'dash+xml': 'mpd',
-        'f4m': 'f4m',
         'f4m+xml': 'f4m',
         'hds+xml': 'f4m',
         'vnd.ms-sstr+xml': 'ism',
         'quicktime': 'mov',
+        'mp2t': 'ts',
     }.get(res, res)
 
 
@@ -2304,11 +2365,11 @@ def parse_codecs(codecs_str):
         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
             if not vcodec:
                 vcodec = full_codec
-        elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
+        elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
             if not acodec:
                 acodec = full_codec
         else:
-            write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
+            write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
     if not vcodec and not acodec:
         if len(splited_codecs) == 2:
             return {
@@ -2676,6 +2737,8 @@ def cli_option(params, command_option, param):
 
 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
     param = params.get(param)
+    if param is None:
+        return []
     assert isinstance(param, bool)
     if separator:
         return [command_option + separator + (true_value if param else false_value)]
@@ -3757,3 +3820,11 @@ def write_xattr(path, key, value):
                         "Couldn't find a tool to set the xattrs. "
                         "Install either the python 'xattr' module, "
                         "or the 'xattr' binary.")
+
+
+def random_birthday(year_field, month_field, day_field):
+    return {
+        year_field: str(random.randint(1950, 1995)),
+        month_field: str(random.randint(1, 12)),
+        day_field: str(random.randint(1, 31)),
+    }
index 8c77f19052496050f9f0dcfb2ba457707fb71b3e..cdcb32e067bde9ed2391971daa758a852036af36 100644 (file)
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '2017.04.28'
+__version__ = '2017.09.11'