Merge pull request #9288 from reyyed/issue#9063fix
authorYen Chi Hsuan <yan12125@gmail.com>
Sat, 9 Jul 2016 06:29:53 +0000 (14:29 +0800)
committerGitHub <noreply@github.com>
Sat, 9 Jul 2016 06:29:53 +0000 (14:29 +0800)
[ffmpeg] Fix embedding subtitles (#9063)

269 files changed:
.github/ISSUE_TEMPLATE.md
.github/PULL_REQUEST_TEMPLATE.md [new file with mode: 0644]
.gitignore
.travis.yml
AUTHORS
CONTRIBUTING.md
Makefile
README.md
devscripts/buildserver.py
devscripts/create-github-release.py [new file with mode: 0644]
devscripts/install_srelay.sh [new file with mode: 0755]
devscripts/make_lazy_extractors.py
devscripts/prepare_manpage.py
devscripts/release.sh
devscripts/show-downloads-statistics.py [new file with mode: 0644]
docs/supportedsites.md
setup.py
test/helper.py
test/test_InfoExtractor.py
test/test_all_urls.py
test/test_compat.py
test/test_http.py
test/test_socks.py [new file with mode: 0644]
test/test_utils.py
tox.ini
youtube_dl/YoutubeDL.py
youtube_dl/__init__.py
youtube_dl/compat.py
youtube_dl/downloader/external.py
youtube_dl/downloader/f4m.py
youtube_dl/downloader/hls.py
youtube_dl/extractor/abcnews.py [new file with mode: 0644]
youtube_dl/extractor/adobetv.py
youtube_dl/extractor/aenetworks.py
youtube_dl/extractor/afreecatv.py [new file with mode: 0644]
youtube_dl/extractor/aftonbladet.py
youtube_dl/extractor/amp.py
youtube_dl/extractor/animeondemand.py
youtube_dl/extractor/anvato.py [new file with mode: 0644]
youtube_dl/extractor/aol.py
youtube_dl/extractor/appletrailers.py
youtube_dl/extractor/ard.py
youtube_dl/extractor/arte.py
youtube_dl/extractor/audiomack.py
youtube_dl/extractor/azubu.py
youtube_dl/extractor/bandcamp.py
youtube_dl/extractor/bbc.py
youtube_dl/extractor/bet.py
youtube_dl/extractor/bilibili.py
youtube_dl/extractor/biqle.py [new file with mode: 0644]
youtube_dl/extractor/bloomberg.py
youtube_dl/extractor/br.py
youtube_dl/extractor/brightcove.py
youtube_dl/extractor/buzzfeed.py
youtube_dl/extractor/byutv.py
youtube_dl/extractor/canalplus.py
youtube_dl/extractor/carambatv.py [new file with mode: 0644]
youtube_dl/extractor/cbc.py
youtube_dl/extractor/cbs.py
youtube_dl/extractor/cbsinteractive.py
youtube_dl/extractor/cbslocal.py [new file with mode: 0644]
youtube_dl/extractor/cbsnews.py
youtube_dl/extractor/cbssports.py
youtube_dl/extractor/ccc.py
youtube_dl/extractor/cda.py
youtube_dl/extractor/ceskatelevize.py
youtube_dl/extractor/channel9.py
youtube_dl/extractor/cinemassacre.py [deleted file]
youtube_dl/extractor/cliprs.py
youtube_dl/extractor/closertotruth.py [new file with mode: 0644]
youtube_dl/extractor/cloudy.py
youtube_dl/extractor/collegehumor.py [deleted file]
youtube_dl/extractor/comedycentral.py
youtube_dl/extractor/common.py
youtube_dl/extractor/coub.py [new file with mode: 0644]
youtube_dl/extractor/crunchyroll.py
youtube_dl/extractor/ctv.py [new file with mode: 0644]
youtube_dl/extractor/ctvnews.py [new file with mode: 0644]
youtube_dl/extractor/cwtv.py
youtube_dl/extractor/dailymail.py [new file with mode: 0644]
youtube_dl/extractor/dailymotion.py
youtube_dl/extractor/daum.py
youtube_dl/extractor/dcn.py
youtube_dl/extractor/dfb.py
youtube_dl/extractor/discovery.py
youtube_dl/extractor/douyutv.py
youtube_dl/extractor/dw.py
youtube_dl/extractor/eagleplatform.py
youtube_dl/extractor/eporner.py
youtube_dl/extractor/espn.py
youtube_dl/extractor/extractors.py
youtube_dl/extractor/eyedotv.py [new file with mode: 0644]
youtube_dl/extractor/facebook.py
youtube_dl/extractor/fczenit.py
youtube_dl/extractor/flickr.py
youtube_dl/extractor/flipagram.py [new file with mode: 0644]
youtube_dl/extractor/formula1.py [new file with mode: 0644]
youtube_dl/extractor/foxsports.py
youtube_dl/extractor/francetv.py
youtube_dl/extractor/funimation.py
youtube_dl/extractor/fusion.py [new file with mode: 0644]
youtube_dl/extractor/gamespot.py
youtube_dl/extractor/gametrailers.py [deleted file]
youtube_dl/extractor/generic.py
youtube_dl/extractor/godtv.py [new file with mode: 0644]
youtube_dl/extractor/groupon.py
youtube_dl/extractor/hearthisat.py
youtube_dl/extractor/howcast.py
youtube_dl/extractor/hrti.py [new file with mode: 0644]
youtube_dl/extractor/imdb.py
youtube_dl/extractor/indavideo.py
youtube_dl/extractor/instagram.py
youtube_dl/extractor/iqiyi.py
youtube_dl/extractor/jwplatform.py
youtube_dl/extractor/kaltura.py
youtube_dl/extractor/kamcord.py [new file with mode: 0644]
youtube_dl/extractor/kuwo.py
youtube_dl/extractor/la7.py
youtube_dl/extractor/learnr.py [new file with mode: 0644]
youtube_dl/extractor/leeco.py
youtube_dl/extractor/libraryofcongress.py [new file with mode: 0644]
youtube_dl/extractor/lifenews.py
youtube_dl/extractor/limelight.py
youtube_dl/extractor/litv.py [new file with mode: 0644]
youtube_dl/extractor/liveleak.py
youtube_dl/extractor/livestream.py
youtube_dl/extractor/localnews8.py [new file with mode: 0644]
youtube_dl/extractor/lynda.py
youtube_dl/extractor/m6.py
youtube_dl/extractor/malemotion.py [deleted file]
youtube_dl/extractor/matchtv.py
youtube_dl/extractor/meta.py [new file with mode: 0644]
youtube_dl/extractor/metacafe.py
youtube_dl/extractor/mgtv.py
youtube_dl/extractor/microsoftvirtualacademy.py [new file with mode: 0644]
youtube_dl/extractor/mitele.py
youtube_dl/extractor/mixcloud.py
youtube_dl/extractor/msn.py [new file with mode: 0644]
youtube_dl/extractor/mtv.py
youtube_dl/extractor/muzu.py [deleted file]
youtube_dl/extractor/mwave.py
youtube_dl/extractor/nationalgeographic.py
youtube_dl/extractor/nbc.py
youtube_dl/extractor/ndtv.py
youtube_dl/extractor/nfb.py
youtube_dl/extractor/nick.py
youtube_dl/extractor/ninecninemedia.py [new file with mode: 0644]
youtube_dl/extractor/normalboots.py
youtube_dl/extractor/nrk.py
youtube_dl/extractor/nuvid.py
youtube_dl/extractor/odnoklassniki.py
youtube_dl/extractor/onet.py [new file with mode: 0644]
youtube_dl/extractor/onionstudios.py
youtube_dl/extractor/ooyala.py
youtube_dl/extractor/openload.py
youtube_dl/extractor/ora.py
youtube_dl/extractor/orf.py
youtube_dl/extractor/pbs.py
youtube_dl/extractor/periscope.py
youtube_dl/extractor/pladform.py
youtube_dl/extractor/playwire.py
youtube_dl/extractor/polskieradio.py [new file with mode: 0644]
youtube_dl/extractor/pornhd.py
youtube_dl/extractor/pornhub.py
youtube_dl/extractor/prosiebensat1.py
youtube_dl/extractor/r7.py
youtube_dl/extractor/radiocanada.py [new file with mode: 0644]
youtube_dl/extractor/radiojavan.py
youtube_dl/extractor/rai.py
youtube_dl/extractor/rds.py
youtube_dl/extractor/redtube.py
youtube_dl/extractor/reuters.py [new file with mode: 0644]
youtube_dl/extractor/revision3.py
youtube_dl/extractor/rockstargames.py [new file with mode: 0644]
youtube_dl/extractor/rtlnl.py
youtube_dl/extractor/rtve.py
youtube_dl/extractor/rtvnh.py
youtube_dl/extractor/sandia.py
youtube_dl/extractor/scivee.py
youtube_dl/extractor/screenwavemedia.py
youtube_dl/extractor/seeker.py [new file with mode: 0644]
youtube_dl/extractor/sendtonews.py [new file with mode: 0644]
youtube_dl/extractor/sexykarma.py [deleted file]
youtube_dl/extractor/sina.py
youtube_dl/extractor/sixplay.py [new file with mode: 0644]
youtube_dl/extractor/skynewsarabia.py
youtube_dl/extractor/skysports.py [new file with mode: 0644]
youtube_dl/extractor/slideshare.py
youtube_dl/extractor/snagfilms.py [deleted file]
youtube_dl/extractor/sohu.py
youtube_dl/extractor/spankwire.py
youtube_dl/extractor/spiegel.py
youtube_dl/extractor/sportschau.py [new file with mode: 0644]
youtube_dl/extractor/srmediathek.py
youtube_dl/extractor/stitcher.py
youtube_dl/extractor/streamcloud.py
youtube_dl/extractor/svt.py
youtube_dl/extractor/tagesschau.py
youtube_dl/extractor/teachingchannel.py
youtube_dl/extractor/teamcoco.py
youtube_dl/extractor/ted.py
youtube_dl/extractor/telecinco.py
youtube_dl/extractor/telegraaf.py
youtube_dl/extractor/telewebion.py [new file with mode: 0644]
youtube_dl/extractor/tf1.py
youtube_dl/extractor/theplatform.py
youtube_dl/extractor/thesixtyone.py
youtube_dl/extractor/threeqsdn.py [new file with mode: 0644]
youtube_dl/extractor/toutv.py
youtube_dl/extractor/tvp.py
youtube_dl/extractor/tweakers.py
youtube_dl/extractor/twentyfourvideo.py
youtube_dl/extractor/twentymin.py
youtube_dl/extractor/twitch.py
youtube_dl/extractor/twitter.py
youtube_dl/extractor/udemy.py
youtube_dl/extractor/udn.py
youtube_dl/extractor/unistra.py
youtube_dl/extractor/urplay.py [new file with mode: 0644]
youtube_dl/extractor/ustudio.py
youtube_dl/extractor/veoh.py
youtube_dl/extractor/vessel.py
youtube_dl/extractor/vevo.py
youtube_dl/extractor/vice.py
youtube_dl/extractor/vidbit.py [new file with mode: 0644]
youtube_dl/extractor/vidio.py [new file with mode: 0644]
youtube_dl/extractor/viewlift.py [new file with mode: 0644]
youtube_dl/extractor/viewster.py
youtube_dl/extractor/viki.py
youtube_dl/extractor/vimeo.py
youtube_dl/extractor/vine.py
youtube_dl/extractor/vk.py
youtube_dl/extractor/vlive.py
youtube_dl/extractor/voicerepublic.py
youtube_dl/extractor/voxmedia.py
youtube_dl/extractor/vporn.py
youtube_dl/extractor/vrt.py
youtube_dl/extractor/vulture.py [deleted file]
youtube_dl/extractor/washingtonpost.py
youtube_dl/extractor/wat.py
youtube_dl/extractor/watchindianporn.py [new file with mode: 0644]
youtube_dl/extractor/wdr.py
youtube_dl/extractor/weibo.py [deleted file]
youtube_dl/extractor/wimp.py
youtube_dl/extractor/wistia.py
youtube_dl/extractor/wrzuta.py
youtube_dl/extractor/wsj.py
youtube_dl/extractor/xfileshare.py
youtube_dl/extractor/xhamster.py
youtube_dl/extractor/xiami.py [new file with mode: 0644]
youtube_dl/extractor/xminus.py
youtube_dl/extractor/xnxx.py
youtube_dl/extractor/xtube.py
youtube_dl/extractor/xuite.py
youtube_dl/extractor/xvideos.py
youtube_dl/extractor/yahoo.py
youtube_dl/extractor/yandexmusic.py
youtube_dl/extractor/youku.py
youtube_dl/extractor/youporn.py
youtube_dl/extractor/youtube.py
youtube_dl/jsinterp.py
youtube_dl/options.py
youtube_dl/postprocessor/execafterdownload.py
youtube_dl/postprocessor/ffmpeg.py
youtube_dl/socks.py [new file with mode: 0644]
youtube_dl/swfinterp.py
youtube_dl/update.py
youtube_dl/utils.py
youtube_dl/version.py

index 35f8e686346d8ea25b442cfcd882b350726a54ef..a1b5b0baa816ee3cb6d2ff19428b55290628a6ee 100644 (file)
@@ -6,8 +6,8 @@
 
 ---
 
-### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.04.19*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
-- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.04.19**
+### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.07.09.1*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
+- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.07.09.1**
 
 ### Before submitting an *issue* make sure you have:
 - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
@@ -35,7 +35,7 @@ $ youtube-dl -v <your command line>
 [debug] User config: []
 [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
 [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
-[debug] youtube-dl version 2016.04.19
+[debug] youtube-dl version 2016.07.09.1
 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
 [debug] Proxy map: {}
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644 (file)
index 0000000..f24bb4b
--- /dev/null
@@ -0,0 +1,22 @@
+## Please follow the guide below
+
+- You will be asked some questions, please read them **carefully** and answer honestly
+- Put an `x` into all the boxes [ ] relevant to your *pull request* (like that [x])
+- Use *Preview* tab to see how your *pull request* will actually look like
+
+---
+
+### Before submitting a *pull request* make sure you have:
+- [ ] At least skimmed through [adding new extractor tutorial](https://github.com/rg3/youtube-dl#adding-support-for-a-new-site) and [youtube-dl coding conventions](https://github.com/rg3/youtube-dl#youtube-dl-coding-conventions) sections
+- [ ] [Searched](https://github.com/rg3/youtube-dl/search?q=is%3Apr&type=Issues) the bugtracker for similar pull requests
+
+### What is the purpose of your *pull request*?
+- [ ] Bug fix
+- [ ] New extractor
+- [ ] New feature
+
+---
+
+### Description of your *pull request* and other information
+
+Explanation of your *pull request* in arbitrary form goes here. Please make sure the description explains the purpose and effect of your *pull request* and is worded well enough to be understood. Provide as much context and examples as possible.
index 72c10425d675f7c1952061be0057db0c2e5e232d..a802c75a10225f53f8da414fa34fd5422de5bea2 100644 (file)
@@ -28,10 +28,16 @@ updates_key.pem
 *.mp4
 *.m4a
 *.m4v
+*.mp3
 *.part
 *.swp
 test/testdata
+test/local_parameters.json
 .tox
 youtube-dl.zsh
+
+# IntelliJ related files
 .idea
-.idea/*
+*.iml
+
+tmp/
index cc21fae8f41ca567a2367d3515f981d8ec0af759..136c339f0cf9058fd9811077c6257afc553e30cf 100644 (file)
@@ -7,11 +7,13 @@ python:
   - "3.4"
   - "3.5"
 sudo: false
+install:
+  - bash ./devscripts/install_srelay.sh
+  - export PATH=$PATH:$(pwd)/tmp/srelay-0.4.8b6
 script: nosetests test --verbose
 notifications:
   email:
     - filippo.valsorda@gmail.com
-    - phihag@phihag.de
     - yasoob.khld@gmail.com
 #  irc:
 #    channels:
diff --git a/AUTHORS b/AUTHORS
index 07cade723be12afdbbf60485d9dbc2890d6c0f32..f74b30d07056c630e39bc8999cca97ac93123504 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -168,3 +168,12 @@ José Joaquín Atria
 Viťas Strádal
 Kagami Hiiragi
 Philip Huppert
+blahgeek
+Kevin Deldycke
+inondle
+Tomáš Čech
+Déstin Reed
+Roman Tsiupa
+Artur Krysiak
+Jakub Adam Wieczorek
+Aleksandar Topuzović
index c83b8655a595d9d040ef09c2c11c07d51f3f7d29..fbf0ab7e80b4244f0bf1c7fb60c5565437e452f4 100644 (file)
@@ -97,9 +97,17 @@ If you want to add support for a new site, first of all **make sure** this site
 After you have ensured this site is distributing it's content legally, you can follow this quick list (assuming your service is called `yourextractor`):
 
 1. [Fork this repository](https://github.com/rg3/youtube-dl/fork)
-2. Check out the source code with `git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git`
-3. Start a new git branch with `cd youtube-dl; git checkout -b yourextractor`
+2. Check out the source code with:
+
+        git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git
+
+3. Start a new git branch with
+
+        cd youtube-dl
+        git checkout -b yourextractor
+
 4. Start with this simple template and save it to `youtube_dl/extractor/yourextractor.py`:
+
     ```python
     # coding: utf-8
     from __future__ import unicode_literals
@@ -142,17 +150,149 @@ After you have ensured this site is distributing it's content legally, you can f
     ```
 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py).
 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc.
-7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/58525c94d547be1c8167d16c298bdd75506db328/youtube_dl/extractor/common.py#L68-L226). Add tests and code for as many as you want.
-8. Keep in mind that the only mandatory fields in info dict for successful extraction process are `id`, `title` and either `url` or `formats`, i.e. these are the critical data the extraction does not make any sense without. This means that [any field](https://github.com/rg3/youtube-dl/blob/58525c94d547be1c8167d16c298bdd75506db328/youtube_dl/extractor/common.py#L138-L226) apart from aforementioned mandatory ones should be treated **as optional** and extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. For example, if you have some intermediate dict `meta` that is a source of metadata and it has a key `summary` that you want to extract and put into resulting info dict as `description`, you should be ready that this key may be missing from the `meta` dict, i.e. you should extract it as `meta.get('summary')` and not `meta['summary']`. Similarly, you should pass `fatal=False` when extracting data from a webpage with `_search_regex/_html_search_regex`.
-9. Check the code with [flake8](https://pypi.python.org/pypi/flake8).
-10. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this:
+7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want.
+8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](http://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+.
+9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this:
 
         $ git add youtube_dl/extractor/extractors.py
         $ git add youtube_dl/extractor/yourextractor.py
         $ git commit -m '[yourextractor] Add new extractor'
         $ git push origin yourextractor
 
-11. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it.
+10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it.
 
 In any case, thank you very much for your contributions!
 
+## youtube-dl coding conventions
+
+This section introduces a guide lines for writing idiomatic, robust and future-proof extractor code.
+
+Extractors are very fragile by nature since they depend on the layout of the source data provided by 3rd party media hoster out of your control and this layout tend to change. As an extractor implementer your task is not only to write code that will extract media links and metadata correctly but also to minimize code dependency on source's layout changes and even to make the code foresee potential future changes and be ready for that. This is important because it will allow extractor not to break on minor layout changes thus keeping old youtube-dl versions working. Even though this breakage issue is easily fixed by emitting a new version of youtube-dl with fix incorporated all the previous version become broken in all repositories and distros' packages that may not be so prompt in fetching the update from us. Needless to say some may never receive an update at all that is possible for non rolling release distros.
+
+### Mandatory and optional metafields
+
+For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by [information dictionary](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L75-L257) or simply *info dict*. Only the following meta fields in *info dict* are considered mandatory for successful extraction process by youtube-dl:
+
+ - `id` (media identifier)
+ - `title` (media title)
+ - `url` (media download URL) or `formats`
+
+In fact only the last option is technically mandatory (i.e. if you can't figure out the download location of the media the extraction does not make any sense). But by convention youtube-dl also treats `id` and `title` to be mandatory. Thus aforementioned metafields are the critical data the extraction does not make any sense without and if any of them fail to be extracted then extractor is considered completely broken.
+
+[Any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L149-L257) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields.
+
+#### Example
+
+Say you have some source dictionary `meta` that you've fetched as JSON with HTTP request and it has a key `summary`:
+
+```python
+meta = self._download_json(url, video_id)
+```
+    
+Assume at this point `meta`'s layout is:
+
+```python
+{
+    ...
+    "summary": "some fancy summary text",
+    ...
+}
+```
+
+Assume you want to extract `summary` and put into resulting info dict as `description`. Since `description` is optional metafield you should be ready that this key may be missing from the `meta` dict, so that you should extract it like:
+
+```python
+description = meta.get('summary')  # correct
+```
+
+and not like:
+
+```python
+description = meta['summary']  # incorrect
+```
+
+The latter will break extraction process with `KeyError` if `summary` disappears from `meta` at some time later but with former approach extraction will just go ahead with `description` set to `None` that is perfectly fine (remember `None` is equivalent for absence of data). 
+
+Similarly, you should pass `fatal=False` when extracting optional data from a webpage with `_search_regex`, `_html_search_regex` or similar methods, for instance:
+
+```python
+description = self._search_regex(
+    r'<span[^>]+id="title"[^>]*>([^<]+)<',
+    webpage, 'description', fatal=False)
+```
+
+With `fatal` set to `False` if `_search_regex` fails to extract `description` it will emit a warning and continue extraction.
+
+You can also pass `default=<some fallback value>`, for example:
+
+```python
+description = self._search_regex(
+    r'<span[^>]+id="title"[^>]*>([^<]+)<',
+    webpage, 'description', default=None)
+```
+
+On failure this code will silently continue the extraction with `description` set to `None`. That is useful for metafields that are known to may or may not be present.
+### Provide fallbacks
+
+When extracting metadata try to provide several scenarios for that. For example if `title` is present in several places/sources try extracting from at least some of them. This would make it more future-proof in case some of the sources became unavailable.
+
+#### Example
+
+Say `meta` from previous example has a `title` and you are about to extract it. Since `title` is mandatory meta field you should end up with something like:
+
+```python
+title = meta['title']
+```
+
+If `title` disappeares from `meta` in future due to some changes on hoster's side the extraction would fail since `title` is mandatory. That's expected.
+
+Assume that you have some another source you can extract `title` from, for example `og:title` HTML meta of a `webpage`. In this case you can provide a fallback scenario:
+
+```python
+title = meta.get('title') or self._og_search_title(webpage)
+```
+
+This code will try to extract from `meta` first and if it fails it will try extracting `og:title` from a `webpage`.
+
+### Make regular expressions flexible
+
+When using regular expressions try to write them fuzzy and flexible.
+#### Example
+
+Say you need to extract `title` from the following HTML code:
+
+```html
+<span style="position: absolute; left: 910px; width: 90px; float: right; z-index: 9999;" class="title">some fancy title</span>
+```
+
+The code for that task should look similar to:
+
+```python
+title = self._search_regex(
+    r'<span[^>]+class="title"[^>]*>([^<]+)', webpage, 'title')
+```
+
+Or even better:
+
+```python
+title = self._search_regex(
+    r'<span[^>]+class=(["\'])title\1[^>]*>(?P<title>[^<]+)',
+    webpage, 'title', group='title')
+```
+
+Note how you tolerate potential changes in `style` attribute's value or switch from using double quotes to single for `class` attribute: 
+
+The code definitely should not look like:
+
+```python
+title = self._search_regex(
+    r'<span style="position: absolute; left: 910px; width: 90px; float: right; z-index: 9999;" class="title">(.*?)</span>',
+    webpage, 'title', group='title')
+```
+
+### Use safe conversion functions
+
+Wrap all extracted numeric data into safe functions from `utils`: `int_or_none`, `float_or_none`. Use them for string to number conversions as well.
+
index 06cffcb710c6fd8fa6962007bd07d4753d5d5af6..6ee4ba4ebc6804ad78061d6b346fd67cd3fd01e5 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites
 
 clean:
-       rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe
+       rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part *.info.json *.mp4 *.m4a *.flv *.mp3 *.avi *.mkv *.webm *.jpg *.png CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe
        find . -name "*.pyc" -delete
        find . -name "*.class" -delete
 
@@ -37,7 +37,7 @@ test:
 ot: offlinetest
 
 offlinetest: codetest
-       $(PYTHON) -m nose --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py
+       $(PYTHON) -m nose --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py --exclude test_socks.py
 
 tar: youtube-dl.tar.gz
 
@@ -69,7 +69,7 @@ README.txt: README.md
        pandoc -f markdown -t plain README.md -o README.txt
 
 youtube-dl.1: README.md
-       $(PYTHON) devscripts/prepare_manpage.py >youtube-dl.1.temp.md
+       $(PYTHON) devscripts/prepare_manpage.py youtube-dl.1.temp.md
        pandoc -s -f markdown -t man youtube-dl.1.temp.md -o youtube-dl.1
        rm -f youtube-dl.1.temp.md
 
index cd18edd87877239f622b2acdfa4632f7291656f3..44332ea9ac0cf28cc7475078a7ccbfbd7566b988 100644 (file)
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ youtube-dl - download videos from youtube.com or other video platforms
 
 To install it right away for all UNIX users (Linux, OS X, etc.), type:
 
-    sudo curl https://yt-dl.org/latest/youtube-dl -o /usr/local/bin/youtube-dl
+    sudo curl -L https://yt-dl.org/latest/youtube-dl -o /usr/local/bin/youtube-dl
     sudo chmod a+rx /usr/local/bin/youtube-dl
 
 If you do not have curl, you can alternatively use a recent wget:
@@ -25,20 +25,26 @@ If you do not have curl, you can alternatively use a recent wget:
     sudo wget https://yt-dl.org/downloads/latest/youtube-dl -O /usr/local/bin/youtube-dl
     sudo chmod a+rx /usr/local/bin/youtube-dl
 
-Windows users can [download a .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in their home directory or any other location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29).
+Windows users can [download an .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in any location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29) except for `%SYSTEMROOT%\System32` (e.g. **do not** put in `C:\Windows\System32`).
 
-OS X users can install **youtube-dl** with [Homebrew](http://brew.sh/).
+You can also use pip:
+
+    sudo pip install --upgrade youtube-dl
+    
+This command will update youtube-dl if you have already installed it. See the [pypi page](https://pypi.python.org/pypi/youtube_dl) for more information.
+
+OS X users can install youtube-dl with [Homebrew](http://brew.sh/):
 
     brew install youtube-dl
 
-You can also use pip:
+Or with [MacPorts](https://www.macports.org/):
 
-    sudo pip install youtube-dl
+    sudo port install youtube-dl
 
 Alternatively, refer to the [developer instructions](#developer-instructions) for how to check out and work with the git repository. For further options, including PGP signatures, see the [youtube-dl Download Page](https://rg3.github.io/youtube-dl/download.html).
 
 # DESCRIPTION
-**youtube-dl** is a small command-line program to download videos from
+**youtube-dl** is a command-line program to download videos from
 YouTube.com and a few more sites. It requires the Python interpreter, version
 2.6, 2.7, or 3.2+, and it is not platform specific. It should work on
 your Unix box, on Windows or on Mac OS X. It is released to the public domain,
@@ -73,8 +79,8 @@ which means you can modify it, redistribute it or use it however you like.
                                      repairs broken URLs, but emits an error if
                                      this is not possible instead of searching.
     --ignore-config                  Do not read configuration files. When given
-                                     in the global configuration file /etc
-                                     /youtube-dl.conf: Do not read the user
+                                     in the global configuration file
+                                     /etc/youtube-dl.conf: Do not read the user
                                      configuration in ~/.config/youtube-
                                      dl/config (%APPDATA%/youtube-dl/config.txt
                                      on Windows)
@@ -85,9 +91,11 @@ which means you can modify it, redistribute it or use it however you like.
     --no-color                       Do not emit color codes in output
 
 ## Network Options:
-    --proxy URL                      Use the specified HTTP/HTTPS proxy. Pass in
-                                     an empty string (--proxy "") for direct
-                                     connection
+    --proxy URL                      Use the specified HTTP/HTTPS/SOCKS proxy.
+                                     To enable experimental SOCKS proxy, specify
+                                     a proper scheme. For example
+                                     socks5://127.0.0.1:1080/. Pass in an empty
+                                     string (--proxy "") for direct connection
     --socket-timeout SECONDS         Time to wait before giving up, in seconds
     --source-address IP              Client-side IP address to bind to
                                      (experimental)
@@ -95,9 +103,9 @@ which means you can modify it, redistribute it or use it however you like.
                                      (experimental)
     -6, --force-ipv6                 Make all connections via IPv6
                                      (experimental)
-    --cn-verification-proxy URL      Use this proxy to verify the IP address for
-                                     some Chinese sites. The default proxy
-                                     specified by --proxy (or none, if the
+    --geo-verification-proxy URL     Use this proxy to verify the IP address for
+                                     some geo-restricted sites. The default
+                                     proxy specified by --proxy (or none, if the
                                      options is not present) is used for the
                                      actual downloading. (experimental)
 
@@ -160,7 +168,7 @@ which means you can modify it, redistribute it or use it however you like.
                                      (experimental)
 
 ## Download Options:
-    -r, --rate-limit LIMIT           Maximum download rate in bytes per second
+    -r, --limit-rate RATE            Maximum download rate in bytes per second
                                      (e.g. 50K or 4.2M)
     -R, --retries RETRIES            Number of retries (default is 10), or
                                      "infinite".
@@ -176,7 +184,9 @@ which means you can modify it, redistribute it or use it however you like.
     --xattr-set-filesize             Set file xattribute ytdl.filesize with
                                      expected filesize (experimental)
     --hls-prefer-native              Use the native HLS downloader instead of
-                                     ffmpeg (experimental)
+                                     ffmpeg
+    --hls-prefer-ffmpeg              Use ffmpeg instead of the native HLS
+                                     downloader
     --hls-use-mpegts                 Use the mpegts container for HLS videos,
                                      allowing to play the video while
                                      downloading (some players may not be able
@@ -245,18 +255,19 @@ which means you can modify it, redistribute it or use it however you like.
     --write-info-json                Write video metadata to a .info.json file
     --write-annotations              Write video annotations to a
                                      .annotations.xml file
-    --load-info FILE                 JSON file containing the video information
+    --load-info-json FILE            JSON file containing the video information
                                      (created with the "--write-info-json"
                                      option)
     --cookies FILE                   File to read cookies from and dump cookie
                                      jar in
     --cache-dir DIR                  Location in the filesystem where youtube-dl
                                      can store some downloaded information
-                                     permanently. By default $XDG_CACHE_HOME
-                                     /youtube-dl or ~/.cache/youtube-dl . At the
-                                     moment, only YouTube player files (for
-                                     videos with obfuscated signatures) are
-                                     cached, but that may change.
+                                     permanently. By default
+                                     $XDG_CACHE_HOME/youtube-dl or
+                                     ~/.cache/youtube-dl . At the moment, only
+                                     YouTube player files (for videos with
+                                     obfuscated signatures) are cached, but that
+                                     may change.
     --no-cache-dir                   Disable filesystem caching
     --rm-cache-dir                   Delete all filesystem cache files
 
@@ -413,7 +424,7 @@ which means you can modify it, redistribute it or use it however you like.
 
 # CONFIGURATION
 
-You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\<user name>\youtube-dl.conf`.
+You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux and OS X, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\<user name>\youtube-dl.conf`. Note that by default configuration file may not exist so you may need to create it yourself.
 
 For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime, use a proxy and save all videos under `Movies` directory in your home directory:
 ```
@@ -421,6 +432,7 @@ For example, with the following configuration file youtube-dl will always extrac
 --no-mtime
 --proxy 127.0.0.1:3128
 -o ~/Movies/%(title)s.%(ext)s
+# Lines starting with # are comments
 ```
 
 Note that options in configuration file are just the same options aka switches used in regular command line calls thus there **must be no whitespace** after `-` or `--`, e.g. `-o` or `--proxy` but not `- o` or `-- proxy`.
@@ -429,7 +441,7 @@ You can use `--ignore-config` if you want to disable the configuration file for
 
 ### Authentication with `.netrc` file
 
-You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every youtube-dl execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](http://stackoverflow.com/tags/.netrc/info) on per extractor basis. For that you will need to create a`.netrc` file in your `$HOME` and restrict permissions to read/write by you only:
+You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every youtube-dl execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](http://stackoverflow.com/tags/.netrc/info) on per extractor basis. For that you will need to create a `.netrc` file in your `$HOME` and restrict permissions to read/write by you only:
 ```
 touch $HOME/.netrc
 chmod a-rwx,u+rw $HOME/.netrc
@@ -463,7 +475,7 @@ The basic usage is not to set any template arguments when downloading a single f
  - `display_id`: An alternative identifier for the video
  - `uploader`: Full name of the video uploader
  - `license`: License name the video is licensed under
- - `creator`: The main artist who created the video
+ - `creator`: The creator of the video
  - `release_date`: The date (YYYYMMDD) when the video was released
  - `timestamp`: UNIX timestamp of the moment the video became available
  - `upload_date`: Video upload date (YYYYMMDD)
@@ -500,6 +512,9 @@ The basic usage is not to set any template arguments when downloading a single f
  - `autonumber`: Five-digit number that will be increased with each download, starting at zero
  - `playlist`: Name or id of the playlist that contains the video
  - `playlist_index`: Index of the video in the playlist padded with leading zeros according to the total length of the playlist
+ - `playlist_id`: Playlist identifier
+ - `playlist_title`: Playlist title
+
 
 Available for the video that belongs to some logical chapter or section:
  - `chapter`: Name or title of the chapter the video belongs to
@@ -515,6 +530,18 @@ Available for the video that is an episode of some series or programme:
  - `episode_number`: Number of the video episode within a season
  - `episode_id`: Id of the video episode
 
+Available for the media that is a track or a part of a music album:
+ - `track`: Title of the track
+ - `track_number`: Number of the track within an album or a disc
+ - `track_id`: Id of the track
+ - `artist`: Artist(s) of the track
+ - `genre`: Genre(s) of the track
+ - `album`: Title of the album the track belongs to
+ - `album_type`: Type of the album
+ - `album_artist`: List of all artists appeared on the album
+ - `disc_number`: Number of the disc or other physical medium the track belongs to
+ - `release_year`: Year (YYYY) when the album was released
+
 Each aforementioned sequence when referenced in output template will be replaced by the actual value corresponding to the sequence name. Note that some of the sequences are not guaranteed to be present since they depend on the metadata obtained by particular extractor, such sequences will be replaced with `NA`.
 
 For example for `-o %(title)s-%(id)s.%(ext)s` and mp4 video with title `youtube-dl test video` and id `BaW_jenozKcj` this will result in a `youtube-dl test video-BaW_jenozKcj.mp4` file created in the current directory.
@@ -527,6 +554,10 @@ The current default template is `%(title)s-%(id)s.%(ext)s`.
 
 In some cases, you don't want special characters such as 中, spaces, or &, such as when transferring the downloaded filename to a Windows system or the filename through an 8bit-unsafe channel. In these cases, add the `--restrict-filenames` flag to get a shorter title:
 
+#### Output template and Windows batch files
+
+If you are using output template inside a Windows batch file then you must escape plain percent characters (`%`) by doubling, so that `-o "%(title)s-%(id)s.%(ext)s"` should become `-o "%%(title)s-%%(id)s.%%(ext)s"`. However you should not touch `%`'s that are not plain characters, e.g. environment variables for expansion should stay intact: `-o "C:\%HOMEPATH%\Desktop\%%(title)s.%%(ext)s"`.
+
 #### Output template examples
 
 Note on Windows you may need to use double quotes instead of single.
@@ -677,12 +708,20 @@ hash -r
 
 Again, from then on you'll be able to update with `sudo youtube-dl -U`.
 
+### youtube-dl is extremely slow to start on Windows
+
+Add a file exclusion for `youtube-dl.exe` in Windows Defender settings.
+
 ### I'm getting an error `Unable to extract OpenGraph title` on YouTube playlists
 
 YouTube changed their playlist format in March 2014 and later on, so you'll need at least youtube-dl 2014.07.25 to download all YouTube videos.
 
 If you have installed youtube-dl with a package manager, pip, setup.py or a tarball, please use that to update. Note that Ubuntu packages do not seem to get updated anymore. Since we are not affiliated with Ubuntu, there is little we can do. Feel free to [report bugs](https://bugs.launchpad.net/ubuntu/+source/youtube-dl/+filebug) to the [Ubuntu packaging guys](mailto:ubuntu-motu@lists.ubuntu.com?subject=outdated%20version%20of%20youtube-dl) - all they have to do is update the package to a somewhat recent version. See above for a way to update.
 
+### I'm getting an error when trying to use output template: `error: using output template conflicts with using title, video ID or auto number`
+
+Make sure you are not using `-o` with any of these options `-t`, `--title`, `--id`, `-A` or `--auto-number` set in command line or in a configuration file. Remove the latter if any.
+
 ### Do I always have to pass `-citw`?
 
 By default, youtube-dl intends to have the best options (incidentally, if you have a convincing case that these should be different, [please file an issue where you explain that](https://yt-dl.org/bug)). Therefore, it is unnecessary and sometimes harmful to copy long option strings from webpages. In particular, the only option out of `-citw` that is regularly useful is `-i`.
@@ -703,7 +742,7 @@ Videos or video formats streamed via RTMP protocol can only be downloaded when [
 
 ### I have downloaded a video but how can I play it?
 
-Once the video is fully downloaded, use any video player, such as [vlc](http://www.videolan.org) or [mplayer](http://www.mplayerhq.hu/).
+Once the video is fully downloaded, use any video player, such as [mpv](https://mpv.io/), [vlc](http://www.videolan.org) or [mplayer](http://www.mplayerhq.hu/).
 
 ### I extracted a video URL with `-g`, but it does not play on another machine / in my webbrowser.
 
@@ -760,9 +799,9 @@ means you're using an outdated version of Python. Please update to Python 2.6 or
 
 Since June 2012 ([#342](https://github.com/rg3/youtube-dl/issues/342)) youtube-dl is packed as an executable zipfile, simply unzip it (might need renaming to `youtube-dl.zip` first on some systems) or clone the git repository, as laid out above. If you modify the code, you can run it by executing the `__main__.py` file. To recompile the executable, run `make youtube-dl`.
 
-### The exe throws a *Runtime error from Visual C++*
+### The exe throws an error due to missing `MSVCR100.dll`
 
-To run the exe you need to install first the [Microsoft Visual C++ 2008 Redistributable Package](http://www.microsoft.com/en-us/download/details.aspx?id=29).
+To run the exe you need to install first the [Microsoft Visual C++ 2010 Redistributable Package (x86)](https://www.microsoft.com/en-US/download/details.aspx?id=5555).
 
 ### On Windows, how should I set up ffmpeg and youtube-dl? Where should I put the exe files?
 
@@ -817,6 +856,12 @@ It is *not* possible to detect whether a URL is supported or not. That's because
 
 If you want to find out whether a given URL is supported, simply call youtube-dl with it. If you get no videos back, chances are the URL is either not referring to a video or unsupported. You can find out which by examining the output (if you run youtube-dl on the console) or catching an `UnsupportedError` exception if you run it from a Python program.
 
+# Why do I need to go through that much red tape when filing bugs?
+
+Before we had the issue template, despite our extensive [bug reporting instructions](#bugs), about 80% of the issue reports we got were useless, for instance because people used ancient versions hundreds of releases old, because of simple syntactic errors (not in youtube-dl but in general shell usage), because the problem was alrady reported multiple times before, because people did not actually read an error message, even if it said "please install ffmpeg", because people did not mention the URL they were trying to download and many more simple, easy-to-avoid problems, many of whom were totally unrelated to youtube-dl.
+
+youtube-dl is an open-source project manned by too few volunteers, so we'd rather spend time fixing bugs where we are certain none of those simple problems apply, and where we can be reasonably confident to be able to reproduce the issue without asking the reporter repeatedly. As such, the output of `youtube-dl -v YOUR_URL_HERE` is really all that's required to file an issue. The issue template also guides you through some basic steps you can do, such as checking that your version of youtube-dl is current.
+
 # DEVELOPER INSTRUCTIONS
 
 Most users do not need to build youtube-dl and can [download the builds](http://rg3.github.io/youtube-dl/download.html) or get them from their distribution.
@@ -846,9 +891,17 @@ If you want to add support for a new site, first of all **make sure** this site
 After you have ensured this site is distributing it's content legally, you can follow this quick list (assuming your service is called `yourextractor`):
 
 1. [Fork this repository](https://github.com/rg3/youtube-dl/fork)
-2. Check out the source code with `git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git`
-3. Start a new git branch with `cd youtube-dl; git checkout -b yourextractor`
+2. Check out the source code with:
+
+        git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git
+
+3. Start a new git branch with
+
+        cd youtube-dl
+        git checkout -b yourextractor
+
 4. Start with this simple template and save it to `youtube_dl/extractor/yourextractor.py`:
+
     ```python
     # coding: utf-8
     from __future__ import unicode_literals
@@ -891,20 +944,152 @@ After you have ensured this site is distributing it's content legally, you can f
     ```
 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py).
 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc.
-7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/58525c94d547be1c8167d16c298bdd75506db328/youtube_dl/extractor/common.py#L68-L226). Add tests and code for as many as you want.
-8. Keep in mind that the only mandatory fields in info dict for successful extraction process are `id`, `title` and either `url` or `formats`, i.e. these are the critical data the extraction does not make any sense without. This means that [any field](https://github.com/rg3/youtube-dl/blob/58525c94d547be1c8167d16c298bdd75506db328/youtube_dl/extractor/common.py#L138-L226) apart from aforementioned mandatory ones should be treated **as optional** and extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. For example, if you have some intermediate dict `meta` that is a source of metadata and it has a key `summary` that you want to extract and put into resulting info dict as `description`, you should be ready that this key may be missing from the `meta` dict, i.e. you should extract it as `meta.get('summary')` and not `meta['summary']`. Similarly, you should pass `fatal=False` when extracting data from a webpage with `_search_regex/_html_search_regex`.
-9. Check the code with [flake8](https://pypi.python.org/pypi/flake8).
-10. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this:
+7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want.
+8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](http://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+.
+9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this:
 
         $ git add youtube_dl/extractor/extractors.py
         $ git add youtube_dl/extractor/yourextractor.py
         $ git commit -m '[yourextractor] Add new extractor'
         $ git push origin yourextractor
 
-11. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it.
+10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it.
 
 In any case, thank you very much for your contributions!
 
+## youtube-dl coding conventions
+
+This section introduces a guide lines for writing idiomatic, robust and future-proof extractor code.
+
+Extractors are very fragile by nature since they depend on the layout of the source data provided by 3rd party media hoster out of your control and this layout tend to change. As an extractor implementer your task is not only to write code that will extract media links and metadata correctly but also to minimize code dependency on source's layout changes and even to make the code foresee potential future changes and be ready for that. This is important because it will allow extractor not to break on minor layout changes thus keeping old youtube-dl versions working. Even though this breakage issue is easily fixed by emitting a new version of youtube-dl with fix incorporated all the previous version become broken in all repositories and distros' packages that may not be so prompt in fetching the update from us. Needless to say some may never receive an update at all that is possible for non rolling release distros.
+
+### Mandatory and optional metafields
+
+For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by [information dictionary](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L75-L257) or simply *info dict*. Only the following meta fields in *info dict* are considered mandatory for successful extraction process by youtube-dl:
+
+ - `id` (media identifier)
+ - `title` (media title)
+ - `url` (media download URL) or `formats`
+
+In fact only the last option is technically mandatory (i.e. if you can't figure out the download location of the media the extraction does not make any sense). But by convention youtube-dl also treats `id` and `title` to be mandatory. Thus aforementioned metafields are the critical data the extraction does not make any sense without and if any of them fail to be extracted then extractor is considered completely broken.
+
+[Any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L149-L257) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields.
+
+#### Example
+
+Say you have some source dictionary `meta` that you've fetched as JSON with HTTP request and it has a key `summary`:
+
+```python
+meta = self._download_json(url, video_id)
+```
+    
+Assume at this point `meta`'s layout is:
+
+```python
+{
+    ...
+    "summary": "some fancy summary text",
+    ...
+}
+```
+
+Assume you want to extract `summary` and put into resulting info dict as `description`. Since `description` is optional metafield you should be ready that this key may be missing from the `meta` dict, so that you should extract it like:
+
+```python
+description = meta.get('summary')  # correct
+```
+
+and not like:
+
+```python
+description = meta['summary']  # incorrect
+```
+
+The latter will break extraction process with `KeyError` if `summary` disappears from `meta` at some time later but with former approach extraction will just go ahead with `description` set to `None` that is perfectly fine (remember `None` is equivalent for absence of data). 
+
+Similarly, you should pass `fatal=False` when extracting optional data from a webpage with `_search_regex`, `_html_search_regex` or similar methods, for instance:
+
+```python
+description = self._search_regex(
+    r'<span[^>]+id="title"[^>]*>([^<]+)<',
+    webpage, 'description', fatal=False)
+```
+
+With `fatal` set to `False` if `_search_regex` fails to extract `description` it will emit a warning and continue extraction.
+
+You can also pass `default=<some fallback value>`, for example:
+
+```python
+description = self._search_regex(
+    r'<span[^>]+id="title"[^>]*>([^<]+)<',
+    webpage, 'description', default=None)
+```
+
+On failure this code will silently continue the extraction with `description` set to `None`. That is useful for metafields that are known to may or may not be present.
+### Provide fallbacks
+
+When extracting metadata try to provide several scenarios for that. For example if `title` is present in several places/sources try extracting from at least some of them. This would make it more future-proof in case some of the sources became unavailable.
+
+#### Example
+
+Say `meta` from previous example has a `title` and you are about to extract it. Since `title` is mandatory meta field you should end up with something like:
+
+```python
+title = meta['title']
+```
+
+If `title` disappeares from `meta` in future due to some changes on hoster's side the extraction would fail since `title` is mandatory. That's expected.
+
+Assume that you have some another source you can extract `title` from, for example `og:title` HTML meta of a `webpage`. In this case you can provide a fallback scenario:
+
+```python
+title = meta.get('title') or self._og_search_title(webpage)
+```
+
+This code will try to extract from `meta` first and if it fails it will try extracting `og:title` from a `webpage`.
+
+### Make regular expressions flexible
+
+When using regular expressions try to write them fuzzy and flexible.
+#### Example
+
+Say you need to extract `title` from the following HTML code:
+
+```html
+<span style="position: absolute; left: 910px; width: 90px; float: right; z-index: 9999;" class="title">some fancy title</span>
+```
+
+The code for that task should look similar to:
+
+```python
+title = self._search_regex(
+    r'<span[^>]+class="title"[^>]*>([^<]+)', webpage, 'title')
+```
+
+Or even better:
+
+```python
+title = self._search_regex(
+    r'<span[^>]+class=(["\'])title\1[^>]*>(?P<title>[^<]+)',
+    webpage, 'title', group='title')
+```
+
+Note how you tolerate potential changes in `style` attribute's value or switch from using double quotes to single for `class` attribute: 
+
+The code definitely should not look like:
+
+```python
+title = self._search_regex(
+    r'<span style="position: absolute; left: 910px; width: 90px; float: right; z-index: 9999;" class="title">(.*?)</span>',
+    webpage, 'title', group='title')
+```
+
+### Use safe conversion functions
+
+Wrap all extracted numeric data into safe functions from `utils`: `int_or_none`, `float_or_none`. Use them for string to number conversions as well.
+
 # EMBEDDING YOUTUBE-DL
 
 youtube-dl makes the best effort to be a good command-line program, and thus should be callable from any programming language. If you encounter any problems parsing its output, feel free to [create a report](https://github.com/rg3/youtube-dl/issues/new).
@@ -920,7 +1105,7 @@ with youtube_dl.YoutubeDL(ydl_opts) as ydl:
     ydl.download(['http://www.youtube.com/watch?v=BaW_jenozKc'])
 ```
 
-Most likely, you'll want to use various options. For a list of what can be done, have a look at [`youtube_dl/YoutubeDL.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L121-L269). For a start, if you want to intercept youtube-dl's output, set a `logger` object.
+Most likely, you'll want to use various options. For a list of options available, have a look at [`youtube_dl/YoutubeDL.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L128-L278). For a start, if you want to intercept youtube-dl's output, set a `logger` object.
 
 Here's a more complete example of a program that outputs only errors (and a short message after the download is finished), and downloads/converts the video to an mp3 file:
 
index 7c2f49f8bb63bbe2b47efca151129a7e6b49674d..fc99c3213dddf985cfcf4fe74584cc09eeaf3175 100644 (file)
@@ -1,17 +1,38 @@
 #!/usr/bin/python3
 
-from http.server import HTTPServer, BaseHTTPRequestHandler
-from socketserver import ThreadingMixIn
 import argparse
 import ctypes
 import functools
+import shutil
+import subprocess
 import sys
+import tempfile
 import threading
 import traceback
 import os.path
 
+sys.path.insert(0, os.path.dirname(os.path.dirname((os.path.abspath(__file__)))))
+from youtube_dl.compat import (
+    compat_input,
+    compat_http_server,
+    compat_str,
+    compat_urlparse,
+)
+
+# These are not used outside of buildserver.py thus not in compat.py
+
+try:
+    import winreg as compat_winreg
+except ImportError:  # Python 2
+    import _winreg as compat_winreg
 
-class BuildHTTPServer(ThreadingMixIn, HTTPServer):
+try:
+    import socketserver as compat_socketserver
+except ImportError:  # Python 2
+    import SocketServer as compat_socketserver
+
+
+class BuildHTTPServer(compat_socketserver.ThreadingMixIn, compat_http_server.HTTPServer):
     allow_reuse_address = True
 
 
@@ -191,7 +212,7 @@ def main(args=None):
                         action='store_const', dest='action', const='service',
                         help='Run as a Windows service')
     parser.add_argument('-b', '--bind', metavar='<host:port>',
-                        action='store', default='localhost:8142',
+                        action='store', default='0.0.0.0:8142',
                         help='Bind to host:port (default %default)')
     options = parser.parse_args(args=args)
 
@@ -216,7 +237,7 @@ def main(args=None):
     srv = BuildHTTPServer((host, port), BuildHTTPRequestHandler)
     thr = threading.Thread(target=srv.serve_forever)
     thr.start()
-    input('Press ENTER to shut down')
+    compat_input('Press ENTER to shut down')
     srv.shutdown()
     thr.join()
 
@@ -231,8 +252,6 @@ def rmtree(path):
             os.remove(fname)
     os.rmdir(path)
 
-#==============================================================================
-
 
 class BuildError(Exception):
     def __init__(self, output, code=500):
@@ -249,15 +268,25 @@ class HTTPError(BuildError):
 
 class PythonBuilder(object):
     def __init__(self, **kwargs):
-        pythonVersion = kwargs.pop('python', '2.7')
-        try:
-            key = _winreg.OpenKey(_winreg.HKEY_LOCAL_MACHINE, r'SOFTWARE\Python\PythonCore\%s\InstallPath' % pythonVersion)
+        python_version = kwargs.pop('python', '3.4')
+        python_path = None
+        for node in ('Wow6432Node\\', ''):
             try:
-                self.pythonPath, _ = _winreg.QueryValueEx(key, '')
-            finally:
-                _winreg.CloseKey(key)
-        except Exception:
-            raise BuildError('No such Python version: %s' % pythonVersion)
+                key = compat_winreg.OpenKey(
+                    compat_winreg.HKEY_LOCAL_MACHINE,
+                    r'SOFTWARE\%sPython\PythonCore\%s\InstallPath' % (node, python_version))
+                try:
+                    python_path, _ = compat_winreg.QueryValueEx(key, '')
+                finally:
+                    compat_winreg.CloseKey(key)
+                break
+            except Exception:
+                pass
+
+        if not python_path:
+            raise BuildError('No such Python version: %s' % python_version)
+
+        self.pythonPath = python_path
 
         super(PythonBuilder, self).__init__(**kwargs)
 
@@ -305,8 +334,10 @@ class YoutubeDLBuilder(object):
 
     def build(self):
         try:
-            subprocess.check_output([os.path.join(self.pythonPath, 'python.exe'), 'setup.py', 'py2exe'],
-                                    cwd=self.buildPath)
+            proc = subprocess.Popen([os.path.join(self.pythonPath, 'python.exe'), 'setup.py', 'py2exe'], stdin=subprocess.PIPE, cwd=self.buildPath)
+            proc.wait()
+            #subprocess.check_output([os.path.join(self.pythonPath, 'python.exe'), 'setup.py', 'py2exe'],
+            #                        cwd=self.buildPath)
         except subprocess.CalledProcessError as e:
             raise BuildError(e.output)
 
@@ -369,12 +400,12 @@ class Builder(PythonBuilder, GITBuilder, YoutubeDLBuilder, DownloadBuilder, Clea
     pass
 
 
-class BuildHTTPRequestHandler(BaseHTTPRequestHandler):
+class BuildHTTPRequestHandler(compat_http_server.BaseHTTPRequestHandler):
     actionDict = {'build': Builder, 'download': Builder}  # They're the same, no more caching.
 
     def do_GET(self):
-        path = urlparse.urlparse(self.path)
-        paramDict = dict([(key, value[0]) for key, value in urlparse.parse_qs(path.query).items()])
+        path = compat_urlparse.urlparse(self.path)
+        paramDict = dict([(key, value[0]) for key, value in compat_urlparse.parse_qs(path.query).items()])
         action, _, path = path.path.strip('/').partition('/')
         if path:
             path = path.split('/')
@@ -388,7 +419,7 @@ class BuildHTTPRequestHandler(BaseHTTPRequestHandler):
                         builder.close()
                 except BuildError as e:
                     self.send_response(e.code)
-                    msg = unicode(e).encode('UTF-8')
+                    msg = compat_str(e).encode('UTF-8')
                     self.send_header('Content-Type', 'text/plain; charset=UTF-8')
                     self.send_header('Content-Length', len(msg))
                     self.end_headers()
@@ -400,7 +431,5 @@ class BuildHTTPRequestHandler(BaseHTTPRequestHandler):
         else:
             self.send_response(500, 'Malformed URL')
 
-#==============================================================================
-
 if __name__ == '__main__':
     main()
diff --git a/devscripts/create-github-release.py b/devscripts/create-github-release.py
new file mode 100644 (file)
index 0000000..3b8021e
--- /dev/null
@@ -0,0 +1,111 @@
+#!/usr/bin/env python
+from __future__ import unicode_literals
+
+import base64
+import json
+import mimetypes
+import netrc
+import optparse
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from youtube_dl.compat import (
+    compat_basestring,
+    compat_input,
+    compat_getpass,
+    compat_print,
+    compat_urllib_request,
+)
+from youtube_dl.utils import (
+    make_HTTPS_handler,
+    sanitized_Request,
+)
+
+
+class GitHubReleaser(object):
+    _API_URL = 'https://api.github.com/repos/rg3/youtube-dl/releases'
+    _UPLOADS_URL = 'https://uploads.github.com/repos/rg3/youtube-dl/releases/%s/assets?name=%s'
+    _NETRC_MACHINE = 'github.com'
+
+    def __init__(self, debuglevel=0):
+        self._init_github_account()
+        https_handler = make_HTTPS_handler({}, debuglevel=debuglevel)
+        self._opener = compat_urllib_request.build_opener(https_handler)
+
+    def _init_github_account(self):
+        try:
+            info = netrc.netrc().authenticators(self._NETRC_MACHINE)
+            if info is not None:
+                self._username = info[0]
+                self._password = info[2]
+                compat_print('Using GitHub credentials found in .netrc...')
+                return
+            else:
+                compat_print('No GitHub credentials found in .netrc')
+        except (IOError, netrc.NetrcParseError):
+            compat_print('Unable to parse .netrc')
+        self._username = compat_input(
+            'Type your GitHub username or email address and press [Return]: ')
+        self._password = compat_getpass(
+            'Type your GitHub password and press [Return]: ')
+
+    def _call(self, req):
+        if isinstance(req, compat_basestring):
+            req = sanitized_Request(req)
+        # Authorizing manually since GitHub does not response with 401 with
+        # WWW-Authenticate header set (see
+        # https://developer.github.com/v3/#basic-authentication)
+        b64 = base64.b64encode(
+            ('%s:%s' % (self._username, self._password)).encode('utf-8')).decode('ascii')
+        req.add_header('Authorization', 'Basic %s' % b64)
+        response = self._opener.open(req).read().decode('utf-8')
+        return json.loads(response)
+
+    def list_releases(self):
+        return self._call(self._API_URL)
+
+    def create_release(self, tag_name, name=None, body='', draft=False, prerelease=False):
+        data = {
+            'tag_name': tag_name,
+            'target_commitish': 'master',
+            'name': name,
+            'body': body,
+            'draft': draft,
+            'prerelease': prerelease,
+        }
+        req = sanitized_Request(self._API_URL, json.dumps(data).encode('utf-8'))
+        return self._call(req)
+
+    def create_asset(self, release_id, asset):
+        asset_name = os.path.basename(asset)
+        url = self._UPLOADS_URL % (release_id, asset_name)
+        # Our files are small enough to be loaded directly into memory.
+        data = open(asset, 'rb').read()
+        req = sanitized_Request(url, data)
+        mime_type, _ = mimetypes.guess_type(asset_name)
+        req.add_header('Content-Type', mime_type or 'application/octet-stream')
+        return self._call(req)
+
+
+def main():
+    parser = optparse.OptionParser(usage='%prog VERSION BUILDPATH')
+    options, args = parser.parse_args()
+    if len(args) != 2:
+        parser.error('Expected a version and a build directory')
+
+    version, build_path = args
+
+    releaser = GitHubReleaser()
+
+    new_release = releaser.create_release(version, name='youtube-dl %s' % version)
+    release_id = new_release['id']
+
+    for asset in os.listdir(build_path):
+        compat_print('Uploading %s...' % asset)
+        releaser.create_asset(release_id, os.path.join(build_path, asset))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/devscripts/install_srelay.sh b/devscripts/install_srelay.sh
new file mode 100755 (executable)
index 0000000..33ce8a3
--- /dev/null
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+mkdir -p tmp && cd tmp
+wget -N http://downloads.sourceforge.net/project/socks-relay/socks-relay/srelay-0.4.8/srelay-0.4.8b6.tar.gz
+tar zxvf srelay-0.4.8b6.tar.gz
+cd srelay-0.4.8b6
+./configure
+make
index b5a8b9190bd5c84b7c0a695cb09ae78014750cbd..9a79c2bc5a6d57f6de31be45b29807e36bd8e12f 100644 (file)
@@ -14,15 +14,17 @@ if os.path.exists(lazy_extractors_filename):
     os.remove(lazy_extractors_filename)
 
 from youtube_dl.extractor import _ALL_CLASSES
-from youtube_dl.extractor.common import InfoExtractor
+from youtube_dl.extractor.common import InfoExtractor, SearchInfoExtractor
 
 with open('devscripts/lazy_load_template.py', 'rt') as f:
     module_template = f.read()
 
-module_contents = [module_template + '\n' + getsource(InfoExtractor.suitable)]
+module_contents = [
+    module_template + '\n' + getsource(InfoExtractor.suitable) + '\n',
+    'class LazyLoadSearchExtractor(LazyLoadExtractor):\n    pass\n']
 
 ie_template = '''
-class {name}(LazyLoadExtractor):
+class {name}({bases}):
     _VALID_URL = {valid_url!r}
     _module = '{module}'
 '''
@@ -34,10 +36,20 @@ make_valid_template = '''
 '''
 
 
+def get_base_name(base):
+    if base is InfoExtractor:
+        return 'LazyLoadExtractor'
+    elif base is SearchInfoExtractor:
+        return 'LazyLoadSearchExtractor'
+    else:
+        return base.__name__
+
+
 def build_lazy_ie(ie, name):
     valid_url = getattr(ie, '_VALID_URL', None)
     s = ie_template.format(
         name=name,
+        bases=', '.join(map(get_base_name, ie.__bases__)),
         valid_url=valid_url,
         module=ie.__module__)
     if ie.suitable.__func__ is not InfoExtractor.suitable.__func__:
@@ -47,12 +59,35 @@ def build_lazy_ie(ie, name):
         s += make_valid_template.format(valid_url=ie._make_valid_url())
     return s
 
+# find the correct sorting and add the required base classes so that sublcasses
+# can be correctly created
+classes = _ALL_CLASSES[:-1]
+ordered_cls = []
+while classes:
+    for c in classes[:]:
+        bases = set(c.__bases__) - set((object, InfoExtractor, SearchInfoExtractor))
+        stop = False
+        for b in bases:
+            if b not in classes and b not in ordered_cls:
+                if b.__name__ == 'GenericIE':
+                    exit()
+                classes.insert(0, b)
+                stop = True
+        if stop:
+            break
+        if all(b in ordered_cls for b in bases):
+            ordered_cls.append(c)
+            classes.remove(c)
+            break
+ordered_cls.append(_ALL_CLASSES[-1])
+
 names = []
-for ie in list(sorted(_ALL_CLASSES[:-1], key=lambda cls: cls.ie_key())) + _ALL_CLASSES[-1:]:
-    name = ie.ie_key() + 'IE'
+for ie in ordered_cls:
+    name = ie.__name__
     src = build_lazy_ie(ie, name)
     module_contents.append(src)
-    names.append(name)
+    if ie in _ALL_CLASSES:
+        names.append(name)
 
 module_contents.append(
     '_ALL_CLASSES = [{0}]'.format(', '.join(names)))
index 776e6556e5b2bd683acbcf79d7bc07431be6548a..e3f6339b5a60fc0a19106e7447842d08e680dce2 100644 (file)
@@ -1,13 +1,46 @@
 from __future__ import unicode_literals
 
 import io
+import optparse
 import os.path
-import sys
 import re
 
 ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 README_FILE = os.path.join(ROOT_DIR, 'README.md')
 
+PREFIX = '''%YOUTUBE-DL(1)
+
+# NAME
+
+youtube\-dl \- download videos from youtube.com or other video platforms
+
+# SYNOPSIS
+
+**youtube-dl** \[OPTIONS\] URL [URL...]
+
+'''
+
+
+def main():
+    parser = optparse.OptionParser(usage='%prog OUTFILE.md')
+    options, args = parser.parse_args()
+    if len(args) != 1:
+        parser.error('Expected an output filename')
+
+    outfile, = args
+
+    with io.open(README_FILE, encoding='utf-8') as f:
+        readme = f.read()
+
+    readme = re.sub(r'(?s)^.*?(?=# DESCRIPTION)', '', readme)
+    readme = re.sub(r'\s+youtube-dl \[OPTIONS\] URL \[URL\.\.\.\]', '', readme)
+    readme = PREFIX + readme
+
+    readme = filter_options(readme)
+
+    with io.open(outfile, 'w', encoding='utf-8') as outf:
+        outf.write(readme)
+
 
 def filter_options(readme):
     ret = ''
@@ -37,27 +70,5 @@ def filter_options(readme):
 
     return ret
 
-with io.open(README_FILE, encoding='utf-8') as f:
-    readme = f.read()
-
-PREFIX = '''%YOUTUBE-DL(1)
-
-# NAME
-
-youtube\-dl \- download videos from youtube.com or other video platforms
-
-# SYNOPSIS
-
-**youtube-dl** \[OPTIONS\] URL [URL...]
-
-'''
-readme = re.sub(r'(?s)^.*?(?=# DESCRIPTION)', '', readme)
-readme = re.sub(r'\s+youtube-dl \[OPTIONS\] URL \[URL\.\.\.\]', '', readme)
-readme = PREFIX + readme
-
-readme = filter_options(readme)
-
-if sys.version_info < (3, 0):
-    print(readme.encode('utf-8'))
-else:
-    print(readme)
+if __name__ == '__main__':
+    main()
index 8dea55dbbc6a4b577c18d9255a9ed010d95c6f2e..f8d466ba879ff7832a652ea41cfb2527b49ce954 100755 (executable)
@@ -6,7 +6,7 @@
 # * the git config user.signingkey is properly set
 
 # You will need
-# pip install coverage nose rsa
+# pip install coverage nose rsa wheel
 
 # TODO
 # release notes
 set -e
 
 skip_tests=true
-if [ "$1" = '--run-tests' ]; then
-    skip_tests=false
-    shift
-fi
+gpg_sign_commits=""
+buildserver='localhost:8142'
+
+while true
+do
+case "$1" in
+    --run-tests)
+        skip_tests=false
+        shift
+    ;;
+    --gpg-sign-commits|-S)
+        gpg_sign_commits="-S"
+        shift
+    ;;
+    --buildserver)
+        buildserver="$2"
+        shift 2
+    ;;
+    --*)
+        echo "ERROR: unknown option $1"
+        exit 1
+    ;;
+    *)
+        break
+    ;;
+esac
+done
 
 if [ -z "$1" ]; then echo "ERROR: specify version number like this: $0 1994.09.06"; exit 1; fi
 version="$1"
@@ -33,6 +56,9 @@ if [ ! -z "`git status --porcelain | grep -v CHANGELOG`" ]; then echo 'ERROR: th
 useless_files=$(find youtube_dl -type f -not -name '*.py')
 if [ ! -z "$useless_files" ]; then echo "ERROR: Non-.py files in youtube_dl: $useless_files"; exit 1; fi
 if [ ! -f "updates_key.pem" ]; then echo 'ERROR: updates_key.pem missing'; exit 1; fi
+if ! type pandoc >/dev/null 2>/dev/null; then echo 'ERROR: pandoc is missing'; exit 1; fi
+if ! python3 -c 'import rsa' 2>/dev/null; then echo 'ERROR: python3-rsa is missing'; exit 1; fi
+if ! python3 -c 'import wheel' 2>/dev/null; then echo 'ERROR: wheel is missing'; exit 1; fi
 
 /bin/echo -e "\n### First of all, testing..."
 make clean
@@ -48,7 +74,7 @@ sed -i "s/__version__ = '.*'/__version__ = '$version'/" youtube_dl/version.py
 /bin/echo -e "\n### Committing documentation, templates and youtube_dl/version.py..."
 make README.md CONTRIBUTING.md .github/ISSUE_TEMPLATE.md supportedsites
 git add README.md CONTRIBUTING.md .github/ISSUE_TEMPLATE.md docs/supportedsites.md youtube_dl/version.py
-git commit -m "release $version"
+git commit $gpg_sign_commits -m "release $version"
 
 /bin/echo -e "\n### Now tagging, signing and pushing..."
 git tag -s -m "Release $version" "$version"
@@ -64,7 +90,7 @@ git push origin "$version"
 REV=$(git rev-parse HEAD)
 make youtube-dl youtube-dl.tar.gz
 read -p "VM running? (y/n) " -n 1
-wget "http://localhost:8142/build/rg3/youtube-dl/youtube-dl.exe?rev=$REV" -O youtube-dl.exe
+wget "http://$buildserver/build/rg3/youtube-dl/youtube-dl.exe?rev=$REV" -O youtube-dl.exe
 mkdir -p "build/$version"
 mv youtube-dl youtube-dl.exe "build/$version"
 mv youtube-dl.tar.gz "build/$version/youtube-dl-$version.tar.gz"
@@ -74,15 +100,16 @@ RELEASE_FILES="youtube-dl youtube-dl.exe youtube-dl-$version.tar.gz"
 (cd build/$version/ && sha256sum $RELEASE_FILES > SHA2-256SUMS)
 (cd build/$version/ && sha512sum $RELEASE_FILES > SHA2-512SUMS)
 
-/bin/echo -e "\n### Signing and uploading the new binaries to yt-dl.org ..."
+/bin/echo -e "\n### Signing and uploading the new binaries to GitHub..."
 for f in $RELEASE_FILES; do gpg --passphrase-repeat 5 --detach-sig "build/$version/$f"; done
-scp -r "build/$version" ytdl@yt-dl.org:html/tmp/
-ssh ytdl@yt-dl.org "mv html/tmp/$version html/downloads/"
+
+ROOT=$(pwd)
+python devscripts/create-github-release.py $version "$ROOT/build/$version"
+
 ssh ytdl@yt-dl.org "sh html/update_latest.sh $version"
 
 /bin/echo -e "\n### Now switching to gh-pages..."
 git clone --branch gh-pages --single-branch . build/gh-pages
-ROOT=$(pwd)
 (
     set -e
     ORIGIN_URL=$(git config --get remote.origin.url)
@@ -94,7 +121,7 @@ ROOT=$(pwd)
     "$ROOT/devscripts/gh-pages/update-copyright.py"
     "$ROOT/devscripts/gh-pages/update-sites.py"
     git add *.html *.html.in update
-    git commit -m "release $version"
+    git commit $gpg_sign_commits -m "release $version"
     git push "$ROOT" gh-pages
     git push "$ORIGIN_URL" gh-pages
 )
diff --git a/devscripts/show-downloads-statistics.py b/devscripts/show-downloads-statistics.py
new file mode 100644 (file)
index 0000000..b591d3f
--- /dev/null
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+from __future__ import unicode_literals
+
+import json
+import os
+import re
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from youtube_dl.compat import (
+    compat_print,
+    compat_urllib_request,
+)
+from youtube_dl.utils import format_bytes
+
+
+def format_size(bytes):
+    return '%s (%d bytes)' % (format_bytes(bytes), bytes)
+
+
+total_bytes = 0
+
+releases = json.loads(compat_urllib_request.urlopen(
+    'https://api.github.com/repos/rg3/youtube-dl/releases').read().decode('utf-8'))
+
+for release in releases:
+    compat_print(release['name'])
+    for asset in release['assets']:
+        asset_name = asset['name']
+        total_bytes += asset['download_count'] * asset['size']
+        if all(not re.match(p, asset_name) for p in (
+                r'^youtube-dl$',
+                r'^youtube-dl-\d{4}\.\d{2}\.\d{2}(?:\.\d+)?\.tar\.gz$',
+                r'^youtube-dl\.exe$')):
+            continue
+        compat_print(
+            ' %s size: %s downloads: %d'
+            % (asset_name, format_size(asset['size']), asset['download_count']))
+
+compat_print('total downloads traffic: %s' % format_size(total_bytes))
index e12a7d1824804ce370ea18daead78e17531a417b..d2a2ef0c486dd3743dd8c0237ddd0c1ec5e47e5d 100644 (file)
@@ -6,6 +6,7 @@
  - **22tracks:genre**
  - **22tracks:track**
  - **24video**
+ - **3qsdn**: 3Q SDN
  - **3sat**
  - **4tube**
  - **56.com**
@@ -15,6 +16,8 @@
  - **9gag**
  - **abc.net.au**
  - **Abc7News**
+ - **abcnews**
+ - **abcnews:video**
  - **AcademicEarth:Course**
  - **acast**
  - **acast:channel**
@@ -25,6 +28,7 @@
  - **AdobeTVVideo**
  - **AdultSwim**
  - **aenetworks**: A+E Networks: A&E, Lifetime, History.com, FYI Network
+ - **AfreecaTV**: afreecatv.com
  - **Aftonbladet**
  - **AirMozilla**
  - **AlJazeera**
@@ -40,7 +44,6 @@
  - **appletrailers:section**
  - **archive.org**: archive.org videos
  - **ARD**
- - **ARD:mediathek**: Saarländischer Rundfunk
  - **ARD:mediathek**
  - **arte.tv**
  - **arte.tv:+7**
@@ -52,6 +55,7 @@
  - **arte.tv:future**
  - **arte.tv:info**
  - **arte.tv:magazine**
+ - **arte.tv:playlist**
  - **AtresPlayer**
  - **ATTTechChannel**
  - **AudiMedia**
@@ -69,6 +73,8 @@
  - **bbc**: BBC
  - **bbc.co.uk**: BBC iPlayer
  - **bbc.co.uk:article**: BBC articles
+ - **bbc.co.uk:iplayer:playlist**
+ - **bbc.co.uk:playlist**
  - **BeatportPro**
  - **Beeg**
  - **BehindKink**
@@ -77,6 +83,7 @@
  - **Bild**: Bild.de
  - **BiliBili**
  - **BioBioChileTV**
+ - **BIQLE**
  - **BleacherReport**
  - **BleacherReportCMS**
  - **blinkx**
  - **canalc2.tv**
  - **Canalplus**: canalplus.fr, piwiplus.fr and d8.tv
  - **Canvas**
+ - **CarambaTV**
+ - **CarambaTVPage**
  - **CBC**
  - **CBCPlayer**
  - **CBS**
  - **CBSInteractive**
+ - **CBSLocal**
  - **CBSNews**: CBS News
  - **CBSNewsLiveVideo**: CBS News Live Videos
  - **CBSSports**
  - **chirbit**
  - **chirbit:profile**
  - **Cinchcast**
- - **Cinemassacre**
  - **Clipfish**
  - **cliphunter**
  - **ClipRs**
  - **Clipsyndicate**
+ - **CloserToTruth**
  - **cloudtime**: CloudTime
  - **Cloudy**
  - **Clubic**
  - **CNN**
  - **CNNArticle**
  - **CNNBlogs**
- - **CollegeHumor**
  - **CollegeRama**
  - **ComCarCoff**
  - **ComedyCentral**
  - **ComedyCentralShows**: The Daily Show / The Colbert Report
  - **CondeNast**: Condé Nast media group: Allure, Architectural Digest, Ars Technica, Bon Appétit, Brides, Condé Nast, Condé Nast Traveler, Details, Epicurious, GQ, Glamour, Golf Digest, SELF, Teen Vogue, The New Yorker, Vanity Fair, Vogue, W Magazine, WIRED
+ - **Coub**
  - **Cracked**
  - **Crackle**
  - **Criterion**
  - **CSNNE**
  - **CSpan**: C-SPAN
  - **CtsNews**: 華視新聞
+ - **CTV**
+ - **CTVNews**
  - **culturebox.francetvinfo.fr**
  - **CultureUnplugged**
  - **CWTV**
+ - **DailyMail**
  - **dailymotion**
  - **dailymotion:playlist**
  - **dailymotion:user**
  - **defense.gouv.fr**
  - **democracynow**
  - **DHM**: Filmarchiv - Deutsches Historisches Museum
+ - **DigitallySpeaking**
  - **Digiteka**
  - **Discovery**
  - **Dotsub**
  - **Dropbox**
  - **DrTuber**
  - **DRTV**
- - **Dump**
  - **Dumpert**
  - **dvtv**: http://video.aktualne.cz/
  - **dw**
  - **exfm**: ex.fm
  - **ExpoTV**
  - **ExtremeTube**
+ - **EyedoTV**
  - **facebook**
  - **faz.net**
  - **fc2**
  - **Firstpost**
  - **FiveTV**
  - **Flickr**
+ - **Flipagram**
  - **Folketinget**: Folketinget (ft.dk; Danish parliament)
  - **FootyRoom**
+ - **Formula1**
  - **FOX**
  - **Foxgay**
  - **FoxNews**: Fox News and Fox Business Video
  - **FreeVideo**
  - **Funimation**
  - **FunnyOrDie**
+ - **Fusion**
  - **GameInformer**
  - **Gamekings**
  - **GameOne**
  - **Gamersyde**
  - **GameSpot**
  - **GameStar**
- - **Gametrailers**
  - **Gazeta**
  - **GDCVault**
  - **generic**: Generic downloader that works on some sites
  - **Globo**
  - **GloboArticle**
  - **GodTube**
+ - **GodTV**
  - **GoldenMoustache**
  - **Golem**
  - **GoogleDrive**
  - **Helsinki**: helsinki.fi
  - **HentaiStigma**
  - **HistoricFilms**
+ - **history:topic**: History.com Topic
  - **hitbox**
  - **hitbox:live**
  - **HornBunny**
  - **HotStar**
  - **Howcast**
  - **HowStuffWorks**
+ - **HRTi**
+ - **HRTiPlaylist**
  - **HuffPost**: Huffington Post
  - **Hypem**
  - **Iconosquare**
  - **jpopsuki.tv**
  - **JWPlatform**
  - **Kaltura**
+ - **Kamcord**
  - **KanalPlay**: Kanal 5/9/11 Play
  - **Kankan**
  - **Karaoketv**
  - **kuwo:mv**: 酷我音乐 - MV
  - **kuwo:singer**: 酷我音乐 - 歌手
  - **kuwo:song**: 酷我音乐
- - **la7.tv**
+ - **la7.it**
  - **Laola1Tv**
  - **Le**: 乐视网
+ - **Learnr**
  - **Lecture2Go**
  - **Lemonde**
  - **LePlaylist**
  - **LetvCloud**: 乐视云
  - **Libsyn**
+ - **life**: Life.ru
  - **life:embed**
- - **lifenews**: LIFE | NEWS
  - **limelight**
  - **limelight:channel**
  - **limelight:channel_list**
+ - **LiTV**
  - **LiveLeak**
  - **livestream**
  - **livestream:original**
  - **LnkGo**
+ - **loc**: Library of Congress
+ - **LocalNews8**
  - **LoveHomePorn**
  - **lrt.lt**
  - **lynda**: lynda.com videos
  - **mailru**: Видео@Mail.Ru
  - **MakersChannel**
  - **MakerTV**
- - **Malemotion**
  - **MatchTV**
  - **MDR**: MDR.DE and KiKA
  - **media.ccc.de**
+ - **META**
  - **metacafe**
  - **Metacritic**
  - **Mgoon**
+ - **MGTV**: 芒果TV
  - **Minhateca**
  - **MinistryGrid**
  - **Minoto**
  - **MovieFap**
  - **Moviezine**
  - **MPORA**
- - **MSNBC**
+ - **MSN**
  - **MTV**
  - **mtv.de**
  - **mtviggy.com**
  - **mtvservices:embedded**
  - **MuenchenTV**: münchen.tv
  - **MusicPlayOn**
- - **muzu.tv**
+ - **mva**: Microsoft Virtual Academy videos
+ - **mva:course**: Microsoft Virtual Academy courses
  - **Mwave**
+ - **MwaveMeetGreet**
  - **MySpace**
  - **MySpace:album**
  - **MySpass**
  - **nfl.com**
  - **nhl.com**
  - **nhl.com:news**: NHL news
- - **nhl.com:videocenter**: NHL videocenter category
+ - **nhl.com:videocenter**
+ - **nhl.com:videocenter:category**: NHL videocenter category
  - **nick.com**
+ - **nick.de**
  - **niconico**: ニコニコ動画
  - **NiconicoPlaylist**
+ - **NineCNineMedia**
  - **njoy**: N-JOY
  - **njoy:embed**
  - **Noco**
  - **Odnoklassniki**
  - **OktoberfestTV**
  - **on.aol.com**
+ - **onet.tv**
+ - **onet.tv:channel**
  - **OnionStudios**
  - **Ooyala**
  - **OoyalaExternal**
  - **Patreon**
  - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET  (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC)
  - **pcmag**
- - **Periscope**: Periscope
+ - **People**
+ - **periscope**: Periscope
+ - **periscope:user**: Periscope user videos
  - **PhilharmonieDeParis**: Philharmonie de Paris
  - **phoenix.de**
  - **Photobucket**
  - **Pinkbike**
  - **Pladform**
- - **PlanetaPlay**
  - **play.fm**
  - **played.to**
  - **PlaysTV**
  - **plus.google**: Google Plus
  - **pluzz.francetv.fr**
  - **podomatic**
+ - **PolskieRadio**
  - **PornHd**
- - **PornHub**
+ - **PornHub**: PornHub and Thumbzilla
  - **PornHubPlaylist**
  - **PornHubUserVideos**
  - **Pornotube**
  - **qqmusic:playlist**: QQ音乐 - 歌单
  - **qqmusic:singer**: QQ音乐 - 歌手
  - **qqmusic:toplist**: QQ音乐 - 排行榜
- - **QuickVid**
  - **R7**
+ - **R7Article**
  - **radio.de**
  - **radiobremen**
+ - **radiocanada**
+ - **RadioCanadaAudioVideo**
  - **radiofrance**
  - **RadioJavan**
  - **Rai**
  - **RedTube**
  - **RegioTV**
  - **Restudy**
+ - **Reuters**
  - **ReverbNation**
- - **Revision3**
+ - **revision**
+ - **revision3:embed**
  - **RICE**
  - **RingTV**
+ - **RockstarGames**
  - **RottenTomatoes**
  - **Roxwel**
  - **RTBF**
  - **ScreencastOMatic**
  - **ScreenJunkies**
  - **ScreenwaveMedia**
+ - **Seeker**
  - **SenateISVP**
+ - **SendtoNews**
  - **ServingSys**
  - **Sexu**
- - **SexyKarma**: Sexy Karma and Watch Indian Porn
  - **Shahid**
  - **Shared**: shared.sx and vivo.sx
  - **ShareSix**
  - **Sina**
+ - **SixPlay**
+ - **skynewsarabia:article**
  - **skynewsarabia:video**
- - **skynewsarabia:video**
+ - **SkySports**
  - **Slideshare**
  - **Slutload**
  - **smotri**: Smotri.com
  - **smotri:broadcast**: Smotri.com broadcasts
  - **smotri:community**: Smotri.com community videos
  - **smotri:user**: Smotri.com user videos
- - **SnagFilms**
- - **SnagFilmsEmbed**
  - **Snotr**
  - **Sohu**
  - **soundcloud**
  - **SportBoxEmbed**
  - **SportDeutschland**
  - **Sportschau**
+ - **sr:mediathek**: Saarländischer Rundfunk
  - **SRGSSR**
  - **SRGSSRPlay**: srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites
  - **SSA**
  - **Syfy**
  - **SztvHu**
  - **Tagesschau**
+ - **tagesschau:player**
  - **Tapely**
  - **Tass**
  - **TDSLifeway**
  - **Telegraaf**
  - **TeleMB**
  - **TeleTask**
+ - **Telewebion**
  - **TF1**
  - **TheIntercept**
  - **ThePlatform**
  - **TVCArticle**
  - **tvigle**: Интернет-телевидение Tvigle.ru
  - **tvland.com**
- - **tvp.pl**
- - **tvp.pl:Series**
+ - **tvp**: Telewizja Polska
+ - **tvp:series**
  - **TVPlay**: TV3Play and related services
  - **Tweakers**
- - **twitch:bookmarks**
  - **twitch:chapter**
+ - **twitch:clips**
  - **twitch:past_broadcasts**
  - **twitch:profile**
  - **twitch:stream**
  - **UDNEmbed**: 聯合影音
  - **Unistra**
  - **Urort**: NRK P3 Urørt
+ - **URPlay**
  - **USAToday**
  - **ustream**
  - **ustream:channel**
- - **Ustudio**
+ - **ustudio**
+ - **ustudio:embed**
  - **Varzesh3**
  - **Vbox7**
  - **VeeHD**
  - **Vessel**
  - **Vesti**: Вести.Ru
  - **Vevo**
+ - **VevoPlaylist**
  - **VGTV**: VGTV, BTTV, FTV, Aftenposten and Aftonbladet
  - **vh1.com**
  - **Vice**
  - **ViceShow**
+ - **Vidbit**
  - **Viddler**
  - **video.google:search**: Google Video search
  - **video.mit.edu**
  - **VideoPremium**
  - **VideoTt**: video.tt - Your True Tube (Currently broken)
  - **videoweed**: VideoWeed
+ - **Vidio**
  - **vidme**
  - **vidme:user**
  - **vidme:user:likes**
  - **Vidzi**
  - **vier**
  - **vier:videos**
+ - **ViewLift**
+ - **ViewLiftEmbed**
  - **Viewster**
  - **Viidea**
  - **viki**
  - **VRT**
  - **vube**: Vube.com
  - **VuClip**
- - **vulture.com**
  - **Walla**
- - **WashingtonPost**
+ - **washingtonpost**
+ - **washingtonpost:article**
  - **wat.tv**
+ - **WatchIndianPorn**: Watch Indian Porn
  - **WDR**
  - **wdr:mobile**
- - **WDRMaus**: Sendung mit der Maus
  - **WebOfStories**
  - **WebOfStoriesPlaylist**
- - **Weibo**
  - **WeiqiTV**: WQTV
  - **wholecloud**: WholeCloud
  - **Wimp**
  - **WNL**
  - **WorldStarHipHop**
  - **wrzuta.pl**
+ - **wrzuta.pl:playlist**
  - **WSJ**: Wall Street Journal
  - **XBef**
  - **XboxClips**
- - **XFileShare**: XFileShare based sites: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net, filehoot.com and vidto.me
+ - **XFileShare**: XFileShare based sites: DaClips, FileHoot, GorillaVid, MovPod, PowerWatch, Rapidvideo.ws, TheVideoBee, Vidto, Streamin.To, XVIDSTAGE
  - **XHamster**
  - **XHamsterEmbed**
+ - **xiami:album**: 虾米音乐 - 专辑
+ - **xiami:artist**: 虾米音乐 - 歌手
+ - **xiami:collection**: 虾米音乐 - 精选集
+ - **xiami:song**: 虾米音乐
  - **XMinus**
  - **XNXX**
  - **Xstream**
  - **Ynet**
  - **YouJizz**
  - **youku**: 优酷
+ - **youku:show**
  - **YouPorn**
  - **YourUpload**
  - **youtube**: YouTube.com
  - **youtube:search**: YouTube.com searches
  - **youtube:search:date**: YouTube.com searches, newest videos first
  - **youtube:search_url**: YouTube.com search URLs
+ - **youtube:shared**
  - **youtube:show**: YouTube.com (multi-season) shows
  - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)
  - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword)
index 9444d403d542a0d3066d9d633532e4daf11726e9..508b27f3707898d07d303cd1ce44b7e4d54b152f 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -21,25 +21,37 @@ try:
     import py2exe
 except ImportError:
     if len(sys.argv) >= 2 and sys.argv[1] == 'py2exe':
-        print("Cannot import py2exe", file=sys.stderr)
+        print('Cannot import py2exe', file=sys.stderr)
         exit(1)
 
 py2exe_options = {
-    "bundle_files": 1,
-    "compressed": 1,
-    "optimize": 2,
-    "dist_dir": '.',
-    "dll_excludes": ['w9xpopen.exe', 'crypt32.dll'],
+    'bundle_files': 1,
+    'compressed': 1,
+    'optimize': 2,
+    'dist_dir': '.',
+    'dll_excludes': ['w9xpopen.exe', 'crypt32.dll'],
 }
 
+# Get the version from youtube_dl/version.py without importing the package
+exec(compile(open('youtube_dl/version.py').read(),
+             'youtube_dl/version.py', 'exec'))
+
+DESCRIPTION = 'YouTube video downloader'
+LONG_DESCRIPTION = 'Command-line program to download videos from YouTube.com and other video sites'
+
 py2exe_console = [{
-    "script": "./youtube_dl/__main__.py",
-    "dest_base": "youtube-dl",
+    'script': './youtube_dl/__main__.py',
+    'dest_base': 'youtube-dl',
+    'version': __version__,
+    'description': DESCRIPTION,
+    'comments': LONG_DESCRIPTION,
+    'product_name': 'youtube-dl',
+    'product_version': __version__,
 }]
 
 py2exe_params = {
     'console': py2exe_console,
-    'options': {"py2exe": py2exe_options},
+    'options': {'py2exe': py2exe_options},
     'zipfile': None
 }
 
@@ -72,7 +84,7 @@ else:
         params['scripts'] = ['bin/youtube-dl']
 
 class build_lazy_extractors(Command):
-    description = "Build the extractor lazy loading module"
+    description = 'Build the extractor lazy loading module'
     user_options = []
 
     def initialize_options(self):
@@ -87,16 +99,11 @@ class build_lazy_extractors(Command):
             dry_run=self.dry_run,
         )
 
-# Get the version from youtube_dl/version.py without importing the package
-exec(compile(open('youtube_dl/version.py').read(),
-             'youtube_dl/version.py', 'exec'))
-
 setup(
     name='youtube_dl',
     version=__version__,
-    description='YouTube video downloader',
-    long_description='Small command-line program to download videos from'
-    ' YouTube.com and other video sites.',
+    description=DESCRIPTION,
+    long_description=LONG_DESCRIPTION,
     url='https://github.com/rg3/youtube-dl',
     author='Ricardo Garcia',
     author_email='ytdl@yt-dl.org',
@@ -112,16 +119,17 @@ setup(
     # test_requires = ['nosetest'],
 
     classifiers=[
-        "Topic :: Multimedia :: Video",
-        "Development Status :: 5 - Production/Stable",
-        "Environment :: Console",
-        "License :: Public Domain",
-        "Programming Language :: Python :: 2.6",
-        "Programming Language :: Python :: 2.7",
-        "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.2",
-        "Programming Language :: Python :: 3.3",
-        "Programming Language :: Python :: 3.4",
+        'Topic :: Multimedia :: Video',
+        'Development Status :: 5 - Production/Stable',
+        'Environment :: Console',
+        'License :: Public Domain',
+        'Programming Language :: Python :: 2.6',
+        'Programming Language :: Python :: 2.7',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.2',
+        'Programming Language :: Python :: 3.3',
+        'Programming Language :: Python :: 3.4',
+        'Programming Language :: Python :: 3.5',
     ],
 
     cmdclass={'build_lazy_extractors': build_lazy_extractors},
index b8e22c5cb42f2e14465e812ed624aaa5e102ff5c..dfee217a9b8acb64e426c3ce8fc5c11a9c5a0121 100644 (file)
@@ -24,8 +24,13 @@ from youtube_dl.utils import (
 def get_params(override=None):
     PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                    "parameters.json")
+    LOCAL_PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                                         "local_parameters.json")
     with io.open(PARAMETERS_FILE, encoding='utf-8') as pf:
         parameters = json.load(pf)
+    if os.path.exists(LOCAL_PARAMETERS_FILE):
+        with io.open(LOCAL_PARAMETERS_FILE, encoding='utf-8') as pf:
+            parameters.update(json.load(pf))
     if override:
         parameters.update(override)
     return parameters
index 6404ac89f55df282e9525f6ae1a8e62f7344dd40..88e8ff904e26576125910cd2ecd2c3e5662d7b17 100644 (file)
@@ -11,7 +11,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from test.helper import FakeYDL
 from youtube_dl.extractor.common import InfoExtractor
 from youtube_dl.extractor import YoutubeIE, get_info_extractor
-from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError
+from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError
 
 
 class TestIE(InfoExtractor):
@@ -66,6 +66,11 @@ class TestInfoExtractor(unittest.TestCase):
         self.assertEqual(ie._html_search_meta('d', html), '4')
         self.assertEqual(ie._html_search_meta('e', html), '5')
         self.assertEqual(ie._html_search_meta('f', html), '6')
+        self.assertEqual(ie._html_search_meta(('a', 'b', 'c'), html), '1')
+        self.assertEqual(ie._html_search_meta(('c', 'b', 'a'), html), '3')
+        self.assertEqual(ie._html_search_meta(('z', 'x', 'c'), html), '3')
+        self.assertRaises(RegexNotFoundError, ie._html_search_meta, 'z', html, None, fatal=True)
+        self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True)
 
     def test_download_json(self):
         uri = encode_data_uri(b'{"foo": "blah"}', 'application/json')
index f5af184e6e0a79ccc11a9a66c2a9f19434087108..1f6079c2934de5386b4cb0a8a580c5f188824083 100644 (file)
@@ -6,6 +6,7 @@ from __future__ import unicode_literals
 import os
 import sys
 import unittest
+import collections
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 
@@ -130,6 +131,15 @@ class TestAllURLsMatching(unittest.TestCase):
             'https://screen.yahoo.com/smartwatches-latest-wearable-gadgets-163745379-cbs.html',
             ['Yahoo'])
 
+    def test_no_duplicated_ie_names(self):
+        name_accu = collections.defaultdict(list)
+        for ie in self.ies:
+            name_accu[ie.IE_NAME.lower()].append(type(ie).__name__)
+        for (ie_name, ie_list) in name_accu.items():
+            self.assertEqual(
+                len(ie_list), 1,
+                'Multiple extractors with the same IE_NAME "%s" (%s)' % (ie_name, ', '.join(ie_list)))
+
 
 if __name__ == '__main__':
     unittest.main()
index 618668210f62191da7f899a2a586c699b512c129..b574249489a3ded4cf5dcd66e49c123829c2331e 100644 (file)
@@ -10,13 +10,14 @@ import unittest
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 
-from youtube_dl.utils import get_filesystem_encoding
 from youtube_dl.compat import (
     compat_getenv,
+    compat_setenv,
     compat_etree_fromstring,
     compat_expanduser,
     compat_shlex_split,
     compat_str,
+    compat_struct_unpack,
     compat_urllib_parse_unquote,
     compat_urllib_parse_unquote_plus,
     compat_urllib_parse_urlencode,
@@ -26,19 +27,22 @@ from youtube_dl.compat import (
 class TestCompat(unittest.TestCase):
     def test_compat_getenv(self):
         test_str = 'тест'
-        os.environ['YOUTUBE-DL-TEST'] = (
-            test_str if sys.version_info >= (3, 0)
-            else test_str.encode(get_filesystem_encoding()))
+        compat_setenv('YOUTUBE-DL-TEST', test_str)
         self.assertEqual(compat_getenv('YOUTUBE-DL-TEST'), test_str)
 
+    def test_compat_setenv(self):
+        test_var = 'YOUTUBE-DL-TEST'
+        test_str = 'тест'
+        compat_setenv(test_var, test_str)
+        compat_getenv(test_var)
+        self.assertEqual(compat_getenv(test_var), test_str)
+
     def test_compat_expanduser(self):
         old_home = os.environ.get('HOME')
         test_str = 'C:\Documents and Settings\тест\Application Data'
-        os.environ['HOME'] = (
-            test_str if sys.version_info >= (3, 0)
-            else test_str.encode(get_filesystem_encoding()))
+        compat_setenv('HOME', test_str)
         self.assertEqual(compat_expanduser('~'), test_str)
-        os.environ['HOME'] = old_home
+        compat_setenv('HOME', old_home or '')
 
     def test_all_present(self):
         import youtube_dl.compat
@@ -83,6 +87,8 @@ class TestCompat(unittest.TestCase):
 
     def test_compat_shlex_split(self):
         self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two'])
+        self.assertEqual(compat_shlex_split('-option "one\ntwo" \n -flag'), ['-option', 'one\ntwo', '-flag'])
+        self.assertEqual(compat_shlex_split('-val 中文'), ['-val', '中文'])
 
     def test_compat_etree_fromstring(self):
         xml = '''
@@ -99,5 +105,15 @@ class TestCompat(unittest.TestCase):
         self.assertTrue(isinstance(doc.find('chinese').text, compat_str))
         self.assertTrue(isinstance(doc.find('foo/bar').text, compat_str))
 
+    def test_compat_etree_fromstring_doctype(self):
+        xml = '''<?xml version="1.0"?>
+<!DOCTYPE smil PUBLIC "-//W3C//DTD SMIL 2.0//EN" "http://www.w3.org/2001/SMIL20/SMIL20.dtd">
+<smil xmlns="http://www.w3.org/2001/SMIL20/Language"></smil>'''
+        compat_etree_fromstring(xml)
+
+    def test_struct_unpack(self):
+        self.assertEqual(compat_struct_unpack('!B', b'\x00'), (0,))
+
+
 if __name__ == '__main__':
     unittest.main()
index 15e0ad369d57966bef222bf35c422ad9bdb4e755..fdc68ccb42c85410788ecb7bcb1eafd802b3a794 100644 (file)
@@ -16,6 +16,15 @@ import threading
 TEST_DIR = os.path.dirname(os.path.abspath(__file__))
 
 
+def http_server_port(httpd):
+    if os.name == 'java' and isinstance(httpd.socket, ssl.SSLSocket):
+        # In Jython SSLSocket is not a subclass of socket.socket
+        sock = httpd.socket.sock
+    else:
+        sock = httpd.socket
+    return sock.getsockname()[1]
+
+
 class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler):
     def log_message(self, format, *args):
         pass
@@ -31,6 +40,22 @@ class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler):
             self.send_header('Content-Type', 'video/mp4')
             self.end_headers()
             self.wfile.write(b'\x00\x00\x00\x00\x20\x66\x74[video]')
+        elif self.path == '/302':
+            if sys.version_info[0] == 3:
+                # XXX: Python 3 http server does not allow non-ASCII header values
+                self.send_response(404)
+                self.end_headers()
+                return
+
+            new_url = 'http://localhost:%d/中文.html' % http_server_port(self.server)
+            self.send_response(302)
+            self.send_header(b'Location', new_url.encode('utf-8'))
+            self.end_headers()
+        elif self.path == '/%E4%B8%AD%E6%96%87.html':
+            self.send_response(200)
+            self.send_header('Content-Type', 'text/html; charset=utf-8')
+            self.end_headers()
+            self.wfile.write(b'<html><video src="/vid.mp4" /></html>')
         else:
             assert False
 
@@ -47,18 +72,32 @@ class FakeLogger(object):
 
 
 class TestHTTP(unittest.TestCase):
+    def setUp(self):
+        self.httpd = compat_http_server.HTTPServer(
+            ('localhost', 0), HTTPTestRequestHandler)
+        self.port = http_server_port(self.httpd)
+        self.server_thread = threading.Thread(target=self.httpd.serve_forever)
+        self.server_thread.daemon = True
+        self.server_thread.start()
+
+    def test_unicode_path_redirection(self):
+        # XXX: Python 3 http server does not allow non-ASCII header values
+        if sys.version_info[0] == 3:
+            return
+
+        ydl = YoutubeDL({'logger': FakeLogger()})
+        r = ydl.extract_info('http://localhost:%d/302' % self.port)
+        self.assertEqual(r['url'], 'http://localhost:%d/vid.mp4' % self.port)
+
+
+class TestHTTPS(unittest.TestCase):
     def setUp(self):
         certfn = os.path.join(TEST_DIR, 'testcert.pem')
         self.httpd = compat_http_server.HTTPServer(
             ('localhost', 0), HTTPTestRequestHandler)
         self.httpd.socket = ssl.wrap_socket(
             self.httpd.socket, certfile=certfn, server_side=True)
-        if os.name == 'java':
-            # In Jython SSLSocket is not a subclass of socket.socket
-            sock = self.httpd.socket.sock
-        else:
-            sock = self.httpd.socket
-        self.port = sock.getsockname()[1]
+        self.port = http_server_port(self.httpd)
         self.server_thread = threading.Thread(target=self.httpd.serve_forever)
         self.server_thread.daemon = True
         self.server_thread.start()
@@ -94,32 +133,32 @@ class TestProxy(unittest.TestCase):
     def setUp(self):
         self.proxy = compat_http_server.HTTPServer(
             ('localhost', 0), _build_proxy_handler('normal'))
-        self.port = self.proxy.socket.getsockname()[1]
+        self.port = http_server_port(self.proxy)
         self.proxy_thread = threading.Thread(target=self.proxy.serve_forever)
         self.proxy_thread.daemon = True
         self.proxy_thread.start()
 
-        self.cn_proxy = compat_http_server.HTTPServer(
-            ('localhost', 0), _build_proxy_handler('cn'))
-        self.cn_port = self.cn_proxy.socket.getsockname()[1]
-        self.cn_proxy_thread = threading.Thread(target=self.cn_proxy.serve_forever)
-        self.cn_proxy_thread.daemon = True
-        self.cn_proxy_thread.start()
+        self.geo_proxy = compat_http_server.HTTPServer(
+            ('localhost', 0), _build_proxy_handler('geo'))
+        self.geo_port = http_server_port(self.geo_proxy)
+        self.geo_proxy_thread = threading.Thread(target=self.geo_proxy.serve_forever)
+        self.geo_proxy_thread.daemon = True
+        self.geo_proxy_thread.start()
 
     def test_proxy(self):
-        cn_proxy = 'localhost:{0}'.format(self.cn_port)
+        geo_proxy = 'localhost:{0}'.format(self.geo_port)
         ydl = YoutubeDL({
             'proxy': 'localhost:{0}'.format(self.port),
-            'cn_verification_proxy': cn_proxy,
+            'geo_verification_proxy': geo_proxy,
         })
         url = 'http://foo.com/bar'
         response = ydl.urlopen(url).read().decode('utf-8')
         self.assertEqual(response, 'normal: {0}'.format(url))
 
         req = compat_urllib_request.Request(url)
-        req.add_header('Ytdl-request-proxy', cn_proxy)
+        req.add_header('Ytdl-request-proxy', geo_proxy)
         response = ydl.urlopen(req).read().decode('utf-8')
-        self.assertEqual(response, 'cn: {0}'.format(url))
+        self.assertEqual(response, 'geo: {0}'.format(url))
 
     def test_proxy_with_idn(self):
         ydl = YoutubeDL({
diff --git a/test/test_socks.py b/test/test_socks.py
new file mode 100644 (file)
index 0000000..1e68eb0
--- /dev/null
@@ -0,0 +1,118 @@
+#!/usr/bin/env python
+# coding: utf-8
+from __future__ import unicode_literals
+
+# Allow direct execution
+import os
+import sys
+import unittest
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import random
+import subprocess
+
+from test.helper import (
+    FakeYDL,
+    get_params,
+)
+from youtube_dl.compat import (
+    compat_str,
+    compat_urllib_request,
+)
+
+
+class TestMultipleSocks(unittest.TestCase):
+    @staticmethod
+    def _check_params(attrs):
+        params = get_params()
+        for attr in attrs:
+            if attr not in params:
+                print('Missing %s. Skipping.' % attr)
+                return
+        return params
+
+    def test_proxy_http(self):
+        params = self._check_params(['primary_proxy', 'primary_server_ip'])
+        if params is None:
+            return
+        ydl = FakeYDL({
+            'proxy': params['primary_proxy']
+        })
+        self.assertEqual(
+            ydl.urlopen('http://yt-dl.org/ip').read().decode('utf-8'),
+            params['primary_server_ip'])
+
+    def test_proxy_https(self):
+        params = self._check_params(['primary_proxy', 'primary_server_ip'])
+        if params is None:
+            return
+        ydl = FakeYDL({
+            'proxy': params['primary_proxy']
+        })
+        self.assertEqual(
+            ydl.urlopen('https://yt-dl.org/ip').read().decode('utf-8'),
+            params['primary_server_ip'])
+
+    def test_secondary_proxy_http(self):
+        params = self._check_params(['secondary_proxy', 'secondary_server_ip'])
+        if params is None:
+            return
+        ydl = FakeYDL()
+        req = compat_urllib_request.Request('http://yt-dl.org/ip')
+        req.add_header('Ytdl-request-proxy', params['secondary_proxy'])
+        self.assertEqual(
+            ydl.urlopen(req).read().decode('utf-8'),
+            params['secondary_server_ip'])
+
+    def test_secondary_proxy_https(self):
+        params = self._check_params(['secondary_proxy', 'secondary_server_ip'])
+        if params is None:
+            return
+        ydl = FakeYDL()
+        req = compat_urllib_request.Request('https://yt-dl.org/ip')
+        req.add_header('Ytdl-request-proxy', params['secondary_proxy'])
+        self.assertEqual(
+            ydl.urlopen(req).read().decode('utf-8'),
+            params['secondary_server_ip'])
+
+
+class TestSocks(unittest.TestCase):
+    _SKIP_SOCKS_TEST = True
+
+    def setUp(self):
+        if self._SKIP_SOCKS_TEST:
+            return
+
+        self.port = random.randint(20000, 30000)
+        self.server_process = subprocess.Popen([
+            'srelay', '-f', '-i', '127.0.0.1:%d' % self.port],
+            stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+    def tearDown(self):
+        if self._SKIP_SOCKS_TEST:
+            return
+
+        self.server_process.terminate()
+        self.server_process.communicate()
+
+    def _get_ip(self, protocol):
+        if self._SKIP_SOCKS_TEST:
+            return '127.0.0.1'
+
+        ydl = FakeYDL({
+            'proxy': '%s://127.0.0.1:%d' % (protocol, self.port),
+        })
+        return ydl.urlopen('http://yt-dl.org/ip').read().decode('utf-8')
+
+    def test_socks4(self):
+        self.assertTrue(isinstance(self._get_ip('socks4'), compat_str))
+
+    def test_socks4a(self):
+        self.assertTrue(isinstance(self._get_ip('socks4a'), compat_str))
+
+    def test_socks5(self):
+        self.assertTrue(isinstance(self._get_ip('socks5'), compat_str))
+
+
+if __name__ == '__main__':
+    unittest.main()
index e16a6761b7e9a70589c6da7b48c9f54e2c03e734..afd273a6533b68915c3b9cbcd547591882f9b37a 100644 (file)
@@ -33,6 +33,7 @@ from youtube_dl.utils import (
     ExtractorError,
     find_xpath_attr,
     fix_xml_ampersands,
+    get_element_by_class,
     InAdvancePagedList,
     intlist_to_bytes,
     is_html,
@@ -50,20 +51,23 @@ from youtube_dl.utils import (
     sanitize_path,
     prepend_extension,
     replace_extension,
+    remove_start,
+    remove_end,
     remove_quotes,
     shell_quote,
     smuggle_url,
     str_to_int,
     strip_jsonp,
-    struct_unpack,
     timeconvert,
     unescapeHTML,
     unified_strdate,
+    unified_timestamp,
     unsmuggle_url,
     uppercase_escape,
     lowercase_escape,
     url_basename,
     urlencode_postdata,
+    urshift,
     update_url_query,
     version_tuple,
     xpath_with_ns,
@@ -139,8 +143,8 @@ class TestUtil(unittest.TestCase):
         self.assertEqual('yes_no', sanitize_filename('yes? no', restricted=True))
         self.assertEqual('this_-_that', sanitize_filename('this: that', restricted=True))
 
-        tests = 'a\xe4b\u4e2d\u56fd\u7684c'
-        self.assertEqual(sanitize_filename(tests, restricted=True), 'a_b_c')
+        tests = 'aäb\u4e2d\u56fd\u7684c'
+        self.assertEqual(sanitize_filename(tests, restricted=True), 'aab_c')
         self.assertTrue(sanitize_filename('\xf6', restricted=True) != '')  # No empty filename
 
         forbidden = '"\0\\/&!: \'\t\n()[]{}$;`^,#'
@@ -155,6 +159,10 @@ class TestUtil(unittest.TestCase):
         self.assertTrue(sanitize_filename('-', restricted=True) != '')
         self.assertTrue(sanitize_filename(':', restricted=True) != '')
 
+        self.assertEqual(sanitize_filename(
+            'ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ', restricted=True),
+            'AAAAAAAECEEEEIIIIDNOOOOOOOOEUUUUUYPssaaaaaaaeceeeeiiiionooooooooeuuuuuypy')
+
     def test_sanitize_ids(self):
         self.assertEqual(sanitize_filename('_n_cd26wFpw', is_id=True), '_n_cd26wFpw')
         self.assertEqual(sanitize_filename('_BD_eEpuzXw', is_id=True), '_BD_eEpuzXw')
@@ -212,6 +220,16 @@ class TestUtil(unittest.TestCase):
         self.assertEqual(replace_extension('.abc', 'temp'), '.abc.temp')
         self.assertEqual(replace_extension('.abc.ext', 'temp'), '.abc.temp')
 
+    def test_remove_start(self):
+        self.assertEqual(remove_start(None, 'A - '), None)
+        self.assertEqual(remove_start('A - B', 'A - '), 'B')
+        self.assertEqual(remove_start('B - A', 'A - '), 'B - A')
+
+    def test_remove_end(self):
+        self.assertEqual(remove_end(None, ' - B'), None)
+        self.assertEqual(remove_end('A - B', ' - B'), 'A')
+        self.assertEqual(remove_end('B - A', ' - B'), 'B - A')
+
     def test_remove_quotes(self):
         self.assertEqual(remove_quotes(None), None)
         self.assertEqual(remove_quotes('"'), '"')
@@ -234,6 +252,8 @@ class TestUtil(unittest.TestCase):
         self.assertEqual(unescapeHTML('&#47;'), '/')
         self.assertEqual(unescapeHTML('&eacute;'), 'é')
         self.assertEqual(unescapeHTML('&#2013266066;'), '&#2013266066;')
+        # HTML5 entities
+        self.assertEqual(unescapeHTML('&period;&apos;'), '.\'')
 
     def test_date_from_str(self):
         self.assertEqual(date_from_str('yesterday'), date_from_str('now-1day'))
@@ -266,8 +286,28 @@ class TestUtil(unittest.TestCase):
             '20150202')
         self.assertEqual(unified_strdate('Feb 14th 2016 5:45PM'), '20160214')
         self.assertEqual(unified_strdate('25-09-2014'), '20140925')
+        self.assertEqual(unified_strdate('27.02.2016 17:30'), '20160227')
         self.assertEqual(unified_strdate('UNKNOWN DATE FORMAT'), None)
 
+    def test_unified_timestamps(self):
+        self.assertEqual(unified_timestamp('December 21, 2010'), 1292889600)
+        self.assertEqual(unified_timestamp('8/7/2009'), 1247011200)
+        self.assertEqual(unified_timestamp('Dec 14, 2012'), 1355443200)
+        self.assertEqual(unified_timestamp('2012/10/11 01:56:38 +0000'), 1349920598)
+        self.assertEqual(unified_timestamp('1968 12 10'), -33436800)
+        self.assertEqual(unified_timestamp('1968-12-10'), -33436800)
+        self.assertEqual(unified_timestamp('28/01/2014 21:00:00 +0100'), 1390939200)
+        self.assertEqual(
+            unified_timestamp('11/26/2014 11:30:00 AM PST', day_first=False),
+            1417001400)
+        self.assertEqual(
+            unified_timestamp('2/2/2015 6:47:40 PM', day_first=False),
+            1422902860)
+        self.assertEqual(unified_timestamp('Feb 14th 2016 5:45PM'), 1455471900)
+        self.assertEqual(unified_timestamp('25-09-2014'), 1411603200)
+        self.assertEqual(unified_timestamp('27.02.2016 17:30'), 1456594200)
+        self.assertEqual(unified_timestamp('UNKNOWN DATE FORMAT'), None)
+
     def test_determine_ext(self):
         self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4')
         self.assertEqual(determine_ext('http://example.com/foo/bar/?download', None), None)
@@ -366,6 +406,12 @@ class TestUtil(unittest.TestCase):
         self.assertEqual(res_url, url)
         self.assertEqual(res_data, None)
 
+        smug_url = smuggle_url(url, {'a': 'b'})
+        smug_smug_url = smuggle_url(smug_url, {'c': 'd'})
+        res_url, res_data = unsmuggle_url(smug_smug_url)
+        self.assertEqual(res_url, url)
+        self.assertEqual(res_data, {'a': 'b', 'c': 'd'})
+
     def test_shell_quote(self):
         args = ['ffmpeg', '-i', encodeFilename('ñ€ß\'.mp4')]
         self.assertEqual(shell_quote(args), """ffmpeg -i 'ñ€ß'"'"'.mp4'""")
@@ -453,9 +499,6 @@ class TestUtil(unittest.TestCase):
         testPL(5, 2, (2, 99), [2, 3, 4])
         testPL(5, 2, (20, 99), [])
 
-    def test_struct_unpack(self):
-        self.assertEqual(struct_unpack('!B', b'\x00'), (0,))
-
     def test_read_batch_urls(self):
         f = io.StringIO('''\xef\xbb\xbf foo
             bar\r
@@ -617,6 +660,18 @@ class TestUtil(unittest.TestCase):
         json_code = js_to_json(inp)
         self.assertEqual(json.loads(json_code), json.loads(inp))
 
+        inp = '''{
+            0:{src:'skipped', type: 'application/dash+xml'},
+            1:{src:'skipped', type: 'application/vnd.apple.mpegURL'},
+        }'''
+        self.assertEqual(js_to_json(inp), '''{
+            "0":{"src":"skipped", "type": "application/dash+xml"},
+            "1":{"src":"skipped", "type": "application/vnd.apple.mpegURL"}
+        }''')
+
+        inp = '''{"foo":101}'''
+        self.assertEqual(js_to_json(inp), '''{"foo":101}''')
+
     def test_js_to_json_edgecases(self):
         on = js_to_json("{abc_def:'1\\'\\\\2\\\\\\'3\"4'}")
         self.assertEqual(json.loads(on), {"abc_def": "1'\\2\\'3\"4"})
@@ -640,6 +695,27 @@ class TestUtil(unittest.TestCase):
         on = js_to_json('{"abc": "def",}')
         self.assertEqual(json.loads(on), {'abc': 'def'})
 
+        on = js_to_json('{ 0: /* " \n */ ",]" , }')
+        self.assertEqual(json.loads(on), {'0': ',]'})
+
+        on = js_to_json(r'["<p>x<\/p>"]')
+        self.assertEqual(json.loads(on), ['<p>x</p>'])
+
+        on = js_to_json(r'["\xaa"]')
+        self.assertEqual(json.loads(on), ['\u00aa'])
+
+        on = js_to_json("['a\\\nb']")
+        self.assertEqual(json.loads(on), ['ab'])
+
+        on = js_to_json('{0xff:0xff}')
+        self.assertEqual(json.loads(on), {'255': 255})
+
+        on = js_to_json('{077:077}')
+        self.assertEqual(json.loads(on), {'63': 63})
+
+        on = js_to_json('{42:42}')
+        self.assertEqual(json.loads(on), {'42': 42})
+
     def test_extract_attributes(self):
         self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'})
         self.assertEqual(extract_attributes("<e x='y'>"), {'x': 'y'})
@@ -912,5 +988,17 @@ The first line
         self.assertRaises(ValueError, encode_base_n, 0, 70)
         self.assertRaises(ValueError, encode_base_n, 0, 60, custom_table)
 
+    def test_urshift(self):
+        self.assertEqual(urshift(3, 1), 1)
+        self.assertEqual(urshift(-3, 1), 2147483646)
+
+    def test_get_element_by_class(self):
+        html = '''
+            <span class="foo bar">nice</span>
+        '''
+
+        self.assertEqual(get_element_by_class('foo', html), 'nice')
+        self.assertEqual(get_element_by_class('no-such-class', html), None)
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/tox.ini b/tox.ini
index 2d71340050bf8f8a971acb3931621f62ded02176..9c4e4a3d1eab285d8def7fce06e7d5ceb108952e 100644 (file)
--- a/tox.ini
+++ b/tox.ini
@@ -9,5 +9,6 @@ passenv = HOME
 defaultargs = test --exclude test_download.py --exclude test_age_restriction.py
     --exclude test_subtitles.py --exclude test_write_annotations.py
     --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py
+    --exclude test_socks.py
 commands = nosetests --verbose {posargs:{[testenv]defaultargs}}  # --with-coverage --cover-package=youtube_dl --cover-html
                                                # test.test_download:TestDownload.test_NowVideo
index 0554333629b829a2a6cb807546a643713cbd0ad5..ba72ec6f3d189b85737a5188074070f835b16f71 100755 (executable)
@@ -64,6 +64,7 @@ from .utils import (
     PostProcessingError,
     preferredencoding,
     prepend_extension,
+    register_socks_protocols,
     render_table,
     replace_extension,
     SameFileError,
@@ -195,8 +196,8 @@ class YoutubeDL(object):
     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
                        At the moment, this is only supported by YouTube.
     proxy:             URL of the proxy server to use
-    cn_verification_proxy:  URL of the proxy to use for IP address verification
-                       on Chinese sites. (Experimental)
+    geo_verification_proxy:  URL of the proxy to use for IP address verification
+                       on geo-restricted sites. (Experimental)
     socket_timeout:    Time to wait for unresponsive hosts, in seconds
     bidi_workaround:   Work around buggy terminals without bidirectional text
                        support, using fridibi
@@ -303,6 +304,11 @@ class YoutubeDL(object):
         self.params.update(params)
         self.cache = Cache(self)
 
+        if self.params.get('cn_verification_proxy') is not None:
+            self.report_warning('--cn-verification-proxy is deprecated. Use --geo-verification-proxy instead.')
+            if self.params.get('geo_verification_proxy') is None:
+                self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
+
         if params.get('bidi_workaround', False):
             try:
                 import pty
@@ -325,7 +331,7 @@ class YoutubeDL(object):
                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
                 self._output_channel = os.fdopen(master, 'rb')
             except OSError as ose:
-                if ose.errno == 2:
+                if ose.errno == errno.ENOENT:
                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
                 else:
                     raise
@@ -361,6 +367,8 @@ class YoutubeDL(object):
         for ph in self.params.get('progress_hooks', []):
             self.add_progress_hook(ph)
 
+        register_socks_protocols()
+
     def warn_if_short_id(self, argv):
         # short YouTube ID starting with dash?
         idxs = [
@@ -580,7 +588,7 @@ class YoutubeDL(object):
                 is_id=(k == 'id'))
             template_dict = dict((k, sanitize(k, v))
                                  for k, v in template_dict.items()
-                                 if v is not None)
+                                 if v is not None and not isinstance(v, (list, tuple, dict)))
             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
 
             outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
@@ -717,6 +725,7 @@ class YoutubeDL(object):
         result_type = ie_result.get('_type', 'video')
 
         if result_type in ('url', 'url_transparent'):
+            ie_result['url'] = sanitize_url(ie_result['url'])
             extract_flat = self.params.get('extract_flat', False)
             if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
                     extract_flat is True):
@@ -1219,6 +1228,10 @@ class YoutubeDL(object):
         if 'title' not in info_dict:
             raise ExtractorError('Missing "title" field in extractor result')
 
+        if not isinstance(info_dict['id'], compat_str):
+            self.report_warning('"id" field is not a string - forcing string conversion')
+            info_dict['id'] = compat_str(info_dict['id'])
+
         if 'playlist' not in info_dict:
             # It isn't part of a playlist
             info_dict['playlist'] = None
@@ -1639,7 +1652,7 @@ class YoutubeDL(object):
                     # Just a single file
                     success = dl(filename, info_dict)
             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-                self.report_error('unable to download video data: %s' % str(err))
+                self.report_error('unable to download video data: %s' % error_to_compat_str(err))
                 return
             except (OSError, IOError) as err:
                 raise UnavailableVideoError(err)
@@ -2018,6 +2031,7 @@ class YoutubeDL(object):
         if opts_cookiefile is None:
             self.cookiejar = compat_cookiejar.CookieJar()
         else:
+            opts_cookiefile = compat_expanduser(opts_cookiefile)
             self.cookiejar = compat_cookiejar.MozillaCookieJar(
                 opts_cookiefile)
             if os.access(opts_cookiefile, os.R_OK):
index 737f6545d4136401dd3d8ddd691ad52e86894bb0..2b34bf9c282983ee2c53bc9d340232831a31a516 100644 (file)
@@ -18,7 +18,6 @@ from .options import (
 from .compat import (
     compat_expanduser,
     compat_getpass,
-    compat_print,
     compat_shlex_split,
     workaround_optparse_bug9161,
 )
@@ -67,16 +66,16 @@ def _real_main(argv=None):
     # Custom HTTP headers
     if opts.headers is not None:
         for h in opts.headers:
-            if h.find(':', 1) < 0:
+            if ':' not in h:
                 parser.error('wrong header formatting, it should be key:value, not "%s"' % h)
-            key, value = h.split(':', 2)
+            key, value = h.split(':', 1)
             if opts.verbose:
                 write_string('[debug] Adding header from command line option %s:%s\n' % (key, value))
             std_headers[key] = value
 
     # Dump user agent
     if opts.dump_user_agent:
-        compat_print(std_headers['User-Agent'])
+        write_string(std_headers['User-Agent'] + '\n', out=sys.stdout)
         sys.exit(0)
 
     # Batch file verification
@@ -86,7 +85,9 @@ def _real_main(argv=None):
             if opts.batchfile == '-':
                 batchfd = sys.stdin
             else:
-                batchfd = io.open(opts.batchfile, 'r', encoding='utf-8', errors='ignore')
+                batchfd = io.open(
+                    compat_expanduser(opts.batchfile),
+                    'r', encoding='utf-8', errors='ignore')
             batch_urls = read_batch_urls(batchfd)
             if opts.verbose:
                 write_string('[debug] Batch file urls: ' + repr(batch_urls) + '\n')
@@ -99,10 +100,10 @@ def _real_main(argv=None):
 
     if opts.list_extractors:
         for ie in list_extractors(opts.age_limit):
-            compat_print(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie._WORKING else ''))
+            write_string(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie._WORKING else '') + '\n', out=sys.stdout)
             matchedUrls = [url for url in all_urls if ie.suitable(url)]
             for mu in matchedUrls:
-                compat_print('  ' + mu)
+                write_string('  ' + mu + '\n', out=sys.stdout)
         sys.exit(0)
     if opts.list_extractor_descriptions:
         for ie in list_extractors(opts.age_limit):
@@ -115,7 +116,7 @@ def _real_main(argv=None):
                 _SEARCHES = ('cute kittens', 'slithering pythons', 'falling cat', 'angry poodle', 'purple fish', 'running tortoise', 'sleeping bunny', 'burping cow')
                 _COUNTS = ('', '5', '10', 'all')
                 desc += ' (Example: "%s%s:%s" )' % (ie.SEARCH_KEY, random.choice(_COUNTS), random.choice(_SEARCHES))
-            compat_print(desc)
+            write_string(desc + '\n', out=sys.stdout)
         sys.exit(0)
 
     # Conflicting, missing and erroneous options
@@ -381,6 +382,8 @@ def _real_main(argv=None):
         'external_downloader_args': external_downloader_args,
         'postprocessor_args': postprocessor_args,
         'cn_verification_proxy': opts.cn_verification_proxy,
+        'geo_verification_proxy': opts.geo_verification_proxy,
+
     }
 
     with YoutubeDL(ydl_opts) as ydl:
@@ -404,7 +407,7 @@ def _real_main(argv=None):
 
         try:
             if opts.load_info_filename is not None:
-                retcode = ydl.download_with_info_file(opts.load_info_filename)
+                retcode = ydl.download_with_info_file(compat_expanduser(opts.load_info_filename))
             else:
                 retcode = ydl.download(all_urls)
         except MaxDownloadsReached:
index 0b6c5ca7a8ba5eb6cb064916d56a5ca8eae32003..b8aaf5a461c9e3ca2884c748ebb3225a2fd9fe29 100644 (file)
@@ -1,3 +1,4 @@
+# coding: utf-8
 from __future__ import unicode_literals
 
 import binascii
@@ -11,6 +12,7 @@ import re
 import shlex
 import shutil
 import socket
+import struct
 import subprocess
 import sys
 import itertools
@@ -62,6 +64,2244 @@ try:
 except ImportError:  # Python 2
     import htmlentitydefs as compat_html_entities
 
+try:  # Python >= 3.3
+    compat_html_entities_html5 = compat_html_entities.html5
+except AttributeError:
+    # Copied from CPython 3.5.1 html/entities.py
+    compat_html_entities_html5 = {
+        'Aacute': '\xc1',
+        'aacute': '\xe1',
+        'Aacute;': '\xc1',
+        'aacute;': '\xe1',
+        'Abreve;': '\u0102',
+        'abreve;': '\u0103',
+        'ac;': '\u223e',
+        'acd;': '\u223f',
+        'acE;': '\u223e\u0333',
+        'Acirc': '\xc2',
+        'acirc': '\xe2',
+        'Acirc;': '\xc2',
+        'acirc;': '\xe2',
+        'acute': '\xb4',
+        'acute;': '\xb4',
+        'Acy;': '\u0410',
+        'acy;': '\u0430',
+        'AElig': '\xc6',
+        'aelig': '\xe6',
+        'AElig;': '\xc6',
+        'aelig;': '\xe6',
+        'af;': '\u2061',
+        'Afr;': '\U0001d504',
+        'afr;': '\U0001d51e',
+        'Agrave': '\xc0',
+        'agrave': '\xe0',
+        'Agrave;': '\xc0',
+        'agrave;': '\xe0',
+        'alefsym;': '\u2135',
+        'aleph;': '\u2135',
+        'Alpha;': '\u0391',
+        'alpha;': '\u03b1',
+        'Amacr;': '\u0100',
+        'amacr;': '\u0101',
+        'amalg;': '\u2a3f',
+        'AMP': '&',
+        'amp': '&',
+        'AMP;': '&',
+        'amp;': '&',
+        'And;': '\u2a53',
+        'and;': '\u2227',
+        'andand;': '\u2a55',
+        'andd;': '\u2a5c',
+        'andslope;': '\u2a58',
+        'andv;': '\u2a5a',
+        'ang;': '\u2220',
+        'ange;': '\u29a4',
+        'angle;': '\u2220',
+        'angmsd;': '\u2221',
+        'angmsdaa;': '\u29a8',
+        'angmsdab;': '\u29a9',
+        'angmsdac;': '\u29aa',
+        'angmsdad;': '\u29ab',
+        'angmsdae;': '\u29ac',
+        'angmsdaf;': '\u29ad',
+        'angmsdag;': '\u29ae',
+        'angmsdah;': '\u29af',
+        'angrt;': '\u221f',
+        'angrtvb;': '\u22be',
+        'angrtvbd;': '\u299d',
+        'angsph;': '\u2222',
+        'angst;': '\xc5',
+        'angzarr;': '\u237c',
+        'Aogon;': '\u0104',
+        'aogon;': '\u0105',
+        'Aopf;': '\U0001d538',
+        'aopf;': '\U0001d552',
+        'ap;': '\u2248',
+        'apacir;': '\u2a6f',
+        'apE;': '\u2a70',
+        'ape;': '\u224a',
+        'apid;': '\u224b',
+        'apos;': "'",
+        'ApplyFunction;': '\u2061',
+        'approx;': '\u2248',
+        'approxeq;': '\u224a',
+        'Aring': '\xc5',
+        'aring': '\xe5',
+        'Aring;': '\xc5',
+        'aring;': '\xe5',
+        'Ascr;': '\U0001d49c',
+        'ascr;': '\U0001d4b6',
+        'Assign;': '\u2254',
+        'ast;': '*',
+        'asymp;': '\u2248',
+        'asympeq;': '\u224d',
+        'Atilde': '\xc3',
+        'atilde': '\xe3',
+        'Atilde;': '\xc3',
+        'atilde;': '\xe3',
+        'Auml': '\xc4',
+        'auml': '\xe4',
+        'Auml;': '\xc4',
+        'auml;': '\xe4',
+        'awconint;': '\u2233',
+        'awint;': '\u2a11',
+        'backcong;': '\u224c',
+        'backepsilon;': '\u03f6',
+        'backprime;': '\u2035',
+        'backsim;': '\u223d',
+        'backsimeq;': '\u22cd',
+        'Backslash;': '\u2216',
+        'Barv;': '\u2ae7',
+        'barvee;': '\u22bd',
+        'Barwed;': '\u2306',
+        'barwed;': '\u2305',
+        'barwedge;': '\u2305',
+        'bbrk;': '\u23b5',
+        'bbrktbrk;': '\u23b6',
+        'bcong;': '\u224c',
+        'Bcy;': '\u0411',
+        'bcy;': '\u0431',
+        'bdquo;': '\u201e',
+        'becaus;': '\u2235',
+        'Because;': '\u2235',
+        'because;': '\u2235',
+        'bemptyv;': '\u29b0',
+        'bepsi;': '\u03f6',
+        'bernou;': '\u212c',
+        'Bernoullis;': '\u212c',
+        'Beta;': '\u0392',
+        'beta;': '\u03b2',
+        'beth;': '\u2136',
+        'between;': '\u226c',
+        'Bfr;': '\U0001d505',
+        'bfr;': '\U0001d51f',
+        'bigcap;': '\u22c2',
+        'bigcirc;': '\u25ef',
+        'bigcup;': '\u22c3',
+        'bigodot;': '\u2a00',
+        'bigoplus;': '\u2a01',
+        'bigotimes;': '\u2a02',
+        'bigsqcup;': '\u2a06',
+        'bigstar;': '\u2605',
+        'bigtriangledown;': '\u25bd',
+        'bigtriangleup;': '\u25b3',
+        'biguplus;': '\u2a04',
+        'bigvee;': '\u22c1',
+        'bigwedge;': '\u22c0',
+        'bkarow;': '\u290d',
+        'blacklozenge;': '\u29eb',
+        'blacksquare;': '\u25aa',
+        'blacktriangle;': '\u25b4',
+        'blacktriangledown;': '\u25be',
+        'blacktriangleleft;': '\u25c2',
+        'blacktriangleright;': '\u25b8',
+        'blank;': '\u2423',
+        'blk12;': '\u2592',
+        'blk14;': '\u2591',
+        'blk34;': '\u2593',
+        'block;': '\u2588',
+        'bne;': '=\u20e5',
+        'bnequiv;': '\u2261\u20e5',
+        'bNot;': '\u2aed',
+        'bnot;': '\u2310',
+        'Bopf;': '\U0001d539',
+        'bopf;': '\U0001d553',
+        'bot;': '\u22a5',
+        'bottom;': '\u22a5',
+        'bowtie;': '\u22c8',
+        'boxbox;': '\u29c9',
+        'boxDL;': '\u2557',
+        'boxDl;': '\u2556',
+        'boxdL;': '\u2555',
+        'boxdl;': '\u2510',
+        'boxDR;': '\u2554',
+        'boxDr;': '\u2553',
+        'boxdR;': '\u2552',
+        'boxdr;': '\u250c',
+        'boxH;': '\u2550',
+        'boxh;': '\u2500',
+        'boxHD;': '\u2566',
+        'boxHd;': '\u2564',
+        'boxhD;': '\u2565',
+        'boxhd;': '\u252c',
+        'boxHU;': '\u2569',
+        'boxHu;': '\u2567',
+        'boxhU;': '\u2568',
+        'boxhu;': '\u2534',
+        'boxminus;': '\u229f',
+        'boxplus;': '\u229e',
+        'boxtimes;': '\u22a0',
+        'boxUL;': '\u255d',
+        'boxUl;': '\u255c',
+        'boxuL;': '\u255b',
+        'boxul;': '\u2518',
+        'boxUR;': '\u255a',
+        'boxUr;': '\u2559',
+        'boxuR;': '\u2558',
+        'boxur;': '\u2514',
+        'boxV;': '\u2551',
+        'boxv;': '\u2502',
+        'boxVH;': '\u256c',
+        'boxVh;': '\u256b',
+        'boxvH;': '\u256a',
+        'boxvh;': '\u253c',
+        'boxVL;': '\u2563',
+        'boxVl;': '\u2562',
+        'boxvL;': '\u2561',
+        'boxvl;': '\u2524',
+        'boxVR;': '\u2560',
+        'boxVr;': '\u255f',
+        'boxvR;': '\u255e',
+        'boxvr;': '\u251c',
+        'bprime;': '\u2035',
+        'Breve;': '\u02d8',
+        'breve;': '\u02d8',
+        'brvbar': '\xa6',
+        'brvbar;': '\xa6',
+        'Bscr;': '\u212c',
+        'bscr;': '\U0001d4b7',
+        'bsemi;': '\u204f',
+        'bsim;': '\u223d',
+        'bsime;': '\u22cd',
+        'bsol;': '\\',
+        'bsolb;': '\u29c5',
+        'bsolhsub;': '\u27c8',
+        'bull;': '\u2022',
+        'bullet;': '\u2022',
+        'bump;': '\u224e',
+        'bumpE;': '\u2aae',
+        'bumpe;': '\u224f',
+        'Bumpeq;': '\u224e',
+        'bumpeq;': '\u224f',
+        'Cacute;': '\u0106',
+        'cacute;': '\u0107',
+        'Cap;': '\u22d2',
+        'cap;': '\u2229',
+        'capand;': '\u2a44',
+        'capbrcup;': '\u2a49',
+        'capcap;': '\u2a4b',
+        'capcup;': '\u2a47',
+        'capdot;': '\u2a40',
+        'CapitalDifferentialD;': '\u2145',
+        'caps;': '\u2229\ufe00',
+        'caret;': '\u2041',
+        'caron;': '\u02c7',
+        'Cayleys;': '\u212d',
+        'ccaps;': '\u2a4d',
+        'Ccaron;': '\u010c',
+        'ccaron;': '\u010d',
+        'Ccedil': '\xc7',
+        'ccedil': '\xe7',
+        'Ccedil;': '\xc7',
+        'ccedil;': '\xe7',
+        'Ccirc;': '\u0108',
+        'ccirc;': '\u0109',
+        'Cconint;': '\u2230',
+        'ccups;': '\u2a4c',
+        'ccupssm;': '\u2a50',
+        'Cdot;': '\u010a',
+        'cdot;': '\u010b',
+        'cedil': '\xb8',
+        'cedil;': '\xb8',
+        'Cedilla;': '\xb8',
+        'cemptyv;': '\u29b2',
+        'cent': '\xa2',
+        'cent;': '\xa2',
+        'CenterDot;': '\xb7',
+        'centerdot;': '\xb7',
+        'Cfr;': '\u212d',
+        'cfr;': '\U0001d520',
+        'CHcy;': '\u0427',
+        'chcy;': '\u0447',
+        'check;': '\u2713',
+        'checkmark;': '\u2713',
+        'Chi;': '\u03a7',
+        'chi;': '\u03c7',
+        'cir;': '\u25cb',
+        'circ;': '\u02c6',
+        'circeq;': '\u2257',
+        'circlearrowleft;': '\u21ba',
+        'circlearrowright;': '\u21bb',
+        'circledast;': '\u229b',
+        'circledcirc;': '\u229a',
+        'circleddash;': '\u229d',
+        'CircleDot;': '\u2299',
+        'circledR;': '\xae',
+        'circledS;': '\u24c8',
+        'CircleMinus;': '\u2296',
+        'CirclePlus;': '\u2295',
+        'CircleTimes;': '\u2297',
+        'cirE;': '\u29c3',
+        'cire;': '\u2257',
+        'cirfnint;': '\u2a10',
+        'cirmid;': '\u2aef',
+        'cirscir;': '\u29c2',
+        'ClockwiseContourIntegral;': '\u2232',
+        'CloseCurlyDoubleQuote;': '\u201d',
+        'CloseCurlyQuote;': '\u2019',
+        'clubs;': '\u2663',
+        'clubsuit;': '\u2663',
+        'Colon;': '\u2237',
+        'colon;': ':',
+        'Colone;': '\u2a74',
+        'colone;': '\u2254',
+        'coloneq;': '\u2254',
+        'comma;': ',',
+        'commat;': '@',
+        'comp;': '\u2201',
+        'compfn;': '\u2218',
+        'complement;': '\u2201',
+        'complexes;': '\u2102',
+        'cong;': '\u2245',
+        'congdot;': '\u2a6d',
+        'Congruent;': '\u2261',
+        'Conint;': '\u222f',
+        'conint;': '\u222e',
+        'ContourIntegral;': '\u222e',
+        'Copf;': '\u2102',
+        'copf;': '\U0001d554',
+        'coprod;': '\u2210',
+        'Coproduct;': '\u2210',
+        'COPY': '\xa9',
+        'copy': '\xa9',
+        'COPY;': '\xa9',
+        'copy;': '\xa9',
+        'copysr;': '\u2117',
+        'CounterClockwiseContourIntegral;': '\u2233',
+        'crarr;': '\u21b5',
+        'Cross;': '\u2a2f',
+        'cross;': '\u2717',
+        'Cscr;': '\U0001d49e',
+        'cscr;': '\U0001d4b8',
+        'csub;': '\u2acf',
+        'csube;': '\u2ad1',
+        'csup;': '\u2ad0',
+        'csupe;': '\u2ad2',
+        'ctdot;': '\u22ef',
+        'cudarrl;': '\u2938',
+        'cudarrr;': '\u2935',
+        'cuepr;': '\u22de',
+        'cuesc;': '\u22df',
+        'cularr;': '\u21b6',
+        'cularrp;': '\u293d',
+        'Cup;': '\u22d3',
+        'cup;': '\u222a',
+        'cupbrcap;': '\u2a48',
+        'CupCap;': '\u224d',
+        'cupcap;': '\u2a46',
+        'cupcup;': '\u2a4a',
+        'cupdot;': '\u228d',
+        'cupor;': '\u2a45',
+        'cups;': '\u222a\ufe00',
+        'curarr;': '\u21b7',
+        'curarrm;': '\u293c',
+        'curlyeqprec;': '\u22de',
+        'curlyeqsucc;': '\u22df',
+        'curlyvee;': '\u22ce',
+        'curlywedge;': '\u22cf',
+        'curren': '\xa4',
+        'curren;': '\xa4',
+        'curvearrowleft;': '\u21b6',
+        'curvearrowright;': '\u21b7',
+        'cuvee;': '\u22ce',
+        'cuwed;': '\u22cf',
+        'cwconint;': '\u2232',
+        'cwint;': '\u2231',
+        'cylcty;': '\u232d',
+        'Dagger;': '\u2021',
+        'dagger;': '\u2020',
+        'daleth;': '\u2138',
+        'Darr;': '\u21a1',
+        'dArr;': '\u21d3',
+        'darr;': '\u2193',
+        'dash;': '\u2010',
+        'Dashv;': '\u2ae4',
+        'dashv;': '\u22a3',
+        'dbkarow;': '\u290f',
+        'dblac;': '\u02dd',
+        'Dcaron;': '\u010e',
+        'dcaron;': '\u010f',
+        'Dcy;': '\u0414',
+        'dcy;': '\u0434',
+        'DD;': '\u2145',
+        'dd;': '\u2146',
+        'ddagger;': '\u2021',
+        'ddarr;': '\u21ca',
+        'DDotrahd;': '\u2911',
+        'ddotseq;': '\u2a77',
+        'deg': '\xb0',
+        'deg;': '\xb0',
+        'Del;': '\u2207',
+        'Delta;': '\u0394',
+        'delta;': '\u03b4',
+        'demptyv;': '\u29b1',
+        'dfisht;': '\u297f',
+        'Dfr;': '\U0001d507',
+        'dfr;': '\U0001d521',
+        'dHar;': '\u2965',
+        'dharl;': '\u21c3',
+        'dharr;': '\u21c2',
+        'DiacriticalAcute;': '\xb4',
+        'DiacriticalDot;': '\u02d9',
+        'DiacriticalDoubleAcute;': '\u02dd',
+        'DiacriticalGrave;': '`',
+        'DiacriticalTilde;': '\u02dc',
+        'diam;': '\u22c4',
+        'Diamond;': '\u22c4',
+        'diamond;': '\u22c4',
+        'diamondsuit;': '\u2666',
+        'diams;': '\u2666',
+        'die;': '\xa8',
+        'DifferentialD;': '\u2146',
+        'digamma;': '\u03dd',
+        'disin;': '\u22f2',
+        'div;': '\xf7',
+        'divide': '\xf7',
+        'divide;': '\xf7',
+        'divideontimes;': '\u22c7',
+        'divonx;': '\u22c7',
+        'DJcy;': '\u0402',
+        'djcy;': '\u0452',
+        'dlcorn;': '\u231e',
+        'dlcrop;': '\u230d',
+        'dollar;': '$',
+        'Dopf;': '\U0001d53b',
+        'dopf;': '\U0001d555',
+        'Dot;': '\xa8',
+        'dot;': '\u02d9',
+        'DotDot;': '\u20dc',
+        'doteq;': '\u2250',
+        'doteqdot;': '\u2251',
+        'DotEqual;': '\u2250',
+        'dotminus;': '\u2238',
+        'dotplus;': '\u2214',
+        'dotsquare;': '\u22a1',
+        'doublebarwedge;': '\u2306',
+        'DoubleContourIntegral;': '\u222f',
+        'DoubleDot;': '\xa8',
+        'DoubleDownArrow;': '\u21d3',
+        'DoubleLeftArrow;': '\u21d0',
+        'DoubleLeftRightArrow;': '\u21d4',
+        'DoubleLeftTee;': '\u2ae4',
+        'DoubleLongLeftArrow;': '\u27f8',
+        'DoubleLongLeftRightArrow;': '\u27fa',
+        'DoubleLongRightArrow;': '\u27f9',
+        'DoubleRightArrow;': '\u21d2',
+        'DoubleRightTee;': '\u22a8',
+        'DoubleUpArrow;': '\u21d1',
+        'DoubleUpDownArrow;': '\u21d5',
+        'DoubleVerticalBar;': '\u2225',
+        'DownArrow;': '\u2193',
+        'Downarrow;': '\u21d3',
+        'downarrow;': '\u2193',
+        'DownArrowBar;': '\u2913',
+        'DownArrowUpArrow;': '\u21f5',
+        'DownBreve;': '\u0311',
+        'downdownarrows;': '\u21ca',
+        'downharpoonleft;': '\u21c3',
+        'downharpoonright;': '\u21c2',
+        'DownLeftRightVector;': '\u2950',
+        'DownLeftTeeVector;': '\u295e',
+        'DownLeftVector;': '\u21bd',
+        'DownLeftVectorBar;': '\u2956',
+        'DownRightTeeVector;': '\u295f',
+        'DownRightVector;': '\u21c1',
+        'DownRightVectorBar;': '\u2957',
+        'DownTee;': '\u22a4',
+        'DownTeeArrow;': '\u21a7',
+        'drbkarow;': '\u2910',
+        'drcorn;': '\u231f',
+        'drcrop;': '\u230c',
+        'Dscr;': '\U0001d49f',
+        'dscr;': '\U0001d4b9',
+        'DScy;': '\u0405',
+        'dscy;': '\u0455',
+        'dsol;': '\u29f6',
+        'Dstrok;': '\u0110',
+        'dstrok;': '\u0111',
+        'dtdot;': '\u22f1',
+        'dtri;': '\u25bf',
+        'dtrif;': '\u25be',
+        'duarr;': '\u21f5',
+        'duhar;': '\u296f',
+        'dwangle;': '\u29a6',
+        'DZcy;': '\u040f',
+        'dzcy;': '\u045f',
+        'dzigrarr;': '\u27ff',
+        'Eacute': '\xc9',
+        'eacute': '\xe9',
+        'Eacute;': '\xc9',
+        'eacute;': '\xe9',
+        'easter;': '\u2a6e',
+        'Ecaron;': '\u011a',
+        'ecaron;': '\u011b',
+        'ecir;': '\u2256',
+        'Ecirc': '\xca',
+        'ecirc': '\xea',
+        'Ecirc;': '\xca',
+        'ecirc;': '\xea',
+        'ecolon;': '\u2255',
+        'Ecy;': '\u042d',
+        'ecy;': '\u044d',
+        'eDDot;': '\u2a77',
+        'Edot;': '\u0116',
+        'eDot;': '\u2251',
+        'edot;': '\u0117',
+        'ee;': '\u2147',
+        'efDot;': '\u2252',
+        'Efr;': '\U0001d508',
+        'efr;': '\U0001d522',
+        'eg;': '\u2a9a',
+        'Egrave': '\xc8',
+        'egrave': '\xe8',
+        'Egrave;': '\xc8',
+        'egrave;': '\xe8',
+        'egs;': '\u2a96',
+        'egsdot;': '\u2a98',
+        'el;': '\u2a99',
+        'Element;': '\u2208',
+        'elinters;': '\u23e7',
+        'ell;': '\u2113',
+        'els;': '\u2a95',
+        'elsdot;': '\u2a97',
+        'Emacr;': '\u0112',
+        'emacr;': '\u0113',
+        'empty;': '\u2205',
+        'emptyset;': '\u2205',
+        'EmptySmallSquare;': '\u25fb',
+        'emptyv;': '\u2205',
+        'EmptyVerySmallSquare;': '\u25ab',
+        'emsp13;': '\u2004',
+        'emsp14;': '\u2005',
+        'emsp;': '\u2003',
+        'ENG;': '\u014a',
+        'eng;': '\u014b',
+        'ensp;': '\u2002',
+        'Eogon;': '\u0118',
+        'eogon;': '\u0119',
+        'Eopf;': '\U0001d53c',
+        'eopf;': '\U0001d556',
+        'epar;': '\u22d5',
+        'eparsl;': '\u29e3',
+        'eplus;': '\u2a71',
+        'epsi;': '\u03b5',
+        'Epsilon;': '\u0395',
+        'epsilon;': '\u03b5',
+        'epsiv;': '\u03f5',
+        'eqcirc;': '\u2256',
+        'eqcolon;': '\u2255',
+        'eqsim;': '\u2242',
+        'eqslantgtr;': '\u2a96',
+        'eqslantless;': '\u2a95',
+        'Equal;': '\u2a75',
+        'equals;': '=',
+        'EqualTilde;': '\u2242',
+        'equest;': '\u225f',
+        'Equilibrium;': '\u21cc',
+        'equiv;': '\u2261',
+        'equivDD;': '\u2a78',
+        'eqvparsl;': '\u29e5',
+        'erarr;': '\u2971',
+        'erDot;': '\u2253',
+        'Escr;': '\u2130',
+        'escr;': '\u212f',
+        'esdot;': '\u2250',
+        'Esim;': '\u2a73',
+        'esim;': '\u2242',
+        'Eta;': '\u0397',
+        'eta;': '\u03b7',
+        'ETH': '\xd0',
+        'eth': '\xf0',
+        'ETH;': '\xd0',
+        'eth;': '\xf0',
+        'Euml': '\xcb',
+        'euml': '\xeb',
+        'Euml;': '\xcb',
+        'euml;': '\xeb',
+        'euro;': '\u20ac',
+        'excl;': '!',
+        'exist;': '\u2203',
+        'Exists;': '\u2203',
+        'expectation;': '\u2130',
+        'ExponentialE;': '\u2147',
+        'exponentiale;': '\u2147',
+        'fallingdotseq;': '\u2252',
+        'Fcy;': '\u0424',
+        'fcy;': '\u0444',
+        'female;': '\u2640',
+        'ffilig;': '\ufb03',
+        'fflig;': '\ufb00',
+        'ffllig;': '\ufb04',
+        'Ffr;': '\U0001d509',
+        'ffr;': '\U0001d523',
+        'filig;': '\ufb01',
+        'FilledSmallSquare;': '\u25fc',
+        'FilledVerySmallSquare;': '\u25aa',
+        'fjlig;': 'fj',
+        'flat;': '\u266d',
+        'fllig;': '\ufb02',
+        'fltns;': '\u25b1',
+        'fnof;': '\u0192',
+        'Fopf;': '\U0001d53d',
+        'fopf;': '\U0001d557',
+        'ForAll;': '\u2200',
+        'forall;': '\u2200',
+        'fork;': '\u22d4',
+        'forkv;': '\u2ad9',
+        'Fouriertrf;': '\u2131',
+        'fpartint;': '\u2a0d',
+        'frac12': '\xbd',
+        'frac12;': '\xbd',
+        'frac13;': '\u2153',
+        'frac14': '\xbc',
+        'frac14;': '\xbc',
+        'frac15;': '\u2155',
+        'frac16;': '\u2159',
+        'frac18;': '\u215b',
+        'frac23;': '\u2154',
+        'frac25;': '\u2156',
+        'frac34': '\xbe',
+        'frac34;': '\xbe',
+        'frac35;': '\u2157',
+        'frac38;': '\u215c',
+        'frac45;': '\u2158',
+        'frac56;': '\u215a',
+        'frac58;': '\u215d',
+        'frac78;': '\u215e',
+        'frasl;': '\u2044',
+        'frown;': '\u2322',
+        'Fscr;': '\u2131',
+        'fscr;': '\U0001d4bb',
+        'gacute;': '\u01f5',
+        'Gamma;': '\u0393',
+        'gamma;': '\u03b3',
+        'Gammad;': '\u03dc',
+        'gammad;': '\u03dd',
+        'gap;': '\u2a86',
+        'Gbreve;': '\u011e',
+        'gbreve;': '\u011f',
+        'Gcedil;': '\u0122',
+        'Gcirc;': '\u011c',
+        'gcirc;': '\u011d',
+        'Gcy;': '\u0413',
+        'gcy;': '\u0433',
+        'Gdot;': '\u0120',
+        'gdot;': '\u0121',
+        'gE;': '\u2267',
+        'ge;': '\u2265',
+        'gEl;': '\u2a8c',
+        'gel;': '\u22db',
+        'geq;': '\u2265',
+        'geqq;': '\u2267',
+        'geqslant;': '\u2a7e',
+        'ges;': '\u2a7e',
+        'gescc;': '\u2aa9',
+        'gesdot;': '\u2a80',
+        'gesdoto;': '\u2a82',
+        'gesdotol;': '\u2a84',
+        'gesl;': '\u22db\ufe00',
+        'gesles;': '\u2a94',
+        'Gfr;': '\U0001d50a',
+        'gfr;': '\U0001d524',
+        'Gg;': '\u22d9',
+        'gg;': '\u226b',
+        'ggg;': '\u22d9',
+        'gimel;': '\u2137',
+        'GJcy;': '\u0403',
+        'gjcy;': '\u0453',
+        'gl;': '\u2277',
+        'gla;': '\u2aa5',
+        'glE;': '\u2a92',
+        'glj;': '\u2aa4',
+        'gnap;': '\u2a8a',
+        'gnapprox;': '\u2a8a',
+        'gnE;': '\u2269',
+        'gne;': '\u2a88',
+        'gneq;': '\u2a88',
+        'gneqq;': '\u2269',
+        'gnsim;': '\u22e7',
+        'Gopf;': '\U0001d53e',
+        'gopf;': '\U0001d558',
+        'grave;': '`',
+        'GreaterEqual;': '\u2265',
+        'GreaterEqualLess;': '\u22db',
+        'GreaterFullEqual;': '\u2267',
+        'GreaterGreater;': '\u2aa2',
+        'GreaterLess;': '\u2277',
+        'GreaterSlantEqual;': '\u2a7e',
+        'GreaterTilde;': '\u2273',
+        'Gscr;': '\U0001d4a2',
+        'gscr;': '\u210a',
+        'gsim;': '\u2273',
+        'gsime;': '\u2a8e',
+        'gsiml;': '\u2a90',
+        'GT': '>',
+        'gt': '>',
+        'GT;': '>',
+        'Gt;': '\u226b',
+        'gt;': '>',
+        'gtcc;': '\u2aa7',
+        'gtcir;': '\u2a7a',
+        'gtdot;': '\u22d7',
+        'gtlPar;': '\u2995',
+        'gtquest;': '\u2a7c',
+        'gtrapprox;': '\u2a86',
+        'gtrarr;': '\u2978',
+        'gtrdot;': '\u22d7',
+        'gtreqless;': '\u22db',
+        'gtreqqless;': '\u2a8c',
+        'gtrless;': '\u2277',
+        'gtrsim;': '\u2273',
+        'gvertneqq;': '\u2269\ufe00',
+        'gvnE;': '\u2269\ufe00',
+        'Hacek;': '\u02c7',
+        'hairsp;': '\u200a',
+        'half;': '\xbd',
+        'hamilt;': '\u210b',
+        'HARDcy;': '\u042a',
+        'hardcy;': '\u044a',
+        'hArr;': '\u21d4',
+        'harr;': '\u2194',
+        'harrcir;': '\u2948',
+        'harrw;': '\u21ad',
+        'Hat;': '^',
+        'hbar;': '\u210f',
+        'Hcirc;': '\u0124',
+        'hcirc;': '\u0125',
+        'hearts;': '\u2665',
+        'heartsuit;': '\u2665',
+        'hellip;': '\u2026',
+        'hercon;': '\u22b9',
+        'Hfr;': '\u210c',
+        'hfr;': '\U0001d525',
+        'HilbertSpace;': '\u210b',
+        'hksearow;': '\u2925',
+        'hkswarow;': '\u2926',
+        'hoarr;': '\u21ff',
+        'homtht;': '\u223b',
+        'hookleftarrow;': '\u21a9',
+        'hookrightarrow;': '\u21aa',
+        'Hopf;': '\u210d',
+        'hopf;': '\U0001d559',
+        'horbar;': '\u2015',
+        'HorizontalLine;': '\u2500',
+        'Hscr;': '\u210b',
+        'hscr;': '\U0001d4bd',
+        'hslash;': '\u210f',
+        'Hstrok;': '\u0126',
+        'hstrok;': '\u0127',
+        'HumpDownHump;': '\u224e',
+        'HumpEqual;': '\u224f',
+        'hybull;': '\u2043',
+        'hyphen;': '\u2010',
+        'Iacute': '\xcd',
+        'iacute': '\xed',
+        'Iacute;': '\xcd',
+        'iacute;': '\xed',
+        'ic;': '\u2063',
+        'Icirc': '\xce',
+        'icirc': '\xee',
+        'Icirc;': '\xce',
+        'icirc;': '\xee',
+        'Icy;': '\u0418',
+        'icy;': '\u0438',
+        'Idot;': '\u0130',
+        'IEcy;': '\u0415',
+        'iecy;': '\u0435',
+        'iexcl': '\xa1',
+        'iexcl;': '\xa1',
+        'iff;': '\u21d4',
+        'Ifr;': '\u2111',
+        'ifr;': '\U0001d526',
+        'Igrave': '\xcc',
+        'igrave': '\xec',
+        'Igrave;': '\xcc',
+        'igrave;': '\xec',
+        'ii;': '\u2148',
+        'iiiint;': '\u2a0c',
+        'iiint;': '\u222d',
+        'iinfin;': '\u29dc',
+        'iiota;': '\u2129',
+        'IJlig;': '\u0132',
+        'ijlig;': '\u0133',
+        'Im;': '\u2111',
+        'Imacr;': '\u012a',
+        'imacr;': '\u012b',
+        'image;': '\u2111',
+        'ImaginaryI;': '\u2148',
+        'imagline;': '\u2110',
+        'imagpart;': '\u2111',
+        'imath;': '\u0131',
+        'imof;': '\u22b7',
+        'imped;': '\u01b5',
+        'Implies;': '\u21d2',
+        'in;': '\u2208',
+        'incare;': '\u2105',
+        'infin;': '\u221e',
+        'infintie;': '\u29dd',
+        'inodot;': '\u0131',
+        'Int;': '\u222c',
+        'int;': '\u222b',
+        'intcal;': '\u22ba',
+        'integers;': '\u2124',
+        'Integral;': '\u222b',
+        'intercal;': '\u22ba',
+        'Intersection;': '\u22c2',
+        'intlarhk;': '\u2a17',
+        'intprod;': '\u2a3c',
+        'InvisibleComma;': '\u2063',
+        'InvisibleTimes;': '\u2062',
+        'IOcy;': '\u0401',
+        'iocy;': '\u0451',
+        'Iogon;': '\u012e',
+        'iogon;': '\u012f',
+        'Iopf;': '\U0001d540',
+        'iopf;': '\U0001d55a',
+        'Iota;': '\u0399',
+        'iota;': '\u03b9',
+        'iprod;': '\u2a3c',
+        'iquest': '\xbf',
+        'iquest;': '\xbf',
+        'Iscr;': '\u2110',
+        'iscr;': '\U0001d4be',
+        'isin;': '\u2208',
+        'isindot;': '\u22f5',
+        'isinE;': '\u22f9',
+        'isins;': '\u22f4',
+        'isinsv;': '\u22f3',
+        'isinv;': '\u2208',
+        'it;': '\u2062',
+        'Itilde;': '\u0128',
+        'itilde;': '\u0129',
+        'Iukcy;': '\u0406',
+        'iukcy;': '\u0456',
+        'Iuml': '\xcf',
+        'iuml': '\xef',
+        'Iuml;': '\xcf',
+        'iuml;': '\xef',
+        'Jcirc;': '\u0134',
+        'jcirc;': '\u0135',
+        'Jcy;': '\u0419',
+        'jcy;': '\u0439',
+        'Jfr;': '\U0001d50d',
+        'jfr;': '\U0001d527',
+        'jmath;': '\u0237',
+        'Jopf;': '\U0001d541',
+        'jopf;': '\U0001d55b',
+        'Jscr;': '\U0001d4a5',
+        'jscr;': '\U0001d4bf',
+        'Jsercy;': '\u0408',
+        'jsercy;': '\u0458',
+        'Jukcy;': '\u0404',
+        'jukcy;': '\u0454',
+        'Kappa;': '\u039a',
+        'kappa;': '\u03ba',
+        'kappav;': '\u03f0',
+        'Kcedil;': '\u0136',
+        'kcedil;': '\u0137',
+        'Kcy;': '\u041a',
+        'kcy;': '\u043a',
+        'Kfr;': '\U0001d50e',
+        'kfr;': '\U0001d528',
+        'kgreen;': '\u0138',
+        'KHcy;': '\u0425',
+        'khcy;': '\u0445',
+        'KJcy;': '\u040c',
+        'kjcy;': '\u045c',
+        'Kopf;': '\U0001d542',
+        'kopf;': '\U0001d55c',
+        'Kscr;': '\U0001d4a6',
+        'kscr;': '\U0001d4c0',
+        'lAarr;': '\u21da',
+        'Lacute;': '\u0139',
+        'lacute;': '\u013a',
+        'laemptyv;': '\u29b4',
+        'lagran;': '\u2112',
+        'Lambda;': '\u039b',
+        'lambda;': '\u03bb',
+        'Lang;': '\u27ea',
+        'lang;': '\u27e8',
+        'langd;': '\u2991',
+        'langle;': '\u27e8',
+        'lap;': '\u2a85',
+        'Laplacetrf;': '\u2112',
+        'laquo': '\xab',
+        'laquo;': '\xab',
+        'Larr;': '\u219e',
+        'lArr;': '\u21d0',
+        'larr;': '\u2190',
+        'larrb;': '\u21e4',
+        'larrbfs;': '\u291f',
+        'larrfs;': '\u291d',
+        'larrhk;': '\u21a9',
+        'larrlp;': '\u21ab',
+        'larrpl;': '\u2939',
+        'larrsim;': '\u2973',
+        'larrtl;': '\u21a2',
+        'lat;': '\u2aab',
+        'lAtail;': '\u291b',
+        'latail;': '\u2919',
+        'late;': '\u2aad',
+        'lates;': '\u2aad\ufe00',
+        'lBarr;': '\u290e',
+        'lbarr;': '\u290c',
+        'lbbrk;': '\u2772',
+        'lbrace;': '{',
+        'lbrack;': '[',
+        'lbrke;': '\u298b',
+        'lbrksld;': '\u298f',
+        'lbrkslu;': '\u298d',
+        'Lcaron;': '\u013d',
+        'lcaron;': '\u013e',
+        'Lcedil;': '\u013b',
+        'lcedil;': '\u013c',
+        'lceil;': '\u2308',
+        'lcub;': '{',
+        'Lcy;': '\u041b',
+        'lcy;': '\u043b',
+        'ldca;': '\u2936',
+        'ldquo;': '\u201c',
+        'ldquor;': '\u201e',
+        'ldrdhar;': '\u2967',
+        'ldrushar;': '\u294b',
+        'ldsh;': '\u21b2',
+        'lE;': '\u2266',
+        'le;': '\u2264',
+        'LeftAngleBracket;': '\u27e8',
+        'LeftArrow;': '\u2190',
+        'Leftarrow;': '\u21d0',
+        'leftarrow;': '\u2190',
+        'LeftArrowBar;': '\u21e4',
+        'LeftArrowRightArrow;': '\u21c6',
+        'leftarrowtail;': '\u21a2',
+        'LeftCeiling;': '\u2308',
+        'LeftDoubleBracket;': '\u27e6',
+        'LeftDownTeeVector;': '\u2961',
+        'LeftDownVector;': '\u21c3',
+        'LeftDownVectorBar;': '\u2959',
+        'LeftFloor;': '\u230a',
+        'leftharpoondown;': '\u21bd',
+        'leftharpoonup;': '\u21bc',
+        'leftleftarrows;': '\u21c7',
+        'LeftRightArrow;': '\u2194',
+        'Leftrightarrow;': '\u21d4',
+        'leftrightarrow;': '\u2194',
+        'leftrightarrows;': '\u21c6',
+        'leftrightharpoons;': '\u21cb',
+        'leftrightsquigarrow;': '\u21ad',
+        'LeftRightVector;': '\u294e',
+        'LeftTee;': '\u22a3',
+        'LeftTeeArrow;': '\u21a4',
+        'LeftTeeVector;': '\u295a',
+        'leftthreetimes;': '\u22cb',
+        'LeftTriangle;': '\u22b2',
+        'LeftTriangleBar;': '\u29cf',
+        'LeftTriangleEqual;': '\u22b4',
+        'LeftUpDownVector;': '\u2951',
+        'LeftUpTeeVector;': '\u2960',
+        'LeftUpVector;': '\u21bf',
+        'LeftUpVectorBar;': '\u2958',
+        'LeftVector;': '\u21bc',
+        'LeftVectorBar;': '\u2952',
+        'lEg;': '\u2a8b',
+        'leg;': '\u22da',
+        'leq;': '\u2264',
+        'leqq;': '\u2266',
+        'leqslant;': '\u2a7d',
+        'les;': '\u2a7d',
+        'lescc;': '\u2aa8',
+        'lesdot;': '\u2a7f',
+        'lesdoto;': '\u2a81',
+        'lesdotor;': '\u2a83',
+        'lesg;': '\u22da\ufe00',
+        'lesges;': '\u2a93',
+        'lessapprox;': '\u2a85',
+        'lessdot;': '\u22d6',
+        'lesseqgtr;': '\u22da',
+        'lesseqqgtr;': '\u2a8b',
+        'LessEqualGreater;': '\u22da',
+        'LessFullEqual;': '\u2266',
+        'LessGreater;': '\u2276',
+        'lessgtr;': '\u2276',
+        'LessLess;': '\u2aa1',
+        'lesssim;': '\u2272',
+        'LessSlantEqual;': '\u2a7d',
+        'LessTilde;': '\u2272',
+        'lfisht;': '\u297c',
+        'lfloor;': '\u230a',
+        'Lfr;': '\U0001d50f',
+        'lfr;': '\U0001d529',
+        'lg;': '\u2276',
+        'lgE;': '\u2a91',
+        'lHar;': '\u2962',
+        'lhard;': '\u21bd',
+        'lharu;': '\u21bc',
+        'lharul;': '\u296a',
+        'lhblk;': '\u2584',
+        'LJcy;': '\u0409',
+        'ljcy;': '\u0459',
+        'Ll;': '\u22d8',
+        'll;': '\u226a',
+        'llarr;': '\u21c7',
+        'llcorner;': '\u231e',
+        'Lleftarrow;': '\u21da',
+        'llhard;': '\u296b',
+        'lltri;': '\u25fa',
+        'Lmidot;': '\u013f',
+        'lmidot;': '\u0140',
+        'lmoust;': '\u23b0',
+        'lmoustache;': '\u23b0',
+        'lnap;': '\u2a89',
+        'lnapprox;': '\u2a89',
+        'lnE;': '\u2268',
+        'lne;': '\u2a87',
+        'lneq;': '\u2a87',
+        'lneqq;': '\u2268',
+        'lnsim;': '\u22e6',
+        'loang;': '\u27ec',
+        'loarr;': '\u21fd',
+        'lobrk;': '\u27e6',
+        'LongLeftArrow;': '\u27f5',
+        'Longleftarrow;': '\u27f8',
+        'longleftarrow;': '\u27f5',
+        'LongLeftRightArrow;': '\u27f7',
+        'Longleftrightarrow;': '\u27fa',
+        'longleftrightarrow;': '\u27f7',
+        'longmapsto;': '\u27fc',
+        'LongRightArrow;': '\u27f6',
+        'Longrightarrow;': '\u27f9',
+        'longrightarrow;': '\u27f6',
+        'looparrowleft;': '\u21ab',
+        'looparrowright;': '\u21ac',
+        'lopar;': '\u2985',
+        'Lopf;': '\U0001d543',
+        'lopf;': '\U0001d55d',
+        'loplus;': '\u2a2d',
+        'lotimes;': '\u2a34',
+        'lowast;': '\u2217',
+        'lowbar;': '_',
+        'LowerLeftArrow;': '\u2199',
+        'LowerRightArrow;': '\u2198',
+        'loz;': '\u25ca',
+        'lozenge;': '\u25ca',
+        'lozf;': '\u29eb',
+        'lpar;': '(',
+        'lparlt;': '\u2993',
+        'lrarr;': '\u21c6',
+        'lrcorner;': '\u231f',
+        'lrhar;': '\u21cb',
+        'lrhard;': '\u296d',
+        'lrm;': '\u200e',
+        'lrtri;': '\u22bf',
+        'lsaquo;': '\u2039',
+        'Lscr;': '\u2112',
+        'lscr;': '\U0001d4c1',
+        'Lsh;': '\u21b0',
+        'lsh;': '\u21b0',
+        'lsim;': '\u2272',
+        'lsime;': '\u2a8d',
+        'lsimg;': '\u2a8f',
+        'lsqb;': '[',
+        'lsquo;': '\u2018',
+        'lsquor;': '\u201a',
+        'Lstrok;': '\u0141',
+        'lstrok;': '\u0142',
+        'LT': '<',
+        'lt': '<',
+        'LT;': '<',
+        'Lt;': '\u226a',
+        'lt;': '<',
+        'ltcc;': '\u2aa6',
+        'ltcir;': '\u2a79',
+        'ltdot;': '\u22d6',
+        'lthree;': '\u22cb',
+        'ltimes;': '\u22c9',
+        'ltlarr;': '\u2976',
+        'ltquest;': '\u2a7b',
+        'ltri;': '\u25c3',
+        'ltrie;': '\u22b4',
+        'ltrif;': '\u25c2',
+        'ltrPar;': '\u2996',
+        'lurdshar;': '\u294a',
+        'luruhar;': '\u2966',
+        'lvertneqq;': '\u2268\ufe00',
+        'lvnE;': '\u2268\ufe00',
+        'macr': '\xaf',
+        'macr;': '\xaf',
+        'male;': '\u2642',
+        'malt;': '\u2720',
+        'maltese;': '\u2720',
+        'Map;': '\u2905',
+        'map;': '\u21a6',
+        'mapsto;': '\u21a6',
+        'mapstodown;': '\u21a7',
+        'mapstoleft;': '\u21a4',
+        'mapstoup;': '\u21a5',
+        'marker;': '\u25ae',
+        'mcomma;': '\u2a29',
+        'Mcy;': '\u041c',
+        'mcy;': '\u043c',
+        'mdash;': '\u2014',
+        'mDDot;': '\u223a',
+        'measuredangle;': '\u2221',
+        'MediumSpace;': '\u205f',
+        'Mellintrf;': '\u2133',
+        'Mfr;': '\U0001d510',
+        'mfr;': '\U0001d52a',
+        'mho;': '\u2127',
+        'micro': '\xb5',
+        'micro;': '\xb5',
+        'mid;': '\u2223',
+        'midast;': '*',
+        'midcir;': '\u2af0',
+        'middot': '\xb7',
+        'middot;': '\xb7',
+        'minus;': '\u2212',
+        'minusb;': '\u229f',
+        'minusd;': '\u2238',
+        'minusdu;': '\u2a2a',
+        'MinusPlus;': '\u2213',
+        'mlcp;': '\u2adb',
+        'mldr;': '\u2026',
+        'mnplus;': '\u2213',
+        'models;': '\u22a7',
+        'Mopf;': '\U0001d544',
+        'mopf;': '\U0001d55e',
+        'mp;': '\u2213',
+        'Mscr;': '\u2133',
+        'mscr;': '\U0001d4c2',
+        'mstpos;': '\u223e',
+        'Mu;': '\u039c',
+        'mu;': '\u03bc',
+        'multimap;': '\u22b8',
+        'mumap;': '\u22b8',
+        'nabla;': '\u2207',
+        'Nacute;': '\u0143',
+        'nacute;': '\u0144',
+        'nang;': '\u2220\u20d2',
+        'nap;': '\u2249',
+        'napE;': '\u2a70\u0338',
+        'napid;': '\u224b\u0338',
+        'napos;': '\u0149',
+        'napprox;': '\u2249',
+        'natur;': '\u266e',
+        'natural;': '\u266e',
+        'naturals;': '\u2115',
+        'nbsp': '\xa0',
+        'nbsp;': '\xa0',
+        'nbump;': '\u224e\u0338',
+        'nbumpe;': '\u224f\u0338',
+        'ncap;': '\u2a43',
+        'Ncaron;': '\u0147',
+        'ncaron;': '\u0148',
+        'Ncedil;': '\u0145',
+        'ncedil;': '\u0146',
+        'ncong;': '\u2247',
+        'ncongdot;': '\u2a6d\u0338',
+        'ncup;': '\u2a42',
+        'Ncy;': '\u041d',
+        'ncy;': '\u043d',
+        'ndash;': '\u2013',
+        'ne;': '\u2260',
+        'nearhk;': '\u2924',
+        'neArr;': '\u21d7',
+        'nearr;': '\u2197',
+        'nearrow;': '\u2197',
+        'nedot;': '\u2250\u0338',
+        'NegativeMediumSpace;': '\u200b',
+        'NegativeThickSpace;': '\u200b',
+        'NegativeThinSpace;': '\u200b',
+        'NegativeVeryThinSpace;': '\u200b',
+        'nequiv;': '\u2262',
+        'nesear;': '\u2928',
+        'nesim;': '\u2242\u0338',
+        'NestedGreaterGreater;': '\u226b',
+        'NestedLessLess;': '\u226a',
+        'NewLine;': '\n',
+        'nexist;': '\u2204',
+        'nexists;': '\u2204',
+        'Nfr;': '\U0001d511',
+        'nfr;': '\U0001d52b',
+        'ngE;': '\u2267\u0338',
+        'nge;': '\u2271',
+        'ngeq;': '\u2271',
+        'ngeqq;': '\u2267\u0338',
+        'ngeqslant;': '\u2a7e\u0338',
+        'nges;': '\u2a7e\u0338',
+        'nGg;': '\u22d9\u0338',
+        'ngsim;': '\u2275',
+        'nGt;': '\u226b\u20d2',
+        'ngt;': '\u226f',
+        'ngtr;': '\u226f',
+        'nGtv;': '\u226b\u0338',
+        'nhArr;': '\u21ce',
+        'nharr;': '\u21ae',
+        'nhpar;': '\u2af2',
+        'ni;': '\u220b',
+        'nis;': '\u22fc',
+        'nisd;': '\u22fa',
+        'niv;': '\u220b',
+        'NJcy;': '\u040a',
+        'njcy;': '\u045a',
+        'nlArr;': '\u21cd',
+        'nlarr;': '\u219a',
+        'nldr;': '\u2025',
+        'nlE;': '\u2266\u0338',
+        'nle;': '\u2270',
+        'nLeftarrow;': '\u21cd',
+        'nleftarrow;': '\u219a',
+        'nLeftrightarrow;': '\u21ce',
+        'nleftrightarrow;': '\u21ae',
+        'nleq;': '\u2270',
+        'nleqq;': '\u2266\u0338',
+        'nleqslant;': '\u2a7d\u0338',
+        'nles;': '\u2a7d\u0338',
+        'nless;': '\u226e',
+        'nLl;': '\u22d8\u0338',
+        'nlsim;': '\u2274',
+        'nLt;': '\u226a\u20d2',
+        'nlt;': '\u226e',
+        'nltri;': '\u22ea',
+        'nltrie;': '\u22ec',
+        'nLtv;': '\u226a\u0338',
+        'nmid;': '\u2224',
+        'NoBreak;': '\u2060',
+        'NonBreakingSpace;': '\xa0',
+        'Nopf;': '\u2115',
+        'nopf;': '\U0001d55f',
+        'not': '\xac',
+        'Not;': '\u2aec',
+        'not;': '\xac',
+        'NotCongruent;': '\u2262',
+        'NotCupCap;': '\u226d',
+        'NotDoubleVerticalBar;': '\u2226',
+        'NotElement;': '\u2209',
+        'NotEqual;': '\u2260',
+        'NotEqualTilde;': '\u2242\u0338',
+        'NotExists;': '\u2204',
+        'NotGreater;': '\u226f',
+        'NotGreaterEqual;': '\u2271',
+        'NotGreaterFullEqual;': '\u2267\u0338',
+        'NotGreaterGreater;': '\u226b\u0338',
+        'NotGreaterLess;': '\u2279',
+        'NotGreaterSlantEqual;': '\u2a7e\u0338',
+        'NotGreaterTilde;': '\u2275',
+        'NotHumpDownHump;': '\u224e\u0338',
+        'NotHumpEqual;': '\u224f\u0338',
+        'notin;': '\u2209',
+        'notindot;': '\u22f5\u0338',
+        'notinE;': '\u22f9\u0338',
+        'notinva;': '\u2209',
+        'notinvb;': '\u22f7',
+        'notinvc;': '\u22f6',
+        'NotLeftTriangle;': '\u22ea',
+        'NotLeftTriangleBar;': '\u29cf\u0338',
+        'NotLeftTriangleEqual;': '\u22ec',
+        'NotLess;': '\u226e',
+        'NotLessEqual;': '\u2270',
+        'NotLessGreater;': '\u2278',
+        'NotLessLess;': '\u226a\u0338',
+        'NotLessSlantEqual;': '\u2a7d\u0338',
+        'NotLessTilde;': '\u2274',
+        'NotNestedGreaterGreater;': '\u2aa2\u0338',
+        'NotNestedLessLess;': '\u2aa1\u0338',
+        'notni;': '\u220c',
+        'notniva;': '\u220c',
+        'notnivb;': '\u22fe',
+        'notnivc;': '\u22fd',
+        'NotPrecedes;': '\u2280',
+        'NotPrecedesEqual;': '\u2aaf\u0338',
+        'NotPrecedesSlantEqual;': '\u22e0',
+        'NotReverseElement;': '\u220c',
+        'NotRightTriangle;': '\u22eb',
+        'NotRightTriangleBar;': '\u29d0\u0338',
+        'NotRightTriangleEqual;': '\u22ed',
+        'NotSquareSubset;': '\u228f\u0338',
+        'NotSquareSubsetEqual;': '\u22e2',
+        'NotSquareSuperset;': '\u2290\u0338',
+        'NotSquareSupersetEqual;': '\u22e3',
+        'NotSubset;': '\u2282\u20d2',
+        'NotSubsetEqual;': '\u2288',
+        'NotSucceeds;': '\u2281',
+        'NotSucceedsEqual;': '\u2ab0\u0338',
+        'NotSucceedsSlantEqual;': '\u22e1',
+        'NotSucceedsTilde;': '\u227f\u0338',
+        'NotSuperset;': '\u2283\u20d2',
+        'NotSupersetEqual;': '\u2289',
+        'NotTilde;': '\u2241',
+        'NotTildeEqual;': '\u2244',
+        'NotTildeFullEqual;': '\u2247',
+        'NotTildeTilde;': '\u2249',
+        'NotVerticalBar;': '\u2224',
+        'npar;': '\u2226',
+        'nparallel;': '\u2226',
+        'nparsl;': '\u2afd\u20e5',
+        'npart;': '\u2202\u0338',
+        'npolint;': '\u2a14',
+        'npr;': '\u2280',
+        'nprcue;': '\u22e0',
+        'npre;': '\u2aaf\u0338',
+        'nprec;': '\u2280',
+        'npreceq;': '\u2aaf\u0338',
+        'nrArr;': '\u21cf',
+        'nrarr;': '\u219b',
+        'nrarrc;': '\u2933\u0338',
+        'nrarrw;': '\u219d\u0338',
+        'nRightarrow;': '\u21cf',
+        'nrightarrow;': '\u219b',
+        'nrtri;': '\u22eb',
+        'nrtrie;': '\u22ed',
+        'nsc;': '\u2281',
+        'nsccue;': '\u22e1',
+        'nsce;': '\u2ab0\u0338',
+        'Nscr;': '\U0001d4a9',
+        'nscr;': '\U0001d4c3',
+        'nshortmid;': '\u2224',
+        'nshortparallel;': '\u2226',
+        'nsim;': '\u2241',
+        'nsime;': '\u2244',
+        'nsimeq;': '\u2244',
+        'nsmid;': '\u2224',
+        'nspar;': '\u2226',
+        'nsqsube;': '\u22e2',
+        'nsqsupe;': '\u22e3',
+        'nsub;': '\u2284',
+        'nsubE;': '\u2ac5\u0338',
+        'nsube;': '\u2288',
+        'nsubset;': '\u2282\u20d2',
+        'nsubseteq;': '\u2288',
+        'nsubseteqq;': '\u2ac5\u0338',
+        'nsucc;': '\u2281',
+        'nsucceq;': '\u2ab0\u0338',
+        'nsup;': '\u2285',
+        'nsupE;': '\u2ac6\u0338',
+        'nsupe;': '\u2289',
+        'nsupset;': '\u2283\u20d2',
+        'nsupseteq;': '\u2289',
+        'nsupseteqq;': '\u2ac6\u0338',
+        'ntgl;': '\u2279',
+        'Ntilde': '\xd1',
+        'ntilde': '\xf1',
+        'Ntilde;': '\xd1',
+        'ntilde;': '\xf1',
+        'ntlg;': '\u2278',
+        'ntriangleleft;': '\u22ea',
+        'ntrianglelefteq;': '\u22ec',
+        'ntriangleright;': '\u22eb',
+        'ntrianglerighteq;': '\u22ed',
+        'Nu;': '\u039d',
+        'nu;': '\u03bd',
+        'num;': '#',
+        'numero;': '\u2116',
+        'numsp;': '\u2007',
+        'nvap;': '\u224d\u20d2',
+        'nVDash;': '\u22af',
+        'nVdash;': '\u22ae',
+        'nvDash;': '\u22ad',
+        'nvdash;': '\u22ac',
+        'nvge;': '\u2265\u20d2',
+        'nvgt;': '>\u20d2',
+        'nvHarr;': '\u2904',
+        'nvinfin;': '\u29de',
+        'nvlArr;': '\u2902',
+        'nvle;': '\u2264\u20d2',
+        'nvlt;': '<\u20d2',
+        'nvltrie;': '\u22b4\u20d2',
+        'nvrArr;': '\u2903',
+        'nvrtrie;': '\u22b5\u20d2',
+        'nvsim;': '\u223c\u20d2',
+        'nwarhk;': '\u2923',
+        'nwArr;': '\u21d6',
+        'nwarr;': '\u2196',
+        'nwarrow;': '\u2196',
+        'nwnear;': '\u2927',
+        'Oacute': '\xd3',
+        'oacute': '\xf3',
+        'Oacute;': '\xd3',
+        'oacute;': '\xf3',
+        'oast;': '\u229b',
+        'ocir;': '\u229a',
+        'Ocirc': '\xd4',
+        'ocirc': '\xf4',
+        'Ocirc;': '\xd4',
+        'ocirc;': '\xf4',
+        'Ocy;': '\u041e',
+        'ocy;': '\u043e',
+        'odash;': '\u229d',
+        'Odblac;': '\u0150',
+        'odblac;': '\u0151',
+        'odiv;': '\u2a38',
+        'odot;': '\u2299',
+        'odsold;': '\u29bc',
+        'OElig;': '\u0152',
+        'oelig;': '\u0153',
+        'ofcir;': '\u29bf',
+        'Ofr;': '\U0001d512',
+        'ofr;': '\U0001d52c',
+        'ogon;': '\u02db',
+        'Ograve': '\xd2',
+        'ograve': '\xf2',
+        'Ograve;': '\xd2',
+        'ograve;': '\xf2',
+        'ogt;': '\u29c1',
+        'ohbar;': '\u29b5',
+        'ohm;': '\u03a9',
+        'oint;': '\u222e',
+        'olarr;': '\u21ba',
+        'olcir;': '\u29be',
+        'olcross;': '\u29bb',
+        'oline;': '\u203e',
+        'olt;': '\u29c0',
+        'Omacr;': '\u014c',
+        'omacr;': '\u014d',
+        'Omega;': '\u03a9',
+        'omega;': '\u03c9',
+        'Omicron;': '\u039f',
+        'omicron;': '\u03bf',
+        'omid;': '\u29b6',
+        'ominus;': '\u2296',
+        'Oopf;': '\U0001d546',
+        'oopf;': '\U0001d560',
+        'opar;': '\u29b7',
+        'OpenCurlyDoubleQuote;': '\u201c',
+        'OpenCurlyQuote;': '\u2018',
+        'operp;': '\u29b9',
+        'oplus;': '\u2295',
+        'Or;': '\u2a54',
+        'or;': '\u2228',
+        'orarr;': '\u21bb',
+        'ord;': '\u2a5d',
+        'order;': '\u2134',
+        'orderof;': '\u2134',
+        'ordf': '\xaa',
+        'ordf;': '\xaa',
+        'ordm': '\xba',
+        'ordm;': '\xba',
+        'origof;': '\u22b6',
+        'oror;': '\u2a56',
+        'orslope;': '\u2a57',
+        'orv;': '\u2a5b',
+        'oS;': '\u24c8',
+        'Oscr;': '\U0001d4aa',
+        'oscr;': '\u2134',
+        'Oslash': '\xd8',
+        'oslash': '\xf8',
+        'Oslash;': '\xd8',
+        'oslash;': '\xf8',
+        'osol;': '\u2298',
+        'Otilde': '\xd5',
+        'otilde': '\xf5',
+        'Otilde;': '\xd5',
+        'otilde;': '\xf5',
+        'Otimes;': '\u2a37',
+        'otimes;': '\u2297',
+        'otimesas;': '\u2a36',
+        'Ouml': '\xd6',
+        'ouml': '\xf6',
+        'Ouml;': '\xd6',
+        'ouml;': '\xf6',
+        'ovbar;': '\u233d',
+        'OverBar;': '\u203e',
+        'OverBrace;': '\u23de',
+        'OverBracket;': '\u23b4',
+        'OverParenthesis;': '\u23dc',
+        'par;': '\u2225',
+        'para': '\xb6',
+        'para;': '\xb6',
+        'parallel;': '\u2225',
+        'parsim;': '\u2af3',
+        'parsl;': '\u2afd',
+        'part;': '\u2202',
+        'PartialD;': '\u2202',
+        'Pcy;': '\u041f',
+        'pcy;': '\u043f',
+        'percnt;': '%',
+        'period;': '.',
+        'permil;': '\u2030',
+        'perp;': '\u22a5',
+        'pertenk;': '\u2031',
+        'Pfr;': '\U0001d513',
+        'pfr;': '\U0001d52d',
+        'Phi;': '\u03a6',
+        'phi;': '\u03c6',
+        'phiv;': '\u03d5',
+        'phmmat;': '\u2133',
+        'phone;': '\u260e',
+        'Pi;': '\u03a0',
+        'pi;': '\u03c0',
+        'pitchfork;': '\u22d4',
+        'piv;': '\u03d6',
+        'planck;': '\u210f',
+        'planckh;': '\u210e',
+        'plankv;': '\u210f',
+        'plus;': '+',
+        'plusacir;': '\u2a23',
+        'plusb;': '\u229e',
+        'pluscir;': '\u2a22',
+        'plusdo;': '\u2214',
+        'plusdu;': '\u2a25',
+        'pluse;': '\u2a72',
+        'PlusMinus;': '\xb1',
+        'plusmn': '\xb1',
+        'plusmn;': '\xb1',
+        'plussim;': '\u2a26',
+        'plustwo;': '\u2a27',
+        'pm;': '\xb1',
+        'Poincareplane;': '\u210c',
+        'pointint;': '\u2a15',
+        'Popf;': '\u2119',
+        'popf;': '\U0001d561',
+        'pound': '\xa3',
+        'pound;': '\xa3',
+        'Pr;': '\u2abb',
+        'pr;': '\u227a',
+        'prap;': '\u2ab7',
+        'prcue;': '\u227c',
+        'prE;': '\u2ab3',
+        'pre;': '\u2aaf',
+        'prec;': '\u227a',
+        'precapprox;': '\u2ab7',
+        'preccurlyeq;': '\u227c',
+        'Precedes;': '\u227a',
+        'PrecedesEqual;': '\u2aaf',
+        'PrecedesSlantEqual;': '\u227c',
+        'PrecedesTilde;': '\u227e',
+        'preceq;': '\u2aaf',
+        'precnapprox;': '\u2ab9',
+        'precneqq;': '\u2ab5',
+        'precnsim;': '\u22e8',
+        'precsim;': '\u227e',
+        'Prime;': '\u2033',
+        'prime;': '\u2032',
+        'primes;': '\u2119',
+        'prnap;': '\u2ab9',
+        'prnE;': '\u2ab5',
+        'prnsim;': '\u22e8',
+        'prod;': '\u220f',
+        'Product;': '\u220f',
+        'profalar;': '\u232e',
+        'profline;': '\u2312',
+        'profsurf;': '\u2313',
+        'prop;': '\u221d',
+        'Proportion;': '\u2237',
+        'Proportional;': '\u221d',
+        'propto;': '\u221d',
+        'prsim;': '\u227e',
+        'prurel;': '\u22b0',
+        'Pscr;': '\U0001d4ab',
+        'pscr;': '\U0001d4c5',
+        'Psi;': '\u03a8',
+        'psi;': '\u03c8',
+        'puncsp;': '\u2008',
+        'Qfr;': '\U0001d514',
+        'qfr;': '\U0001d52e',
+        'qint;': '\u2a0c',
+        'Qopf;': '\u211a',
+        'qopf;': '\U0001d562',
+        'qprime;': '\u2057',
+        'Qscr;': '\U0001d4ac',
+        'qscr;': '\U0001d4c6',
+        'quaternions;': '\u210d',
+        'quatint;': '\u2a16',
+        'quest;': '?',
+        'questeq;': '\u225f',
+        'QUOT': '"',
+        'quot': '"',
+        'QUOT;': '"',
+        'quot;': '"',
+        'rAarr;': '\u21db',
+        'race;': '\u223d\u0331',
+        'Racute;': '\u0154',
+        'racute;': '\u0155',
+        'radic;': '\u221a',
+        'raemptyv;': '\u29b3',
+        'Rang;': '\u27eb',
+        'rang;': '\u27e9',
+        'rangd;': '\u2992',
+        'range;': '\u29a5',
+        'rangle;': '\u27e9',
+        'raquo': '\xbb',
+        'raquo;': '\xbb',
+        'Rarr;': '\u21a0',
+        'rArr;': '\u21d2',
+        'rarr;': '\u2192',
+        'rarrap;': '\u2975',
+        'rarrb;': '\u21e5',
+        'rarrbfs;': '\u2920',
+        'rarrc;': '\u2933',
+        'rarrfs;': '\u291e',
+        'rarrhk;': '\u21aa',
+        'rarrlp;': '\u21ac',
+        'rarrpl;': '\u2945',
+        'rarrsim;': '\u2974',
+        'Rarrtl;': '\u2916',
+        'rarrtl;': '\u21a3',
+        'rarrw;': '\u219d',
+        'rAtail;': '\u291c',
+        'ratail;': '\u291a',
+        'ratio;': '\u2236',
+        'rationals;': '\u211a',
+        'RBarr;': '\u2910',
+        'rBarr;': '\u290f',
+        'rbarr;': '\u290d',
+        'rbbrk;': '\u2773',
+        'rbrace;': '}',
+        'rbrack;': ']',
+        'rbrke;': '\u298c',
+        'rbrksld;': '\u298e',
+        'rbrkslu;': '\u2990',
+        'Rcaron;': '\u0158',
+        'rcaron;': '\u0159',
+        'Rcedil;': '\u0156',
+        'rcedil;': '\u0157',
+        'rceil;': '\u2309',
+        'rcub;': '}',
+        'Rcy;': '\u0420',
+        'rcy;': '\u0440',
+        'rdca;': '\u2937',
+        'rdldhar;': '\u2969',
+        'rdquo;': '\u201d',
+        'rdquor;': '\u201d',
+        'rdsh;': '\u21b3',
+        'Re;': '\u211c',
+        'real;': '\u211c',
+        'realine;': '\u211b',
+        'realpart;': '\u211c',
+        'reals;': '\u211d',
+        'rect;': '\u25ad',
+        'REG': '\xae',
+        'reg': '\xae',
+        'REG;': '\xae',
+        'reg;': '\xae',
+        'ReverseElement;': '\u220b',
+        'ReverseEquilibrium;': '\u21cb',
+        'ReverseUpEquilibrium;': '\u296f',
+        'rfisht;': '\u297d',
+        'rfloor;': '\u230b',
+        'Rfr;': '\u211c',
+        'rfr;': '\U0001d52f',
+        'rHar;': '\u2964',
+        'rhard;': '\u21c1',
+        'rharu;': '\u21c0',
+        'rharul;': '\u296c',
+        'Rho;': '\u03a1',
+        'rho;': '\u03c1',
+        'rhov;': '\u03f1',
+        'RightAngleBracket;': '\u27e9',
+        'RightArrow;': '\u2192',
+        'Rightarrow;': '\u21d2',
+        'rightarrow;': '\u2192',
+        'RightArrowBar;': '\u21e5',
+        'RightArrowLeftArrow;': '\u21c4',
+        'rightarrowtail;': '\u21a3',
+        'RightCeiling;': '\u2309',
+        'RightDoubleBracket;': '\u27e7',
+        'RightDownTeeVector;': '\u295d',
+        'RightDownVector;': '\u21c2',
+        'RightDownVectorBar;': '\u2955',
+        'RightFloor;': '\u230b',
+        'rightharpoondown;': '\u21c1',
+        'rightharpoonup;': '\u21c0',
+        'rightleftarrows;': '\u21c4',
+        'rightleftharpoons;': '\u21cc',
+        'rightrightarrows;': '\u21c9',
+        'rightsquigarrow;': '\u219d',
+        'RightTee;': '\u22a2',
+        'RightTeeArrow;': '\u21a6',
+        'RightTeeVector;': '\u295b',
+        'rightthreetimes;': '\u22cc',
+        'RightTriangle;': '\u22b3',
+        'RightTriangleBar;': '\u29d0',
+        'RightTriangleEqual;': '\u22b5',
+        'RightUpDownVector;': '\u294f',
+        'RightUpTeeVector;': '\u295c',
+        'RightUpVector;': '\u21be',
+        'RightUpVectorBar;': '\u2954',
+        'RightVector;': '\u21c0',
+        'RightVectorBar;': '\u2953',
+        'ring;': '\u02da',
+        'risingdotseq;': '\u2253',
+        'rlarr;': '\u21c4',
+        'rlhar;': '\u21cc',
+        'rlm;': '\u200f',
+        'rmoust;': '\u23b1',
+        'rmoustache;': '\u23b1',
+        'rnmid;': '\u2aee',
+        'roang;': '\u27ed',
+        'roarr;': '\u21fe',
+        'robrk;': '\u27e7',
+        'ropar;': '\u2986',
+        'Ropf;': '\u211d',
+        'ropf;': '\U0001d563',
+        'roplus;': '\u2a2e',
+        'rotimes;': '\u2a35',
+        'RoundImplies;': '\u2970',
+        'rpar;': ')',
+        'rpargt;': '\u2994',
+        'rppolint;': '\u2a12',
+        'rrarr;': '\u21c9',
+        'Rrightarrow;': '\u21db',
+        'rsaquo;': '\u203a',
+        'Rscr;': '\u211b',
+        'rscr;': '\U0001d4c7',
+        'Rsh;': '\u21b1',
+        'rsh;': '\u21b1',
+        'rsqb;': ']',
+        'rsquo;': '\u2019',
+        'rsquor;': '\u2019',
+        'rthree;': '\u22cc',
+        'rtimes;': '\u22ca',
+        'rtri;': '\u25b9',
+        'rtrie;': '\u22b5',
+        'rtrif;': '\u25b8',
+        'rtriltri;': '\u29ce',
+        'RuleDelayed;': '\u29f4',
+        'ruluhar;': '\u2968',
+        'rx;': '\u211e',
+        'Sacute;': '\u015a',
+        'sacute;': '\u015b',
+        'sbquo;': '\u201a',
+        'Sc;': '\u2abc',
+        'sc;': '\u227b',
+        'scap;': '\u2ab8',
+        'Scaron;': '\u0160',
+        'scaron;': '\u0161',
+        'sccue;': '\u227d',
+        'scE;': '\u2ab4',
+        'sce;': '\u2ab0',
+        'Scedil;': '\u015e',
+        'scedil;': '\u015f',
+        'Scirc;': '\u015c',
+        'scirc;': '\u015d',
+        'scnap;': '\u2aba',
+        'scnE;': '\u2ab6',
+        'scnsim;': '\u22e9',
+        'scpolint;': '\u2a13',
+        'scsim;': '\u227f',
+        'Scy;': '\u0421',
+        'scy;': '\u0441',
+        'sdot;': '\u22c5',
+        'sdotb;': '\u22a1',
+        'sdote;': '\u2a66',
+        'searhk;': '\u2925',
+        'seArr;': '\u21d8',
+        'searr;': '\u2198',
+        'searrow;': '\u2198',
+        'sect': '\xa7',
+        'sect;': '\xa7',
+        'semi;': ';',
+        'seswar;': '\u2929',
+        'setminus;': '\u2216',
+        'setmn;': '\u2216',
+        'sext;': '\u2736',
+        'Sfr;': '\U0001d516',
+        'sfr;': '\U0001d530',
+        'sfrown;': '\u2322',
+        'sharp;': '\u266f',
+        'SHCHcy;': '\u0429',
+        'shchcy;': '\u0449',
+        'SHcy;': '\u0428',
+        'shcy;': '\u0448',
+        'ShortDownArrow;': '\u2193',
+        'ShortLeftArrow;': '\u2190',
+        'shortmid;': '\u2223',
+        'shortparallel;': '\u2225',
+        'ShortRightArrow;': '\u2192',
+        'ShortUpArrow;': '\u2191',
+        'shy': '\xad',
+        'shy;': '\xad',
+        'Sigma;': '\u03a3',
+        'sigma;': '\u03c3',
+        'sigmaf;': '\u03c2',
+        'sigmav;': '\u03c2',
+        'sim;': '\u223c',
+        'simdot;': '\u2a6a',
+        'sime;': '\u2243',
+        'simeq;': '\u2243',
+        'simg;': '\u2a9e',
+        'simgE;': '\u2aa0',
+        'siml;': '\u2a9d',
+        'simlE;': '\u2a9f',
+        'simne;': '\u2246',
+        'simplus;': '\u2a24',
+        'simrarr;': '\u2972',
+        'slarr;': '\u2190',
+        'SmallCircle;': '\u2218',
+        'smallsetminus;': '\u2216',
+        'smashp;': '\u2a33',
+        'smeparsl;': '\u29e4',
+        'smid;': '\u2223',
+        'smile;': '\u2323',
+        'smt;': '\u2aaa',
+        'smte;': '\u2aac',
+        'smtes;': '\u2aac\ufe00',
+        'SOFTcy;': '\u042c',
+        'softcy;': '\u044c',
+        'sol;': '/',
+        'solb;': '\u29c4',
+        'solbar;': '\u233f',
+        'Sopf;': '\U0001d54a',
+        'sopf;': '\U0001d564',
+        'spades;': '\u2660',
+        'spadesuit;': '\u2660',
+        'spar;': '\u2225',
+        'sqcap;': '\u2293',
+        'sqcaps;': '\u2293\ufe00',
+        'sqcup;': '\u2294',
+        'sqcups;': '\u2294\ufe00',
+        'Sqrt;': '\u221a',
+        'sqsub;': '\u228f',
+        'sqsube;': '\u2291',
+        'sqsubset;': '\u228f',
+        'sqsubseteq;': '\u2291',
+        'sqsup;': '\u2290',
+        'sqsupe;': '\u2292',
+        'sqsupset;': '\u2290',
+        'sqsupseteq;': '\u2292',
+        'squ;': '\u25a1',
+        'Square;': '\u25a1',
+        'square;': '\u25a1',
+        'SquareIntersection;': '\u2293',
+        'SquareSubset;': '\u228f',
+        'SquareSubsetEqual;': '\u2291',
+        'SquareSuperset;': '\u2290',
+        'SquareSupersetEqual;': '\u2292',
+        'SquareUnion;': '\u2294',
+        'squarf;': '\u25aa',
+        'squf;': '\u25aa',
+        'srarr;': '\u2192',
+        'Sscr;': '\U0001d4ae',
+        'sscr;': '\U0001d4c8',
+        'ssetmn;': '\u2216',
+        'ssmile;': '\u2323',
+        'sstarf;': '\u22c6',
+        'Star;': '\u22c6',
+        'star;': '\u2606',
+        'starf;': '\u2605',
+        'straightepsilon;': '\u03f5',
+        'straightphi;': '\u03d5',
+        'strns;': '\xaf',
+        'Sub;': '\u22d0',
+        'sub;': '\u2282',
+        'subdot;': '\u2abd',
+        'subE;': '\u2ac5',
+        'sube;': '\u2286',
+        'subedot;': '\u2ac3',
+        'submult;': '\u2ac1',
+        'subnE;': '\u2acb',
+        'subne;': '\u228a',
+        'subplus;': '\u2abf',
+        'subrarr;': '\u2979',
+        'Subset;': '\u22d0',
+        'subset;': '\u2282',
+        'subseteq;': '\u2286',
+        'subseteqq;': '\u2ac5',
+        'SubsetEqual;': '\u2286',
+        'subsetneq;': '\u228a',
+        'subsetneqq;': '\u2acb',
+        'subsim;': '\u2ac7',
+        'subsub;': '\u2ad5',
+        'subsup;': '\u2ad3',
+        'succ;': '\u227b',
+        'succapprox;': '\u2ab8',
+        'succcurlyeq;': '\u227d',
+        'Succeeds;': '\u227b',
+        'SucceedsEqual;': '\u2ab0',
+        'SucceedsSlantEqual;': '\u227d',
+        'SucceedsTilde;': '\u227f',
+        'succeq;': '\u2ab0',
+        'succnapprox;': '\u2aba',
+        'succneqq;': '\u2ab6',
+        'succnsim;': '\u22e9',
+        'succsim;': '\u227f',
+        'SuchThat;': '\u220b',
+        'Sum;': '\u2211',
+        'sum;': '\u2211',
+        'sung;': '\u266a',
+        'sup1': '\xb9',
+        'sup1;': '\xb9',
+        'sup2': '\xb2',
+        'sup2;': '\xb2',
+        'sup3': '\xb3',
+        'sup3;': '\xb3',
+        'Sup;': '\u22d1',
+        'sup;': '\u2283',
+        'supdot;': '\u2abe',
+        'supdsub;': '\u2ad8',
+        'supE;': '\u2ac6',
+        'supe;': '\u2287',
+        'supedot;': '\u2ac4',
+        'Superset;': '\u2283',
+        'SupersetEqual;': '\u2287',
+        'suphsol;': '\u27c9',
+        'suphsub;': '\u2ad7',
+        'suplarr;': '\u297b',
+        'supmult;': '\u2ac2',
+        'supnE;': '\u2acc',
+        'supne;': '\u228b',
+        'supplus;': '\u2ac0',
+        'Supset;': '\u22d1',
+        'supset;': '\u2283',
+        'supseteq;': '\u2287',
+        'supseteqq;': '\u2ac6',
+        'supsetneq;': '\u228b',
+        'supsetneqq;': '\u2acc',
+        'supsim;': '\u2ac8',
+        'supsub;': '\u2ad4',
+        'supsup;': '\u2ad6',
+        'swarhk;': '\u2926',
+        'swArr;': '\u21d9',
+        'swarr;': '\u2199',
+        'swarrow;': '\u2199',
+        'swnwar;': '\u292a',
+        'szlig': '\xdf',
+        'szlig;': '\xdf',
+        'Tab;': '\t',
+        'target;': '\u2316',
+        'Tau;': '\u03a4',
+        'tau;': '\u03c4',
+        'tbrk;': '\u23b4',
+        'Tcaron;': '\u0164',
+        'tcaron;': '\u0165',
+        'Tcedil;': '\u0162',
+        'tcedil;': '\u0163',
+        'Tcy;': '\u0422',
+        'tcy;': '\u0442',
+        'tdot;': '\u20db',
+        'telrec;': '\u2315',
+        'Tfr;': '\U0001d517',
+        'tfr;': '\U0001d531',
+        'there4;': '\u2234',
+        'Therefore;': '\u2234',
+        'therefore;': '\u2234',
+        'Theta;': '\u0398',
+        'theta;': '\u03b8',
+        'thetasym;': '\u03d1',
+        'thetav;': '\u03d1',
+        'thickapprox;': '\u2248',
+        'thicksim;': '\u223c',
+        'ThickSpace;': '\u205f\u200a',
+        'thinsp;': '\u2009',
+        'ThinSpace;': '\u2009',
+        'thkap;': '\u2248',
+        'thksim;': '\u223c',
+        'THORN': '\xde',
+        'thorn': '\xfe',
+        'THORN;': '\xde',
+        'thorn;': '\xfe',
+        'Tilde;': '\u223c',
+        'tilde;': '\u02dc',
+        'TildeEqual;': '\u2243',
+        'TildeFullEqual;': '\u2245',
+        'TildeTilde;': '\u2248',
+        'times': '\xd7',
+        'times;': '\xd7',
+        'timesb;': '\u22a0',
+        'timesbar;': '\u2a31',
+        'timesd;': '\u2a30',
+        'tint;': '\u222d',
+        'toea;': '\u2928',
+        'top;': '\u22a4',
+        'topbot;': '\u2336',
+        'topcir;': '\u2af1',
+        'Topf;': '\U0001d54b',
+        'topf;': '\U0001d565',
+        'topfork;': '\u2ada',
+        'tosa;': '\u2929',
+        'tprime;': '\u2034',
+        'TRADE;': '\u2122',
+        'trade;': '\u2122',
+        'triangle;': '\u25b5',
+        'triangledown;': '\u25bf',
+        'triangleleft;': '\u25c3',
+        'trianglelefteq;': '\u22b4',
+        'triangleq;': '\u225c',
+        'triangleright;': '\u25b9',
+        'trianglerighteq;': '\u22b5',
+        'tridot;': '\u25ec',
+        'trie;': '\u225c',
+        'triminus;': '\u2a3a',
+        'TripleDot;': '\u20db',
+        'triplus;': '\u2a39',
+        'trisb;': '\u29cd',
+        'tritime;': '\u2a3b',
+        'trpezium;': '\u23e2',
+        'Tscr;': '\U0001d4af',
+        'tscr;': '\U0001d4c9',
+        'TScy;': '\u0426',
+        'tscy;': '\u0446',
+        'TSHcy;': '\u040b',
+        'tshcy;': '\u045b',
+        'Tstrok;': '\u0166',
+        'tstrok;': '\u0167',
+        'twixt;': '\u226c',
+        'twoheadleftarrow;': '\u219e',
+        'twoheadrightarrow;': '\u21a0',
+        'Uacute': '\xda',
+        'uacute': '\xfa',
+        'Uacute;': '\xda',
+        'uacute;': '\xfa',
+        'Uarr;': '\u219f',
+        'uArr;': '\u21d1',
+        'uarr;': '\u2191',
+        'Uarrocir;': '\u2949',
+        'Ubrcy;': '\u040e',
+        'ubrcy;': '\u045e',
+        'Ubreve;': '\u016c',
+        'ubreve;': '\u016d',
+        'Ucirc': '\xdb',
+        'ucirc': '\xfb',
+        'Ucirc;': '\xdb',
+        'ucirc;': '\xfb',
+        'Ucy;': '\u0423',
+        'ucy;': '\u0443',
+        'udarr;': '\u21c5',
+        'Udblac;': '\u0170',
+        'udblac;': '\u0171',
+        'udhar;': '\u296e',
+        'ufisht;': '\u297e',
+        'Ufr;': '\U0001d518',
+        'ufr;': '\U0001d532',
+        'Ugrave': '\xd9',
+        'ugrave': '\xf9',
+        'Ugrave;': '\xd9',
+        'ugrave;': '\xf9',
+        'uHar;': '\u2963',
+        'uharl;': '\u21bf',
+        'uharr;': '\u21be',
+        'uhblk;': '\u2580',
+        'ulcorn;': '\u231c',
+        'ulcorner;': '\u231c',
+        'ulcrop;': '\u230f',
+        'ultri;': '\u25f8',
+        'Umacr;': '\u016a',
+        'umacr;': '\u016b',
+        'uml': '\xa8',
+        'uml;': '\xa8',
+        'UnderBar;': '_',
+        'UnderBrace;': '\u23df',
+        'UnderBracket;': '\u23b5',
+        'UnderParenthesis;': '\u23dd',
+        'Union;': '\u22c3',
+        'UnionPlus;': '\u228e',
+        'Uogon;': '\u0172',
+        'uogon;': '\u0173',
+        'Uopf;': '\U0001d54c',
+        'uopf;': '\U0001d566',
+        'UpArrow;': '\u2191',
+        'Uparrow;': '\u21d1',
+        'uparrow;': '\u2191',
+        'UpArrowBar;': '\u2912',
+        'UpArrowDownArrow;': '\u21c5',
+        'UpDownArrow;': '\u2195',
+        'Updownarrow;': '\u21d5',
+        'updownarrow;': '\u2195',
+        'UpEquilibrium;': '\u296e',
+        'upharpoonleft;': '\u21bf',
+        'upharpoonright;': '\u21be',
+        'uplus;': '\u228e',
+        'UpperLeftArrow;': '\u2196',
+        'UpperRightArrow;': '\u2197',
+        'Upsi;': '\u03d2',
+        'upsi;': '\u03c5',
+        'upsih;': '\u03d2',
+        'Upsilon;': '\u03a5',
+        'upsilon;': '\u03c5',
+        'UpTee;': '\u22a5',
+        'UpTeeArrow;': '\u21a5',
+        'upuparrows;': '\u21c8',
+        'urcorn;': '\u231d',
+        'urcorner;': '\u231d',
+        'urcrop;': '\u230e',
+        'Uring;': '\u016e',
+        'uring;': '\u016f',
+        'urtri;': '\u25f9',
+        'Uscr;': '\U0001d4b0',
+        'uscr;': '\U0001d4ca',
+        'utdot;': '\u22f0',
+        'Utilde;': '\u0168',
+        'utilde;': '\u0169',
+        'utri;': '\u25b5',
+        'utrif;': '\u25b4',
+        'uuarr;': '\u21c8',
+        'Uuml': '\xdc',
+        'uuml': '\xfc',
+        'Uuml;': '\xdc',
+        'uuml;': '\xfc',
+        'uwangle;': '\u29a7',
+        'vangrt;': '\u299c',
+        'varepsilon;': '\u03f5',
+        'varkappa;': '\u03f0',
+        'varnothing;': '\u2205',
+        'varphi;': '\u03d5',
+        'varpi;': '\u03d6',
+        'varpropto;': '\u221d',
+        'vArr;': '\u21d5',
+        'varr;': '\u2195',
+        'varrho;': '\u03f1',
+        'varsigma;': '\u03c2',
+        'varsubsetneq;': '\u228a\ufe00',
+        'varsubsetneqq;': '\u2acb\ufe00',
+        'varsupsetneq;': '\u228b\ufe00',
+        'varsupsetneqq;': '\u2acc\ufe00',
+        'vartheta;': '\u03d1',
+        'vartriangleleft;': '\u22b2',
+        'vartriangleright;': '\u22b3',
+        'Vbar;': '\u2aeb',
+        'vBar;': '\u2ae8',
+        'vBarv;': '\u2ae9',
+        'Vcy;': '\u0412',
+        'vcy;': '\u0432',
+        'VDash;': '\u22ab',
+        'Vdash;': '\u22a9',
+        'vDash;': '\u22a8',
+        'vdash;': '\u22a2',
+        'Vdashl;': '\u2ae6',
+        'Vee;': '\u22c1',
+        'vee;': '\u2228',
+        'veebar;': '\u22bb',
+        'veeeq;': '\u225a',
+        'vellip;': '\u22ee',
+        'Verbar;': '\u2016',
+        'verbar;': '|',
+        'Vert;': '\u2016',
+        'vert;': '|',
+        'VerticalBar;': '\u2223',
+        'VerticalLine;': '|',
+        'VerticalSeparator;': '\u2758',
+        'VerticalTilde;': '\u2240',
+        'VeryThinSpace;': '\u200a',
+        'Vfr;': '\U0001d519',
+        'vfr;': '\U0001d533',
+        'vltri;': '\u22b2',
+        'vnsub;': '\u2282\u20d2',
+        'vnsup;': '\u2283\u20d2',
+        'Vopf;': '\U0001d54d',
+        'vopf;': '\U0001d567',
+        'vprop;': '\u221d',
+        'vrtri;': '\u22b3',
+        'Vscr;': '\U0001d4b1',
+        'vscr;': '\U0001d4cb',
+        'vsubnE;': '\u2acb\ufe00',
+        'vsubne;': '\u228a\ufe00',
+        'vsupnE;': '\u2acc\ufe00',
+        'vsupne;': '\u228b\ufe00',
+        'Vvdash;': '\u22aa',
+        'vzigzag;': '\u299a',
+        'Wcirc;': '\u0174',
+        'wcirc;': '\u0175',
+        'wedbar;': '\u2a5f',
+        'Wedge;': '\u22c0',
+        'wedge;': '\u2227',
+        'wedgeq;': '\u2259',
+        'weierp;': '\u2118',
+        'Wfr;': '\U0001d51a',
+        'wfr;': '\U0001d534',
+        'Wopf;': '\U0001d54e',
+        'wopf;': '\U0001d568',
+        'wp;': '\u2118',
+        'wr;': '\u2240',
+        'wreath;': '\u2240',
+        'Wscr;': '\U0001d4b2',
+        'wscr;': '\U0001d4cc',
+        'xcap;': '\u22c2',
+        'xcirc;': '\u25ef',
+        'xcup;': '\u22c3',
+        'xdtri;': '\u25bd',
+        'Xfr;': '\U0001d51b',
+        'xfr;': '\U0001d535',
+        'xhArr;': '\u27fa',
+        'xharr;': '\u27f7',
+        'Xi;': '\u039e',
+        'xi;': '\u03be',
+        'xlArr;': '\u27f8',
+        'xlarr;': '\u27f5',
+        'xmap;': '\u27fc',
+        'xnis;': '\u22fb',
+        'xodot;': '\u2a00',
+        'Xopf;': '\U0001d54f',
+        'xopf;': '\U0001d569',
+        'xoplus;': '\u2a01',
+        'xotime;': '\u2a02',
+        'xrArr;': '\u27f9',
+        'xrarr;': '\u27f6',
+        'Xscr;': '\U0001d4b3',
+        'xscr;': '\U0001d4cd',
+        'xsqcup;': '\u2a06',
+        'xuplus;': '\u2a04',
+        'xutri;': '\u25b3',
+        'xvee;': '\u22c1',
+        'xwedge;': '\u22c0',
+        'Yacute': '\xdd',
+        'yacute': '\xfd',
+        'Yacute;': '\xdd',
+        'yacute;': '\xfd',
+        'YAcy;': '\u042f',
+        'yacy;': '\u044f',
+        'Ycirc;': '\u0176',
+        'ycirc;': '\u0177',
+        'Ycy;': '\u042b',
+        'ycy;': '\u044b',
+        'yen': '\xa5',
+        'yen;': '\xa5',
+        'Yfr;': '\U0001d51c',
+        'yfr;': '\U0001d536',
+        'YIcy;': '\u0407',
+        'yicy;': '\u0457',
+        'Yopf;': '\U0001d550',
+        'yopf;': '\U0001d56a',
+        'Yscr;': '\U0001d4b4',
+        'yscr;': '\U0001d4ce',
+        'YUcy;': '\u042e',
+        'yucy;': '\u044e',
+        'yuml': '\xff',
+        'Yuml;': '\u0178',
+        'yuml;': '\xff',
+        'Zacute;': '\u0179',
+        'zacute;': '\u017a',
+        'Zcaron;': '\u017d',
+        'zcaron;': '\u017e',
+        'Zcy;': '\u0417',
+        'zcy;': '\u0437',
+        'Zdot;': '\u017b',
+        'zdot;': '\u017c',
+        'zeetrf;': '\u2128',
+        'ZeroWidthSpace;': '\u200b',
+        'Zeta;': '\u0396',
+        'zeta;': '\u03b6',
+        'Zfr;': '\u2128',
+        'zfr;': '\U0001d537',
+        'ZHcy;': '\u0416',
+        'zhcy;': '\u0436',
+        'zigrarr;': '\u21dd',
+        'Zopf;': '\u2124',
+        'zopf;': '\U0001d56b',
+        'Zscr;': '\U0001d4b5',
+        'zscr;': '\U0001d4cf',
+        'zwj;': '\u200d',
+        'zwnj;': '\u200c',
+    }
+
 try:
     import http.client as compat_http_client
 except ImportError:  # Python 2
@@ -82,7 +2322,6 @@ try:
 except ImportError:  # Python 2
     from HTMLParser import HTMLParser as compat_HTMLParser
 
-
 try:
     from subprocess import DEVNULL
     compat_subprocess_get_DEVNULL = lambda: DEVNULL
@@ -244,13 +2483,20 @@ try:
 except ImportError:  # Python 2.6
     from xml.parsers.expat import ExpatError as compat_xml_parse_error
 
+
+etree = xml.etree.ElementTree
+
+
+class _TreeBuilder(etree.TreeBuilder):
+    def doctype(self, name, pubid, system):
+        pass
+
 if sys.version_info[0] >= 3:
-    compat_etree_fromstring = xml.etree.ElementTree.fromstring
+    def compat_etree_fromstring(text):
+        return etree.XML(text, parser=etree.XMLParser(target=_TreeBuilder()))
 else:
     # python 2.x tries to encode unicode strings with ascii (see the
     # XMLParser._fixtext method)
-    etree = xml.etree.ElementTree
-
     try:
         _etree_iter = etree.Element.iter
     except AttributeError:  # Python <=2.6
@@ -264,7 +2510,7 @@ else:
     # 2.7 source
     def _XML(text, parser=None):
         if not parser:
-            parser = etree.XMLParser(target=etree.TreeBuilder())
+            parser = etree.XMLParser(target=_TreeBuilder())
         parser.feed(text)
         return parser.close()
 
@@ -276,7 +2522,7 @@ else:
         return el
 
     def compat_etree_fromstring(text):
-        doc = _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory)))
+        doc = _XML(text, parser=etree.XMLParser(target=_TreeBuilder(element_factory=_element_factory)))
         for el in _etree_iter(doc):
             if el.text is not None and isinstance(el.text, bytes):
                 el.text = el.text.decode('utf-8')
@@ -340,24 +2586,28 @@ except ImportError:  # Python 2
         return parsed_result
 
 try:
-    from shlex import quote as shlex_quote
+    from shlex import quote as compat_shlex_quote
 except ImportError:  # Python < 3.3
-    def shlex_quote(s):
+    def compat_shlex_quote(s):
         if re.match(r'^[-_\w./]+$', s):
             return s
         else:
             return "'" + s.replace("'", "'\"'\"'") + "'"
 
 
-if sys.version_info >= (2, 7, 3):
+try:
+    args = shlex.split('中文')
+    assert (isinstance(args, list) and
+            isinstance(args[0], compat_str) and
+            args[0] == '中文')
     compat_shlex_split = shlex.split
-else:
+except (AssertionError, UnicodeEncodeError):
     # Working around shlex issue with unicode strings on some python 2
     # versions (see http://bugs.python.org/issue1548891)
     def compat_shlex_split(s, comments=False, posix=True):
         if isinstance(s, compat_str):
             s = s.encode('utf-8')
-        return shlex.split(s, comments, posix)
+        return list(map(lambda s: s.decode('utf-8'), shlex.split(s, comments, posix)))
 
 
 def compat_ord(c):
@@ -373,6 +2623,9 @@ compat_os_name = os._name if os.name == 'java' else os.name
 if sys.version_info >= (3, 0):
     compat_getenv = os.getenv
     compat_expanduser = os.path.expanduser
+
+    def compat_setenv(key, value, env=os.environ):
+        env[key] = value
 else:
     # Environment variables should be decoded with filesystem encoding.
     # Otherwise it will fail if any non-ASCII characters present (see #3854 #3217 #2918)
@@ -384,6 +2637,12 @@ else:
             env = env.decode(get_filesystem_encoding())
         return env
 
+    def compat_setenv(key, value, env=os.environ):
+        def encode(v):
+            from .utils import get_filesystem_encoding
+            return v.encode(get_filesystem_encoding()) if isinstance(v, compat_str) else v
+        env[encode(key)] = encode(value)
+
     # HACK: The default implementations of os.path.expanduser from cpython do not decode
     # environment variables with filesystem encoding. We will work around this by
     # providing adjusted implementations.
@@ -456,18 +2715,6 @@ else:
         print(s)
 
 
-try:
-    subprocess_check_output = subprocess.check_output
-except AttributeError:
-    def subprocess_check_output(*args, **kwargs):
-        assert 'input' not in kwargs
-        p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
-        output, _ = p.communicate()
-        ret = p.poll()
-        if ret:
-            raise subprocess.CalledProcessError(ret, p.args, output=output)
-        return output
-
 if sys.version_info < (3, 0) and sys.platform == 'win32':
     def compat_getpass(prompt, *args, **kwargs):
         if isinstance(prompt, compat_str):
@@ -477,6 +2724,11 @@ if sys.version_info < (3, 0) and sys.platform == 'win32':
 else:
     compat_getpass = getpass.getpass
 
+try:
+    compat_input = raw_input
+except NameError:  # Python 3
+    compat_input = input
+
 # Python < 2.6.5 require kwargs to be bytes
 try:
     def _testfunc(x):
@@ -583,6 +2835,26 @@ if sys.version_info >= (3, 0):
 else:
     from tokenize import generate_tokens as compat_tokenize_tokenize
 
+
+try:
+    struct.pack('!I', 0)
+except TypeError:
+    # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument
+    # See https://bugs.python.org/issue19099
+    def compat_struct_pack(spec, *args):
+        if isinstance(spec, compat_str):
+            spec = spec.encode('ascii')
+        return struct.pack(spec, *args)
+
+    def compat_struct_unpack(spec, *args):
+        if isinstance(spec, compat_str):
+            spec = spec.encode('ascii')
+        return struct.unpack(spec, *args)
+else:
+    compat_struct_pack = struct.pack
+    compat_struct_unpack = struct.unpack
+
+
 __all__ = [
     'compat_HTMLParser',
     'compat_HTTPError',
@@ -596,17 +2868,23 @@ __all__ = [
     'compat_getenv',
     'compat_getpass',
     'compat_html_entities',
+    'compat_html_entities_html5',
     'compat_http_client',
     'compat_http_server',
+    'compat_input',
     'compat_itertools_count',
     'compat_kwargs',
     'compat_ord',
     'compat_os_name',
     'compat_parse_qs',
     'compat_print',
+    'compat_setenv',
+    'compat_shlex_quote',
     'compat_shlex_split',
     'compat_socket_create_connection',
     'compat_str',
+    'compat_struct_pack',
+    'compat_struct_unpack',
     'compat_subprocess_get_DEVNULL',
     'compat_tokenize_tokenize',
     'compat_urllib_error',
@@ -623,7 +2901,5 @@ __all__ = [
     'compat_urlretrieve',
     'compat_xml_parse_error',
     'compat_xpath',
-    'shlex_quote',
-    'subprocess_check_output',
     'workaround_optparse_bug9161',
 ]
index 8d642fc3e60594f10a057847cf5702f715941326..fae2450248494a70f237a65d07a4bedcfddaadeb 100644 (file)
@@ -6,6 +6,7 @@ import sys
 import re
 
 from .common import FileDownloader
+from ..compat import compat_setenv
 from ..postprocessor.ffmpeg import FFmpegPostProcessor, EXT_TO_OUT_FORMATS
 from ..utils import (
     cli_option,
@@ -84,7 +85,7 @@ class ExternalFD(FileDownloader):
             cmd, stderr=subprocess.PIPE)
         _, stderr = p.communicate()
         if p.returncode != 0:
-            self.to_stderr(stderr)
+            self.to_stderr(stderr.decode('utf-8', 'replace'))
         return p.returncode
 
 
@@ -198,6 +199,19 @@ class FFmpegFD(ExternalFD):
                 '-headers',
                 ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())]
 
+        env = None
+        proxy = self.params.get('proxy')
+        if proxy:
+            if not re.match(r'^[\da-zA-Z]+://', proxy):
+                proxy = 'http://%s' % proxy
+            # Since December 2015 ffmpeg supports -http_proxy option (see
+            # http://git.videolan.org/?p=ffmpeg.git;a=commit;h=b4eb1f29ebddd60c41a2eb39f5af701e38e0d3fd)
+            # We could switch to the following code if we are able to detect version properly
+            # args += ['-http_proxy', proxy]
+            env = os.environ.copy()
+            compat_setenv('HTTP_PROXY', proxy, env=env)
+            compat_setenv('http_proxy', proxy, env=env)
+
         protocol = info_dict.get('protocol')
 
         if protocol == 'rtmp':
@@ -224,7 +238,7 @@ class FFmpegFD(ExternalFD):
                 args += ['-rtmp_live', 'live']
 
         args += ['-i', url, '-c', 'copy']
-        if protocol == 'm3u8':
+        if protocol in ('m3u8', 'm3u8_native'):
             if self.params.get('hls_use_mpegts', False) or tmpfilename == '-':
                 args += ['-f', 'mpegts']
             else:
@@ -239,7 +253,7 @@ class FFmpegFD(ExternalFD):
 
         self._debug_cmd(args)
 
-        proc = subprocess.Popen(args, stdin=subprocess.PIPE)
+        proc = subprocess.Popen(args, stdin=subprocess.PIPE, env=env)
         try:
             retval = proc.wait()
         except KeyboardInterrupt:
index 664d87543d07f7c357b803e0a0058034b71276a6..80c21d40bc88382a64634b0eeb9daa3eaaccc303 100644 (file)
@@ -12,37 +12,49 @@ from ..compat import (
     compat_urlparse,
     compat_urllib_error,
     compat_urllib_parse_urlparse,
+    compat_struct_pack,
+    compat_struct_unpack,
 )
 from ..utils import (
     encodeFilename,
     fix_xml_ampersands,
     sanitize_open,
-    struct_pack,
-    struct_unpack,
     xpath_text,
 )
 
 
+class DataTruncatedError(Exception):
+    pass
+
+
 class FlvReader(io.BytesIO):
     """
     Reader for Flv files
     The file format is documented in https://www.adobe.com/devnet/f4v.html
     """
 
+    def read_bytes(self, n):
+        data = self.read(n)
+        if len(data) < n:
+            raise DataTruncatedError(
+                'FlvReader error: need %d bytes while only %d bytes got' % (
+                    n, len(data)))
+        return data
+
     # Utility functions for reading numbers and strings
     def read_unsigned_long_long(self):
-        return struct_unpack('!Q', self.read(8))[0]
+        return compat_struct_unpack('!Q', self.read_bytes(8))[0]
 
     def read_unsigned_int(self):
-        return struct_unpack('!I', self.read(4))[0]
+        return compat_struct_unpack('!I', self.read_bytes(4))[0]
 
     def read_unsigned_char(self):
-        return struct_unpack('!B', self.read(1))[0]
+        return compat_struct_unpack('!B', self.read_bytes(1))[0]
 
     def read_string(self):
         res = b''
         while True:
-            char = self.read(1)
+            char = self.read_bytes(1)
             if char == b'\x00':
                 break
             res += char
@@ -53,18 +65,18 @@ class FlvReader(io.BytesIO):
         Read a box and return the info as a tuple: (box_size, box_type, box_data)
         """
         real_size = size = self.read_unsigned_int()
-        box_type = self.read(4)
+        box_type = self.read_bytes(4)
         header_end = 8
         if size == 1:
             real_size = self.read_unsigned_long_long()
             header_end = 16
-        return real_size, box_type, self.read(real_size - header_end)
+        return real_size, box_type, self.read_bytes(real_size - header_end)
 
     def read_asrt(self):
         # version
         self.read_unsigned_char()
         # flags
-        self.read(3)
+        self.read_bytes(3)
         quality_entry_count = self.read_unsigned_char()
         # QualityEntryCount
         for i in range(quality_entry_count):
@@ -85,7 +97,7 @@ class FlvReader(io.BytesIO):
         # version
         self.read_unsigned_char()
         # flags
-        self.read(3)
+        self.read_bytes(3)
         # time scale
         self.read_unsigned_int()
 
@@ -119,7 +131,7 @@ class FlvReader(io.BytesIO):
         # version
         self.read_unsigned_char()
         # flags
-        self.read(3)
+        self.read_bytes(3)
 
         self.read_unsigned_int()  # BootstrapinfoVersion
         # Profile,Live,Update,Reserved
@@ -184,6 +196,11 @@ def build_fragments_list(boot_info):
     first_frag_number = fragment_run_entry_table[0]['first']
     fragments_counter = itertools.count(first_frag_number)
     for segment, fragments_count in segment_run_table['segment_run']:
+        # In some live HDS streams (for example Rai), `fragments_count` is
+        # abnormal and causing out-of-memory errors. It's OK to change the
+        # number of fragments for live streams as they are updated periodically
+        if fragments_count == 4294967295 and boot_info['live']:
+            fragments_count = 2
         for _ in range(fragments_count):
             res.append((segment, next(fragments_counter)))
 
@@ -194,11 +211,11 @@ def build_fragments_list(boot_info):
 
 
 def write_unsigned_int(stream, val):
-    stream.write(struct_pack('!I', val))
+    stream.write(compat_struct_pack('!I', val))
 
 
 def write_unsigned_int_24(stream, val):
-    stream.write(struct_pack('!I', val)[1:])
+    stream.write(compat_struct_pack('!I', val)[1:])
 
 
 def write_flv_header(stream):
@@ -307,7 +324,7 @@ class F4mFD(FragmentFD):
         doc = compat_etree_fromstring(manifest)
         formats = [(int(f.attrib.get('bitrate', -1)), f)
                    for f in self._get_unencrypted_media(doc)]
-        if requested_bitrate is None:
+        if requested_bitrate is None or len(formats) == 1:
             # get the best format
             formats = sorted(formats, key=lambda f: f[0])
             rate, media = formats[-1]
@@ -317,7 +334,11 @@ class F4mFD(FragmentFD):
 
         base_url = compat_urlparse.urljoin(man_url, media.attrib['url'])
         bootstrap_node = doc.find(_add_ns('bootstrapInfo'))
-        boot_info, bootstrap_url = self._parse_bootstrap_node(bootstrap_node, base_url)
+        # From Adobe F4M 3.0 spec:
+        # The <baseURL> element SHALL be the base URL for all relative
+        # (HTTP-based) URLs in the manifest. If <baseURL> is not present, said
+        # URLs should be relative to the location of the containing document.
+        boot_info, bootstrap_url = self._parse_bootstrap_node(bootstrap_node, man_url)
         live = boot_info['live']
         metadata_node = media.find(_add_ns('metadata'))
         if metadata_node is not None:
@@ -374,7 +395,17 @@ class F4mFD(FragmentFD):
                 down.close()
                 reader = FlvReader(down_data)
                 while True:
-                    _, box_type, box_data = reader.read_box_info()
+                    try:
+                        _, box_type, box_data = reader.read_box_info()
+                    except DataTruncatedError:
+                        if test:
+                            # In tests, segments may be truncated, and thus
+                            # FlvReader may not be able to parse the whole
+                            # chunk. If so, write the segment as is
+                            # See https://github.com/rg3/youtube-dl/issues/9214
+                            dest_stream.write(down_data)
+                            break
+                        raise
                     if box_type == b'mdat':
                         dest_stream.write(box_data)
                         break
index a01dac031aa3b0c012a4262d210d16ef2b10a47a..3b7bb35087568b084e84fcd0610f951c0eaf8472 100644 (file)
@@ -2,13 +2,24 @@ from __future__ import unicode_literals
 
 import os.path
 import re
+import binascii
+try:
+    from Crypto.Cipher import AES
+    can_decrypt_frag = True
+except ImportError:
+    can_decrypt_frag = False
 
 from .fragment import FragmentFD
+from .external import FFmpegFD
 
-from ..compat import compat_urlparse
+from ..compat import (
+    compat_urlparse,
+    compat_struct_pack,
+)
 from ..utils import (
     encodeFilename,
     sanitize_open,
+    parse_m3u8_attributes,
 )
 
 
@@ -17,42 +28,101 @@ class HlsFD(FragmentFD):
 
     FD_NAME = 'hlsnative'
 
+    @staticmethod
+    def can_download(manifest):
+        UNSUPPORTED_FEATURES = (
+            r'#EXT-X-KEY:METHOD=(?!NONE|AES-128)',  # encrypted streams [1]
+            r'#EXT-X-BYTERANGE',  # playlists composed of byte ranges of media files [2]
+
+            # Live streams heuristic does not always work (e.g. geo restricted to Germany
+            # http://hls-geo.daserste.de/i/videoportal/Film/c_620000/622873/format,716451,716457,716450,716458,716459,.mp4.csmil/index_4_av.m3u8?null=0)
+            # r'#EXT-X-MEDIA-SEQUENCE:(?!0$)',  # live streams [3]
+
+            # This heuristic also is not correct since segments may not be appended as well.
+            # Twitch vods of finished streams have EXT-X-PLAYLIST-TYPE:EVENT despite
+            # no segments will definitely be appended to the end of the playlist.
+            # r'#EXT-X-PLAYLIST-TYPE:EVENT',  # media segments may be appended to the end of
+            #                                 # event media playlists [4]
+
+            # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4
+            # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2
+            # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2
+            # 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5
+        )
+        check_results = [not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES]
+        check_results.append(can_decrypt_frag or '#EXT-X-KEY:METHOD=AES-128' not in manifest)
+        return all(check_results)
+
     def real_download(self, filename, info_dict):
         man_url = info_dict['url']
         self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME)
         manifest = self.ydl.urlopen(man_url).read()
 
         s = manifest.decode('utf-8', 'ignore')
-        fragment_urls = []
+
+        if not self.can_download(s):
+            self.report_warning(
+                'hlsnative has detected features it does not support, '
+                'extraction will be delegated to ffmpeg')
+            fd = FFmpegFD(self.ydl, self.params)
+            for ph in self._progress_hooks:
+                fd.add_progress_hook(ph)
+            return fd.real_download(filename, info_dict)
+
+        total_frags = 0
         for line in s.splitlines():
             line = line.strip()
             if line and not line.startswith('#'):
-                segment_url = (
-                    line
-                    if re.match(r'^https?://', line)
-                    else compat_urlparse.urljoin(man_url, line))
-                fragment_urls.append(segment_url)
-                # We only download the first fragment during the test
-                if self.params.get('test', False):
-                    break
+                total_frags += 1
 
         ctx = {
             'filename': filename,
-            'total_frags': len(fragment_urls),
+            'total_frags': total_frags,
         }
 
         self._prepare_and_start_frag_download(ctx)
 
+        i = 0
+        media_sequence = 0
+        decrypt_info = {'METHOD': 'NONE'}
         frags_filenames = []
-        for i, frag_url in enumerate(fragment_urls):
-            frag_filename = '%s-Frag%d' % (ctx['tmpfilename'], i)
-            success = ctx['dl'].download(frag_filename, {'url': frag_url})
-            if not success:
-                return False
-            down, frag_sanitized = sanitize_open(frag_filename, 'rb')
-            ctx['dest_stream'].write(down.read())
-            down.close()
-            frags_filenames.append(frag_sanitized)
+        for line in s.splitlines():
+            line = line.strip()
+            if line:
+                if not line.startswith('#'):
+                    frag_url = (
+                        line
+                        if re.match(r'^https?://', line)
+                        else compat_urlparse.urljoin(man_url, line))
+                    frag_filename = '%s-Frag%d' % (ctx['tmpfilename'], i)
+                    success = ctx['dl'].download(frag_filename, {'url': frag_url})
+                    if not success:
+                        return False
+                    down, frag_sanitized = sanitize_open(frag_filename, 'rb')
+                    frag_content = down.read()
+                    down.close()
+                    if decrypt_info['METHOD'] == 'AES-128':
+                        iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence)
+                        frag_content = AES.new(
+                            decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content)
+                    ctx['dest_stream'].write(frag_content)
+                    frags_filenames.append(frag_sanitized)
+                    # We only download the first fragment during the test
+                    if self.params.get('test', False):
+                        break
+                    i += 1
+                    media_sequence += 1
+                elif line.startswith('#EXT-X-KEY'):
+                    decrypt_info = parse_m3u8_attributes(line[11:])
+                    if decrypt_info['METHOD'] == 'AES-128':
+                        if 'IV' in decrypt_info:
+                            decrypt_info['IV'] = binascii.unhexlify(decrypt_info['IV'][2:])
+                        if not re.match(r'^https?://', decrypt_info['URI']):
+                            decrypt_info['URI'] = compat_urlparse.urljoin(
+                                man_url, decrypt_info['URI'])
+                        decrypt_info['KEY'] = self.ydl.urlopen(decrypt_info['URI']).read()
+                elif line.startswith('#EXT-X-MEDIA-SEQUENCE'):
+                    media_sequence = int(line[22:])
 
         self._finish_frag_download(ctx)
 
diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py
new file mode 100644 (file)
index 0000000..b61a632
--- /dev/null
@@ -0,0 +1,135 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import calendar
+import re
+import time
+
+from .amp import AMPIE
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+
+
+class AbcNewsVideoIE(AMPIE):
+    IE_NAME = 'abcnews:video'
+    _VALID_URL = 'http://abcnews.go.com/[^/]+/video/(?P<display_id>[0-9a-z-]+)-(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'http://abcnews.go.com/ThisWeek/video/week-exclusive-irans-foreign-minister-zarif-20411932',
+        'info_dict': {
+            'id': '20411932',
+            'ext': 'mp4',
+            'display_id': 'week-exclusive-irans-foreign-minister-zarif',
+            'title': '\'This Week\' Exclusive: Iran\'s Foreign Minister Zarif',
+            'description': 'George Stephanopoulos goes one-on-one with Iranian Foreign Minister Dr. Javad Zarif.',
+            'duration': 180,
+            'thumbnail': 're:^https?://.*\.jpg$',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('display_id')
+        video_id = mobj.group('id')
+        info_dict = self._extract_feed_info(
+            'http://abcnews.go.com/video/itemfeed?id=%s' % video_id)
+        info_dict.update({
+            'id': video_id,
+            'display_id': display_id,
+        })
+        return info_dict
+
+
+class AbcNewsIE(InfoExtractor):
+    IE_NAME = 'abcnews'
+    _VALID_URL = 'https?://abcnews\.go\.com/(?:[^/]+/)+(?P<display_id>[0-9a-z-]+)/story\?id=(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'http://abcnews.go.com/Blotter/News/dramatic-video-rare-death-job-america/story?id=10498713#.UIhwosWHLjY',
+        'info_dict': {
+            'id': '10498713',
+            'ext': 'flv',
+            'display_id': 'dramatic-video-rare-death-job-america',
+            'title': 'Occupational Hazards',
+            'description': 'Nightline investigates the dangers that lurk at various jobs.',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'upload_date': '20100428',
+            'timestamp': 1272412800,
+        },
+        'add_ie': ['AbcNewsVideo'],
+    }, {
+        'url': 'http://abcnews.go.com/Entertainment/justin-timberlake-performs-stop-feeling-eurovision-2016/story?id=39125818',
+        'info_dict': {
+            'id': '39125818',
+            'ext': 'mp4',
+            'display_id': 'justin-timberlake-performs-stop-feeling-eurovision-2016',
+            'title': 'Justin Timberlake Drops Hints For Secret Single',
+            'description': 'Lara Spencer reports the buzziest stories of the day in "GMA" Pop News.',
+            'upload_date': '20160515',
+            'timestamp': 1463329500,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+            # The embedded YouTube video is blocked due to copyright issues
+            'playlist_items': '1',
+        },
+        'add_ie': ['AbcNewsVideo'],
+    }, {
+        'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('display_id')
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        video_url = self._search_regex(
+            r'window\.abcnvideo\.url\s*=\s*"([^"]+)"', webpage, 'video URL')
+        full_video_url = compat_urlparse.urljoin(url, video_url)
+
+        youtube_url = self._html_search_regex(
+            r'<iframe[^>]+src="(https://www\.youtube\.com/embed/[^"]+)"',
+            webpage, 'YouTube URL', default=None)
+
+        timestamp = None
+        date_str = self._html_search_regex(
+            r'<span[^>]+class="timestamp">([^<]+)</span>',
+            webpage, 'timestamp', fatal=False)
+        if date_str:
+            tz_offset = 0
+            if date_str.endswith(' ET'):  # Eastern Time
+                tz_offset = -5
+                date_str = date_str[:-3]
+            date_formats = ['%b. %d, %Y', '%b %d, %Y, %I:%M %p']
+            for date_format in date_formats:
+                try:
+                    timestamp = calendar.timegm(time.strptime(date_str.strip(), date_format))
+                except ValueError:
+                    continue
+            if timestamp is not None:
+                timestamp -= tz_offset * 3600
+
+        entry = {
+            '_type': 'url_transparent',
+            'ie_key': AbcNewsVideoIE.ie_key(),
+            'url': full_video_url,
+            'id': video_id,
+            'display_id': display_id,
+            'timestamp': timestamp,
+        }
+
+        if youtube_url:
+            entries = [entry, self.url_result(youtube_url, 'Youtube')]
+            return self.playlist_result(entries)
+
+        return entry
index 8753ee2cf2b5fdaa5810fc8d564f388734a84324..5ae16fa16809b557e74e133a4a7811d396b1c2c2 100644 (file)
@@ -156,7 +156,10 @@ class AdobeTVVideoIE(InfoExtractor):
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        video_data = self._download_json(url + '?format=json', video_id)
+        webpage = self._download_webpage(url, video_id)
+
+        video_data = self._parse_json(self._search_regex(
+            r'var\s+bridge\s*=\s*([^;]+);', webpage, 'bridged data'), video_id)
 
         formats = [{
             'format_id': '%s-%s' % (determine_ext(source['src']), source.get('height')),
index 1bbfe264177dab80c6e40009e8543e52600c3d9d..8f53050c949a1ba2ebdeedbacba9610ca694ef4f 100644 (file)
@@ -2,41 +2,33 @@ from __future__ import unicode_literals
 
 import re
 
-from .common import InfoExtractor
+from .theplatform import ThePlatformIE
 from ..utils import (
     smuggle_url,
     update_url_query,
     unescapeHTML,
+    extract_attributes,
+    get_element_by_attribute,
 )
+from ..compat import (
+    compat_urlparse,
+)
+
 
+class AENetworksBaseIE(ThePlatformIE):
+    _THEPLATFORM_KEY = 'crazyjava'
+    _THEPLATFORM_SECRET = 's3cr3t'
 
-class AENetworksIE(InfoExtractor):
+
+class AENetworksIE(AENetworksBaseIE):
     IE_NAME = 'aenetworks'
     IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network'
-    _VALID_URL = r'https?://(?:www\.)?(?:(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?P<type>[^/]+)/(?:[^/]+/)+(?P<id>[^/]+?)(?:$|[?#])'
-
+    _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?:shows/(?P<show_path>[^/]+(?:/[^/]+){0,2})|movies/(?P<movie_display_id>[^/]+)/full-movie)'
     _TESTS = [{
-        'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false',
-        'info_dict': {
-            'id': 'g12m5Gyt3fdR',
-            'ext': 'mp4',
-            'title': "Bet You Didn't Know: Valentine's Day",
-            'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7',
-            'timestamp': 1375819729,
-            'upload_date': '20130806',
-            'uploader': 'AENE-NEW',
-        },
-        'params': {
-            # m3u8 download
-            'skip_download': True,
-        },
-        'add_ie': ['ThePlatform'],
-        'expected_warnings': ['JSON-LD'],
-    }, {
         'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1',
         'md5': '8ff93eb073449f151d6b90c0ae1ef0c7',
         'info_dict': {
-            'id': 'eg47EERs_JsZ',
+            'id': '22253814',
             'ext': 'mp4',
             'title': 'Winter Is Coming',
             'description': 'md5:641f424b7a19d8e24f26dea22cf59d74',
@@ -46,42 +38,168 @@ class AENetworksIE(InfoExtractor):
         },
         'add_ie': ['ThePlatform'],
     }, {
-        'url': 'http://www.aetv.com/shows/duck-dynasty/video/inlawful-entry',
+        'url': 'http://www.history.com/shows/ancient-aliens/season-1',
+        'info_dict': {
+            'id': '71889446852',
+        },
+        'playlist_mincount': 5,
+    }, {
+        'url': 'http://www.mylifetime.com/shows/atlanta-plastic',
+        'info_dict': {
+            'id': 'SERIES4317',
+            'title': 'Atlanta Plastic',
+        },
+        'playlist_mincount': 2,
+    }, {
+        'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1',
         'only_matching': True
     }, {
-        'url': 'http://www.fyi.tv/shows/tiny-house-nation/videos/207-sq-ft-minnesota-prairie-cottage',
+        'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8',
         'only_matching': True
     }, {
-        'url': 'http://www.mylifetime.com/shows/project-runway-junior/video/season-1/episode-6/superstar-clients',
+        'url': 'http://www.mylifetime.com/shows/project-runway-junior/season-1/episode-6',
+        'only_matching': True
+    }, {
+        'url': 'http://www.mylifetime.com/movies/center-stage-on-pointe/full-movie',
         'only_matching': True
     }]
+    _DOMAIN_TO_REQUESTOR_ID = {
+        'history.com': 'HISTORY',
+        'aetv.com': 'AETV',
+        'mylifetime.com': 'LIFETIME',
+        'fyi.tv': 'FYI',
+    }
 
     def _real_extract(self, url):
-        page_type, video_id = re.match(self._VALID_URL, url).groups()
+        domain, show_path, movie_display_id = re.match(self._VALID_URL, url).groups()
+        display_id = show_path or movie_display_id
+        webpage = self._download_webpage(url, display_id)
+        if show_path:
+            url_parts = show_path.split('/')
+            url_parts_len = len(url_parts)
+            if url_parts_len == 1:
+                entries = []
+                for season_url_path in re.findall(r'(?s)<li[^>]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage):
+                    entries.append(self.url_result(
+                        compat_urlparse.urljoin(url, season_url_path), 'AENetworks'))
+                return self.playlist_result(
+                    entries, self._html_search_meta('aetn:SeriesId', webpage),
+                    self._html_search_meta('aetn:SeriesTitle', webpage))
+            elif url_parts_len == 2:
+                entries = []
+                for episode_item in re.findall(r'(?s)<div[^>]+class="[^"]*episode-item[^"]*"[^>]*>', webpage):
+                    episode_attributes = extract_attributes(episode_item)
+                    episode_url = compat_urlparse.urljoin(
+                        url, episode_attributes['data-canonical'])
+                    entries.append(self.url_result(
+                        episode_url, 'AENetworks',
+                        episode_attributes['data-videoid']))
+                return self.playlist_result(
+                    entries, self._html_search_meta('aetn:SeasonId', webpage))
+
+        query = {
+            'mbr': 'true',
+            'assetTypes': 'medium_video_s3'
+        }
+        video_id = self._html_search_meta('aetn:VideoID', webpage)
+        media_url = self._search_regex(
+            r"media_url\s*=\s*'([^']+)'", webpage, 'video url')
+        theplatform_metadata = self._download_theplatform_metadata(self._search_regex(
+            r'https?://link.theplatform.com/s/([^?]+)', media_url, 'theplatform_path'), video_id)
+        info = self._parse_theplatform_metadata(theplatform_metadata)
+        if theplatform_metadata.get('AETN$isBehindWall'):
+            requestor_id = self._DOMAIN_TO_REQUESTOR_ID[domain]
+            resource = '<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/"><channel><title>%s</title><item><title>%s</title><guid>%s</guid><media:rating scheme="urn:v-chip">%s</media:rating></item></channel></rss>' % (requestor_id, theplatform_metadata['title'], theplatform_metadata['AETN$PPL_pplProgramId'], theplatform_metadata['ratings'][0]['rating'])
+            query['auth'] = self._extract_mvpd_auth(
+                url, video_id, requestor_id, resource)
+        info.update(self._search_json_ld(webpage, video_id, fatal=False))
+        media_url = update_url_query(media_url, query)
+        media_url = self._sign_url(media_url, self._THEPLATFORM_KEY, self._THEPLATFORM_SECRET)
+        formats, subtitles = self._extract_theplatform_smil(media_url, video_id)
+        self._sort_formats(formats)
+        info.update({
+            'id': video_id,
+            'formats': formats,
+            'subtitles': subtitles,
+        })
+        return info
 
-        webpage = self._download_webpage(url, video_id)
 
-        video_url_re = [
-            r'data-href="[^"]*/%s"[^>]+data-release-url="([^"]+)"' % video_id,
-            r"media_url\s*=\s*'([^']+)'"
-        ]
-        video_url = unescapeHTML(self._search_regex(video_url_re, webpage, 'video url'))
-        query = {'mbr': 'true'}
-        if page_type == 'shows':
-            query['assetTypes'] = 'medium_video_s3'
-        if 'switch=hds' in video_url:
-            query['switch'] = 'hls'
+class HistoryTopicIE(AENetworksBaseIE):
+    IE_NAME = 'history:topic'
+    IE_DESC = 'History.com Topic'
+    _VALID_URL = r'https?://(?:www\.)?history\.com/topics/(?:[^/]+/)?(?P<topic_id>[^/]+)(?:/[^/]+(?:/(?P<video_display_id>[^/?#]+))?)?'
+    _TESTS = [{
+        'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false',
+        'info_dict': {
+            'id': '40700995724',
+            'ext': 'mp4',
+            'title': "Bet You Didn't Know: Valentine's Day",
+            'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7',
+            'timestamp': 1375819729,
+            'upload_date': '20130806',
+            'uploader': 'AENE-NEW',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'add_ie': ['ThePlatform'],
+    }, {
+        'url': 'http://www.history.com/topics/world-war-i/world-war-i-history/videos',
+        'info_dict':
+        {
+            'id': 'world-war-i-history',
+            'title': 'World War I History',
+        },
+        'playlist_mincount': 24,
+    }, {
+        'url': 'http://www.history.com/topics/world-war-i-history/videos',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.history.com/topics/world-war-i/world-war-i-history',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.history.com/topics/world-war-i/world-war-i-history/speeches',
+        'only_matching': True,
+    }]
 
-        info = self._search_json_ld(webpage, video_id, fatal=False)
-        info.update({
+    def theplatform_url_result(self, theplatform_url, video_id, query):
+        return {
             '_type': 'url_transparent',
+            'id': video_id,
             'url': smuggle_url(
-                update_url_query(video_url, query),
+                update_url_query(theplatform_url, query),
                 {
                     'sig': {
-                        'key': 'crazyjava',
-                        'secret': 's3cr3t'},
+                        'key': self._THEPLATFORM_KEY,
+                        'secret': self._THEPLATFORM_SECRET,
+                    },
                     'force_smil_url': True
                 }),
-        })
-        return info
+            'ie_key': 'ThePlatform',
+        }
+
+    def _real_extract(self, url):
+        topic_id, video_display_id = re.match(self._VALID_URL, url).groups()
+        if video_display_id:
+            webpage = self._download_webpage(url, video_display_id)
+            release_url, video_id = re.search(r"_videoPlayer.play\('([^']+)'\s*,\s*'[^']+'\s*,\s*'(\d+)'\)", webpage).groups()
+            release_url = unescapeHTML(release_url)
+
+            return self.theplatform_url_result(
+                release_url, video_id, {
+                    'mbr': 'true',
+                    'switch': 'hls'
+                })
+        else:
+            webpage = self._download_webpage(url, topic_id)
+            entries = []
+            for episode_item in re.findall(r'<a.+?data-release-url="[^"]+"[^>]*>', webpage):
+                video_attributes = extract_attributes(episode_item)
+                entries.append(self.theplatform_url_result(
+                    video_attributes['data-release-url'], video_attributes['data-id'], {
+                        'mbr': 'true',
+                        'switch': 'hls'
+                    }))
+            return self.playlist_result(entries, topic_id, get_element_by_attribute('class', 'show-title', webpage))
diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py
new file mode 100644 (file)
index 0000000..518c61f
--- /dev/null
@@ -0,0 +1,133 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_parse_urlparse,
+    compat_urlparse,
+)
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    xpath_element,
+    xpath_text,
+)
+
+
+class AfreecaTVIE(InfoExtractor):
+    IE_DESC = 'afreecatv.com'
+    _VALID_URL = r'''(?x)^
+        https?://(?:(live|afbbs|www)\.)?afreeca(?:tv)?\.com(?::\d+)?
+        (?:
+            /app/(?:index|read_ucc_bbs)\.cgi|
+            /player/[Pp]layer\.(?:swf|html))
+        \?.*?\bnTitleNo=(?P<id>\d+)'''
+    _TESTS = [{
+        'url': 'http://live.afreecatv.com:8079/app/index.cgi?szType=read_ucc_bbs&szBjId=dailyapril&nStationNo=16711924&nBbsNo=18605867&nTitleNo=36164052&szSkin=',
+        'md5': 'f72c89fe7ecc14c1b5ce506c4996046e',
+        'info_dict': {
+            'id': '36164052',
+            'ext': 'mp4',
+            'title': '데일리 에이프릴 요정들의 시상식!',
+            'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
+            'uploader': 'dailyapril',
+            'uploader_id': 'dailyapril',
+            'upload_date': '20160503',
+        }
+    }, {
+        'url': 'http://afbbs.afreecatv.com:8080/app/read_ucc_bbs.cgi?nStationNo=16711924&nTitleNo=36153164&szBjId=dailyapril&nBbsNo=18605867',
+        'info_dict': {
+            'id': '36153164',
+            'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'",
+            'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
+            'uploader': 'dailyapril',
+            'uploader_id': 'dailyapril',
+        },
+        'playlist_count': 2,
+        'playlist': [{
+            'md5': 'd8b7c174568da61d774ef0203159bf97',
+            'info_dict': {
+                'id': '36153164_1',
+                'ext': 'mp4',
+                'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'",
+                'upload_date': '20160502',
+            },
+        }, {
+            'md5': '58f2ce7f6044e34439ab2d50612ab02b',
+            'info_dict': {
+                'id': '36153164_2',
+                'ext': 'mp4',
+                'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'",
+                'upload_date': '20160502',
+            },
+        }],
+    }, {
+        'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def parse_video_key(key):
+        video_key = {}
+        m = re.match(r'^(?P<upload_date>\d{8})_\w+_(?P<part>\d+)$', key)
+        if m:
+            video_key['upload_date'] = m.group('upload_date')
+            video_key['part'] = m.group('part')
+        return video_key
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        parsed_url = compat_urllib_parse_urlparse(url)
+        info_url = compat_urlparse.urlunparse(parsed_url._replace(
+            netloc='afbbs.afreecatv.com:8080',
+            path='/api/video/get_video_info.php'))
+        video_xml = self._download_xml(info_url, video_id)
+
+        if xpath_element(video_xml, './track/video/file') is None:
+            raise ExtractorError('Specified AfreecaTV video does not exist',
+                                 expected=True)
+
+        title = xpath_text(video_xml, './track/title', 'title')
+        uploader = xpath_text(video_xml, './track/nickname', 'uploader')
+        uploader_id = xpath_text(video_xml, './track/bj_id', 'uploader id')
+        duration = int_or_none(xpath_text(video_xml, './track/duration',
+                                          'duration'))
+        thumbnail = xpath_text(video_xml, './track/titleImage', 'thumbnail')
+
+        entries = []
+        for i, video_file in enumerate(video_xml.findall('./track/video/file')):
+            video_key = self.parse_video_key(video_file.get('key', ''))
+            if not video_key:
+                continue
+            entries.append({
+                'id': '%s_%s' % (video_id, video_key.get('part', i + 1)),
+                'title': title,
+                'upload_date': video_key.get('upload_date'),
+                'duration': int_or_none(video_file.get('duration')),
+                'url': video_file.text,
+            })
+
+        info = {
+            'id': video_id,
+            'title': title,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'duration': duration,
+            'thumbnail': thumbnail,
+        }
+
+        if len(entries) > 1:
+            info['_type'] = 'multi_video'
+            info['entries'] = entries
+        elif len(entries) == 1:
+            info['url'] = entries[0]['url']
+            info['upload_date'] = entries[0].get('upload_date')
+        else:
+            raise ExtractorError(
+                'No files found for the specified AfreecaTV video, either'
+                ' the URL is incorrect or the video has been made private.',
+                expected=True)
+
+        return info
index d548592fe8acbbf2db432db3ed699b80b78e0aa0..5766b4fe80ab3e827ce9cccfe8e5460113c61440 100644 (file)
@@ -24,10 +24,10 @@ class AftonbladetIE(InfoExtractor):
         webpage = self._download_webpage(url, video_id)
 
         # find internal video meta data
-        meta_url = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json'
+        meta_url = 'http://aftonbladet-play-metadata.cdn.drvideo.aptoma.no/video/%s.json'
         player_config = self._parse_json(self._html_search_regex(
             r'data-player-config="([^"]+)"', webpage, 'player config'), video_id)
-        internal_meta_id = player_config['videoId']
+        internal_meta_id = player_config['aptomaVideoId']
         internal_meta_url = meta_url % internal_meta_id
         internal_meta_json = self._download_json(
             internal_meta_url, video_id, 'Downloading video meta data')
index 138fa08086ee2d4e7c446c09ce85b8726e4ff255..e8e40126baca4bad27f8593dd9bd026f16fad131 100644 (file)
@@ -5,6 +5,8 @@ from .common import InfoExtractor
 from ..utils import (
     int_or_none,
     parse_iso8601,
+    mimetype2ext,
+    determine_ext,
 )
 
 
@@ -50,21 +52,25 @@ class AMPIE(InfoExtractor):
         if isinstance(media_content, dict):
             media_content = [media_content]
         for media_data in media_content:
-            media = media_data['@attributes']
-            media_type = media['type']
-            if media_type == 'video/f4m':
+            media = media_data.get('@attributes', {})
+            media_url = media.get('url')
+            if not media_url:
+                continue
+            ext = mimetype2ext(media.get('type')) or determine_ext(media_url)
+            if ext == 'f4m':
                 formats.extend(self._extract_f4m_formats(
-                    media['url'] + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124',
+                    media_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124',
                     video_id, f4m_id='hds', fatal=False))
-            elif media_type == 'application/x-mpegURL':
+            elif ext == 'm3u8':
                 formats.extend(self._extract_m3u8_formats(
-                    media['url'], video_id, 'mp4', m3u8_id='hls', fatal=False))
+                    media_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
             else:
                 formats.append({
-                    'format_id': media_data['media-category']['@attributes']['label'],
+                    'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'),
                     'url': media['url'],
                     'tbr': int_or_none(media.get('bitrate')),
                     'filesize': int_or_none(media.get('fileSize')),
+                    'ext': ext,
                 })
 
         self._sort_formats(formats)
index 9b01e38f5fe8b5a80b2635061433cc214fb1b315..159c6ef5afa423458505e3a74f1ceadce5e31a07 100644 (file)
@@ -22,6 +22,7 @@ class AnimeOnDemandIE(InfoExtractor):
     _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply'
     _NETRC_MACHINE = 'animeondemand'
     _TESTS = [{
+        # jap, OmU
         'url': 'https://www.anime-on-demand.de/anime/161',
         'info_dict': {
             'id': '161',
@@ -30,17 +31,21 @@ class AnimeOnDemandIE(InfoExtractor):
         },
         'playlist_mincount': 4,
     }, {
-        # Film wording is used instead of Episode
+        # Film wording is used instead of Episode, ger/jap, Dub/OmU
         'url': 'https://www.anime-on-demand.de/anime/39',
         'only_matching': True,
     }, {
-        # Episodes without titles
+        # Episodes without titles, jap, OmU
         'url': 'https://www.anime-on-demand.de/anime/162',
         'only_matching': True,
     }, {
         # ger/jap, Dub/OmU, account required
         'url': 'https://www.anime-on-demand.de/anime/169',
         'only_matching': True,
+    }, {
+        # Full length film, non-series, ger/jap, Dub/OmU, account required
+        'url': 'https://www.anime-on-demand.de/anime/185',
+        'only_matching': True,
     }]
 
     def _login(self):
@@ -110,35 +115,12 @@ class AnimeOnDemandIE(InfoExtractor):
 
         entries = []
 
-        for num, episode_html in enumerate(re.findall(
-                r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', webpage), 1):
-            episodebox_title = self._search_regex(
-                (r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1',
-                 r'class="episodebox-title"[^>]+>(?P<title>.+?)<'),
-                episode_html, 'episodebox title', default=None, group='title')
-            if not episodebox_title:
-                continue
-
-            episode_number = int(self._search_regex(
-                r'(?:Episode|Film)\s*(\d+)',
-                episodebox_title, 'episode number', default=num))
-            episode_title = self._search_regex(
-                r'(?:Episode|Film)\s*\d+\s*-\s*(.+)',
-                episodebox_title, 'episode title', default=None)
-
-            video_id = 'episode-%d' % episode_number
-
-            common_info = {
-                'id': video_id,
-                'series': anime_title,
-                'episode': episode_title,
-                'episode_number': episode_number,
-            }
-
+        def extract_info(html, video_id, num=None):
+            title, description = [None] * 2
             formats = []
 
             for input_ in re.findall(
-                    r'<input[^>]+class=["\'].*?streamstarter_html5[^>]+>', episode_html):
+                    r'<input[^>]+class=["\'].*?streamstarter_html5[^>]+>', html):
                 attributes = extract_attributes(input_)
                 playlist_urls = []
                 for playlist_key in ('data-playlist', 'data-otherplaylist'):
@@ -161,7 +143,7 @@ class AnimeOnDemandIE(InfoExtractor):
                         format_id_list.append(lang)
                     if kind:
                         format_id_list.append(kind)
-                    if not format_id_list:
+                    if not format_id_list and num is not None:
                         format_id_list.append(compat_str(num))
                     format_id = '-'.join(format_id_list)
                     format_note = ', '.join(filter(None, (kind, lang_note)))
@@ -215,28 +197,74 @@ class AnimeOnDemandIE(InfoExtractor):
                             })
                         formats.extend(file_formats)
 
-            if formats:
-                self._sort_formats(formats)
+            return {
+                'title': title,
+                'description': description,
+                'formats': formats,
+            }
+
+        def extract_entries(html, video_id, common_info, num):
+            info = extract_info(html, video_id, num)
+
+            if info['formats']:
+                self._sort_formats(info['formats'])
                 f = common_info.copy()
-                f.update({
-                    'title': title,
-                    'description': description,
-                    'formats': formats,
-                })
+                f.update(info)
                 entries.append(f)
 
-            # Extract teaser only when full episode is not available
-            if not formats:
+            # Extract teaser/trailer only when full episode is not available
+            if not info['formats']:
                 m = re.search(
-                    r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>Teaser<',
-                    episode_html)
+                    r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>(?P<kind>Teaser|Trailer)<',
+                    html)
                 if m:
                     f = common_info.copy()
                     f.update({
-                        'id': '%s-teaser' % f['id'],
+                        'id': '%s-%s' % (f['id'], m.group('kind').lower()),
                         'title': m.group('title'),
                         'url': compat_urlparse.urljoin(url, m.group('href')),
                     })
                     entries.append(f)
 
+        def extract_episodes(html):
+            for num, episode_html in enumerate(re.findall(
+                    r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', html), 1):
+                episodebox_title = self._search_regex(
+                    (r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1',
+                     r'class="episodebox-title"[^>]+>(?P<title>.+?)<'),
+                    episode_html, 'episodebox title', default=None, group='title')
+                if not episodebox_title:
+                    continue
+
+                episode_number = int(self._search_regex(
+                    r'(?:Episode|Film)\s*(\d+)',
+                    episodebox_title, 'episode number', default=num))
+                episode_title = self._search_regex(
+                    r'(?:Episode|Film)\s*\d+\s*-\s*(.+)',
+                    episodebox_title, 'episode title', default=None)
+
+                video_id = 'episode-%d' % episode_number
+
+                common_info = {
+                    'id': video_id,
+                    'series': anime_title,
+                    'episode': episode_title,
+                    'episode_number': episode_number,
+                }
+
+                extract_entries(episode_html, video_id, common_info)
+
+        def extract_film(html, video_id):
+            common_info = {
+                'id': anime_id,
+                'title': anime_title,
+                'description': anime_description,
+            }
+            extract_entries(html, video_id, common_info)
+
+        extract_episodes(webpage)
+
+        if not entries:
+            extract_film(webpage, anime_id)
+
         return self.playlist_result(entries, anime_id, anime_title, anime_description)
diff --git a/youtube_dl/extractor/anvato.py b/youtube_dl/extractor/anvato.py
new file mode 100644 (file)
index 0000000..cb29cf1
--- /dev/null
@@ -0,0 +1,224 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import base64
+import hashlib
+import json
+import random
+import time
+
+from .common import InfoExtractor
+from ..aes import aes_encrypt
+from ..compat import compat_str
+from ..utils import (
+    bytes_to_intlist,
+    determine_ext,
+    intlist_to_bytes,
+    int_or_none,
+    strip_jsonp,
+)
+
+
+def md5_text(s):
+    if not isinstance(s, compat_str):
+        s = compat_str(s)
+    return hashlib.md5(s.encode('utf-8')).hexdigest()
+
+
+class AnvatoIE(InfoExtractor):
+    # Copied from anvplayer.min.js
+    _ANVACK_TABLE = {
+        'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ',
+        'nbcu_nbcd_desktop_web_qa_1a6f01bdd0dc45a439043b694c8a031d': 'eSxJUbA2UUKBTXryyQ2d6NuM8oEqaPySvaPzfKNA',
+        'nbcu_nbcd_desktop_web_acc_eb2ff240a5d4ae9a63d4c297c32716b6c523a129': '89JR3RtUGbvKuuJIiKOMK0SoarLb5MUx8v89RcbP',
+        'nbcu_nbcd_watchvod_web_prod_e61107507180976724ec8e8319fe24ba5b4b60e1': 'Uc7dFt7MJ9GsBWB5T7iPvLaMSOt8BBxv4hAXk5vv',
+        'nbcu_nbcd_watchvod_web_qa_42afedba88a36203db5a4c09a5ba29d045302232': 'T12oDYVFP2IaFvxkmYMy5dKxswpLHtGZa4ZAXEi7',
+        'nbcu_nbcd_watchvod_web_acc_9193214448e2e636b0ffb78abacfd9c4f937c6ca': 'MmobcxUxMedUpohNWwXaOnMjlbiyTOBLL6d46ZpR',
+        'nbcu_local_monitor_web_acc_f998ad54eaf26acd8ee033eb36f39a7b791c6335': 'QvfIoPYrwsjUCcASiw3AIkVtQob2LtJHfidp9iWg',
+        'nbcu_cable_monitor_web_acc_a413759603e8bedfcd3c61b14767796e17834077': 'uwVPJLShvJWSs6sWEIuVem7MTF8A4IknMMzIlFto',
+        'nbcu_nbcd_mcpstage_web_qa_4c43a8f6e95a88dbb40276c0630ba9f693a63a4e': 'PxVYZVwjhgd5TeoPRxL3whssb5OUPnM3zyAzq8GY',
+        'nbcu_comcast_comcast_web_prod_074080762ad4ce956b26b43fb22abf153443a8c4': 'afnaRZfDyg1Z3WZHdupKfy6xrbAG2MHqe3VfuSwh',
+        'nbcu_comcast_comcast_web_qa_706103bb93ead3ef70b1de12a0e95e3c4481ade0': 'DcjsVbX9b3uoPlhdriIiovgFQZVxpISZwz0cx1ZK',
+        'nbcu_comcast_comcastcable_web_prod_669f04817536743563d7331c9293e59fbdbe3d07': '0RwMN2cWy10qhAhOscq3eK7aEe0wqnKt3vJ0WS4D',
+        'nbcu_comcast_comcastcable_web_qa_3d9d2d66219094127f0f6b09cc3c7bb076e3e1ca': '2r8G9DEya7PCqBceKZgrn2XkXgASjwLMuaFE1Aad',
+        'hearst_hearst_demo_web_stage_960726dfef3337059a01a78816e43b29ec04dfc7': 'cuZBPXTR6kSdoTCVXwk5KGA8rk3NrgGn4H6e9Dsp',
+        'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922': 'IOaaLQ8ymqVyem14QuAvE5SndQynTcH5CrLkU2Ih',
+        'anvato_nextmedia_demo_web_stage_9787d56a02ff6b9f43e9a2b0920d8ca88beb5818': 'Pqu9zVzI1ApiIzbVA3VkGBEQHvdKSUuKpD6s2uaR',
+        'anvato_scripps_app_web_prod_0837996dbe373629133857ae9eb72e740424d80a': 'du1ccmn7RxzgizwbWU7hyUaGodNlJn7HtXI0WgXW',
+        'anvato_scripps_app_web_stage_360797e00fe2826be142155c4618cc52fce6c26c': '2PMrQ0BRoqCWl7nzphj0GouIMEh2mZYivAT0S1Su',
+        'fs2go_fs2go_go_all_prod_21934911ccfafc03a075894ead2260d11e2ddd24': 'RcuHlKikW2IJw6HvVoEkqq2UsuEJlbEl11pWXs4Q',
+        'fs2go_fs2go_go_web_prod_ead4b0eec7460c1a07783808db21b49cf1f2f9a7': '4K0HTT2u1zkQA2MaGaZmkLa1BthGSBdr7jllrhk5',
+        'fs2go_fs2go_go_web_stage_407585454a4400355d4391691c67f361': 'ftnc37VKRJBmHfoGGi3kT05bHyeJzilEzhKJCyl3',
+        'fs2go_fs2go_go_android_stage_44b714db6f8477f29afcba15a41e1d30': 'CtxpPvVpo6AbZGomYUhkKs7juHZwNml9b9J0J2gI',
+        'anvato_cbslocal_app_web_prod_547f3e49241ef0e5d30c79b2efbca5d92c698f67': 'Pw0XX5KBDsyRnPS0R2JrSrXftsy8Jnz5pAjaYC8s',
+        'anvato_cbslocal_app_web_stage_547a5f096594cd3e00620c6f825cad1096d28c80': '37OBUhX2uwNyKhhrNzSSNHSRPZpApC3trdqDBpuz',
+        'fs2go_att_att_web_prod_1042dddd089a05438b6a08f972941176f699ffd8': 'JLcF20JwYvpv6uAGcLWIaV12jKwaL1R8us4b6Zkg',
+        'fs2go_att_att_web_stage_807c5001955fc114a3331fe027ddc76e': 'gbu1oO1y0JiOFh4SUipt86P288JHpyjSqolrrT1x',
+        'fs2go_fs2go_tudor_web_prod_a7dd8e5a7cdc830cae55eae6f3e9fee5ee49eb9b': 'ipcp87VCEZXPPe868j3orLqzc03oTy7DXsGkAXXH',
+        'anvato_mhz_app_web_prod_b808218b30de7fdf60340cbd9831512bc1bf6d37': 'Stlm5Gs6BEhJLRTZHcNquyzxGqr23EuFmE5DCgjX',
+        'fs2go_charter_charter_web_stage_c2c6e5a68375a1bf00fff213d3ff8f61a835a54c': 'Lz4hbJp1fwL6jlcz4M2PMzghM4jp4aAmybtT5dPc',
+        'fs2go_charter_charter_web_prod_ebfe3b10f1af215a7321cd3d629e0b81dfa6fa8c': 'vUJsK345A1bVmyYDRhZX0lqFIgVXuqhmuyp1EtPK',
+        'anvato_epfox_app_web_prod_b3373168e12f423f41504f207000188daf88251b': 'GDKq1ixvX3MoBNdU5IOYmYa2DTUXYOozPjrCJnW7',
+        'anvato_epfox_app_web_stage_a3c2ce60f8f83ef374a88b68ee73a950f8ab87ce': '2jz2NH4BsXMaDsoJ5qkHMbcczAfIReo2eFYuVC1C',
+        'fs2go_verizon_verizon_web_stage_08e6df0354a4803f1b1f2428b5a9a382e8dbcd62': 'rKTVapNaAcmnUbGL4ZcuOoY4SE7VmZSQsblPFr7e',
+        'fs2go_verizon_verizon_web_prod_f909564cb606eff1f731b5e22e0928676732c445': 'qLSUuHerM3u9eNPzaHyUK52obai5MvE4XDJfqYe1',
+        'fs2go_foxcom_synd_web_stage_f7b9091f00ea25a4fdaaae77fca5b54cdc7e7043': '96VKF2vLd24fFiDfwPFpzM5llFN4TiIGAlodE0Re',
+        'fs2go_foxcom_synd_web_prod_0f2cdd64d87e4ab6a1d54aada0ff7a7c8387a064': 'agiPjbXEyEZUkbuhcnmVPhe9NNVbDjCFq2xkcx51',
+        'anvato_own_app_web_stage_1214ade5d28422c4dae9d03c1243aba0563c4dba': 'mzhamNac3swG4WsJAiUTacnGIODi6SWeVWk5D7ho',
+        'anvato_own_app_web_prod_944e162ed927ec3e9ed13eb68ed2f1008ee7565e': '9TSxh6G2TXOLBoYm9ro3LdNjjvnXpKb8UR8KoIP9',
+        'anvato_scripps_app_ftv_prod_a10a10468edd5afb16fb48171c03b956176afad1': 'COJ2i2UIPK7xZqIWswxe7FaVBOVgRkP1F6O6qGoH',
+        'anvato_scripps_app_ftv_stage_77d3ad2bdb021ec37ca2e35eb09acd396a974c9a': 'Q7nnopNLe2PPfGLOTYBqxSaRpl209IhqaEuDZi1F',
+        'anvato_univision_app_web_stage_551236ef07a0e17718c3995c35586b5ed8cb5031': 'D92PoLS6UitwxDRA191HUGT9OYcOjV6mPMa5wNyo',
+        'anvato_univision_app_web_prod_039a5c0a6009e637ae8ac906718a79911e0e65e1': '5mVS5u4SQjtw6NGw2uhMbKEIONIiLqRKck5RwQLR',
+        'nbcu_cnbc_springfield_ios_prod_670207fae43d6e9a94c351688851a2ce': 'M7fqCCIP9lW53oJbHs19OlJlpDrVyc2OL8gNeuTa',
+        'nbcu_cnbc_springfieldvod_ios_prod_7a5f04b1ceceb0e9c9e2264a44aa236e08e034c2': 'Yia6QbJahW0S7K1I0drksimhZb4UFq92xLBmmMvk',
+        'anvato_cox_app_web_prod_ce45cda237969f93e7130f50ee8bb6280c1484ab': 'cc0miZexpFtdoqZGvdhfXsLy7FXjRAOgb9V0f5fZ',
+        'anvato_cox_app_web_stage_c23dbe016a8e9d8c7101d10172b92434f6088bf9': 'yivU3MYHd2eDZcOfmLbINVtqxyecKTOp8OjOuoGJ',
+        'anvato_chnzero_app_web_stage_b1164d1352b579e792e542fddf13ee34c0eeb46b': 'A76QkXMmVH8lTCfU15xva1mZnSVcqeY4Xb22Kp7m',
+        'anvato_chnzero_app_web_prod_253d358928dc08ec161eda2389d53707288a730c': 'OA5QI3ZWZZkdtUEDqh28AH8GedsF6FqzJI32596b',
+        'anvato_discovery_vodpoc_web_stage_9fa7077b5e8af1f8355f65d4fb8d2e0e9d54e2b7': 'q3oT191tTQ5g3JCP67PkjLASI9s16DuWZ6fYmry3',
+        'anvato_discovery_vodpoc_web_prod_688614983167a1af6cdf6d76343fda10a65223c1': 'qRvRQCTVHd0VVOHsMvvfidyWmlYVrTbjby7WqIuK',
+        'nbcu_cnbc_springfieldvod_ftv_stage_826040aad1925a46ac5dfb4b3c5143e648c6a30d': 'JQaSb5a8Tz0PT4ti329DNmzDO30TnngTHmvX8Vua',
+        'nbcu_cnbc_springfield_ftv_stage_826040aad1925a46ac5dfb4b3c5143e648c6a30d': 'JQaSb5a8Tz0PT4ti329DNmzDO30TnngTHmvX8Vua',
+        'nbcu_nbcd_capture_web_stage_4dd9d585bfb984ebf856dee35db027b2465cc4ae': '0j1Ov4Vopyi2HpBZJYdL2m8ERJVGYh3nNpzPiO8F',
+        'nbcu_nbcd_watch3_android_prod_7712ca5fcf1c22f19ec1870a9650f9c37db22dcf': '3LN2UB3rPUAMu7ZriWkHky9vpLMXYha8JbSnxBlx',
+        'nbcu_nbcd_watchvod3_android_prod_0910a3a4692d57c0b5ff4316075bc5d096be45b9': 'mJagcQ2II30vUOAauOXne7ERwbf5S9nlB3IP17lQ',
+        'anvato_scripps_app_atv_prod_790deda22e16e71e83df58f880cd389908a45d52': 'CB6trI1mpoDIM5o54DNTsji90NDBQPZ4z4RqBNSH',
+        'nbcu_nbcd_watchv4_android_prod_ff67cef9cb409158c6f8c3533edddadd0b750507': 'j8CHQCUWjlYERj4NFRmUYOND85QNbHViH09UwuKm',
+        'nbcu_nbcd_watchvodv4_android_prod_a814d781609989dea6a629d50ae4c7ad8cc8e907': 'rkVnUXxdA9rawVLUlDQtMue9Y4Q7lFEaIotcUhjt',
+        'rvVKpA50qlOPLFxMjrCGf5pdkdQDm7qn': '1J7ZkY5Qz5lMLi93QOH9IveE7EYB3rLl',
+        'nbcu_dtv_local_web_prod_b266cf49defe255fd4426a97e27c09e513e9f82f': 'HuLnJDqzLa4saCzYMJ79zDRSQpEduw1TzjMNQu2b',
+        'nbcu_att_local_web_prod_4cef038b2d969a6b7d700a56a599040b6a619f67': 'Q0Em5VDc2KpydUrVwzWRXAwoNBulWUxCq2faK0AV',
+        'nbcu_dish_local_web_prod_c56dcaf2da2e9157a4266c82a78195f1dd570f6b': 'bC1LWmRz9ayj2AlzizeJ1HuhTfIaJGsDBnZNgoRg',
+        'nbcu_verizon_local_web_prod_88bebd2ce006d4ed980de8133496f9a74cb9b3e1': 'wzhDKJZpgvUSS1EQvpCQP8Q59qVzcPixqDGJefSk',
+        'nbcu_charter_local_web_prod_9ad90f7fc4023643bb718f0fe0fd5beea2382a50': 'PyNbxNhEWLzy1ZvWEQelRuIQY88Eub7xbSVRMdfT',
+        'nbcu_suddenlink_local_web_prod_20fb711725cac224baa1c1cb0b1c324d25e97178': '0Rph41lPXZbb3fqeXtHjjbxfSrNbtZp1Ygq7Jypa',
+        'nbcu_wow_local_web_prod_652d9ce4f552d9c2e7b5b1ed37b8cb48155174ad': 'qayIBZ70w1dItm2zS42AptXnxW15mkjRrwnBjMPv',
+        'nbcu_centurylink_local_web_prod_2034402b029bf3e837ad46814d9e4b1d1345ccd5': 'StePcPMkjsX51PcizLdLRMzxMEl5k2FlsMLUNV4k',
+        'nbcu_atlanticbrd_local_web_prod_8d5f5ecbf7f7b2f5e6d908dd75d90ae3565f682e': 'NtYLb4TFUS0pRs3XTkyO5sbVGYjVf17bVbjaGscI',
+        'nbcu_nbcd_watchvod_web_dev_08bc05699be47c4f31d5080263a8cfadc16d0f7c': 'hwxi2dgDoSWgfmVVXOYZm14uuvku4QfopstXckhr',
+        'anvato_nextmedia_app_web_prod_a4fa8c7204aa65e71044b57aaf63711980cfe5a0': 'tQN1oGPYY1nM85rJYePWGcIb92TG0gSqoVpQTWOw',
+        'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749': 'GUXNf5ZDX2jFUpu4WT2Go4DJ5nhUCzpnwDRRUx1K',
+        'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa': 'bLDYF8JqfG42b7bwKEgQiU9E2LTIAtnKzSgYpFUH',
+        'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a': 'icgGoYGipQMMSEvhplZX1pwbN69srwKYWksz3xWK',
+        'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336': 'fA2iQdI7RDpynqzQYIpXALVS83NTPr8LLFK4LFsu',
+        'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3': 'rMOUZqe9lwcGq2mNgG3EDusm6lKgsUnczoOX3mbg',
+        'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900': 'rMOUZqe9lwcGq2mNgG3EDusm6lKgsUnczoOX3mbg',
+        'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99': 'P3uXJ0fXXditBPCGkfvlnVScpPEfKmc64Zv7ZgbK',
+        'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe': 'mGPvo5ZA5SgjOFAPEPXv7AnOpFUICX8hvFQVz69n',
+        'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582': 'qyT6PXXLjVNCrHaRVj0ugAhalNRS7Ee9BP7LUokD',
+        'nbcu_nbcd_watchvodv4_web_stage_4108362fba2d4ede21f262fea3c4162cbafd66c7': 'DhaU5lj0W2gEdcSSsnxURq8t7KIWtJfD966crVDk',
+        'anvato_scripps_app_ios_prod_409c41960c60b308db43c3cc1da79cab9f1c3d93': 'WPxj5GraLTkYCyj3M7RozLqIycjrXOEcDGFMIJPn',
+        'EZqvRyKBJLrgpClDPDF8I7Xpdp40Vx73': '4OxGd2dEakylntVKjKF0UK9PDPYB6A9W',
+        'M2v78QkpleXm9hPp9jUXI63x5vA6BogR': 'ka6K32k7ZALmpINkjJUGUo0OE42Md1BQ',
+        'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ'
+    }
+
+    _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce'
+
+    def __init__(self, *args, **kwargs):
+        super(AnvatoIE, self).__init__(*args, **kwargs)
+        self.__server_time = None
+
+    def _server_time(self, access_key, video_id):
+        if self.__server_time is not None:
+            return self.__server_time
+
+        self.__server_time = int(self._download_json(
+            self._api_prefix(access_key) + 'server_time?anvack=' + access_key, video_id,
+            note='Fetching server time')['server_time'])
+
+        return self.__server_time
+
+    def _api_prefix(self, access_key):
+        return 'https://tkx2-%s.anvato.net/rest/v2/' % ('prod' if 'prod' in access_key else 'stage')
+
+    def _get_video_json(self, access_key, video_id):
+        # See et() in anvplayer.min.js, which is an alias of getVideoJSON()
+        video_data_url = self._api_prefix(access_key) + 'mcp/video/%s?anvack=%s' % (video_id, access_key)
+        server_time = self._server_time(access_key, video_id)
+        input_data = '%d~%s~%s' % (server_time, md5_text(video_data_url), md5_text(server_time))
+
+        auth_secret = intlist_to_bytes(aes_encrypt(
+            bytes_to_intlist(input_data[:64]), bytes_to_intlist(self._AUTH_KEY)))
+
+        video_data_url += '&X-Anvato-Adst-Auth=' + base64.b64encode(auth_secret).decode('ascii')
+        anvrid = md5_text(time.time() * 1000 * random.random())[:30]
+        payload = {
+            'api': {
+                'anvrid': anvrid,
+                'anvstk': md5_text('%s|%s|%d|%s' % (
+                    access_key, anvrid, server_time, self._ANVACK_TABLE[access_key])),
+                'anvts': server_time,
+            },
+        }
+
+        return self._download_json(
+            video_data_url, video_id, transform_source=strip_jsonp,
+            data=json.dumps(payload).encode('utf-8'))
+
+    def _extract_anvato_videos(self, webpage, video_id):
+        anvplayer_data = self._parse_json(self._html_search_regex(
+            r'<script[^>]+data-anvp=\'([^\']+)\'', webpage,
+            'Anvato player data'), video_id)
+
+        video_id = anvplayer_data['video']
+        access_key = anvplayer_data['accessKey']
+
+        video_data = self._get_video_json(access_key, video_id)
+
+        formats = []
+        for published_url in video_data['published_urls']:
+            video_url = published_url['embed_url']
+            ext = determine_ext(video_url)
+
+            if ext == 'smil':
+                formats.extend(self._extract_smil_formats(video_url, video_id))
+                continue
+
+            tbr = int_or_none(published_url.get('kbps'))
+            a_format = {
+                'url': video_url,
+                'format_id': ('-'.join(filter(None, ['http', published_url.get('cdn_name')]))).lower(),
+                'tbr': tbr if tbr != 0 else None,
+            }
+
+            if ext == 'm3u8':
+                # Not using _extract_m3u8_formats here as individual media
+                # playlists are also included in published_urls.
+                if tbr is None:
+                    formats.append(self._m3u8_meta_format(video_url, ext='mp4', m3u8_id='hls'))
+                    continue
+                else:
+                    a_format.update({
+                        'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])),
+                        'ext': 'mp4',
+                    })
+            elif ext == 'mp3':
+                a_format['vcodec'] = 'none'
+            else:
+                a_format.update({
+                    'width': int_or_none(published_url.get('width')),
+                    'height': int_or_none(published_url.get('height')),
+                })
+            formats.append(a_format)
+
+        self._sort_formats(formats)
+
+        subtitles = {}
+        for caption in video_data.get('captions', []):
+            a_caption = {
+                'url': caption['url'],
+                'ext': 'tt' if caption.get('format') == 'SMPTE-TT' else None
+            }
+            subtitles.setdefault(caption['language'], []).append(a_caption)
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': video_data.get('def_title'),
+            'description': video_data.get('def_description'),
+            'categories': video_data.get('categories'),
+            'thumbnail': video_data.get('thumbnail'),
+            'subtitles': subtitles,
+        }
index 24df8fe9305e7df0487965ed03756305feca3dea..42c21bf41d975bcb49fd6c398b19ed2897cd3bd2 100644 (file)
@@ -12,7 +12,7 @@ from ..utils import (
 
 class AolIE(InfoExtractor):
     IE_NAME = 'on.aol.com'
-    _VALID_URL = r'(?:aol-video:|https?://on\.aol\.com/.*-)(?P<id>[^/?-]+)'
+    _VALID_URL = r'(?:aol-video:|https?://on\.aol\.com/(?:[^/]+/)*(?:[^/?#&]+-)?)(?P<id>[^/?#&]+)'
 
     _TESTS = [{
         # video with 5min ID
@@ -53,6 +53,12 @@ class AolIE(InfoExtractor):
     }, {
         'url': 'http://on.aol.com/shows/park-bench-shw518173474-559a1b9be4b0c3bfad3357a7?context=SH:SHW518173474:PL4327:1460619712763',
         'only_matching': True,
+    }, {
+        'url': 'http://on.aol.com/video/519442220',
+        'only_matching': True,
+    }, {
+        'url': 'aol-video:5707d6b8e4b090497b04f706',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
index be40f85b487057b4cb319dba102cec76519880a5..a6801f3d4860414c286277c92bd994e16212cffd 100644 (file)
@@ -7,6 +7,8 @@ from .common import InfoExtractor
 from ..compat import compat_urlparse
 from ..utils import (
     int_or_none,
+    parse_duration,
+    unified_strdate,
 )
 
 
@@ -16,7 +18,8 @@ class AppleTrailersIE(InfoExtractor):
     _TESTS = [{
         'url': 'http://trailers.apple.com/trailers/wb/manofsteel/',
         'info_dict': {
-            'id': 'manofsteel',
+            'id': '5111',
+            'title': 'Man of Steel',
         },
         'playlist': [
             {
@@ -70,6 +73,15 @@ class AppleTrailersIE(InfoExtractor):
             'id': 'blackthorn',
         },
         'playlist_mincount': 2,
+        'expected_warnings': ['Unable to download JSON metadata'],
+    }, {
+        # json data only available from http://trailers.apple.com/trailers/feeds/data/15881.json
+        'url': 'http://trailers.apple.com/trailers/fox/kungfupanda3/',
+        'info_dict': {
+            'id': '15881',
+            'title': 'Kung Fu Panda 3',
+        },
+        'playlist_mincount': 4,
     }, {
         'url': 'http://trailers.apple.com/ca/metropole/autrui/',
         'only_matching': True,
@@ -85,6 +97,45 @@ class AppleTrailersIE(InfoExtractor):
         movie = mobj.group('movie')
         uploader_id = mobj.group('company')
 
+        webpage = self._download_webpage(url, movie)
+        film_id = self._search_regex(r"FilmId\s*=\s*'(\d+)'", webpage, 'film id')
+        film_data = self._download_json(
+            'http://trailers.apple.com/trailers/feeds/data/%s.json' % film_id,
+            film_id, fatal=False)
+
+        if film_data:
+            entries = []
+            for clip in film_data.get('clips', []):
+                clip_title = clip['title']
+
+                formats = []
+                for version, version_data in clip.get('versions', {}).items():
+                    for size, size_data in version_data.get('sizes', {}).items():
+                        src = size_data.get('src')
+                        if not src:
+                            continue
+                        formats.append({
+                            'format_id': '%s-%s' % (version, size),
+                            'url': re.sub(r'_(\d+p.mov)', r'_h\1', src),
+                            'width': int_or_none(size_data.get('width')),
+                            'height': int_or_none(size_data.get('height')),
+                            'language': version[:2],
+                        })
+                self._sort_formats(formats)
+
+                entries.append({
+                    'id': movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', clip_title).lower(),
+                    'formats': formats,
+                    'title': clip_title,
+                    'thumbnail': clip.get('screen') or clip.get('thumb'),
+                    'duration': parse_duration(clip.get('runtime') or clip.get('faded')),
+                    'upload_date': unified_strdate(clip.get('posted')),
+                    'uploader_id': uploader_id,
+                })
+
+            page_data = film_data.get('page', {})
+            return self.playlist_result(entries, film_id, page_data.get('movie_title'))
+
         playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc')
 
         def fix_html(s):
index 26446c2fe1e4ecb0b15b6ec87a927a2b6151a1da..13a06396d9bcb21e53a0d7dc58413b717729a93d 100644 (file)
@@ -8,12 +8,12 @@ from .generic import GenericIE
 from ..utils import (
     determine_ext,
     ExtractorError,
-    get_element_by_attribute,
     qualities,
     int_or_none,
     parse_duration,
     unified_strdate,
     xpath_text,
+    update_url_query,
 )
 from ..compat import compat_etree_fromstring
 
@@ -35,6 +35,7 @@ class ARDMediathekIE(InfoExtractor):
             # m3u8 download
             'skip_download': True,
         },
+        'skip': 'HTTP Error 404: Not Found',
     }, {
         'url': 'http://www.ardmediathek.de/tv/Tatort/Tatort-Scheinwelten-H%C3%B6rfassung-Video/Das-Erste/Video?documentId=29522730&bcastId=602916',
         'md5': 'f4d98b10759ac06c0072bbcd1f0b9e3e',
@@ -45,6 +46,7 @@ class ARDMediathekIE(InfoExtractor):
             'description': 'md5:196392e79876d0ac94c94e8cdb2875f1',
             'duration': 5252,
         },
+        'skip': 'HTTP Error 404: Not Found',
     }, {
         # audio
         'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086',
@@ -56,6 +58,7 @@ class ARDMediathekIE(InfoExtractor):
             'description': 'md5:f6e39f3461f0e1f54bfa48c8875c86ef',
             'duration': 3240,
         },
+        'skip': 'HTTP Error 404: Not Found',
     }, {
         'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
         'only_matching': True,
@@ -114,11 +117,14 @@ class ARDMediathekIE(InfoExtractor):
                         continue
                     if ext == 'f4m':
                         formats.extend(self._extract_f4m_formats(
-                            stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124',
-                            video_id, preference=-1, f4m_id='hds', fatal=False))
+                            update_url_query(stream_url, {
+                                'hdcore': '3.1.1',
+                                'plugin': 'aasp-3.1.1.69.124'
+                            }),
+                            video_id, f4m_id='hds', fatal=False))
                     elif ext == 'm3u8':
                         formats.extend(self._extract_m3u8_formats(
-                            stream_url, video_id, 'mp4', preference=1, m3u8_id='hls', fatal=False))
+                            stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
                     else:
                         if server and server.startswith('rtmp'):
                             f = {
@@ -232,7 +238,8 @@ class ARDIE(InfoExtractor):
             'title': 'Die Story im Ersten: Mission unter falscher Flagge',
             'upload_date': '20140804',
             'thumbnail': 're:^https?://.*\.jpg$',
-        }
+        },
+        'skip': 'HTTP Error 404: Not Found',
     }
 
     def _real_extract(self, url):
@@ -274,41 +281,3 @@ class ARDIE(InfoExtractor):
             'upload_date': upload_date,
             'thumbnail': thumbnail,
         }
-
-
-class SportschauIE(ARDMediathekIE):
-    IE_NAME = 'Sportschau'
-    _VALID_URL = r'(?P<baseurl>https?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video(?P<id>[^/#?]+))\.html'
-    _TESTS = [{
-        'url': 'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html',
-        'info_dict': {
-            'id': 'seppeltkokainhatnichtsmitklassischemdopingzutun100',
-            'ext': 'mp4',
-            'title': 'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun"',
-            'thumbnail': 're:^https?://.*\.jpg$',
-            'description': 'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.',
-        },
-        'params': {
-            # m3u8 download
-            'skip_download': True,
-        },
-    }]
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        base_url = mobj.group('baseurl')
-
-        webpage = self._download_webpage(url, video_id)
-        title = get_element_by_attribute('class', 'headline', webpage)
-        description = self._html_search_meta('description', webpage, 'description')
-
-        info = self._extract_media_info(
-            base_url + '-mc_defaultQuality-h.json', webpage, video_id)
-
-        info.update({
-            'title': title,
-            'description': description,
-        })
-
-        return info
index a9e3266dcb138794774e30ad2c0af0dea645463f..e0c5c18045312a064d8663a025a9fdaabb7a28df 100644 (file)
@@ -61,10 +61,7 @@ class ArteTvIE(InfoExtractor):
         }
 
 
-class ArteTVPlus7IE(InfoExtractor):
-    IE_NAME = 'arte.tv:+7'
-    _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&+])'
-
+class ArteTVBaseIE(InfoExtractor):
     @classmethod
     def _extract_url_info(cls, url):
         mobj = re.match(cls._VALID_URL, url)
@@ -78,60 +75,6 @@ class ArteTVPlus7IE(InfoExtractor):
             video_id = mobj.group('id')
         return video_id, lang
 
-    def _real_extract(self, url):
-        video_id, lang = self._extract_url_info(url)
-        webpage = self._download_webpage(url, video_id)
-        return self._extract_from_webpage(webpage, video_id, lang)
-
-    def _extract_from_webpage(self, webpage, video_id, lang):
-        patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']')
-        ids = (video_id, '')
-        # some pages contain multiple videos (like
-        # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D),
-        # so we first try to look for json URLs that contain the video id from
-        # the 'vid' parameter.
-        patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates]
-        json_url = self._html_search_regex(
-            patterns, webpage, 'json vp url', default=None)
-        if not json_url:
-            def find_iframe_url(webpage, default=NO_DEFAULT):
-                return self._html_search_regex(
-                    r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1',
-                    webpage, 'iframe url', group='url', default=default)
-
-            iframe_url = find_iframe_url(webpage, None)
-            if not iframe_url:
-                embed_url = self._html_search_regex(
-                    r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None)
-                if embed_url:
-                    player = self._download_json(
-                        embed_url, video_id, 'Downloading player page')
-                    iframe_url = find_iframe_url(player['html'])
-            # en and es URLs produce react-based pages with different layout (e.g.
-            # http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world)
-            if not iframe_url:
-                program = self._search_regex(
-                    r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n',
-                    webpage, 'program', default=None)
-                if program:
-                    embed_html = self._parse_json(program, video_id)
-                    if embed_html:
-                        iframe_url = find_iframe_url(embed_html['embed_html'])
-            if iframe_url:
-                json_url = compat_parse_qs(
-                    compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0]
-        if json_url:
-            title = self._search_regex(
-                r'<h3[^>]+title=(["\'])(?P<title>.+?)\1',
-                webpage, 'title', default=None, group='title')
-            return self._extract_from_json_url(json_url, video_id, lang, title=title)
-        # Different kind of embed URL (e.g.
-        # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium)
-        embed_url = self._search_regex(
-            r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1',
-            webpage, 'embed url', group='url')
-        return self.url_result(embed_url)
-
     def _extract_from_json_url(self, json_url, video_id, lang, title=None):
         info = self._download_json(json_url, video_id)
         player_info = info['videoJsonPlayer']
@@ -161,24 +104,53 @@ class ArteTVPlus7IE(InfoExtractor):
             'es': 'E[ESP]',
         }
 
+        langcode = LANGS.get(lang, lang)
+
         formats = []
         for format_id, format_dict in player_info['VSR'].items():
             f = dict(format_dict)
             versionCode = f.get('versionCode')
-            langcode = LANGS.get(lang, lang)
-            lang_rexs = [r'VO?%s-' % re.escape(langcode), r'VO?.-ST%s$' % re.escape(langcode)]
-            lang_pref = None
-            if versionCode:
-                matched_lang_rexs = [r for r in lang_rexs if re.match(r, versionCode)]
-                lang_pref = -10 if not matched_lang_rexs else 10 * len(matched_lang_rexs)
-            source_pref = 0
-            if versionCode is not None:
-                # The original version with subtitles has lower relevance
-                if re.match(r'VO-ST(F|A|E)', versionCode):
-                    source_pref -= 10
-                # The version with sourds/mal subtitles has also lower relevance
-                elif re.match(r'VO?(F|A|E)-STM\1', versionCode):
-                    source_pref -= 9
+            l = re.escape(langcode)
+
+            # Language preference from most to least priority
+            # Reference: section 5.6.3 of
+            # http://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-05.pdf
+            PREFERENCES = (
+                # original version in requested language, without subtitles
+                r'VO{0}$'.format(l),
+                # original version in requested language, with partial subtitles in requested language
+                r'VO{0}-ST{0}$'.format(l),
+                # original version in requested language, with subtitles for the deaf and hard-of-hearing in requested language
+                r'VO{0}-STM{0}$'.format(l),
+                # non-original (dubbed) version in requested language, without subtitles
+                r'V{0}$'.format(l),
+                # non-original (dubbed) version in requested language, with subtitles partial subtitles in requested language
+                r'V{0}-ST{0}$'.format(l),
+                # non-original (dubbed) version in requested language, with subtitles for the deaf and hard-of-hearing in requested language
+                r'V{0}-STM{0}$'.format(l),
+                # original version in requested language, with partial subtitles in different language
+                r'VO{0}-ST(?!{0}).+?$'.format(l),
+                # original version in requested language, with subtitles for the deaf and hard-of-hearing in different language
+                r'VO{0}-STM(?!{0}).+?$'.format(l),
+                # original version in different language, with partial subtitles in requested language
+                r'VO(?:(?!{0}).+?)?-ST{0}$'.format(l),
+                # original version in different language, with subtitles for the deaf and hard-of-hearing in requested language
+                r'VO(?:(?!{0}).+?)?-STM{0}$'.format(l),
+                # original version in different language, without subtitles
+                r'VO(?:(?!{0}))?$'.format(l),
+                # original version in different language, with partial subtitles in different language
+                r'VO(?:(?!{0}).+?)?-ST(?!{0}).+?$'.format(l),
+                # original version in different language, with subtitles for the deaf and hard-of-hearing in different language
+                r'VO(?:(?!{0}).+?)?-STM(?!{0}).+?$'.format(l),
+            )
+
+            for pref, p in enumerate(PREFERENCES):
+                if re.match(p, versionCode):
+                    lang_pref = len(PREFERENCES) - pref
+                    break
+            else:
+                lang_pref = -1
+
             format = {
                 'format_id': format_id,
                 'preference': -10 if f.get('videoFormat') == 'M3U8' else None,
@@ -188,7 +160,6 @@ class ArteTVPlus7IE(InfoExtractor):
                 'height': int_or_none(f.get('height')),
                 'tbr': int_or_none(f.get('bitrate')),
                 'quality': qfunc(f.get('quality')),
-                'source_preference': source_pref,
             }
 
             if f.get('mediaType') == 'rtmp':
@@ -207,28 +178,94 @@ class ArteTVPlus7IE(InfoExtractor):
         return info_dict
 
 
+class ArteTVPlus7IE(ArteTVBaseIE):
+    IE_NAME = 'arte.tv:+7'
+    _VALID_URL = r'https?://(?:(?:www|sites)\.)?arte\.tv/[^/]+/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+
+    _TESTS = [{
+        'url': 'http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D',
+        'only_matching': True,
+    }, {
+        'url': 'http://sites.arte.tv/karambolage/de/video/karambolage-22',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if ArteTVPlaylistIE.suitable(url) else super(ArteTVPlus7IE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        video_id, lang = self._extract_url_info(url)
+        webpage = self._download_webpage(url, video_id)
+        return self._extract_from_webpage(webpage, video_id, lang)
+
+    def _extract_from_webpage(self, webpage, video_id, lang):
+        patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']')
+        ids = (video_id, '')
+        # some pages contain multiple videos (like
+        # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D),
+        # so we first try to look for json URLs that contain the video id from
+        # the 'vid' parameter.
+        patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates]
+        json_url = self._html_search_regex(
+            patterns, webpage, 'json vp url', default=None)
+        if not json_url:
+            def find_iframe_url(webpage, default=NO_DEFAULT):
+                return self._html_search_regex(
+                    r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1',
+                    webpage, 'iframe url', group='url', default=default)
+
+            iframe_url = find_iframe_url(webpage, None)
+            if not iframe_url:
+                embed_url = self._html_search_regex(
+                    r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None)
+                if embed_url:
+                    player = self._download_json(
+                        embed_url, video_id, 'Downloading player page')
+                    iframe_url = find_iframe_url(player['html'])
+            # en and es URLs produce react-based pages with different layout (e.g.
+            # http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world)
+            if not iframe_url:
+                program = self._search_regex(
+                    r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n',
+                    webpage, 'program', default=None)
+                if program:
+                    embed_html = self._parse_json(program, video_id)
+                    if embed_html:
+                        iframe_url = find_iframe_url(embed_html['embed_html'])
+            if iframe_url:
+                json_url = compat_parse_qs(
+                    compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0]
+        if json_url:
+            title = self._search_regex(
+                r'<h3[^>]+title=(["\'])(?P<title>.+?)\1',
+                webpage, 'title', default=None, group='title')
+            return self._extract_from_json_url(json_url, video_id, lang, title=title)
+        # Different kind of embed URL (e.g.
+        # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium)
+        entries = [
+            self.url_result(url)
+            for _, url in re.findall(r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1', webpage)]
+        return self.playlist_result(entries)
+
+
 # It also uses the arte_vp_url url from the webpage to extract the information
 class ArteTVCreativeIE(ArteTVPlus7IE):
     IE_NAME = 'arte.tv:creative'
     _VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
 
     _TESTS = [{
-        'url': 'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design',
+        'url': 'http://creative.arte.tv/fr/episode/osmosis-episode-1',
         'info_dict': {
-            'id': '72176',
+            'id': '057405-001-A',
             'ext': 'mp4',
-            'title': 'Folge 2 - Corporate Design',
-            'upload_date': '20131004',
+            'title': 'OSMOSIS - N\'AYEZ PLUS PEUR D\'AIMER (1)',
+            'upload_date': '20150716',
         },
     }, {
         'url': 'http://creative.arte.tv/fr/Monty-Python-Reunion',
-        'info_dict': {
-            'id': '160676',
-            'ext': 'mp4',
-            'title': 'Monty Python live (mostly)',
-            'description': 'Événement ! Quarante-cinq ans après leurs premiers succès, les légendaires Monty Python remontent sur scène.\n',
-            'upload_date': '20140805',
-        }
+        'playlist_count': 11,
+        'add_ie': ['Youtube'],
     }, {
         'url': 'http://creative.arte.tv/de/episode/agentur-amateur-4-der-erste-kunde',
         'only_matching': True,
@@ -239,7 +276,7 @@ class ArteTVInfoIE(ArteTVPlus7IE):
     IE_NAME = 'arte.tv:info'
     _VALID_URL = r'https?://info\.arte\.tv/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://info.arte.tv/fr/service-civique-un-cache-misere',
         'info_dict': {
             'id': '067528-000-A',
@@ -247,7 +284,7 @@ class ArteTVInfoIE(ArteTVPlus7IE):
             'title': 'Service civique, un cache misère ?',
             'upload_date': '20160403',
         },
-    }
+    }]
 
 
 class ArteTVFutureIE(ArteTVPlus7IE):
@@ -272,6 +309,8 @@ class ArteTVDDCIE(ArteTVPlus7IE):
     IE_NAME = 'arte.tv:ddc'
     _VALID_URL = r'https?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>[^/?#&]+)'
 
+    _TESTS = []
+
     def _real_extract(self, url):
         video_id, lang = self._extract_url_info(url)
         if lang == 'folge':
@@ -290,7 +329,7 @@ class ArteTVConcertIE(ArteTVPlus7IE):
     IE_NAME = 'arte.tv:concert'
     _VALID_URL = r'https?://concert\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://concert.arte.tv/de/notwist-im-pariser-konzertclub-divan-du-monde',
         'md5': '9ea035b7bd69696b67aa2ccaaa218161',
         'info_dict': {
@@ -300,24 +339,23 @@ class ArteTVConcertIE(ArteTVPlus7IE):
             'upload_date': '20140128',
             'description': 'md5:486eb08f991552ade77439fe6d82c305',
         },
-    }
+    }]
 
 
 class ArteTVCinemaIE(ArteTVPlus7IE):
     IE_NAME = 'arte.tv:cinema'
     _VALID_URL = r'https?://cinema\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>.+)'
 
-    _TEST = {
-        'url': 'http://cinema.arte.tv/de/node/38291',
-        'md5': '6b275511a5107c60bacbeeda368c3aa1',
+    _TESTS = [{
+        'url': 'http://cinema.arte.tv/fr/article/les-ailes-du-desir-de-julia-reck',
+        'md5': 'a5b9dd5575a11d93daf0e3f404f45438',
         'info_dict': {
-            'id': '055876-000_PWA12025-D',
+            'id': '062494-000-A',
             'ext': 'mp4',
-            'title': 'Tod auf dem Nil',
-            'upload_date': '20160122',
-            'description': 'md5:7f749bbb77d800ef2be11d54529b96bc',
+            'title': 'Film lauréat du concours web - "Les ailes du désir" de Julia Reck',
+            'upload_date': '20150807',
         },
-    }
+    }]
 
 
 class ArteTVMagazineIE(ArteTVPlus7IE):
@@ -362,9 +400,42 @@ class ArteTVEmbedIE(ArteTVPlus7IE):
         )
     '''
 
+    _TESTS = []
+
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
         lang = mobj.group('lang')
         json_url = mobj.group('json_url')
         return self._extract_from_json_url(json_url, video_id, lang)
+
+
+class ArteTVPlaylistIE(ArteTVBaseIE):
+    IE_NAME = 'arte.tv:playlist'
+    _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/[^#]*#collection/(?P<id>PL-\d+)'
+
+    _TESTS = [{
+        'url': 'http://www.arte.tv/guide/de/plus7/?country=DE#collection/PL-013263/ARTETV',
+        'info_dict': {
+            'id': 'PL-013263',
+            'title': 'Areva & Uramin',
+            'description': 'md5:a1dc0312ce357c262259139cfd48c9bf',
+        },
+        'playlist_mincount': 6,
+    }, {
+        'url': 'http://www.arte.tv/guide/de/playlists?country=DE#collection/PL-013190/ARTETV',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        playlist_id, lang = self._extract_url_info(url)
+        collection = self._download_json(
+            'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos'
+            % (lang, playlist_id), playlist_id)
+        title = collection.get('title')
+        description = collection.get('shortDescription') or collection.get('teaserText')
+        entries = [
+            self._extract_from_json_url(
+                video['jsonUrl'], video.get('programId') or playlist_id, lang)
+            for video in collection['videos'] if video.get('jsonUrl')]
+        return self.playlist_result(entries, playlist_id, title, description)
index a52d26cecd1e98f8d4a902ed4e8051a42e21e200..f3bd4d4447f559a8bd924f7d796a1a9faf24b9d3 100644 (file)
@@ -6,6 +6,7 @@ import time
 
 from .common import InfoExtractor
 from .soundcloud import SoundcloudIE
+from ..compat import compat_str
 from ..utils import (
     ExtractorError,
     url_basename,
@@ -136,7 +137,7 @@ class AudiomackAlbumIE(InfoExtractor):
                         result[resultkey] = api_response[apikey]
                 song_id = url_basename(api_response['url']).rpartition('.')[0]
                 result['entries'].append({
-                    'id': api_response.get('id', song_id),
+                    'id': compat_str(api_response.get('id', song_id)),
                     'uploader': api_response.get('artist'),
                     'title': api_response.get('title', song_id),
                     'url': api_response['url'],
index efa624de1cbfddb741a7f8114059165d4e099095..a813eb429fe8168c2e4223342fd6540647fe127a 100644 (file)
@@ -46,6 +46,7 @@ class AzubuIE(InfoExtractor):
                 'uploader_id': 272749,
                 'view_count': int,
             },
+            'skip': 'Channel offline',
         },
     ]
 
@@ -56,22 +57,26 @@ class AzubuIE(InfoExtractor):
             'http://www.azubu.tv/api/video/%s' % video_id, video_id)['data']
 
         title = data['title'].strip()
-        description = data['description']
-        thumbnail = data['thumbnail']
-        view_count = data['view_count']
-        uploader = data['user']['username']
-        uploader_id = data['user']['id']
+        description = data.get('description')
+        thumbnail = data.get('thumbnail')
+        view_count = data.get('view_count')
+        user = data.get('user', {})
+        uploader = user.get('username')
+        uploader_id = user.get('id')
 
         stream_params = json.loads(data['stream_params'])
 
-        timestamp = float_or_none(stream_params['creationDate'], 1000)
-        duration = float_or_none(stream_params['length'], 1000)
+        timestamp = float_or_none(stream_params.get('creationDate'), 1000)
+        duration = float_or_none(stream_params.get('length'), 1000)
 
         renditions = stream_params.get('renditions') or []
         video = stream_params.get('FLVFullLength') or stream_params.get('videoFullLength')
         if video:
             renditions.append(video)
 
+        if not renditions and not user.get('channel', {}).get('is_live', True):
+            raise ExtractorError('%s said: channel is offline.' % self.IE_NAME, expected=True)
+
         formats = [{
             'url': fmt['url'],
             'width': fmt['frameWidth'],
index c1ef8051d3074a6551941bf140f88eee4ed8a124..991ab0676e6b93a1c64d04f48b3728551bc4ccf0 100644 (file)
@@ -29,7 +29,7 @@ class BandcampIE(InfoExtractor):
         '_skip': 'There is a limit of 200 free downloads / month for the test song'
     }, {
         'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
-        'md5': '2b68e5851514c20efdff2afc5603b8b4',
+        'md5': '73d0b3171568232574e45652f8720b5c',
         'info_dict': {
             'id': '2650410135',
             'ext': 'mp3',
@@ -48,6 +48,10 @@ class BandcampIE(InfoExtractor):
             if m_trackinfo:
                 json_code = m_trackinfo.group(1)
                 data = json.loads(json_code)[0]
+                track_id = compat_str(data['id'])
+
+                if not data.get('file'):
+                    raise ExtractorError('Not streamable', video_id=track_id, expected=True)
 
                 formats = []
                 for format_id, format_url in data['file'].items():
@@ -64,7 +68,7 @@ class BandcampIE(InfoExtractor):
                 self._sort_formats(formats)
 
                 return {
-                    'id': compat_str(data['id']),
+                    'id': track_id,
                     'title': data['title'],
                     'formats': formats,
                     'duration': float_or_none(data.get('duration')),
index 74c4510f9b4522b0a914cdf1621bff832ac94638..4b3cd8c65a65967c5ad017d53126e9ccc76a71ef 100644 (file)
@@ -31,7 +31,7 @@ class BBCCoUkIE(InfoExtractor):
                             music/clips[/#]|
                             radio/player/
                         )
-                        (?P<id>%s)
+                        (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
                     ''' % _ID_REGEX
 
     _MEDIASELECTOR_URLS = [
@@ -192,6 +192,7 @@ class BBCCoUkIE(InfoExtractor):
                 # rtmp download
                 'skip_download': True,
             },
+            'skip': 'Now it\'s really geo-restricted',
         }, {
             # compact player (https://github.com/rg3/youtube-dl/issues/8147)
             'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
@@ -698,7 +699,9 @@ class BBCIE(BBCCoUkIE):
 
     @classmethod
     def suitable(cls, url):
-        return False if BBCCoUkIE.suitable(url) or BBCCoUkArticleIE.suitable(url) else super(BBCIE, cls).suitable(url)
+        EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE)
+        return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
+                else super(BBCIE, cls).suitable(url))
 
     def _extract_from_media_meta(self, media_meta, video_id):
         # Direct links to media in media metadata (e.g.
@@ -975,3 +978,72 @@ class BBCCoUkArticleIE(InfoExtractor):
             r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
 
         return self.playlist_result(entries, playlist_id, title, description)
+
+
+class BBCCoUkPlaylistBaseIE(InfoExtractor):
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        entries = [
+            self.url_result(self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
+            for video_id in re.findall(
+                self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage)]
+
+        title, description = self._extract_title_and_description(webpage)
+
+        return self.playlist_result(entries, playlist_id, title, description)
+
+
+class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
+    IE_NAME = 'bbc.co.uk:iplayer:playlist'
+    _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/episodes/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
+    _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s'
+    _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)'
+    _TEST = {
+        'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
+        'info_dict': {
+            'id': 'b05rcz9v',
+            'title': 'The Disappearance',
+            'description': 'French thriller serial about a missing teenager.',
+        },
+        'playlist_mincount': 6,
+    }
+
+    def _extract_title_and_description(self, webpage):
+        title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
+        description = self._search_regex(
+            r'<p[^>]+class=(["\'])subtitle\1[^>]*>(?P<value>[^<]+)</p>',
+            webpage, 'description', fatal=False, group='value')
+        return title, description
+
+
+class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
+    IE_NAME = 'bbc.co.uk:playlist'
+    _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
+    _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
+    _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
+    _TESTS = [{
+        'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
+        'info_dict': {
+            'id': 'b05rcz9v',
+            'title': 'The Disappearance - Clips - BBC Four',
+            'description': 'French thriller serial about a missing teenager.',
+        },
+        'playlist_mincount': 7,
+    }, {
+        'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
+        'only_matching': True,
+    }]
+
+    def _extract_title_and_description(self, webpage):
+        title = self._og_search_title(webpage, fatal=False)
+        description = self._og_search_description(webpage)
+        return title, description
index 986245bf0568e8aaaaab8b8a32eeedca866b21cc..bd3ee2e2eb3822253bb04fceb253d4028448f5f5 100644 (file)
@@ -1,31 +1,27 @@
 from __future__ import unicode_literals
 
-from .common import InfoExtractor
-from ..compat import compat_urllib_parse_unquote
-from ..utils import (
-    xpath_text,
-    xpath_with_ns,
-    int_or_none,
-    parse_iso8601,
-)
+from .mtv import MTVServicesInfoExtractor
+from ..utils import unified_strdate
+from ..compat import compat_urllib_parse_urlencode
 
 
-class BetIE(InfoExtractor):
+class BetIE(MTVServicesInfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?bet\.com/(?:[^/]+/)+(?P<id>.+?)\.html'
     _TESTS = [
         {
             'url': 'http://www.bet.com/news/politics/2014/12/08/in-bet-exclusive-obama-talks-race-and-racism.html',
             'info_dict': {
-                'id': 'news/national/2014/a-conversation-with-president-obama',
+                'id': '07e96bd3-8850-3051-b856-271b457f0ab8',
                 'display_id': 'in-bet-exclusive-obama-talks-race-and-racism',
                 'ext': 'flv',
                 'title': 'A Conversation With President Obama',
-                'description': 'md5:699d0652a350cf3e491cd15cc745b5da',
+                'description': 'President Obama urges persistence in confronting racism and bias.',
                 'duration': 1534,
-                'timestamp': 1418075340,
                 'upload_date': '20141208',
-                'uploader': 'admin',
                 'thumbnail': 're:(?i)^https?://.*\.jpg$',
+                'subtitles': {
+                    'en': 'mincount:2',
+                }
             },
             'params': {
                 # rtmp download
@@ -35,16 +31,17 @@ class BetIE(InfoExtractor):
         {
             'url': 'http://www.bet.com/video/news/national/2014/justice-for-ferguson-a-community-reacts.html',
             'info_dict': {
-                'id': 'news/national/2014/justice-for-ferguson-a-community-reacts',
+                'id': '9f516bf1-7543-39c4-8076-dd441b459ba9',
                 'display_id': 'justice-for-ferguson-a-community-reacts',
                 'ext': 'flv',
                 'title': 'Justice for Ferguson: A Community Reacts',
                 'description': 'A BET News special.',
                 'duration': 1696,
-                'timestamp': 1416942360,
                 'upload_date': '20141125',
-                'uploader': 'admin',
                 'thumbnail': 're:(?i)^https?://.*\.jpg$',
+                'subtitles': {
+                    'en': 'mincount:2',
+                }
             },
             'params': {
                 # rtmp download
@@ -53,57 +50,32 @@ class BetIE(InfoExtractor):
         }
     ]
 
-    def _real_extract(self, url):
-        display_id = self._match_id(url)
-        webpage = self._download_webpage(url, display_id)
-
-        media_url = compat_urllib_parse_unquote(self._search_regex(
-            [r'mediaURL\s*:\s*"([^"]+)"', r"var\s+mrssMediaUrl\s*=\s*'([^']+)'"],
-            webpage, 'media URL'))
+    _FEED_URL = "http://feeds.mtvnservices.com/od/feed/bet-mrss-player"
 
-        video_id = self._search_regex(
-            r'/video/(.*)/_jcr_content/', media_url, 'video id')
+    def _get_feed_query(self, uri):
+        return compat_urllib_parse_urlencode({
+            'uuid': uri,
+        })
 
-        mrss = self._download_xml(media_url, display_id)
-
-        item = mrss.find('./channel/item')
-
-        NS_MAP = {
-            'dc': 'http://purl.org/dc/elements/1.1/',
-            'media': 'http://search.yahoo.com/mrss/',
-            'ka': 'http://kickapps.com/karss',
-        }
+    def _extract_mgid(self, webpage):
+        return self._search_regex(r'data-uri="([^"]+)', webpage, 'mgid')
 
-        title = xpath_text(item, './title', 'title')
-        description = xpath_text(
-            item, './description', 'description', fatal=False)
-
-        timestamp = parse_iso8601(xpath_text(
-            item, xpath_with_ns('./dc:date', NS_MAP),
-            'upload date', fatal=False))
-        uploader = xpath_text(
-            item, xpath_with_ns('./dc:creator', NS_MAP),
-            'uploader', fatal=False)
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
 
-        media_content = item.find(
-            xpath_with_ns('./media:content', NS_MAP))
-        duration = int_or_none(media_content.get('duration'))
-        smil_url = media_content.get('url')
+        webpage = self._download_webpage(url, display_id)
+        mgid = self._extract_mgid(webpage)
+        videos_info = self._get_videos_info(mgid)
 
-        thumbnail = media_content.find(
-            xpath_with_ns('./media:thumbnail', NS_MAP)).get('url')
+        info_dict = videos_info['entries'][0]
 
-        formats = self._extract_smil_formats(smil_url, display_id)
-        self._sort_formats(formats)
+        upload_date = unified_strdate(self._html_search_meta('date', webpage))
+        description = self._html_search_meta('description', webpage)
 
-        return {
-            'id': video_id,
+        info_dict.update({
             'display_id': display_id,
-            'title': title,
             'description': description,
-            'thumbnail': thumbnail,
-            'timestamp': timestamp,
-            'uploader': uploader,
-            'duration': duration,
-            'formats': formats,
-        }
+            'upload_date': upload_date,
+        })
+
+        return info_dict
index 8baff2041bb380d0204895cbbc6c64b16be94993..b17047b399b6630fe2334aa24f0a5e97aed8506f 100644 (file)
@@ -1,34 +1,42 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import calendar
+import datetime
 import re
 
 from .common import InfoExtractor
-from ..compat import compat_str
+from ..compat import (
+    compat_etree_fromstring,
+    compat_str,
+    compat_parse_qs,
+    compat_xml_parse_error,
+)
 from ..utils import (
-    int_or_none,
-    unescapeHTML,
     ExtractorError,
+    int_or_none,
+    float_or_none,
     xpath_text,
 )
 
 
 class BiliBiliIE(InfoExtractor):
-    _VALID_URL = r'https?://www\.bilibili\.(?:tv|com)/video/av(?P<id>\d+)(?:/index_(?P<page_num>\d+).html)?'
+    _VALID_URL = r'https?://www\.bilibili\.(?:tv|com)/video/av(?P<id>\d+)'
 
     _TESTS = [{
         'url': 'http://www.bilibili.tv/video/av1074402/',
-        'md5': '2c301e4dab317596e837c3e7633e7d86',
+        'md5': '5f7d29e1a2872f3df0cf76b1f87d3788',
         'info_dict': {
             'id': '1554319',
             'ext': 'flv',
             'title': '【金坷垃】金泡沫',
-            'duration': 308313,
+            'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923',
+            'duration': 308.067,
+            'timestamp': 1398012660,
             'upload_date': '20140420',
             'thumbnail': 're:^https?://.+\.jpg',
-            'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923',
-            'timestamp': 1397983878,
             'uploader': '菊子桑',
+            'uploader_id': '156160',
         },
     }, {
         'url': 'http://www.bilibili.com/video/av1041170/',
@@ -36,75 +44,186 @@ class BiliBiliIE(InfoExtractor):
             'id': '1041170',
             'title': '【BD1080P】刀语【诸神&异域】',
             'description': '这是个神奇的故事~每个人不留弹幕不给走哦~切利哦!~',
-            'uploader': '枫叶逝去',
-            'timestamp': 1396501299,
         },
         'playlist_count': 9,
+    }, {
+        'url': 'http://www.bilibili.com/video/av4808130/',
+        'info_dict': {
+            'id': '4808130',
+            'title': '【长篇】哆啦A梦443【钉铛】',
+            'description': '(2016.05.27)来组合客人的脸吧&amp;amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;amp;illust_id=56912929',
+        },
+        'playlist': [{
+            'md5': '55cdadedf3254caaa0d5d27cf20a8f9c',
+            'info_dict': {
+                'id': '4808130_part1',
+                'ext': 'flv',
+                'title': '【长篇】哆啦A梦443【钉铛】',
+                'description': '(2016.05.27)来组合客人的脸吧&amp;amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;amp;illust_id=56912929',
+                'timestamp': 1464564180,
+                'upload_date': '20160529',
+                'uploader': '喜欢拉面',
+                'uploader_id': '151066',
+            },
+        }, {
+            'md5': '926f9f67d0c482091872fbd8eca7ea3d',
+            'info_dict': {
+                'id': '4808130_part2',
+                'ext': 'flv',
+                'title': '【长篇】哆啦A梦443【钉铛】',
+                'description': '(2016.05.27)来组合客人的脸吧&amp;amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;amp;illust_id=56912929',
+                'timestamp': 1464564180,
+                'upload_date': '20160529',
+                'uploader': '喜欢拉面',
+                'uploader_id': '151066',
+            },
+        }, {
+            'md5': '4b7b225b968402d7c32348c646f1fd83',
+            'info_dict': {
+                'id': '4808130_part3',
+                'ext': 'flv',
+                'title': '【长篇】哆啦A梦443【钉铛】',
+                'description': '(2016.05.27)来组合客人的脸吧&amp;amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;amp;illust_id=56912929',
+                'timestamp': 1464564180,
+                'upload_date': '20160529',
+                'uploader': '喜欢拉面',
+                'uploader_id': '151066',
+            },
+        }, {
+            'md5': '7b795e214166501e9141139eea236e91',
+            'info_dict': {
+                'id': '4808130_part4',
+                'ext': 'flv',
+                'title': '【长篇】哆啦A梦443【钉铛】',
+                'description': '(2016.05.27)来组合客人的脸吧&amp;amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;amp;illust_id=56912929',
+                'timestamp': 1464564180,
+                'upload_date': '20160529',
+                'uploader': '喜欢拉面',
+                'uploader_id': '151066',
+            },
+        }],
+    }, {
+        # Missing upload time
+        'url': 'http://www.bilibili.com/video/av1867637/',
+        'info_dict': {
+            'id': '2880301',
+            'ext': 'flv',
+            'title': '【HDTV】【喜剧】岳父岳母真难当 (2014)【法国票房冠军】',
+            'description': '一个信奉天主教的法国旧式传统资产阶级家庭中有四个女儿。三个女儿却分别找了阿拉伯、犹太、中国丈夫,老夫老妻唯独期盼剩下未嫁的小女儿能找一个信奉天主教的法国白人,结果没想到小女儿找了一位非裔黑人……【这次应该不会跳帧了】',
+            'uploader': '黑夜为猫',
+            'uploader_id': '610729',
+        },
+        'params': {
+            # Just to test metadata extraction
+            'skip_download': True,
+        },
+        'expected_warnings': ['upload time'],
     }]
 
+    # BiliBili blocks keys from time to time. The current key is extracted from
+    # the Android client
+    # TODO: find the sign algorithm used in the flash player
+    _APP_KEY = '86385cdc024c0f6c'
+
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
-        page_num = mobj.group('page_num') or '1'
 
-        view_data = self._download_json(
-            'http://api.bilibili.com/view?type=json&appkey=8e9fc618fbd41e28&id=%s&page=%s' % (video_id, page_num),
-            video_id)
-        if 'error' in view_data:
-            raise ExtractorError('%s said: %s' % (self.IE_NAME, view_data['error']), expected=True)
+        webpage = self._download_webpage(url, video_id)
 
-        cid = view_data['cid']
-        title = unescapeHTML(view_data['title'])
+        params = compat_parse_qs(self._search_regex(
+            [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)',
+             r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'],
+            webpage, 'player parameters'))
+        cid = params['cid'][0]
 
-        doc = self._download_xml(
-            'http://interface.bilibili.com/v_cdn_play?appkey=8e9fc618fbd41e28&cid=%s' % cid,
-            cid,
-            'Downloading page %s/%s' % (page_num, view_data['pages'])
-        )
+        info_xml_str = self._download_webpage(
+            'http://interface.bilibili.com/v_cdn_play',
+            cid, query={'appkey': self._APP_KEY, 'cid': cid},
+            note='Downloading video info page')
+
+        err_msg = None
+        durls = None
+        info_xml = None
+        try:
+            info_xml = compat_etree_fromstring(info_xml_str.encode('utf-8'))
+        except compat_xml_parse_error:
+            info_json = self._parse_json(info_xml_str, video_id, fatal=False)
+            err_msg = (info_json or {}).get('error_text')
+        else:
+            err_msg = xpath_text(info_xml, './message')
 
-        if xpath_text(doc, './result') == 'error':
-            raise ExtractorError('%s said: %s' % (self.IE_NAME, xpath_text(doc, './message')), expected=True)
+        if info_xml is not None:
+            durls = info_xml.findall('./durl')
+        if not durls:
+            if err_msg:
+                raise ExtractorError('%s said: %s' % (self.IE_NAME, err_msg), expected=True)
+            else:
+                raise ExtractorError('No videos found!')
 
         entries = []
 
-        for durl in doc.findall('./durl'):
+        for durl in durls:
             size = xpath_text(durl, ['./filesize', './size'])
             formats = [{
                 'url': durl.find('./url').text,
                 'filesize': int_or_none(size),
-                'ext': 'flv',
             }]
-            backup_urls = durl.find('./backup_url')
-            if backup_urls is not None:
-                for backup_url in backup_urls.findall('./url'):
-                    formats.append({'url': backup_url.text})
-            formats.reverse()
+            for backup_url in durl.findall('./backup_url/url'):
+                formats.append({
+                    'url': backup_url.text,
+                    # backup URLs have lower priorities
+                    'preference': -2 if 'hd.mp4' in backup_url.text else -3,
+                })
+
+            self._sort_formats(formats)
 
             entries.append({
                 'id': '%s_part%s' % (cid, xpath_text(durl, './order')),
-                'title': title,
                 'duration': int_or_none(xpath_text(durl, './length'), 1000),
                 'formats': formats,
             })
 
+        title = self._html_search_regex('<h1[^>]+title="([^"]+)">', webpage, 'title')
+        description = self._html_search_meta('description', webpage)
+        datetime_str = self._html_search_regex(
+            r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', fatal=False)
+        timestamp = None
+        if datetime_str:
+            timestamp = calendar.timegm(datetime.datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M').timetuple())
+
+        # TODO 'view_count' requires deobfuscating Javascript
         info = {
             'id': compat_str(cid),
             'title': title,
-            'description': view_data.get('description'),
-            'thumbnail': view_data.get('pic'),
-            'uploader': view_data.get('author'),
-            'timestamp': int_or_none(view_data.get('created')),
-            'view_count': int_or_none(view_data.get('play')),
-            'duration': int_or_none(xpath_text(doc, './timelength')),
+            'description': description,
+            'timestamp': timestamp,
+            'thumbnail': self._html_search_meta('thumbnailUrl', webpage),
+            'duration': float_or_none(xpath_text(info_xml, './timelength'), scale=1000),
         }
 
+        uploader_mobj = re.search(
+            r'<a[^>]+href="https?://space\.bilibili\.com/(?P<id>\d+)"[^>]+title="(?P<name>[^"]+)"',
+            webpage)
+        if uploader_mobj:
+            info.update({
+                'uploader': uploader_mobj.group('name'),
+                'uploader_id': uploader_mobj.group('id'),
+            })
+
+        for entry in entries:
+            entry.update(info)
+
         if len(entries) == 1:
-            entries[0].update(info)
             return entries[0]
         else:
-            info.update({
+            for idx, entry in enumerate(entries):
+                entry['id'] = '%s_part%d' % (video_id, (idx + 1))
+
+            return {
                 '_type': 'multi_video',
                 'id': video_id,
+                'title': title,
+                'description': description,
                 'entries': entries,
-            })
-            return info
+            }
diff --git a/youtube_dl/extractor/biqle.py b/youtube_dl/extractor/biqle.py
new file mode 100644 (file)
index 0000000..ae4579b
--- /dev/null
@@ -0,0 +1,39 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class BIQLEIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?biqle\.(?:com|org|ru)/watch/(?P<id>-?\d+_\d+)'
+    _TESTS = [{
+        'url': 'http://www.biqle.ru/watch/847655_160197695',
+        'md5': 'ad5f746a874ccded7b8f211aeea96637',
+        'info_dict': {
+            'id': '160197695',
+            'ext': 'mp4',
+            'title': 'Foo Fighters - The Pretender (Live at Wembley Stadium)',
+            'uploader': 'Andrey Rogozin',
+            'upload_date': '20110605',
+        }
+    }, {
+        'url': 'https://biqle.org/watch/-44781847_168547604',
+        'md5': '7f24e72af1db0edf7c1aaba513174f97',
+        'info_dict': {
+            'id': '168547604',
+            'ext': 'mp4',
+            'title': 'Ребенок в шоке от автоматической мойки',
+            'uploader': 'Dmitry Kotov',
+        }
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        embed_url = self._proto_relative_url(self._search_regex(
+            r'<iframe.+?src="((?:http:)?//daxab\.com/[^"]+)".*?></iframe>', webpage, 'embed url'))
+
+        return {
+            '_type': 'url_transparent',
+            'url': embed_url,
+        }
index 13343bc258532b37bf912f0648e317103b5f428d..bd538be50bc4a650d000eaffb5e292a4e8e76cbe 100644 (file)
@@ -17,6 +17,9 @@ class BloombergIE(InfoExtractor):
             'title': 'Shah\'s Presentation on Foreign-Exchange Strategies',
             'description': 'md5:a8ba0302912d03d246979735c17d2761',
         },
+        'params': {
+            'format': 'best[format_id^=hds]',
+        },
     }, {
         'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets',
         'only_matching': True,
index 11cf498515ba572f8ef8c7f20d5620bf50289827..ff0aa11b19a7736017992d76f13a0ba5509f2f8e 100644 (file)
@@ -29,7 +29,8 @@ class BRIE(InfoExtractor):
                 'duration': 180,
                 'uploader': 'Reinhard Weber',
                 'upload_date': '20150422',
-            }
+            },
+            'skip': '404 not found',
         },
         {
             'url': 'http://www.br.de/nachrichten/oberbayern/inhalt/muenchner-polizeipraesident-schreiber-gestorben-100.html',
@@ -40,7 +41,8 @@ class BRIE(InfoExtractor):
                 'title': 'Manfred Schreiber ist tot',
                 'description': 'md5:b454d867f2a9fc524ebe88c3f5092d97',
                 'duration': 26,
-            }
+            },
+            'skip': '404 not found',
         },
         {
             'url': 'https://www.br-klassik.de/audio/peeping-tom-premierenkritik-dance-festival-muenchen-100.html',
@@ -51,7 +53,8 @@ class BRIE(InfoExtractor):
                 'title': 'Kurzweilig und sehr bewegend',
                 'description': 'md5:0351996e3283d64adeb38ede91fac54e',
                 'duration': 296,
-            }
+            },
+            'skip': '404 not found',
         },
         {
             'url': 'http://www.br.de/radio/bayern1/service/team/videos/team-video-erdelt100.html',
index f0781fc273a18ec30c1ffa97546232d991ad8574..57ce0c174ce92c45a80471e8088b2a951c0a0219 100644 (file)
@@ -90,6 +90,7 @@ class BrightcoveLegacyIE(InfoExtractor):
                 'description': 'md5:363109c02998fee92ec02211bd8000df',
                 'uploader': 'National Ballet of Canada',
             },
+            'skip': 'Video gone',
         },
         {
             # test flv videos served by akamaihd.net
@@ -108,7 +109,7 @@ class BrightcoveLegacyIE(InfoExtractor):
             },
         },
         {
-            # playlist test
+            # playlist with 'videoList'
             # from http://support.brightcove.com/en/video-cloud/docs/playlist-support-single-video-players
             'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=3550052898001&playerKey=AQ%7E%7E%2CAAABmA9XpXk%7E%2C-Kp7jNgisre1fG5OdqpAFUTcs0lP_ZoL',
             'info_dict': {
@@ -117,6 +118,15 @@ class BrightcoveLegacyIE(InfoExtractor):
             },
             'playlist_mincount': 7,
         },
+        {
+            # playlist with 'playlistTab' (https://github.com/rg3/youtube-dl/issues/9965)
+            'url': 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=AQ%7E%7E,AAABXlLMdok%7E,NJ4EoMlZ4rZdx9eU1rkMVd8EaYPBBUlg',
+            'info_dict': {
+                'id': '1522758701001',
+                'title': 'Lesson 08',
+            },
+            'playlist_mincount': 10,
+        },
     ]
     FLV_VCODECS = {
         1: 'SORENSON',
@@ -298,18 +308,25 @@ class BrightcoveLegacyIE(InfoExtractor):
             info_url, player_key, 'Downloading playlist information')
 
         json_data = json.loads(playlist_info)
-        if 'videoList' not in json_data:
+        if 'videoList' in json_data:
+            playlist_info = json_data['videoList']
+            playlist_dto = playlist_info['mediaCollectionDTO']
+        elif 'playlistTabs' in json_data:
+            playlist_info = json_data['playlistTabs']
+            playlist_dto = playlist_info['lineupListDTO']['playlistDTOs'][0]
+        else:
             raise ExtractorError('Empty playlist')
-        playlist_info = json_data['videoList']
-        videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']]
+
+        videos = [self._extract_video_info(video_info) for video_info in playlist_dto['videoDTOs']]
 
         return self.playlist_result(videos, playlist_id='%s' % playlist_info['id'],
-                                    playlist_title=playlist_info['mediaCollectionDTO']['displayName'])
+                                    playlist_title=playlist_dto['displayName'])
 
     def _extract_video_info(self, video_info):
+        video_id = compat_str(video_info['id'])
         publisher_id = video_info.get('publisherId')
         info = {
-            'id': compat_str(video_info['id']),
+            'id': video_id,
             'title': video_info['displayName'].strip(),
             'description': video_info.get('shortDescription'),
             'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'),
@@ -331,7 +348,8 @@ class BrightcoveLegacyIE(InfoExtractor):
                     url_comp = compat_urllib_parse_urlparse(url)
                     if url_comp.path.endswith('.m3u8'):
                         formats.extend(
-                            self._extract_m3u8_formats(url, info['id'], 'mp4'))
+                            self._extract_m3u8_formats(
+                                url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
                         continue
                     elif 'akamaihd.net' in url_comp.netloc:
                         # This type of renditions are served through
@@ -365,7 +383,7 @@ class BrightcoveLegacyIE(InfoExtractor):
                     a_format.update({
                         'format_id': 'hls%s' % ('-%s' % tbr if tbr else ''),
                         'ext': 'mp4',
-                        'protocol': 'm3u8',
+                        'protocol': 'm3u8_native',
                     })
 
                 formats.append(a_format)
@@ -395,7 +413,7 @@ class BrightcoveLegacyIE(InfoExtractor):
                     return ad_info
 
         if 'url' not in info and not info.get('formats'):
-            raise ExtractorError('Unable to extract video url for %s' % info['id'])
+            raise ExtractorError('Unable to extract video url for %s' % video_id)
         return info
 
 
@@ -442,6 +460,10 @@ class BrightcoveNewIE(InfoExtractor):
         # non numeric ref: prefixed video id
         'url': 'http://players.brightcove.net/710858724001/default_default/index.html?videoId=ref:event-stream-356',
         'only_matching': True,
+    }, {
+        # unavailable video without message but with error_code
+        'url': 'http://players.brightcove.net/1305187701/c832abfb-641b-44eb-9da0-2fe76786505f_default/index.html?videoId=4377407326001',
+        'only_matching': True,
     }]
 
     @staticmethod
@@ -512,8 +534,9 @@ class BrightcoveNewIE(InfoExtractor):
             })
         except ExtractorError as e:
             if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
-                json_data = self._parse_json(e.cause.read().decode(), video_id)
-                raise ExtractorError(json_data[0]['message'], expected=True)
+                json_data = self._parse_json(e.cause.read().decode(), video_id)[0]
+                raise ExtractorError(
+                    json_data.get('message') or json_data['error_code'], expected=True)
             raise
 
         title = json_data['name'].strip()
@@ -527,7 +550,7 @@ class BrightcoveNewIE(InfoExtractor):
                 if not src:
                     continue
                 formats.extend(self._extract_m3u8_formats(
-                    src, video_id, 'mp4', m3u8_id='hls', fatal=False))
+                    src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
             elif source_type == 'application/dash+xml':
                 if not src:
                     continue
@@ -578,6 +601,13 @@ class BrightcoveNewIE(InfoExtractor):
                         'format_id': build_format_id('rtmp'),
                     })
                 formats.append(f)
+
+        errors = json_data.get('errors')
+        if not formats and errors:
+            error = errors[0]
+            raise ExtractorError(
+                error.get('message') or error.get('error_subcode') or error['error_code'], expected=True)
+
         self._sort_formats(formats)
 
         subtitles = {}
index df503ecc0f50283f0cc77a867353912a47eee5dd..75fa92d7cfc0204f4539e10c762585b2537abbb6 100644 (file)
@@ -5,6 +5,7 @@ import json
 import re
 
 from .common import InfoExtractor
+from .facebook import FacebookIE
 
 
 class BuzzFeedIE(InfoExtractor):
@@ -20,11 +21,11 @@ class BuzzFeedIE(InfoExtractor):
             'info_dict': {
                 'id': 'aVCR29aE_OQ',
                 'ext': 'mp4',
+                'title': 'Angry Ram destroys a punching bag..',
+                'description': 'md5:c59533190ef23fd4458a5e8c8c872345',
                 'upload_date': '20141024',
                 'uploader_id': 'Buddhanz1',
-                'description': 'He likes to stay in shape with his heavy bag, he wont stop until its on the ground\n\nFollow Angry Ram on Facebook for regular updates -\nhttps://www.facebook.com/pages/Angry-Ram/1436897249899558?ref=hl',
-                'uploader': 'Buddhanz',
-                'title': 'Angry Ram destroys a punching bag',
+                'uploader': 'Angry Ram',
             }
         }]
     }, {
@@ -41,13 +42,30 @@ class BuzzFeedIE(InfoExtractor):
             'info_dict': {
                 'id': 'mVmBL8B-In0',
                 'ext': 'mp4',
+                'title': 're:Munchkin the Teddy Bear gets her exercise',
+                'description': 'md5:28faab95cda6e361bcff06ec12fc21d8',
                 'upload_date': '20141124',
                 'uploader_id': 'CindysMunchkin',
-                'description': 're:© 2014 Munchkin the',
                 'uploader': 're:^Munchkin the',
-                'title': 're:Munchkin the Teddy Bear gets her exercise',
             },
         }]
+    }, {
+        'url': 'http://www.buzzfeed.com/craigsilverman/the-most-adorable-crash-landing-ever#.eq7pX0BAmK',
+        'info_dict': {
+            'id': 'the-most-adorable-crash-landing-ever',
+            'title': 'Watch This Baby Goose Make The Most Adorable Crash Landing',
+            'description': 'This gosling knows how to stick a landing.',
+        },
+        'playlist': [{
+            'md5': '763ca415512f91ca62e4621086900a23',
+            'info_dict': {
+                'id': '971793786185728',
+                'ext': 'mp4',
+                'title': 'We set up crash pads so that the goslings on our roof would have a safe landi...',
+                'uploader': 'Calgary Outdoor Centre-University of Calgary',
+            },
+        }],
+        'add_ie': ['Facebook'],
     }]
 
     def _real_extract(self, url):
@@ -66,6 +84,10 @@ class BuzzFeedIE(InfoExtractor):
                 continue
             entries.append(self.url_result(video['url']))
 
+        facebook_url = FacebookIE._extract_url(webpage)
+        if facebook_url:
+            entries.append(self.url_result(facebook_url))
+
         return {
             '_type': 'playlist',
             'id': playlist_id,
index dda98059e9041c651de5a211fccb2c106b11bb75..3aec601f8e7179570088e1ea5ad1f7b6d30f219d 100644 (file)
@@ -11,6 +11,7 @@ class BYUtvIE(InfoExtractor):
     _VALID_URL = r'^https?://(?:www\.)?byutv.org/watch/[0-9a-f-]+/(?P<video_id>[^/?#]+)'
     _TEST = {
         'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5',
+        'md5': '05850eb8c749e2ee05ad5a1c34668493',
         'info_dict': {
             'id': 'studio-c-season-5-episode-5',
             'ext': 'mp4',
@@ -21,7 +22,8 @@ class BYUtvIE(InfoExtractor):
         },
         'params': {
             'skip_download': True,
-        }
+        },
+        'add_ie': ['Ooyala'],
     }
 
     def _real_extract(self, url):
index 25b2d4efe5d54e1c3264f906a3105ad05dd2ca3f..61463f249f6e4ded3b5f59831d7dba421ef9de9a 100644 (file)
@@ -4,11 +4,11 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_urllib_parse_urlparse
 from ..utils import (
     ExtractorError,
     HEADRequest,
     unified_strdate,
-    url_basename,
     qualities,
     int_or_none,
 )
@@ -16,24 +16,38 @@ from ..utils import (
 
 class CanalplusIE(InfoExtractor):
     IE_DESC = 'canalplus.fr, piwiplus.fr and d8.tv'
-    _VALID_URL = r'https?://(?:www\.(?P<site>canalplus\.fr|piwiplus\.fr|d8\.tv|itele\.fr)/.*?/(?P<path>.*)|player\.canalplus\.fr/#/(?P<id>[0-9]+))'
+    _VALID_URL = r'''(?x)
+                        https?://
+                            (?:
+                                (?:
+                                    (?:(?:www|m)\.)?canalplus\.fr|
+                                    (?:www\.)?piwiplus\.fr|
+                                    (?:www\.)?d8\.tv|
+                                    (?:www\.)?d17\.tv|
+                                    (?:www\.)?itele\.fr
+                                )/(?:(?:[^/]+/)*(?P<display_id>[^/?#&]+))?(?:\?.*\bvid=(?P<vid>\d+))?|
+                                player\.canalplus\.fr/#/(?P<id>\d+)
+                            )
+
+                    '''
     _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/%s/%s?format=json'
     _SITE_ID_MAP = {
-        'canalplus.fr': 'cplus',
-        'piwiplus.fr': 'teletoon',
-        'd8.tv': 'd8',
-        'itele.fr': 'itele',
+        'canalplus': 'cplus',
+        'piwiplus': 'teletoon',
+        'd8': 'd8',
+        'd17': 'd17',
+        'itele': 'itele',
     }
 
     _TESTS = [{
-        'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1263092',
-        'md5': '12164a6f14ff6df8bd628e8ba9b10b78',
+        'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1192814',
+        'md5': '41f438a4904f7664b91b4ed0dec969dc',
         'info_dict': {
-            'id': '1263092',
+            'id': '1192814',
             'ext': 'mp4',
-            'title': 'Le Zapping - 13/05/15',
-            'description': 'md5:09738c0d06be4b5d06a0940edb0da73f',
-            'upload_date': '20150513',
+            'title': "L'Année du Zapping 2014 - L'Année du Zapping 2014",
+            'description': "Toute l'année 2014 dans un Zapping exceptionnel !",
+            'upload_date': '20150105',
         },
     }, {
         'url': 'http://www.piwiplus.fr/videos-piwi/pid1405-le-labyrinthe-boing-super-ranger.html?vid=1108190',
@@ -46,35 +60,45 @@ class CanalplusIE(InfoExtractor):
         },
         'skip': 'Only works from France',
     }, {
-        'url': 'http://www.d8.tv/d8-docs-mags/pid6589-d8-campagne-intime.html',
+        'url': 'http://www.d8.tv/d8-docs-mags/pid5198-d8-en-quete-d-actualite.html?vid=1390231',
         'info_dict': {
-            'id': '966289',
-            'ext': 'flv',
-            'title': 'Campagne intime - Documentaire exceptionnel',
-            'description': 'md5:d2643b799fb190846ae09c61e59a859f',
-            'upload_date': '20131108',
+            'id': '1390231',
+            'ext': 'mp4',
+            'title': "Vacances pas chères : prix discount ou grosses dépenses ? - En quête d'actualité",
+            'description': 'md5:edb6cf1cb4a1e807b5dd089e1ac8bfc6',
+            'upload_date': '20160512',
+        },
+        'params': {
+            'skip_download': True,
         },
-        'skip': 'videos get deleted after a while',
     }, {
-        'url': 'http://www.itele.fr/france/video/aubervilliers-un-lycee-en-colere-111559',
-        'md5': '38b8f7934def74f0d6f3ba6c036a5f82',
+        'url': 'http://www.itele.fr/chroniques/invite-bruce-toussaint/thierry-solere-nicolas-sarkozy-officialisera-sa-candidature-a-la-primaire-quand-il-le-voudra-167224',
         'info_dict': {
-            'id': '1213714',
+            'id': '1398334',
             'ext': 'mp4',
-            'title': 'Aubervilliers : un lycée en colère - Le 11/02/2015 à 06h45',
-            'description': 'md5:8216206ec53426ea6321321f3b3c16db',
-            'upload_date': '20150211',
+            'title': "L'invité de Bruce Toussaint du 07/06/2016 - ",
+            'description': 'md5:40ac7c9ad0feaeb6f605bad986f61324',
+            'upload_date': '20160607',
         },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://m.canalplus.fr/?vid=1398231',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.d17.tv/emissions/pid8303-lolywood.html?vid=1397061',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.groupdict().get('id')
+        video_id = mobj.groupdict().get('id') or mobj.groupdict().get('vid')
 
-        site_id = self._SITE_ID_MAP[mobj.group('site') or 'canal']
+        site_id = self._SITE_ID_MAP[compat_urllib_parse_urlparse(url).netloc.rsplit('.', 2)[-2]]
 
         # Beware, some subclasses do not define an id group
-        display_id = url_basename(mobj.group('path'))
+        display_id = mobj.group('display_id') or video_id
 
         if video_id is None:
             webpage = self._download_webpage(url, display_id)
diff --git a/youtube_dl/extractor/carambatv.py b/youtube_dl/extractor/carambatv.py
new file mode 100644 (file)
index 0000000..5797fb9
--- /dev/null
@@ -0,0 +1,88 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    float_or_none,
+    int_or_none,
+    try_get,
+)
+
+
+class CarambaTVIE(InfoExtractor):
+    _VALID_URL = r'(?:carambatv:|https?://video1\.carambatv\.ru/v/)(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://video1.carambatv.ru/v/191910501',
+        'md5': '2f4a81b7cfd5ab866ee2d7270cb34a2a',
+        'info_dict': {
+            'id': '191910501',
+            'ext': 'mp4',
+            'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)',
+            'thumbnail': 're:^https?://.*\.jpg',
+            'duration': 2678.31,
+        },
+    }, {
+        'url': 'carambatv:191910501',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video = self._download_json(
+            'http://video1.carambatv.ru/v/%s/videoinfo.js' % video_id,
+            video_id)
+
+        title = video['title']
+
+        base_url = video.get('video') or 'http://video1.carambatv.ru/v/%s/' % video_id
+
+        formats = [{
+            'url': base_url + f['fn'],
+            'height': int_or_none(f.get('height')),
+            'format_id': '%sp' % f['height'] if f.get('height') else None,
+        } for f in video['qualities'] if f.get('fn')]
+        self._sort_formats(formats)
+
+        thumbnail = video.get('splash')
+        duration = float_or_none(try_get(
+            video, lambda x: x['annotations'][0]['end_time'], compat_str))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'formats': formats,
+        }
+
+
+class CarambaTVPageIE(InfoExtractor):
+    _VALID_URL = r'https?://carambatv\.ru/(?:[^/]+/)+(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'http://carambatv.ru/movie/bad-comedian/razborka-v-manile/',
+        'md5': '',
+        'info_dict': {
+            'id': '191910501',
+            'ext': 'mp4',
+            'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 2678.31,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_url = self._og_search_property('video:iframe', webpage, default=None)
+
+        if not video_url:
+            video_id = self._search_regex(
+                r'(?:video_id|crmb_vuid)\s*[:=]\s*["\']?(\d+)',
+                webpage, 'video id')
+            video_url = 'carambatv:%s' % video_id
+
+        return self.url_result(video_url, CarambaTVIE.ie_key())
index d8aa31038bfb85f6e5123fe8e7831a2eb22c0c45..ff663d07947fd345a107203892cbe1811080ac6c 100644 (file)
@@ -4,64 +4,66 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..utils import js_to_json
+from ..utils import (
+    js_to_json,
+    smuggle_url,
+)
 
 
 class CBCIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?:[^/]+/)+(?P<id>[^/?#]+)'
+    _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?!player/)(?:[^/]+/)+(?P<id>[^/?#]+)'
     _TESTS = [{
         # with mediaId
         'url': 'http://www.cbc.ca/22minutes/videos/clips-season-23/don-cherry-play-offs',
+        'md5': '97e24d09672fc4cf56256d6faa6c25bc',
         'info_dict': {
             'id': '2682904050',
-            'ext': 'flv',
+            'ext': 'mp4',
             'title': 'Don Cherry – All-Stars',
             'description': 'Don Cherry has a bee in his bonnet about AHL player John Scott because that guy’s got heart.',
-            'timestamp': 1454475540,
+            'timestamp': 1454463000,
             'upload_date': '20160203',
-        },
-        'params': {
-            # rtmp download
-            'skip_download': True,
+            'uploader': 'CBCC-NEW',
         },
     }, {
         # with clipId
         'url': 'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live',
+        'md5': '0274a90b51a9b4971fe005c63f592f12',
         'info_dict': {
             'id': '2487345465',
-            'ext': 'flv',
+            'ext': 'mp4',
             'title': 'Robin Williams freestyles on 90 Minutes Live',
             'description': 'Wacky American comedian Robin Williams shows off his infamous "freestyle" comedic talents while being interviewed on CBC\'s 90 Minutes Live.',
-            'upload_date': '19700101',
-        },
-        'params': {
-            # rtmp download
-            'skip_download': True,
+            'upload_date': '19780210',
+            'uploader': 'CBCC-NEW',
+            'timestamp': 255977160,
         },
     }, {
         # multiple iframes
         'url': 'http://www.cbc.ca/natureofthings/blog/birds-eye-view-from-vancouvers-burrard-street-bridge-how-we-got-the-shot',
         'playlist': [{
+            'md5': '377572d0b49c4ce0c9ad77470e0b96b4',
             'info_dict': {
                 'id': '2680832926',
-                'ext': 'flv',
+                'ext': 'mp4',
                 'title': 'An Eagle\'s-Eye View Off Burrard Bridge',
                 'description': 'Hercules the eagle flies from Vancouver\'s Burrard Bridge down to a nearby park with a mini-camera strapped to his back.',
-                'upload_date': '19700101',
+                'upload_date': '20160201',
+                'timestamp': 1454342820,
+                'uploader': 'CBCC-NEW',
             },
         }, {
+            'md5': '415a0e3f586113894174dfb31aa5bb1a',
             'info_dict': {
                 'id': '2658915080',
-                'ext': 'flv',
+                'ext': 'mp4',
                 'title': 'Fly like an eagle!',
                 'description': 'Eagle equipped with a mini camera flies from the world\'s tallest tower',
-                'upload_date': '19700101',
+                'upload_date': '20150315',
+                'timestamp': 1426443984,
+                'uploader': 'CBCC-NEW',
             },
         }],
-        'params': {
-            # rtmp download
-            'skip_download': True,
-        },
     }]
 
     @classmethod
@@ -90,24 +92,54 @@ class CBCIE(InfoExtractor):
 
 class CBCPlayerIE(InfoExtractor):
     _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P<id>\d+)'
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.cbc.ca/player/play/2683190193',
+        'md5': '64d25f841ddf4ddb28a235338af32e2c',
         'info_dict': {
             'id': '2683190193',
-            'ext': 'flv',
+            'ext': 'mp4',
             'title': 'Gerry Runs a Sweat Shop',
             'description': 'md5:b457e1c01e8ff408d9d801c1c2cd29b0',
-            'timestamp': 1455067800,
+            'timestamp': 1455071400,
             'upload_date': '20160210',
+            'uploader': 'CBCC-NEW',
         },
-        'params': {
-            # rtmp download
-            'skip_download': True,
+    }, {
+        # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/
+        'url': 'http://www.cbc.ca/player/play/2657631896',
+        'md5': 'e5e708c34ae6fca156aafe17c43e8b75',
+        'info_dict': {
+            'id': '2657631896',
+            'ext': 'mp3',
+            'title': 'CBC Montreal is organizing its first ever community hackathon!',
+            'description': 'The modern technology we tend to depend on so heavily, is never without it\'s share of hiccups and headaches. Next weekend - CBC Montreal will be getting members of the public for its first Hackathon.',
+            'timestamp': 1425704400,
+            'upload_date': '20150307',
+            'uploader': 'CBCC-NEW',
         },
-    }
+    }, {
+        # available only when we add `formats=MPEG4,FLV,MP3` to theplatform url
+        'url': 'http://www.cbc.ca/player/play/2164402062',
+        'md5': '17a61eb813539abea40618d6323a7f82',
+        'info_dict': {
+            'id': '2164402062',
+            'ext': 'flv',
+            'title': 'Cancer survivor four times over',
+            'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.',
+            'timestamp': 1320410746,
+            'upload_date': '20111104',
+            'uploader': 'CBCC-NEW',
+        },
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        return self.url_result(
-            'http://feed.theplatform.com/f/ExhSPC/vms_5akSXx4Ng_Zn?byGuid=%s' % video_id,
-            'ThePlatformFeed', video_id)
+        return {
+            '_type': 'url_transparent',
+            'ie_key': 'ThePlatform',
+            'url': smuggle_url(
+                'http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/%s?mbr=true&formats=MPEG4,FLV,MP3' % video_id, {
+                    'force_smil_url': True
+                }),
+            'id': video_id,
+        }
index 051d783a23cc7c0b5858af0c24f63187181cd276..a23173d6f1a9570225242692ee74d68fc061fb3d 100644 (file)
@@ -1,15 +1,13 @@
 from __future__ import unicode_literals
 
-from .theplatform import ThePlatformIE
+from .theplatform import ThePlatformFeedIE
 from ..utils import (
-    xpath_text,
-    xpath_element,
     int_or_none,
     find_xpath_attr,
 )
 
 
-class CBSBaseIE(ThePlatformIE):
+class CBSBaseIE(ThePlatformFeedIE):
     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
         closed_caption_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', 'ClosedCaptionURL')
         return {
@@ -19,9 +17,22 @@ class CBSBaseIE(ThePlatformIE):
             }]
         } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else []
 
+    def _extract_video_info(self, filter_query, video_id):
+        return self._extract_feed_info(
+            'dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id, lambda entry: {
+                'series': entry.get('cbs$SeriesTitle'),
+                'season_number': int_or_none(entry.get('cbs$SeasonNumber')),
+                'episode': entry.get('cbs$EpisodeTitle'),
+                'episode_number': int_or_none(entry.get('cbs$EpisodeNumber')),
+            }, {
+                'StreamPack': {
+                    'manifest': 'm3u',
+                }
+            })
+
 
 class CBSIE(CBSBaseIE):
-    _VALID_URL = r'https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P<id>[^/]+)'
+    _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P<id>[\w-]+)'
 
     _TESTS = [{
         'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/',
@@ -36,25 +47,7 @@ class CBSIE(CBSBaseIE):
             'upload_date': '20131127',
             'uploader': 'CBSI-NEW',
         },
-        'params': {
-            # rtmp download
-            'skip_download': True,
-        },
-        '_skip': 'Blocked outside the US',
-    }, {
-        'url': 'http://www.cbs.com/shows/liveonletterman/artist/221752/st-vincent/',
-        'info_dict': {
-            'id': 'WWF_5KqY3PK1',
-            'display_id': 'st-vincent',
-            'ext': 'flv',
-            'title': 'Live on Letterman - St. Vincent',
-            'description': 'Live On Letterman: St. Vincent in concert from New York\'s Ed Sullivan Theater on Tuesday, July 16, 2014.',
-            'duration': 3221,
-        },
-        'params': {
-            # rtmp download
-            'skip_download': True,
-        },
+        'expected_warnings': ['Failed to download m3u8 information'],
         '_skip': 'Blocked outside the US',
     }, {
         'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/',
@@ -66,43 +59,5 @@ class CBSIE(CBSBaseIE):
     TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true'
 
     def _real_extract(self, url):
-        display_id = self._match_id(url)
-        webpage = self._download_webpage(url, display_id)
-        content_id = self._search_regex(
-            [r"video\.settings\.content_id\s*=\s*'([^']+)';", r"cbsplayer\.contentId\s*=\s*'([^']+)';"],
-            webpage, 'content id')
-        items_data = self._download_xml(
-            'http://can.cbs.com/thunder/player/videoPlayerService.php',
-            content_id, query={'partner': 'cbs', 'contentId': content_id})
-        video_data = xpath_element(items_data, './/item')
-        title = xpath_text(video_data, 'videoTitle', 'title', True)
-
-        subtitles = {}
-        formats = []
-        for item in items_data.findall('.//item'):
-            pid = xpath_text(item, 'pid')
-            if not pid:
-                continue
-            tp_release_url = self.TP_RELEASE_URL_TEMPLATE % pid
-            if '.m3u8' in xpath_text(item, 'contentUrl', default=''):
-                tp_release_url += '&manifest=m3u'
-            tp_formats, tp_subtitles = self._extract_theplatform_smil(
-                tp_release_url, content_id, 'Downloading %s SMIL data' % pid)
-            formats.extend(tp_formats)
-            subtitles = self._merge_subtitles(subtitles, tp_subtitles)
-        self._sort_formats(formats)
-
-        info = self.get_metadata('dJ5BDC/media/guid/2198311517/%s' % content_id, content_id)
-        info.update({
-            'id': content_id,
-            'display_id': display_id,
-            'title': title,
-            'series': xpath_text(video_data, 'seriesTitle'),
-            'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')),
-            'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')),
-            'duration': int_or_none(xpath_text(video_data, 'videoLength'), 1000),
-            'thumbnail': xpath_text(video_data, 'previewImageURL'),
-            'formats': formats,
-            'subtitles': subtitles,
-        })
-        return info
+        content_id = self._match_id(url)
+        return self._extract_video_info('byGuid=%s' % content_id, content_id)
index 0011c30296971486e66fc121d82f7c9be8c53164..821db20b23052ca71d594c6c05ad705a400129a3 100644 (file)
@@ -80,9 +80,6 @@ class CBSInteractiveIE(ThePlatformIE):
 
         media_guid_path = 'media/guid/%d/%s' % (self.MPX_ACCOUNTS[site], vdata['mpxRefId'])
         formats, subtitles = [], {}
-        if site == 'cnet':
-            formats, subtitles = self._extract_theplatform_smil(
-                self.TP_RELEASE_URL_TEMPLATE % media_guid_path, video_id)
         for (fkey, vid) in vdata['files'].items():
             if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']:
                 continue
@@ -94,7 +91,7 @@ class CBSInteractiveIE(ThePlatformIE):
             subtitles = self._merge_subtitles(subtitles, tp_subtitles)
         self._sort_formats(formats)
 
-        info = self.get_metadata('kYEXFC/%s' % media_guid_path, video_id)
+        info = self._extract_theplatform_metadata('kYEXFC/%s' % media_guid_path, video_id)
         info.update({
             'id': video_id,
             'display_id': display_id,
diff --git a/youtube_dl/extractor/cbslocal.py b/youtube_dl/extractor/cbslocal.py
new file mode 100644 (file)
index 0000000..74adb38
--- /dev/null
@@ -0,0 +1,84 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import calendar
+import datetime
+
+from .anvato import AnvatoIE
+from .sendtonews import SendtoNewsIE
+from ..compat import compat_urlparse
+
+
+class CBSLocalIE(AnvatoIE):
+    _VALID_URL = r'https?://[a-z]+\.cbslocal\.com/\d+/\d+/\d+/(?P<id>[0-9a-z-]+)'
+
+    _TESTS = [{
+        # Anvato backend
+        'url': 'http://losangeles.cbslocal.com/2016/05/16/safety-advocates-say-fatal-car-seat-failures-are-public-health-crisis',
+        'md5': 'f0ee3081e3843f575fccef901199b212',
+        'info_dict': {
+            'id': '3401037',
+            'ext': 'mp4',
+            'title': 'Safety Advocates Say Fatal Car Seat Failures Are \'Public Health Crisis\'',
+            'description': 'Collapsing seats have been the focus of scrutiny for decades, though experts say remarkably little has been done to address the issue. Randy Paige reports.',
+            'thumbnail': 're:^https?://.*',
+            'timestamp': 1463440500,
+            'upload_date': '20160516',
+            'subtitles': {
+                'en': 'mincount:5',
+            },
+            'categories': [
+                'Stations\\Spoken Word\\KCBSTV',
+                'Syndication\\MSN',
+                'Syndication\\NDN',
+                'Syndication\\AOL',
+                'Syndication\\Yahoo',
+                'Syndication\\Tribune',
+                'Syndication\\Curb.tv',
+                'Content\\News'
+            ],
+        },
+    }, {
+        # SendtoNews embed
+        'url': 'http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/',
+        'info_dict': {
+            'id': 'GxfCe0Zo7D-175909-5588',
+            'ext': 'mp4',
+            'title': 'Recap: CLE 15, CIN 6',
+            'description': '5/16/16: Indians\' bats explode for 15 runs in a win',
+            'upload_date': '20160516',
+            'timestamp': 1463433840,
+            'duration': 49,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+
+        sendtonews_url = SendtoNewsIE._extract_url(webpage)
+        if sendtonews_url:
+            info_dict = {
+                '_type': 'url_transparent',
+                'url': compat_urlparse.urljoin(url, sendtonews_url),
+            }
+        else:
+            info_dict = self._extract_anvato_videos(webpage, display_id)
+
+        time_str = self._html_search_regex(
+            r'class="entry-date">([^<]+)<', webpage, 'released date', fatal=False)
+        timestamp = None
+        if time_str:
+            timestamp = calendar.timegm(datetime.datetime.strptime(
+                time_str, '%b %d, %Y %I:%M %p').timetuple())
+
+        info_dict.update({
+            'display_id': display_id,
+            'timestamp': timestamp,
+        })
+
+        return info_dict
index 79ddc20a09ca067922e25d315bd0fbdb03b0abf9..387537e766fc4886201285f221022181b2beeb53 100644 (file)
@@ -30,9 +30,12 @@ class CBSNewsIE(CBSBaseIE):
         {
             'url': 'http://www.cbsnews.com/videos/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/',
             'info_dict': {
-                'id': 'fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack',
+                'id': 'SNJBOYzXiWBOvaLsdzwH8fmtP1SCd91Y',
                 'ext': 'mp4',
                 'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack',
+                'description': 'md5:4a6983e480542d8b333a947bfc64ddc7',
+                'upload_date': '19700101',
+                'uploader': 'CBSI-NEW',
                 'thumbnail': 're:^https?://.*\.jpg$',
                 'duration': 205,
                 'subtitles': {
@@ -58,30 +61,8 @@ class CBSNewsIE(CBSBaseIE):
             webpage, 'video JSON info'), video_id)
 
         item = video_info['item'] if 'item' in video_info else video_info
-        title = item.get('articleTitle') or item.get('hed')
-        duration = item.get('duration')
-        thumbnail = item.get('mediaImage') or item.get('thumbnail')
-
-        subtitles = {}
-        formats = []
-        for format_id in ['RtmpMobileLow', 'RtmpMobileHigh', 'Hls', 'RtmpDesktop']:
-            pid = item.get('media' + format_id)
-            if not pid:
-                continue
-            release_url = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true' % pid
-            tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % pid)
-            formats.extend(tp_formats)
-            subtitles = self._merge_subtitles(subtitles, tp_subtitles)
-        self._sort_formats(formats)
-
-        return {
-            'id': video_id,
-            'title': title,
-            'thumbnail': thumbnail,
-            'duration': duration,
-            'formats': formats,
-            'subtitles': subtitles,
-        }
+        guid = item['mpxRefId']
+        return self._extract_video_info('byGuid=%s' % guid, guid)
 
 
 class CBSNewsLiveVideoIE(InfoExtractor):
index 549ae32f36c8ebd258896d4189ba90ae501c40d0..78ca44b024bfb20dc6ce79e4ee51f3472d599711 100644 (file)
@@ -1,30 +1,28 @@
 from __future__ import unicode_literals
 
-import re
+from .cbs import CBSBaseIE
 
-from .common import InfoExtractor
 
+class CBSSportsIE(CBSBaseIE):
+    _VALID_URL = r'https?://www\.cbssports\.com/video/player/[^/]+/(?P<id>\d+)'
 
-class CBSSportsIE(InfoExtractor):
-    _VALID_URL = r'https?://www\.cbssports\.com/video/player/(?P<section>[^/]+)/(?P<id>[^/]+)'
-
-    _TEST = {
-        'url': 'http://www.cbssports.com/video/player/tennis/318462531970/0/us-open-flashbacks-1990s',
+    _TESTS = [{
+        'url': 'http://www.cbssports.com/video/player/videos/708337219968/0/ben-simmons-the-next-lebron?-not-so-fast',
         'info_dict': {
-            'id': '_d5_GbO8p1sT',
-            'ext': 'flv',
-            'title': 'US Open flashbacks: 1990s',
-            'description': 'Bill Macatee relives the best moments in US Open history from the 1990s.',
+            'id': '708337219968',
+            'ext': 'mp4',
+            'title': 'Ben Simmons the next LeBron? Not so fast',
+            'description': 'md5:854294f627921baba1f4b9a990d87197',
+            'timestamp': 1466293740,
+            'upload_date': '20160618',
+            'uploader': 'CBSI-NEW',
         },
-    }
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        }
+    }]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        section = mobj.group('section')
-        video_id = mobj.group('id')
-        all_videos = self._download_json(
-            'http://www.cbssports.com/data/video/player/getVideos/%s?as=json' % section,
-            video_id)
-        # The json file contains the info of all the videos in the section
-        video_info = next(v for v in all_videos if v['pcid'] == video_id)
-        return self.url_result('theplatform:%s' % video_info['pid'], 'ThePlatform')
+        video_id = self._match_id(url)
+        return self._extract_video_info('byId=%s' % video_id, video_id)
index dda2c0959882c3cd3c5de56b817ccd7815ef0068..8f7f09e22dad6eda3ca08edfbf9edc118146e893 100644 (file)
@@ -1,13 +1,9 @@
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
 from ..utils import (
     int_or_none,
-    parse_duration,
-    qualities,
-    unified_strdate,
+    parse_iso8601,
 )
 
 
@@ -19,14 +15,14 @@ class CCCIE(InfoExtractor):
         'url': 'https://media.ccc.de/v/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor#video',
         'md5': '3a1eda8f3a29515d27f5adb967d7e740',
         'info_dict': {
-            'id': '30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor',
+            'id': '1839',
             'ext': 'mp4',
             'title': 'Introduction to Processor Design',
-            'description': 'md5:80be298773966f66d56cb11260b879af',
+            'description': 'md5:df55f6d073d4ceae55aae6f2fd98a0ac',
             'thumbnail': 're:^https?://.*\.jpg$',
-            'view_count': int,
             'upload_date': '20131228',
-            'duration': 3660,
+            'timestamp': 1388188800,
+            'duration': 3710,
         }
     }, {
         'url': 'https://media.ccc.de/v/32c3-7368-shopshifting#download',
@@ -34,79 +30,48 @@ class CCCIE(InfoExtractor):
     }]
 
     def _real_extract(self, url):
-        video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
-
-        if self._downloader.params.get('prefer_free_formats'):
-            preference = qualities(['mp3', 'opus', 'mp4-lq', 'webm-lq', 'h264-sd', 'mp4-sd', 'webm-sd', 'mp4', 'webm', 'mp4-hd', 'h264-hd', 'webm-hd'])
-        else:
-            preference = qualities(['opus', 'mp3', 'webm-lq', 'mp4-lq', 'webm-sd', 'h264-sd', 'mp4-sd', 'webm', 'mp4', 'webm-hd', 'mp4-hd', 'h264-hd'])
-
-        title = self._html_search_regex(
-            r'(?s)<h1>(.*?)</h1>', webpage, 'title')
-        description = self._html_search_regex(
-            r'(?s)<h3>About</h3>(.+?)<h3>',
-            webpage, 'description', fatal=False)
-        upload_date = unified_strdate(self._html_search_regex(
-            r"(?s)<span[^>]+class='[^']*fa-calendar-o'[^>]*>(.+?)</span>",
-            webpage, 'upload date', fatal=False))
-        view_count = int_or_none(self._html_search_regex(
-            r"(?s)<span class='[^']*fa-eye'></span>(.*?)</li>",
-            webpage, 'view count', fatal=False))
-        duration = parse_duration(self._html_search_regex(
-            r'(?s)<span[^>]+class=(["\']).*?fa-clock-o.*?\1[^>]*></span>(?P<duration>.+?)</li',
-            webpage, 'duration', fatal=False, group='duration'))
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        event_id = self._search_regex("data-id='(\d+)'", webpage, 'event id')
+        event_data = self._download_json('https://media.ccc.de/public/events/%s' % event_id, event_id)
 
-        matches = re.finditer(r'''(?xs)
-            <(?:span|div)\s+class='label\s+filetype'>(?P<format>[^<]*)</(?:span|div)>\s*
-            <(?:span|div)\s+class='label\s+filetype'>(?P<lang>[^<]*)</(?:span|div)>\s*
-            <a\s+download\s+href='(?P<http_url>[^']+)'>\s*
-            (?:
-                .*?
-                <a\s+(?:download\s+)?href='(?P<torrent_url>[^']+\.torrent)'
-            )?''', webpage)
         formats = []
-        for m in matches:
-            format = m.group('format')
-            format_id = self._search_regex(
-                r'.*/([a-z0-9_-]+)/[^/]*$',
-                m.group('http_url'), 'format id', default=None)
-            if format_id:
-                format_id = m.group('lang') + '-' + format_id
-            vcodec = 'h264' if 'h264' in format_id else (
-                'none' if format_id in ('mp3', 'opus') else None
+        for recording in event_data.get('recordings', []):
+            recording_url = recording.get('recording_url')
+            if not recording_url:
+                continue
+            language = recording.get('language')
+            folder = recording.get('folder')
+            format_id = None
+            if language:
+                format_id = language
+            if folder:
+                if language:
+                    format_id += '-' + folder
+                else:
+                    format_id = folder
+            vcodec = 'h264' if 'h264' in folder else (
+                'none' if folder in ('mp3', 'opus') else None
             )
             formats.append({
                 'format_id': format_id,
-                'format': format,
-                'language': m.group('lang'),
-                'url': m.group('http_url'),
+                'url': recording_url,
+                'width': int_or_none(recording.get('width')),
+                'height': int_or_none(recording.get('height')),
+                'filesize': int_or_none(recording.get('size'), invscale=1024 * 1024),
+                'language': language,
                 'vcodec': vcodec,
-                'preference': preference(format_id),
             })
-
-            if m.group('torrent_url'):
-                formats.append({
-                    'format_id': 'torrent-%s' % (format if format_id is None else format_id),
-                    'format': '%s (torrent)' % format,
-                    'proto': 'torrent',
-                    'format_note': '(unsupported; will just download the .torrent file)',
-                    'vcodec': vcodec,
-                    'preference': -100 + preference(format_id),
-                    'url': m.group('torrent_url'),
-                })
         self._sort_formats(formats)
 
-        thumbnail = self._html_search_regex(
-            r"<video.*?poster='([^']+)'", webpage, 'thumbnail', fatal=False)
-
         return {
-            'id': video_id,
-            'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
-            'view_count': view_count,
-            'upload_date': upload_date,
-            'duration': duration,
+            'id': event_id,
+            'display_id': display_id,
+            'title': event_data['title'],
+            'description': event_data.get('description'),
+            'thumbnail': event_data.get('thumb_url'),
+            'timestamp': parse_iso8601(event_data.get('date')),
+            'duration': int_or_none(event_data.get('length')),
+            'tags': event_data.get('tags'),
             'formats': formats,
         }
index 498d2c0d8a1dad129c8a1a9681da2f06503129f9..8af318703b0ae9ad57fedb49c7f320288322caa9 100755 (executable)
@@ -58,7 +58,8 @@ class CDAIE(InfoExtractor):
         def extract_format(page, version):
             unpacked = decode_packed_codes(page)
             format_url = self._search_regex(
-                r"url:\\'(.+?)\\'", unpacked, '%s url' % version, fatal=False)
+                r"(?:file|url)\s*:\s*(\\?[\"'])(?P<url>http.+?)\1", unpacked,
+                '%s url' % version, fatal=False, group='url')
             if not format_url:
                 return
             f = {
@@ -75,7 +76,8 @@ class CDAIE(InfoExtractor):
             info_dict['formats'].append(f)
             if not info_dict['duration']:
                 info_dict['duration'] = parse_duration(self._search_regex(
-                    r"duration:\\'(.+?)\\'", unpacked, 'duration', fatal=False))
+                    r"duration\s*:\s*(\\?[\"'])(?P<duration>.+?)\1",
+                    unpacked, 'duration', fatal=False, group='duration'))
 
         extract_format(webpage, 'default')
 
index 6652c8e42a279f45bdbbc1af3d36ad2500a454eb..5a58d1777d50297557cae49039df19cbfe15fef0 100644 (file)
@@ -33,19 +33,33 @@ class CeskaTelevizeIE(InfoExtractor):
             'skip_download': True,
         },
     }, {
-        'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina',
+        'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en',
         'info_dict': {
-            'id': '61924494876844374',
+            'id': '61924494877028507',
             'ext': 'mp4',
-            'title': 'První republika: Zpěvačka z Dupárny Bobina',
-            'description': 'Sága mapující atmosféru první republiky od r. 1918 do r. 1945.',
+            'title': 'Hyde Park Civilizace: Bonus 01 - En',
+            'description': 'English Subtittles',
             'thumbnail': 're:^https?://.*\.jpg',
-            'duration': 88.4,
+            'duration': 81.3,
         },
         'params': {
             # m3u8 download
             'skip_download': True,
         },
+    }, {
+        # live stream
+        'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/',
+        'info_dict': {
+            'id': 402,
+            'ext': 'mp4',
+            'title': 're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
+            'is_live': True,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'skip': 'Georestricted to Czech Republic',
     }, {
         # video with 18+ caution trailer
         'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/',
@@ -118,19 +132,21 @@ class CeskaTelevizeIE(InfoExtractor):
         req = sanitized_Request(compat_urllib_parse_unquote(playlist_url))
         req.add_header('Referer', url)
 
-        playlist_title = self._og_search_title(webpage)
-        playlist_description = self._og_search_description(webpage)
+        playlist_title = self._og_search_title(webpage, default=None)
+        playlist_description = self._og_search_description(webpage, default=None)
 
         playlist = self._download_json(req, playlist_id)['playlist']
         playlist_len = len(playlist)
 
         entries = []
         for item in playlist:
+            is_live = item.get('type') == 'LIVE'
             formats = []
             for format_id, stream_url in item['streamUrls'].items():
                 formats.extend(self._extract_m3u8_formats(
                     stream_url, playlist_id, 'mp4',
-                    entry_protocol='m3u8_native', fatal=False))
+                    entry_protocol='m3u8' if is_live else 'm3u8_native',
+                    fatal=False))
             self._sort_formats(formats)
 
             item_id = item.get('id') or item['assetId']
@@ -145,14 +161,22 @@ class CeskaTelevizeIE(InfoExtractor):
                 if subs:
                     subtitles = self.extract_subtitles(episode_id, subs)
 
+            if playlist_len == 1:
+                final_title = playlist_title or title
+                if is_live:
+                    final_title = self._live_title(final_title)
+            else:
+                final_title = '%s (%s)' % (playlist_title, title)
+
             entries.append({
                 'id': item_id,
-                'title': playlist_title if playlist_len == 1 else '%s (%s)' % (playlist_title, title),
+                'title': final_title,
                 'description': playlist_description if playlist_len == 1 else None,
                 'thumbnail': thumbnail,
                 'duration': duration,
                 'formats': formats,
                 'subtitles': subtitles,
+                'is_live': is_live,
             })
 
         return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
index c74553dcfa7c689b7fc8d69147625b1169e1e178..34d4e61569b110b49998768f13bb81cdda75bd75 100644 (file)
@@ -20,54 +20,64 @@ class Channel9IE(InfoExtractor):
     '''
     IE_DESC = 'Channel 9'
     IE_NAME = 'channel9'
-    _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
-
-    _TESTS = [
-        {
-            'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
-            'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
-            'info_dict': {
-                'id': 'Events/TechEd/Australia/2013/KOS002',
-                'ext': 'mp4',
-                'title': 'Developer Kick-Off Session: Stuff We Love',
-                'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
-                'duration': 4576,
-                'thumbnail': 're:http://.*\.jpg',
-                'session_code': 'KOS002',
-                'session_day': 'Day 1',
-                'session_room': 'Arena 1A',
-                'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'],
-            },
+    _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
+
+    _TESTS = [{
+        'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
+        'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
+        'info_dict': {
+            'id': 'Events/TechEd/Australia/2013/KOS002',
+            'ext': 'mp4',
+            'title': 'Developer Kick-Off Session: Stuff We Love',
+            'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
+            'duration': 4576,
+            'thumbnail': 're:http://.*\.jpg',
+            'session_code': 'KOS002',
+            'session_day': 'Day 1',
+            'session_room': 'Arena 1A',
+            'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug',
+                                 'Mads Kristensen'],
         },
-        {
-            'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
-            'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
-            'info_dict': {
-                'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
-                'ext': 'mp4',
-                'title': 'Self-service BI with Power BI - nuclear testing',
-                'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
-                'duration': 1540,
-                'thumbnail': 're:http://.*\.jpg',
-                'authors': ['Mike Wilmot'],
-            },
+    }, {
+        'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
+        'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
+        'info_dict': {
+            'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
+            'ext': 'mp4',
+            'title': 'Self-service BI with Power BI - nuclear testing',
+            'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
+            'duration': 1540,
+            'thumbnail': 're:http://.*\.jpg',
+            'authors': ['Mike Wilmot'],
         },
-        {
-            # low quality mp4 is best
-            'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
-            'info_dict': {
-                'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
-                'ext': 'mp4',
-                'title': 'Ranges for the Standard Library',
-                'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
-                'duration': 5646,
-                'thumbnail': 're:http://.*\.jpg',
-            },
-            'params': {
-                'skip_download': True,
-            },
-        }
-    ]
+    }, {
+        # low quality mp4 is best
+        'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
+        'info_dict': {
+            'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
+            'ext': 'mp4',
+            'title': 'Ranges for the Standard Library',
+            'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
+            'duration': 5646,
+            'thumbnail': 're:http://.*\.jpg',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS',
+        'info_dict': {
+            'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b',
+            'title': 'Channel 9',
+        },
+        'playlist_count': 2,
+    }, {
+        'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS',
+        'only_matching': True,
+    }, {
+        'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman',
+        'only_matching': True,
+    }]
 
     _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
 
@@ -254,22 +264,30 @@ class Channel9IE(InfoExtractor):
 
         return self.playlist_result(contents)
 
-    def _extract_list(self, content_path):
-        rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
+    def _extract_list(self, video_id, rss_url=None):
+        if not rss_url:
+            rss_url = self._RSS_URL % video_id
+        rss = self._download_xml(rss_url, video_id, 'Downloading RSS')
         entries = [self.url_result(session_url.text, 'Channel9')
                    for session_url in rss.findall('./channel/item/link')]
         title_text = rss.find('./channel/title').text
-        return self.playlist_result(entries, content_path, title_text)
+        return self.playlist_result(entries, video_id, title_text)
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         content_path = mobj.group('contentpath')
+        rss = mobj.group('rss')
+
+        if rss:
+            return self._extract_list(content_path, url)
 
-        webpage = self._download_webpage(url, content_path, 'Downloading web page')
+        webpage = self._download_webpage(
+            url, content_path, 'Downloading web page')
 
-        page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
-        if page_type_m is not None:
-            page_type = page_type_m.group('pagetype')
+        page_type = self._search_regex(
+            r'<meta[^>]+name=(["\'])WT\.entryid\1[^>]+content=(["\'])(?P<pagetype>[^:]+).+?\2',
+            webpage, 'page type', default=None, group='pagetype')
+        if page_type:
             if page_type == 'Entry':      # Any 'item'-like page, may contain downloadable content
                 return self._extract_entry_item(webpage, content_path)
             elif page_type == 'Session':  # Event session page, may contain downloadable content
@@ -278,6 +296,5 @@ class Channel9IE(InfoExtractor):
                 return self._extract_list(content_path)
             else:
                 raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
-
         else:  # Assuming list
             return self._extract_list(content_path)
diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py
deleted file mode 100644 (file)
index 042c4f2..0000000
+++ /dev/null
@@ -1,119 +0,0 @@
-# encoding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import ExtractorError
-from .screenwavemedia import ScreenwaveMediaIE
-
-
-class CinemassacreIE(InfoExtractor):
-    _VALID_URL = 'https?://(?:www\.)?cinemassacre\.com/(?P<date_y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/(?P<display_id>[^?#/]+)'
-    _TESTS = [
-        {
-            'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/',
-            'md5': 'fde81fbafaee331785f58cd6c0d46190',
-            'info_dict': {
-                'id': 'Cinemassacre-19911',
-                'ext': 'mp4',
-                'upload_date': '20121110',
-                'title': '“Angry Video Game Nerd: The Movie” – Trailer',
-                'description': 'md5:fb87405fcb42a331742a0dce2708560b',
-            },
-            'params': {
-                # m3u8 download
-                'skip_download': True,
-            },
-        },
-        {
-            'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
-            'md5': 'd72f10cd39eac4215048f62ab477a511',
-            'info_dict': {
-                'id': 'Cinemassacre-521be8ef82b16',
-                'ext': 'mp4',
-                'upload_date': '20131002',
-                'title': 'The Mummy’s Hand (1940)',
-            },
-            'params': {
-                # m3u8 download
-                'skip_download': True,
-            },
-        },
-        {
-            # Youtube embedded video
-            'url': 'http://cinemassacre.com/2006/12/07/chronologically-confused-about-bad-movie-and-video-game-sequel-titles/',
-            'md5': 'ec9838a5520ef5409b3e4e42fcb0a3b9',
-            'info_dict': {
-                'id': 'OEVzPCY2T-g',
-                'ext': 'webm',
-                'title': 'AVGN: Chronologically Confused about Bad Movie and Video Game Sequel Titles',
-                'upload_date': '20061207',
-                'uploader': 'Cinemassacre',
-                'uploader_id': 'JamesNintendoNerd',
-                'description': 'md5:784734696c2b8b7f4b8625cc799e07f6',
-            }
-        },
-        {
-            # Youtube embedded video
-            'url': 'http://cinemassacre.com/2006/09/01/mckids/',
-            'md5': '7393c4e0f54602ad110c793eb7a6513a',
-            'info_dict': {
-                'id': 'FnxsNhuikpo',
-                'ext': 'webm',
-                'upload_date': '20060901',
-                'uploader': 'Cinemassacre Extra',
-                'description': 'md5:de9b751efa9e45fbaafd9c8a1123ed53',
-                'uploader_id': 'Cinemassacre',
-                'title': 'AVGN: McKids',
-            }
-        },
-        {
-            'url': 'http://cinemassacre.com/2015/05/25/mario-kart-64-nintendo-64-james-mike-mondays/',
-            'md5': '1376908e49572389e7b06251a53cdd08',
-            'info_dict': {
-                'id': 'Cinemassacre-555779690c440',
-                'ext': 'mp4',
-                'description': 'Let’s Play Mario Kart 64 !! Mario Kart 64 is a classic go-kart racing game released for the Nintendo 64 (N64). Today James & Mike do 4 player Battle Mode with Kyle and Bootsy!',
-                'title': 'Mario Kart 64 (Nintendo 64) James & Mike Mondays',
-                'upload_date': '20150525',
-            },
-            'params': {
-                # m3u8 download
-                'skip_download': True,
-            },
-        }
-    ]
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        display_id = mobj.group('display_id')
-        video_date = mobj.group('date_y') + mobj.group('date_m') + mobj.group('date_d')
-
-        webpage = self._download_webpage(url, display_id)
-
-        playerdata_url = self._search_regex(
-            [
-                ScreenwaveMediaIE.EMBED_PATTERN,
-                r'<iframe[^>]+src="(?P<url>(?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"',
-            ],
-            webpage, 'player data URL', default=None, group='url')
-        if not playerdata_url:
-            raise ExtractorError('Unable to find player data')
-
-        video_title = self._html_search_regex(
-            r'<title>(?P<title>.+?)\|', webpage, 'title')
-        video_description = self._html_search_regex(
-            r'<div class="entry-content">(?P<description>.+?)</div>',
-            webpage, 'description', flags=re.DOTALL, fatal=False)
-        video_thumbnail = self._og_search_thumbnail(webpage)
-
-        return {
-            '_type': 'url_transparent',
-            'display_id': display_id,
-            'title': video_title,
-            'description': video_description,
-            'upload_date': video_date,
-            'thumbnail': video_thumbnail,
-            'url': playerdata_url,
-        }
index 4f9320ea57cb542f69b6b0be1965bb004fa35574..d55b26d59ff89af6ef9ae7943cb67b4346031f66 100644 (file)
@@ -1,16 +1,10 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
-    float_or_none,
-    int_or_none,
-    parse_iso8601,
-)
+from .onet import OnetBaseIE
 
 
-class ClipRsIE(InfoExtractor):
+class ClipRsIE(OnetBaseIE):
     _VALID_URL = r'https?://(?:www\.)?clip\.rs/(?P<id>[^/]+)/\d+'
     _TEST = {
         'url': 'http://www.clip.rs/premijera-frajle-predstavljaju-novi-spot-za-pesmu-moli-me-moli/3732',
@@ -27,64 +21,13 @@ class ClipRsIE(InfoExtractor):
     }
 
     def _real_extract(self, url):
-        video_id = self._match_id(url)
+        display_id = self._match_id(url)
 
-        webpage = self._download_webpage(url, video_id)
+        webpage = self._download_webpage(url, display_id)
 
-        video_id = self._search_regex(
-            r'id=(["\'])mvp:(?P<id>.+?)\1', webpage, 'mvp id', group='id')
+        mvp_id = self._search_mvp_id(webpage)
 
-        response = self._download_json(
-            'http://qi.ckm.onetapi.pl/', video_id,
-            query={
-                'body[id]': video_id,
-                'body[jsonrpc]': '2.0',
-                'body[method]': 'get_asset_detail',
-                'body[params][ID_Publikacji]': video_id,
-                'body[params][Service]': 'www.onet.pl',
-                'content-type': 'application/jsonp',
-                'x-onet-app': 'player.front.onetapi.pl',
-            })
+        info_dict = self._extract_from_id(mvp_id, webpage)
+        info_dict['display_id'] = display_id
 
-        error = response.get('error')
-        if error:
-            raise ExtractorError(
-                '%s said: %s' % (self.IE_NAME, error['message']), expected=True)
-
-        video = response['result'].get('0')
-
-        formats = []
-        for _, formats_dict in video['formats'].items():
-            if not isinstance(formats_dict, dict):
-                continue
-            for format_id, format_list in formats_dict.items():
-                if not isinstance(format_list, list):
-                    continue
-                for f in format_list:
-                    if not f.get('url'):
-                        continue
-                    formats.append({
-                        'url': f['url'],
-                        'format_id': format_id,
-                        'height': int_or_none(f.get('vertical_resolution')),
-                        'width': int_or_none(f.get('horizontal_resolution')),
-                        'abr': float_or_none(f.get('audio_bitrate')),
-                        'vbr': float_or_none(f.get('video_bitrate')),
-                    })
-        self._sort_formats(formats)
-
-        meta = video.get('meta', {})
-
-        title = self._og_search_title(webpage, default=None) or meta['title']
-        description = self._og_search_description(webpage, default=None) or meta.get('description')
-        duration = meta.get('length') or meta.get('lenght')
-        timestamp = parse_iso8601(meta.get('addDate'), ' ')
-
-        return {
-            'id': video_id,
-            'title': title,
-            'description': description,
-            'duration': duration,
-            'timestamp': timestamp,
-            'formats': formats,
-        }
+        return info_dict
diff --git a/youtube_dl/extractor/closertotruth.py b/youtube_dl/extractor/closertotruth.py
new file mode 100644 (file)
index 0000000..26243d5
--- /dev/null
@@ -0,0 +1,92 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class CloserToTruthIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688',
+        'info_dict': {
+            'id': '0_zof1ktre',
+            'display_id': 'solutions-the-mind-body-problem',
+            'ext': 'mov',
+            'title': 'Solutions to the Mind-Body Problem?',
+            'upload_date': '20140221',
+            'timestamp': 1392956007,
+            'uploader_id': 'CTTXML'
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://closertotruth.com/episodes/how-do-brains-work',
+        'info_dict': {
+            'id': '0_iuxai6g6',
+            'display_id': 'how-do-brains-work',
+            'ext': 'mov',
+            'title': 'How do Brains Work?',
+            'upload_date': '20140221',
+            'timestamp': 1392956024,
+            'uploader_id': 'CTTXML'
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://closertotruth.com/interviews/1725',
+        'info_dict': {
+            'id': '1725',
+            'title': 'AyaFr-002',
+        },
+        'playlist_mincount': 2,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        partner_id = self._search_regex(
+            r'<script[^>]+src=["\'].*?\b(?:partner_id|p)/(\d+)',
+            webpage, 'kaltura partner_id')
+
+        title = self._search_regex(
+            r'<title>(.+?)\s*\|\s*.+?</title>', webpage, 'video title')
+
+        select = self._search_regex(
+            r'(?s)<select[^>]+id="select-version"[^>]*>(.+?)</select>',
+            webpage, 'select version', default=None)
+        if select:
+            entry_ids = set()
+            entries = []
+            for mobj in re.finditer(
+                    r'<option[^>]+value=(["\'])(?P<id>[0-9a-z_]+)(?:#.+?)?\1[^>]*>(?P<title>[^<]+)',
+                    webpage):
+                entry_id = mobj.group('id')
+                if entry_id in entry_ids:
+                    continue
+                entry_ids.add(entry_id)
+                entries.append({
+                    '_type': 'url_transparent',
+                    'url': 'kaltura:%s:%s' % (partner_id, entry_id),
+                    'ie_key': 'Kaltura',
+                    'title': mobj.group('title'),
+                })
+            if entries:
+                return self.playlist_result(entries, display_id, title)
+
+        entry_id = self._search_regex(
+            r'<a[^>]+id=(["\'])embed-kaltura\1[^>]+data-kaltura=(["\'])(?P<id>[0-9a-z_]+)\2',
+            webpage, 'kaltura entry_id', group='id')
+
+        return {
+            '_type': 'url_transparent',
+            'display_id': display_id,
+            'url': 'kaltura:%s:%s' % (partner_id, entry_id),
+            'ie_key': 'Kaltura',
+            'title': title
+        }
index 9e267e6c0260e0391ff04b61c613a2fb6d916313..9a28ef35423a5dfd28295333ffc037b8919873ac 100644 (file)
@@ -19,7 +19,7 @@ from ..utils import (
 class CloudyIE(InfoExtractor):
     _IE_DESC = 'cloudy.ec and videoraj.ch'
     _VALID_URL = r'''(?x)
-        https?://(?:www\.)?(?P<host>cloudy\.ec|videoraj\.ch)/
+        https?://(?:www\.)?(?P<host>cloudy\.ec|videoraj\.(?:ch|to))/
         (?:v/|embed\.php\?id=)
         (?P<id>[A-Za-z0-9]+)
         '''
@@ -37,7 +37,7 @@ class CloudyIE(InfoExtractor):
             }
         },
         {
-            'url': 'http://www.videoraj.ch/v/47f399fd8bb60',
+            'url': 'http://www.videoraj.to/v/47f399fd8bb60',
             'md5': '7d0f8799d91efd4eda26587421c3c3b0',
             'info_dict': {
                 'id': '47f399fd8bb60',
diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py
deleted file mode 100644 (file)
index 002b240..0000000
+++ /dev/null
@@ -1,101 +0,0 @@
-from __future__ import unicode_literals
-
-import json
-import re
-
-from .common import InfoExtractor
-from ..utils import int_or_none
-
-
-class CollegeHumorIE(InfoExtractor):
-    _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/(video|embed|e)/(?P<videoid>[0-9]+)/?(?P<shorttitle>.*)$'
-
-    _TESTS = [
-        {
-            'url': 'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe',
-            'md5': 'dcc0f5c1c8be98dc33889a191f4c26bd',
-            'info_dict': {
-                'id': '6902724',
-                'ext': 'mp4',
-                'title': 'Comic-Con Cosplay Catastrophe',
-                'description': "Fans get creative this year at San Diego.  Too creative.  And yes, that's really Joss Whedon.",
-                'age_limit': 13,
-                'duration': 187,
-            },
-        }, {
-            'url': 'http://www.collegehumor.com/video/3505939/font-conference',
-            'md5': '72fa701d8ef38664a4dbb9e2ab721816',
-            'info_dict': {
-                'id': '3505939',
-                'ext': 'mp4',
-                'title': 'Font Conference',
-                'description': "This video wasn't long enough, so we made it double-spaced.",
-                'age_limit': 10,
-                'duration': 179,
-            },
-        }, {
-            # embedded youtube video
-            'url': 'http://www.collegehumor.com/embed/6950306',
-            'info_dict': {
-                'id': 'Z-bao9fg6Yc',
-                'ext': 'mp4',
-                'title': 'Young Americans Think President John F. Kennedy Died THIS MORNING IN A CAR ACCIDENT!!!',
-                'uploader': 'Mark Dice',
-                'uploader_id': 'MarkDice',
-                'description': 'md5:62c3dab9351fac7bb44b53b69511d87f',
-                'upload_date': '20140127',
-            },
-            'params': {
-                'skip_download': True,
-            },
-            'add_ie': ['Youtube'],
-        },
-    ]
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('videoid')
-
-        jsonUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id + '.json'
-        data = json.loads(self._download_webpage(
-            jsonUrl, video_id, 'Downloading info JSON'))
-        vdata = data['video']
-        if vdata.get('youtubeId') is not None:
-            return {
-                '_type': 'url',
-                'url': vdata['youtubeId'],
-                'ie_key': 'Youtube',
-            }
-
-        AGE_LIMITS = {'nc17': 18, 'r': 18, 'pg13': 13, 'pg': 10, 'g': 0}
-        rating = vdata.get('rating')
-        if rating:
-            age_limit = AGE_LIMITS.get(rating.lower())
-        else:
-            age_limit = None  # None = No idea
-
-        PREFS = {'high_quality': 2, 'low_quality': 0}
-        formats = []
-        for format_key in ('mp4', 'webm'):
-            for qname, qurl in vdata.get(format_key, {}).items():
-                formats.append({
-                    'format_id': format_key + '_' + qname,
-                    'url': qurl,
-                    'format': format_key,
-                    'preference': PREFS.get(qname),
-                })
-        self._sort_formats(formats)
-
-        duration = int_or_none(vdata.get('duration'), 1000)
-        like_count = int_or_none(vdata.get('likes'))
-
-        return {
-            'id': video_id,
-            'title': vdata['title'],
-            'description': vdata.get('description'),
-            'thumbnail': vdata.get('thumbnail'),
-            'formats': formats,
-            'age_limit': age_limit,
-            'duration': duration,
-            'like_count': like_count,
-        }
index 0c59102e072594857cc0f1c53e15c183b1885a93..2b6aaa3aa47541669a59b7936477023d54ae7437 100644 (file)
@@ -44,10 +44,10 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor):
     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
     _VALID_URL = r'''(?x)^(:(?P<shortname>tds|thedailyshow)
                       |https?://(:www\.)?
-                          (?P<showname>thedailyshow|thecolbertreport)\.(?:cc\.)?com/
+                          (?P<showname>thedailyshow|thecolbertreport|tosh)\.(?:cc\.)?com/
                          ((?:full-)?episodes/(?:[0-9a-z]{6}/)?(?P<episode>.*)|
                           (?P<clip>
-                              (?:(?:guests/[^/]+|videos|video-playlists|special-editions|news-team/[^/]+)/[^/]+/(?P<videotitle>[^/?#]+))
+                              (?:(?:guests/[^/]+|videos|video-(?:clips|playlists)|special-editions|news-team/[^/]+)/[^/]+/(?P<videotitle>[^/?#]+))
                               |(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*))
                           )|
@@ -129,6 +129,9 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor):
     }, {
         'url': 'http://thedailyshow.cc.com/news-team/michael-che/7wnfel/we-need-to-talk-about-israel',
         'only_matching': True,
+    }, {
+        'url': 'http://tosh.cc.com/video-clips/68g93d/twitter-users-share-summer-plans',
+        'only_matching': True,
     }]
 
     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
index 02cd2c003ce0ab7b305d67e4f0e9c61304795620..816baa424e2a9b8efc5f9ce0c27ff320cca77e74 100644 (file)
@@ -44,7 +44,9 @@ from ..utils import (
     sanitized_Request,
     unescapeHTML,
     unified_strdate,
+    unified_timestamp,
     url_basename,
+    xpath_element,
     xpath_text,
     xpath_with_ns,
     determine_protocol,
@@ -52,6 +54,7 @@ from ..utils import (
     mimetype2ext,
     update_Request,
     update_url_query,
+    parse_m3u8_attributes,
 )
 
 
@@ -159,11 +162,12 @@ class InfoExtractor(object):
                         * "height" (optional, int)
                         * "resolution" (optional, string "{width}x{height"},
                                         deprecated)
+                        * "filesize" (optional, int)
     thumbnail:      Full URL to a video thumbnail image.
     description:    Full video description.
     uploader:       Full name of the video uploader.
     license:        License name the video is licensed under.
-    creator:        The main artist who created the video.
+    creator:        The creator of the video.
     release_date:   The date (YYYYMMDD) when the video was released.
     timestamp:      UNIX timestamp of the moment the video became available.
     upload_date:    Video upload date (YYYYMMDD).
@@ -747,10 +751,12 @@ class InfoExtractor(object):
         return self._og_search_property('url', html, **kargs)
 
     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
+        if not isinstance(name, (list, tuple)):
+            name = [name]
         if display_name is None:
-            display_name = name
+            display_name = name[0]
         return self._html_search_regex(
-            self._meta_regex(name),
+            [self._meta_regex(n) for n in name],
             html, display_name, fatal=fatal, group='content', **kwargs)
 
     def _dc_search_uploader(self, html):
@@ -799,15 +805,17 @@ class InfoExtractor(object):
         return self._html_search_meta('twitter:player', html,
                                       'twitter card player')
 
-    def _search_json_ld(self, html, video_id, **kwargs):
+    def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
         json_ld = self._search_regex(
             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
             html, 'JSON-LD', group='json_ld', **kwargs)
         if not json_ld:
             return {}
-        return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
+        return self._json_ld(
+            json_ld, video_id, fatal=kwargs.get('fatal', True),
+            expected_type=expected_type)
 
-    def _json_ld(self, json_ld, video_id, fatal=True):
+    def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
         if isinstance(json_ld, compat_str):
             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
         if not json_ld:
@@ -815,6 +823,8 @@ class InfoExtractor(object):
         info = {}
         if json_ld.get('@context') == 'http://schema.org':
             item_type = json_ld.get('@type')
+            if expected_type is not None and expected_type != item_type:
+                return info
             if item_type == 'TVEpisode':
                 info.update({
                     'episode': unescapeHTML(json_ld.get('name')),
@@ -833,6 +843,19 @@ class InfoExtractor(object):
                     'title': unescapeHTML(json_ld.get('headline')),
                     'description': unescapeHTML(json_ld.get('articleBody')),
                 })
+            elif item_type == 'VideoObject':
+                info.update({
+                    'url': json_ld.get('contentUrl'),
+                    'title': unescapeHTML(json_ld.get('name')),
+                    'description': unescapeHTML(json_ld.get('description')),
+                    'thumbnail': json_ld.get('thumbnailUrl'),
+                    'duration': parse_duration(json_ld.get('duration')),
+                    'timestamp': unified_timestamp(json_ld.get('uploadDate')),
+                    'filesize': float_or_none(json_ld.get('contentSize')),
+                    'tbr': int_or_none(json_ld.get('bitrate')),
+                    'width': int_or_none(json_ld.get('width')),
+                    'height': int_or_none(json_ld.get('height')),
+                })
         return dict((k, v) for k, v in info.items() if v is not None)
 
     @staticmethod
@@ -874,7 +897,11 @@ class InfoExtractor(object):
                 f['ext'] = determine_ext(f['url'])
 
             if isinstance(field_preference, (list, tuple)):
-                return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
+                return tuple(
+                    f.get(field)
+                    if f.get(field) is not None
+                    else ('' if field == 'format_id' else -1)
+                    for field in field_preference)
 
             preference = f.get('preference')
             if preference is None:
@@ -987,7 +1014,7 @@ class InfoExtractor(object):
 
     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
-                             fatal=True):
+                             fatal=True, m3u8_id=None):
         manifest = self._download_xml(
             manifest_url, video_id, 'Downloading f4m manifest',
             'Unable to download f4m manifest',
@@ -1001,11 +1028,18 @@ class InfoExtractor(object):
 
         return self._parse_f4m_formats(
             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
-            transform_source=transform_source, fatal=fatal)
+            transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
 
     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
-                           fatal=True):
+                           fatal=True, m3u8_id=None):
+        # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
+        akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
+        if akamai_pv is not None and ';' in akamai_pv.text:
+            playerVerificationChallenge = akamai_pv.text.split(';')[0]
+            if playerVerificationChallenge.strip() != '':
+                return []
+
         formats = []
         manifest_version = '1.0'
         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
@@ -1022,9 +1056,26 @@ class InfoExtractor(object):
             'base URL', default=None)
         if base_url:
             base_url = base_url.strip()
+
+        bootstrap_info = xpath_element(
+            manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
+            'bootstrap info', default=None)
+
         for i, media_el in enumerate(media_nodes):
-            if manifest_version == '2.0':
-                media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
+            tbr = int_or_none(media_el.attrib.get('bitrate'))
+            width = int_or_none(media_el.attrib.get('width'))
+            height = int_or_none(media_el.attrib.get('height'))
+            format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
+            # If <bootstrapInfo> is present, the specified f4m is a
+            # stream-level manifest, and only set-level manifests may refer to
+            # external resources.  See section 11.4 and section 4 of F4M spec
+            if bootstrap_info is None:
+                media_url = None
+                # @href is introduced in 2.0, see section 11.6 of F4M spec
+                if manifest_version == '2.0':
+                    media_url = media_el.attrib.get('href')
+                if media_url is None:
+                    media_url = media_el.attrib.get('url')
                 if not media_url:
                     continue
                 manifest_url = (
@@ -1034,29 +1085,43 @@ class InfoExtractor(object):
                 # since bitrates in parent manifest (this one) and media_url manifest
                 # may differ leading to inability to resolve the format by requested
                 # bitrate in f4m downloader
-                if determine_ext(manifest_url) == 'f4m':
-                    formats.extend(self._extract_f4m_formats(
+                ext = determine_ext(manifest_url)
+                if ext == 'f4m':
+                    f4m_formats = self._extract_f4m_formats(
                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
-                        transform_source=transform_source, fatal=fatal))
+                        transform_source=transform_source, fatal=fatal)
+                    # Sometimes stream-level manifest contains single media entry that
+                    # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
+                    # At the same time parent's media entry in set-level manifest may
+                    # contain it. We will copy it from parent in such cases.
+                    if len(f4m_formats) == 1:
+                        f = f4m_formats[0]
+                        f.update({
+                            'tbr': f.get('tbr') or tbr,
+                            'width': f.get('width') or width,
+                            'height': f.get('height') or height,
+                            'format_id': f.get('format_id') if not tbr else format_id,
+                        })
+                    formats.extend(f4m_formats)
+                    continue
+                elif ext == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        manifest_url, video_id, 'mp4', preference=preference,
+                        m3u8_id=m3u8_id, fatal=fatal))
                     continue
-            tbr = int_or_none(media_el.attrib.get('bitrate'))
             formats.append({
-                'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
+                'format_id': format_id,
                 'url': manifest_url,
-                'ext': 'flv',
+                'ext': 'flv' if bootstrap_info is not None else None,
                 'tbr': tbr,
-                'width': int_or_none(media_el.attrib.get('width')),
-                'height': int_or_none(media_el.attrib.get('height')),
+                'width': width,
+                'height': height,
                 'preference': preference,
             })
         return formats
 
-    def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
-                              entry_protocol='m3u8', preference=None,
-                              m3u8_id=None, note=None, errnote=None,
-                              fatal=True):
-
-        formats = [{
+    def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
+        return {
             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
             'url': m3u8_url,
             'ext': ext,
@@ -1064,7 +1129,14 @@ class InfoExtractor(object):
             'preference': preference - 1 if preference else -1,
             'resolution': 'multiple',
             'format_note': 'Quality selection URL',
-        }]
+        }
+
+    def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
+                              entry_protocol='m3u8', preference=None,
+                              m3u8_id=None, note=None, errnote=None,
+                              fatal=True, live=False):
+
+        formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
 
         format_url = lambda u: (
             u
@@ -1104,23 +1176,11 @@ class InfoExtractor(object):
             }]
         last_info = None
         last_media = None
-        kv_rex = re.compile(
-            r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
         for line in m3u8_doc.splitlines():
             if line.startswith('#EXT-X-STREAM-INF:'):
-                last_info = {}
-                for m in kv_rex.finditer(line):
-                    v = m.group('val')
-                    if v.startswith('"'):
-                        v = v[1:-1]
-                    last_info[m.group('key')] = v
+                last_info = parse_m3u8_attributes(line)
             elif line.startswith('#EXT-X-MEDIA:'):
-                last_media = {}
-                for m in kv_rex.finditer(line):
-                    v = m.group('val')
-                    if v.startswith('"'):
-                        v = v[1:-1]
-                    last_media[m.group('key')] = v
+                last_media = parse_m3u8_attributes(line)
             elif line.startswith('#') or not line.strip():
                 continue
             else:
@@ -1131,8 +1191,15 @@ class InfoExtractor(object):
                 format_id = []
                 if m3u8_id:
                     format_id.append(m3u8_id)
-                last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
-                format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
+                last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') not in ('SUBTITLES', 'CLOSED-CAPTIONS') else None
+                # Despite specification does not mention NAME attribute for
+                # EXT-X-STREAM-INF it still sometimes may be present
+                stream_name = last_info.get('NAME') or last_media_name
+                # Bandwidth of live streams may differ over time thus making
+                # format_id unpredictable. So it's better to keep provided
+                # format_id intact.
+                if not live:
+                    format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
                 f = {
                     'format_id': '-'.join(format_id),
                     'url': format_url(line.strip()),
@@ -1264,21 +1331,21 @@ class InfoExtractor(object):
         m3u8_count = 0
 
         srcs = []
-        videos = smil.findall(self._xpath_ns('.//video', namespace))
-        for video in videos:
-            src = video.get('src')
+        media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
+        for medium in media:
+            src = medium.get('src')
             if not src or src in srcs:
                 continue
             srcs.append(src)
 
-            bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
-            filesize = int_or_none(video.get('size') or video.get('fileSize'))
-            width = int_or_none(video.get('width'))
-            height = int_or_none(video.get('height'))
-            proto = video.get('proto')
-            ext = video.get('ext')
+            bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
+            filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
+            width = int_or_none(medium.get('width'))
+            height = int_or_none(medium.get('height'))
+            proto = medium.get('proto')
+            ext = medium.get('ext')
             src_ext = determine_ext(src)
-            streamer = video.get('streamer') or base
+            streamer = medium.get('streamer') or base
 
             if proto == 'rtmp' or streamer.startswith('rtmp'):
                 rtmp_count += 1
@@ -1681,6 +1748,13 @@ class InfoExtractor(object):
     def _mark_watched(self, *args, **kwargs):
         raise NotImplementedError('This method must be implemented by subclasses')
 
+    def geo_verification_headers(self):
+        headers = {}
+        geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
+        if geo_verification_proxy:
+            headers['Ytdl-request-proxy'] = geo_verification_proxy
+        return headers
+
 
 class SearchInfoExtractor(InfoExtractor):
     """
diff --git a/youtube_dl/extractor/coub.py b/youtube_dl/extractor/coub.py
new file mode 100644 (file)
index 0000000..a901b8d
--- /dev/null
@@ -0,0 +1,143 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    float_or_none,
+    int_or_none,
+    parse_iso8601,
+    qualities,
+)
+
+
+class CoubIE(InfoExtractor):
+    _VALID_URL = r'(?:coub:|https?://(?:coub\.com/(?:view|embed|coubs)/|c-cdn\.coub\.com/fb-player\.swf\?.*\bcoub(?:ID|id)=))(?P<id>[\da-z]+)'
+
+    _TESTS = [{
+        'url': 'http://coub.com/view/5u5n1',
+        'info_dict': {
+            'id': '5u5n1',
+            'ext': 'mp4',
+            'title': 'The Matrix Moonwalk',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 4.6,
+            'timestamp': 1428527772,
+            'upload_date': '20150408',
+            'uploader': 'Артём Лоскутников',
+            'uploader_id': 'artyom.loskutnikov',
+            'view_count': int,
+            'like_count': int,
+            'repost_count': int,
+            'comment_count': int,
+            'age_limit': 0,
+        },
+    }, {
+        'url': 'http://c-cdn.coub.com/fb-player.swf?bot_type=vk&coubID=7w5a4',
+        'only_matching': True,
+    }, {
+        'url': 'coub:5u5n1',
+        'only_matching': True,
+    }, {
+        # longer video id
+        'url': 'http://coub.com/view/237d5l5h',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        coub = self._download_json(
+            'http://coub.com/api/v2/coubs/%s.json' % video_id, video_id)
+
+        if coub.get('error'):
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, coub['error']), expected=True)
+
+        title = coub['title']
+
+        file_versions = coub['file_versions']
+
+        QUALITIES = ('low', 'med', 'high')
+
+        MOBILE = 'mobile'
+        IPHONE = 'iphone'
+        HTML5 = 'html5'
+
+        SOURCE_PREFERENCE = (MOBILE, IPHONE, HTML5)
+
+        quality_key = qualities(QUALITIES)
+        preference_key = qualities(SOURCE_PREFERENCE)
+
+        formats = []
+
+        for kind, items in file_versions.get(HTML5, {}).items():
+            if kind not in ('video', 'audio'):
+                continue
+            if not isinstance(items, dict):
+                continue
+            for quality, item in items.items():
+                if not isinstance(item, dict):
+                    continue
+                item_url = item.get('url')
+                if not item_url:
+                    continue
+                formats.append({
+                    'url': item_url,
+                    'format_id': '%s-%s-%s' % (HTML5, kind, quality),
+                    'filesize': int_or_none(item.get('size')),
+                    'vcodec': 'none' if kind == 'audio' else None,
+                    'quality': quality_key(quality),
+                    'preference': preference_key(HTML5),
+                })
+
+        iphone_url = file_versions.get(IPHONE, {}).get('url')
+        if iphone_url:
+            formats.append({
+                'url': iphone_url,
+                'format_id': IPHONE,
+                'preference': preference_key(IPHONE),
+            })
+
+        mobile_url = file_versions.get(MOBILE, {}).get('audio_url')
+        if mobile_url:
+            formats.append({
+                'url': mobile_url,
+                'format_id': '%s-audio' % MOBILE,
+                'preference': preference_key(MOBILE),
+            })
+
+        self._sort_formats(formats)
+
+        thumbnail = coub.get('picture')
+        duration = float_or_none(coub.get('duration'))
+        timestamp = parse_iso8601(coub.get('published_at') or coub.get('created_at'))
+        uploader = coub.get('channel', {}).get('title')
+        uploader_id = coub.get('channel', {}).get('permalink')
+
+        view_count = int_or_none(coub.get('views_count') or coub.get('views_increase_count'))
+        like_count = int_or_none(coub.get('likes_count'))
+        repost_count = int_or_none(coub.get('recoubs_count'))
+        comment_count = int_or_none(coub.get('comments_count'))
+
+        age_restricted = coub.get('age_restricted', coub.get('age_restricted_by_admin'))
+        if age_restricted is not None:
+            age_limit = 18 if age_restricted is True else 0
+        else:
+            age_limit = None
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'timestamp': timestamp,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'view_count': view_count,
+            'like_count': like_count,
+            'repost_count': repost_count,
+            'comment_count': comment_count,
+            'age_limit': age_limit,
+            'formats': formats,
+        }
index 8ae3f28903deeb94cd723c7f431a3c86a45538ea..90a64303d3282c925d1620787922622e99a8b612 100644 (file)
@@ -11,7 +11,6 @@ from math import pow, sqrt, floor
 from .common import InfoExtractor
 from ..compat import (
     compat_etree_fromstring,
-    compat_urllib_parse_unquote,
     compat_urllib_parse_urlencode,
     compat_urllib_request,
     compat_urlparse,
@@ -27,6 +26,7 @@ from ..utils import (
     unified_strdate,
     urlencode_postdata,
     xpath_text,
+    extract_attributes,
 )
 from ..aes import (
     aes_cbc_decrypt,
@@ -306,28 +306,36 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
             r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', webpage,
             'video_uploader', fatal=False)
 
-        playerdata_url = compat_urllib_parse_unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url'))
-        playerdata_req = sanitized_Request(playerdata_url)
-        playerdata_req.data = urlencode_postdata({'current_page': webpage_url})
-        playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
-        playerdata = self._download_webpage(playerdata_req, video_id, note='Downloading media info')
-
-        stream_id = self._search_regex(r'<media_id>([^<]+)', playerdata, 'stream_id')
-        video_thumbnail = self._search_regex(r'<episode_image_url>([^<]+)', playerdata, 'thumbnail', fatal=False)
-
+        available_fmts = []
+        for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage):
+            attrs = extract_attributes(a)
+            href = attrs.get('href')
+            if href and '/freetrial' in href:
+                continue
+            available_fmts.append(fmt)
+        if not available_fmts:
+            for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'):
+                available_fmts = re.findall(p, webpage)
+                if available_fmts:
+                    break
+        video_encode_ids = []
         formats = []
-        for fmt in re.findall(r'showmedia\.([0-9]{3,4})p', webpage):
+        for fmt in available_fmts:
             stream_quality, stream_format = self._FORMAT_IDS[fmt]
             video_format = fmt + 'p'
             streamdata_req = sanitized_Request(
                 'http://www.crunchyroll.com/xml/?req=RpcApiVideoPlayer_GetStandardConfig&media_id=%s&video_format=%s&video_quality=%s'
-                % (stream_id, stream_format, stream_quality),
+                % (video_id, stream_format, stream_quality),
                 compat_urllib_parse_urlencode({'current_page': url}).encode('utf-8'))
             streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
             streamdata = self._download_xml(
                 streamdata_req, video_id,
                 note='Downloading media info for %s' % video_format)
             stream_info = streamdata.find('./{default}preload/stream_info')
+            video_encode_id = xpath_text(stream_info, './video_encode_id')
+            if video_encode_id in video_encode_ids:
+                continue
+            video_encode_ids.append(video_encode_id)
             video_url = xpath_text(stream_info, './host')
             video_play_path = xpath_text(stream_info, './file')
             if not video_url or not video_play_path:
@@ -359,6 +367,14 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
                 'ext': 'flv',
             })
             formats.append(format_info)
+        self._sort_formats(formats)
+
+        metadata = self._download_xml(
+            'http://www.crunchyroll.com/xml', video_id,
+            note='Downloading media info', query={
+                'req': 'RpcApiVideoPlayer_GetMediaMetadata',
+                'media_id': video_id,
+            })
 
         subtitles = self.extract_subtitles(video_id, webpage)
 
@@ -366,9 +382,12 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
             'id': video_id,
             'title': video_title,
             'description': video_description,
-            'thumbnail': video_thumbnail,
+            'thumbnail': xpath_text(metadata, 'episode_image_url'),
             'uploader': video_uploader,
             'upload_date': video_upload_date,
+            'series': xpath_text(metadata, 'series_title'),
+            'episode': xpath_text(metadata, 'episode_title'),
+            'episode_number': int_or_none(xpath_text(metadata, 'episode_number')),
             'subtitles': subtitles,
             'formats': formats,
         }
diff --git a/youtube_dl/extractor/ctv.py b/youtube_dl/extractor/ctv.py
new file mode 100644 (file)
index 0000000..5807fba
--- /dev/null
@@ -0,0 +1,30 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class CTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?ctv\.ca/video/player\?vid=(?P<id>[0-9.]+)'
+    _TESTS = [{
+        'url': 'http://www.ctv.ca/video/player?vid=706966',
+        'md5': 'ff2ebbeae0aa2dcc32a830c3fd69b7b0',
+        'info_dict': {
+            'id': '706966',
+            'ext': 'mp4',
+            'title': 'Larry Day and Richard Jutras on the TIFF red carpet of \'Stonewall\'',
+            'description': 'etalk catches up with Larry Day and Richard Jutras on the TIFF red carpet of "Stonewall”.',
+            'upload_date': '20150919',
+            'timestamp': 1442624700,
+        },
+        'expected_warnings': ['HTTP Error 404'],
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'url': '9c9media:ctv_web:%s' % video_id,
+            'ie_key': 'NineCNineMedia',
+        }
diff --git a/youtube_dl/extractor/ctvnews.py b/youtube_dl/extractor/ctvnews.py
new file mode 100644 (file)
index 0000000..1023b61
--- /dev/null
@@ -0,0 +1,65 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import orderedSet
+
+
+class CTVNewsIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?ctvnews\.ca/(?:video\?(?:clip|playlist|bin)Id=|.*?)(?P<id>[0-9.]+)'
+    _TESTS = [{
+        'url': 'http://www.ctvnews.ca/video?clipId=901995',
+        'md5': '10deb320dc0ccb8d01d34d12fc2ea672',
+        'info_dict': {
+            'id': '901995',
+            'ext': 'mp4',
+            'title': 'Extended: \'That person cannot be me\' Johnson says',
+            'description': 'md5:958dd3b4f5bbbf0ed4d045c790d89285',
+            'timestamp': 1467286284,
+            'upload_date': '20160630',
+        }
+    }, {
+        'url': 'http://www.ctvnews.ca/video?playlistId=1.2966224',
+        'info_dict':
+        {
+            'id': '1.2966224',
+        },
+        'playlist_mincount': 19,
+    }, {
+        'url': 'http://www.ctvnews.ca/video?binId=1.2876780',
+        'info_dict':
+        {
+            'id': '1.2876780',
+        },
+        'playlist_mincount': 100,
+    }, {
+        'url': 'http://www.ctvnews.ca/1.810401',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.ctvnews.ca/canadiens-send-p-k-subban-to-nashville-in-blockbuster-trade-1.2967231',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        page_id = self._match_id(url)
+
+        def ninecninemedia_url_result(clip_id):
+            return {
+                '_type': 'url_transparent',
+                'id': clip_id,
+                'url': '9c9media:ctvnews_web:%s' % clip_id,
+                'ie_key': 'NineCNineMedia',
+            }
+
+        if page_id.isdigit():
+            return ninecninemedia_url_result(page_id)
+        else:
+            webpage = self._download_webpage('http://www.ctvnews.ca/%s' % page_id, page_id, query={
+                'ot': 'example.AjaxPageLayout.ot',
+                'maxItemsPerPage': 1000000,
+            })
+            entries = [ninecninemedia_url_result(clip_id) for clip_id in orderedSet(
+                re.findall(r'clip\.id\s*=\s*(\d+);', webpage))]
+            return self.playlist_result(entries, page_id)
index f5cefd9660829d1ab65ec789c208ba6938e0de5a..ebd14cb1638b6309f1522c142502c0dac5d763a2 100644 (file)
@@ -9,7 +9,7 @@ from ..utils import (
 
 
 class CWTVIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?cw(?:tv|seed)\.com/shows/(?:[^/]+/){2}\?play=(?P<id>[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})'
+    _VALID_URL = r'https?://(?:www\.)?cw(?:tv|seed)\.com/(?:shows/)?(?:[^/]+/){2}\?.*\bplay=(?P<id>[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})'
     _TESTS = [{
         'url': 'http://cwtv.com/shows/arrow/legends-of-yesterday/?play=6b15e985-9345-4f60-baf8-56e96be57c63',
         'info_dict': {
@@ -48,6 +48,9 @@ class CWTVIE(InfoExtractor):
             # m3u8 download
             'skip_download': True,
         }
+    }, {
+        'url': 'http://cwtv.com/thecw/chroniclesofcisco/?play=8adebe35-f447-465f-ab52-e863506ff6d6',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py
new file mode 100644 (file)
index 0000000..b60a1d8
--- /dev/null
@@ -0,0 +1,61 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    determine_protocol,
+)
+
+
+class DailyMailIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/video/[^/]+/video-(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.dailymail.co.uk/video/sciencetech/video-1288527/Turn-video-impressionist-masterpiece.html',
+        'md5': '2f639d446394f53f3a33658b518b6615',
+        'info_dict': {
+            'id': '1288527',
+            'ext': 'mp4',
+            'title': 'Turn any video into an impressionist masterpiece',
+            'description': 'md5:88ddbcb504367987b2708bb38677c9d2',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        video_data = self._parse_json(self._search_regex(
+            r"data-opts='({.+?})'", webpage, 'video data'), video_id)
+        title = video_data['title']
+        video_sources = self._download_json(video_data.get(
+            'sources', {}).get('url') or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id, video_id)
+
+        formats = []
+        for rendition in video_sources['renditions']:
+            rendition_url = rendition.get('url')
+            if not rendition_url:
+                continue
+            tbr = int_or_none(rendition.get('encodingRate'), 1000)
+            container = rendition.get('videoContainer')
+            is_hls = container == 'M2TS'
+            protocol = 'm3u8_native' if is_hls else determine_protocol({'url': rendition_url})
+            formats.append({
+                'format_id': ('hls' if is_hls else protocol) + ('-%d' % tbr if tbr else ''),
+                'url': rendition_url,
+                'width': int_or_none(rendition.get('frameWidth')),
+                'height': int_or_none(rendition.get('frameHeight')),
+                'tbr': tbr,
+                'vcodec': rendition.get('videoCodec'),
+                'container': container,
+                'protocol': protocol,
+                'ext': 'mp4' if is_hls else None,
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video_data.get('descr'),
+            'thumbnail': video_data.get('poster') or video_data.get('thumbnail'),
+            'formats': formats,
+        }
index 2e6226ea0774af2e636cbc4b4a4ca9f1ecb763a3..1f92823b74ab9da8176ac690841fa144c2630bc1 100644 (file)
@@ -16,6 +16,7 @@ from ..utils import (
     sanitized_Request,
     str_to_int,
     unescapeHTML,
+    mimetype2ext,
 )
 
 
@@ -111,6 +112,13 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
         }
     ]
 
+    @staticmethod
+    def _extract_urls(webpage):
+        # Look for embedded Dailymotion player
+        matches = re.findall(
+            r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage)
+        return list(map(lambda m: unescapeHTML(m[1]), matches))
+
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
@@ -153,18 +161,19 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
                     type_ = media.get('type')
                     if type_ == 'application/vnd.lumberjack.manifest':
                         continue
-                    ext = determine_ext(media_url)
-                    if type_ == 'application/x-mpegURL' or ext == 'm3u8':
+                    ext = mimetype2ext(type_) or determine_ext(media_url)
+                    if ext == 'm3u8':
                         formats.extend(self._extract_m3u8_formats(
                             media_url, video_id, 'mp4', preference=-1,
                             m3u8_id='hls', fatal=False))
-                    elif type_ == 'application/f4m' or ext == 'f4m':
+                    elif ext == 'f4m':
                         formats.extend(self._extract_f4m_formats(
                             media_url, video_id, preference=-1, f4m_id='hds', fatal=False))
                     else:
                         f = {
                             'url': media_url,
                             'format_id': 'http-%s' % quality,
+                            'ext': ext,
                         }
                         m = re.search(r'H264-(?P<width>\d+)x(?P<height>\d+)', media_url)
                         if m:
index 86024a745661dda2da9d3fb883ccf4db017a722c..b5c310ccb8042c7bfa44c6a909ead398fc679dd4 100644 (file)
@@ -66,22 +66,32 @@ class DaumIE(InfoExtractor):
             'view_count': int,
             'comment_count': int,
         },
+    }, {
+        # Requires dte_type=WEB (#9972)
+        'url': 'http://tvpot.daum.net/v/s3794Uf1NZeZ1qMpGpeqeRU',
+        'md5': 'a8917742069a4dd442516b86e7d66529',
+        'info_dict': {
+            'id': 's3794Uf1NZeZ1qMpGpeqeRU',
+            'ext': 'mp4',
+            'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny) [쇼! 음악중심] 508회 20160611',
+            'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\n\n[쇼! 음악중심] 20160611, 507회',
+            'upload_date': '20160611',
+        },
     }]
 
     def _real_extract(self, url):
         video_id = compat_urllib_parse_unquote(self._match_id(url))
-        query = compat_urllib_parse_urlencode({'vid': video_id})
         movie_data = self._download_json(
-            'http://videofarm.daum.net/controller/api/closed/v1_2/IntegratedMovieData.json?' + query,
-            video_id, 'Downloading video formats info')
+            'http://videofarm.daum.net/controller/api/closed/v1_2/IntegratedMovieData.json',
+            video_id, 'Downloading video formats info', query={'vid': video_id, 'dte_type': 'WEB'})
 
         # For urls like http://m.tvpot.daum.net/v/65139429, where the video_id is really a clipid
         if not movie_data.get('output_list', {}).get('output_list') and re.match(r'^\d+$', video_id):
             return self.url_result('http://tvpot.daum.net/clip/ClipView.do?clipid=%s' % video_id)
 
         info = self._download_xml(
-            'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id,
-            'Downloading video info')
+            'http://tvpot.daum.net/clip/ClipInfoXml.do', video_id,
+            'Downloading video info', query={'vid': video_id})
 
         formats = []
         for format_el in movie_data['output_list']['output_list']:
index 5deff5f30ea22592c24015b6b78546175b0f498e..efb8585e825c9f5cd746e4601380aa90d09246f7 100644 (file)
@@ -20,7 +20,7 @@ from ..utils import (
 
 
 class DCNIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<video_id>\d+)/(?P<season_id>\d+))?'
+    _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<video_id>\d+)/(?P<season_id>\d+))?'
 
     def _real_extract(self, url):
         show_id, video_id, season_id = re.match(self._VALID_URL, url).groups()
@@ -55,30 +55,32 @@ class DCNBaseIE(InfoExtractor):
             'is_live': is_live,
         }
 
-    def _extract_video_formats(self, webpage, video_id, entry_protocol):
+    def _extract_video_formats(self, webpage, video_id, m3u8_entry_protocol):
         formats = []
-        m3u8_url = self._html_search_regex(
-            r'file\s*:\s*"([^"]+)', webpage, 'm3u8 url', fatal=False)
-        if m3u8_url:
-            formats.extend(self._extract_m3u8_formats(
-                m3u8_url, video_id, 'mp4', entry_protocol, m3u8_id='hls', fatal=None))
-
-        rtsp_url = self._search_regex(
-            r'<a[^>]+href="(rtsp://[^"]+)"', webpage, 'rtsp url', fatal=False)
-        if rtsp_url:
-            formats.append({
-                'url': rtsp_url,
-                'format_id': 'rtsp',
-            })
-
+        format_url_base = 'http' + self._html_search_regex(
+            [
+                r'file\s*:\s*"https?(://[^"]+)/playlist.m3u8',
+                r'<a[^>]+href="rtsp(://[^"]+)"'
+            ], webpage, 'format url')
+        # TODO: Current DASH formats are broken - $Time$ pattern in
+        # <SegmentTemplate> not implemented yet
+        # formats.extend(self._extract_mpd_formats(
+        #     format_url_base + '/manifest.mpd',
+        #     video_id, mpd_id='dash', fatal=False))
+        formats.extend(self._extract_m3u8_formats(
+            format_url_base + '/playlist.m3u8', video_id, 'mp4',
+            m3u8_entry_protocol, m3u8_id='hls', fatal=False))
+        formats.extend(self._extract_f4m_formats(
+            format_url_base + '/manifest.f4m',
+            video_id, f4m_id='hds', fatal=False))
         self._sort_formats(formats)
         return formats
 
 
 class DCNVideoIE(DCNBaseIE):
     IE_NAME = 'dcn:video'
-    _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/[^/]+|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)'
-    _TEST = {
+    _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?(?:video(?:/[^/]+)?|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)'
+    _TESTS = [{
         'url': 'http://www.dcndigital.ae/#/video/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375',
         'info_dict':
         {
@@ -94,7 +96,10 @@ class DCNVideoIE(DCNBaseIE):
             # m3u8 download
             'skip_download': True,
         },
-    }
+    }, {
+        'url': 'http://awaan.ae/video/26723981/%D8%AF%D8%A7%D8%B1-%D8%A7%D9%84%D8%B3%D9%84%D8%A7%D9%85:-%D8%AE%D9%8A%D8%B1-%D8%AF%D9%88%D8%B1-%D8%A7%D9%84%D8%A3%D9%86%D8%B5%D8%A7%D8%B1',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -120,7 +125,7 @@ class DCNVideoIE(DCNBaseIE):
 
 class DCNLiveIE(DCNBaseIE):
     IE_NAME = 'dcn:live'
-    _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?live/(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?live/(?P<id>\d+)'
 
     def _real_extract(self, url):
         channel_id = self._match_id(url)
@@ -147,7 +152,7 @@ class DCNLiveIE(DCNBaseIE):
 
 class DCNSeasonIE(InfoExtractor):
     IE_NAME = 'dcn:season'
-    _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))'
+    _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))'
     _TEST = {
         'url': 'http://dcndigital.ae/#/program/205024/%D9%85%D8%AD%D8%A7%D8%B6%D8%B1%D8%A7%D8%AA-%D8%A7%D9%84%D8%B4%D9%8A%D8%AE-%D8%A7%D9%84%D8%B4%D8%B9%D8%B1%D8%A7%D9%88%D9%8A',
         'info_dict':
index cdfeccacb447591f4dcc776a9c1a374a794fa5ba..a4d0448c26149429ebd7d5813f432b56bf0e6020 100644 (file)
@@ -12,39 +12,46 @@ class DFBIE(InfoExtractor):
 
     _TEST = {
         'url': 'http://tv.dfb.de/video/u-19-em-stimmen-zum-spiel-gegen-russland/11633/',
-        # The md5 is different each time
+        'md5': 'ac0f98a52a330f700b4b3034ad240649',
         'info_dict': {
             'id': '11633',
             'display_id': 'u-19-em-stimmen-zum-spiel-gegen-russland',
-            'ext': 'flv',
+            'ext': 'mp4',
             'title': 'U 19-EM: Stimmen zum Spiel gegen Russland',
             'upload_date': '20150714',
         },
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        display_id = mobj.group('display_id')
+        display_id, video_id = re.match(self._VALID_URL, url).groups()
 
-        webpage = self._download_webpage(url, display_id)
         player_info = self._download_xml(
             'http://tv.dfb.de/server/hd_video.php?play=%s' % video_id,
             display_id)
         video_info = player_info.find('video')
-
-        f4m_info = self._download_xml(
-            self._proto_relative_url(video_info.find('url').text.strip()), display_id)
-        token_el = f4m_info.find('token')
-        manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + '&hdcore=3.2.0'
-        formats = self._extract_f4m_formats(manifest_url, display_id)
+        stream_access_url = self._proto_relative_url(video_info.find('url').text.strip())
+
+        formats = []
+        # see http://tv.dfb.de/player/js/ajax.js for the method to extract m3u8 formats
+        for sa_url in (stream_access_url, stream_access_url + '&area=&format=iphone'):
+            stream_access_info = self._download_xml(sa_url, display_id)
+            token_el = stream_access_info.find('token')
+            manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth']
+            if '.f4m' in manifest_url:
+                formats.extend(self._extract_f4m_formats(
+                    manifest_url + '&hdcore=3.2.0',
+                    display_id, f4m_id='hds', fatal=False))
+            else:
+                formats.extend(self._extract_m3u8_formats(
+                    manifest_url, display_id, 'mp4',
+                    'm3u8_native', m3u8_id='hls', fatal=False))
         self._sort_formats(formats)
 
         return {
             'id': video_id,
             'display_id': display_id,
             'title': video_info.find('title').text,
-            'thumbnail': self._og_search_thumbnail(webpage),
+            'thumbnail': 'http://tv.dfb.de/images/%s_640x360.jpg' % video_id,
             'upload_date': unified_strdate(video_info.find('time_date').text),
             'formats': formats,
         }
index 5f1275b39a1e40048dfdbe865f38bb5d72d89426..55853f76f91e97db19423c6cf8c1b8b56e006447 100644 (file)
@@ -33,6 +33,7 @@ class DiscoveryIE(InfoExtractor):
             'duration': 156,
             'timestamp': 1302032462,
             'upload_date': '20110405',
+            'uploader_id': '103207',
         },
         'params': {
             'skip_download': True,  # requires ffmpeg
@@ -54,7 +55,11 @@ class DiscoveryIE(InfoExtractor):
             'upload_date': '20140725',
             'timestamp': 1406246400,
             'duration': 116,
+            'uploader_id': '103207',
         },
+        'params': {
+            'skip_download': True,  # requires ffmpeg
+        }
     }]
 
     def _real_extract(self, url):
@@ -66,13 +71,19 @@ class DiscoveryIE(InfoExtractor):
         entries = []
 
         for idx, video_info in enumerate(info['playlist']):
-            formats = self._extract_m3u8_formats(
-                video_info['src'], display_id, 'mp4', 'm3u8_native', m3u8_id='hls',
-                note='Download m3u8 information for video %d' % (idx + 1))
-            self._sort_formats(formats)
+            subtitles = {}
+            caption_url = video_info.get('captionsUrl')
+            if caption_url:
+                subtitles = {
+                    'en': [{
+                        'url': caption_url,
+                    }]
+                }
+
             entries.append({
+                '_type': 'url_transparent',
+                'url': 'http://players.brightcove.net/103207/default_default/index.html?videoId=ref:%s' % video_info['referenceId'],
                 'id': compat_str(video_info['id']),
-                'formats': formats,
                 'title': video_info['title'],
                 'description': video_info.get('description'),
                 'duration': parse_duration(video_info.get('video_length')),
@@ -80,6 +91,7 @@ class DiscoveryIE(InfoExtractor):
                 'thumbnail': video_info.get('thumbnailURL'),
                 'alt_title': video_info.get('secondary_title'),
                 'timestamp': parse_iso8601(video_info.get('publishedDate')),
+                'subtitles': subtitles,
             })
 
         return self.playlist_result(entries, display_id, video_title)
index 3915cb182961711873b7a75b48958ac50602aa45..ce6962755831a8c6f853271ad49112fee4fcbc63 100644 (file)
@@ -18,7 +18,7 @@ class DouyuTVIE(InfoExtractor):
             'display_id': 'iseven',
             'ext': 'flv',
             'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
-            'description': 'md5:f34981259a03e980a3c6404190a3ed61',
+            'description': 're:.*m7show@163\.com.*',
             'thumbnail': 're:^https?://.*\.jpg$',
             'uploader': '7师傅',
             'uploader_id': '431925',
@@ -43,7 +43,7 @@ class DouyuTVIE(InfoExtractor):
         'params': {
             'skip_download': True,
         },
-        'skip': 'Romm not found',
+        'skip': 'Room not found',
     }, {
         'url': 'http://www.douyutv.com/17732',
         'info_dict': {
@@ -51,7 +51,7 @@ class DouyuTVIE(InfoExtractor):
             'display_id': '17732',
             'ext': 'flv',
             'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
-            'description': 'md5:f34981259a03e980a3c6404190a3ed61',
+            'description': 're:.*m7show@163\.com.*',
             'thumbnail': 're:^https?://.*\.jpg$',
             'uploader': '7师傅',
             'uploader_id': '431925',
@@ -75,13 +75,28 @@ class DouyuTVIE(InfoExtractor):
             room_id = self._html_search_regex(
                 r'"room_id"\s*:\s*(\d+),', page, 'room id')
 
-        prefix = 'room/%s?aid=android&client_sys=android&time=%d' % (
-            room_id, int(time.time()))
-
-        auth = hashlib.md5((prefix + '1231').encode('ascii')).hexdigest()
-        config = self._download_json(
-            'http://www.douyutv.com/api/v1/%s&auth=%s' % (prefix, auth),
-            video_id)
+        config = None
+        # Douyu API sometimes returns error "Unable to load the requested class: eticket_redis_cache"
+        # Retry with different parameters - same parameters cause same errors
+        for i in range(5):
+            prefix = 'room/%s?aid=android&client_sys=android&time=%d' % (
+                room_id, int(time.time()))
+            auth = hashlib.md5((prefix + '1231').encode('ascii')).hexdigest()
+
+            config_page = self._download_webpage(
+                'http://www.douyutv.com/api/v1/%s&auth=%s' % (prefix, auth),
+                video_id)
+            try:
+                config = self._parse_json(config_page, video_id, fatal=False)
+            except ExtractorError:
+                # Wait some time before retrying to get a different time() value
+                self._sleep(1, video_id, msg_template='%(video_id)s: Error occurs. '
+                                                      'Waiting for %(timeout)s seconds before retrying')
+                continue
+            else:
+                break
+        if config is None:
+            raise ExtractorError('Unable to fetch API result')
 
         data = config['data']
 
index ae7c571bd3d06b41895d46f1df0dbd803170de0b..d740652f172c1dc9b61b19205af20b6721e10211 100644 (file)
@@ -2,13 +2,16 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..utils import int_or_none
+from ..utils import (
+    int_or_none,
+    unified_strdate,
+)
 from ..compat import compat_urlparse
 
 
 class DWIE(InfoExtractor):
     IE_NAME = 'dw'
-    _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+av-(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+(?:av|e)-(?P<id>\d+)'
     _TESTS = [{
         # video
         'url': 'http://www.dw.com/en/intelligent-light/av-19112290',
@@ -31,6 +34,18 @@ class DWIE(InfoExtractor):
             'description': 'md5:bc9ca6e4e063361e21c920c53af12405',
             'upload_date': '20160311',
         }
+    }, {
+        # DW documentaries, only last for one or two weeks
+        'url': 'http://www.dw.com/en/documentaries-welcome-to-the-90s-2016-05-21/e-19220158-9798',
+        'md5': '56b6214ef463bfb9a3b71aeb886f3cf1',
+        'info_dict': {
+            'id': '19274438',
+            'ext': 'mp4',
+            'title': 'Welcome to the 90s – Hip Hop',
+            'description': 'Welcome to the 90s - The Golden Decade of Hip Hop',
+            'upload_date': '20160521',
+        },
+        'skip': 'Video removed',
     }]
 
     def _real_extract(self, url):
@@ -38,6 +53,7 @@ class DWIE(InfoExtractor):
         webpage = self._download_webpage(url, media_id)
         hidden_inputs = self._hidden_inputs(webpage)
         title = hidden_inputs['media_title']
+        media_id = hidden_inputs.get('media_id') or media_id
 
         if hidden_inputs.get('player_type') == 'video' and hidden_inputs.get('stream_file') == '1':
             formats = self._extract_smil_formats(
@@ -49,13 +65,20 @@ class DWIE(InfoExtractor):
         else:
             formats = [{'url': hidden_inputs['file_name']}]
 
+        upload_date = hidden_inputs.get('display_date')
+        if not upload_date:
+            upload_date = self._html_search_regex(
+                r'<span[^>]+class="date">([0-9.]+)\s*\|', webpage,
+                'upload date', default=None)
+            upload_date = unified_strdate(upload_date)
+
         return {
             'id': media_id,
             'title': title,
             'description': self._og_search_description(webpage),
             'thumbnail': hidden_inputs.get('preview_image'),
             'duration': int_or_none(hidden_inputs.get('file_duration')),
-            'upload_date': hidden_inputs.get('display_date'),
+            'upload_date': upload_date,
             'formats': formats,
         }
 
index 0f8c73fd7d330da33afd3c2a3d2cb732e4d3ff33..12d28d3b9f1e76f84f0f9fa322befd0bfa056f09 100644 (file)
@@ -23,7 +23,7 @@ class EaglePlatformIE(InfoExtractor):
     _TESTS = [{
         # http://lenta.ru/news/2015/03/06/navalny/
         'url': 'http://lentaru.media.eagleplatform.com/index/player?player=new&record_id=227304&player_template_id=5201',
-        'md5': '881ee8460e1b7735a8be938e2ffb362b',
+        # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
         'info_dict': {
             'id': '227304',
             'ext': 'mp4',
@@ -50,6 +50,14 @@ class EaglePlatformIE(InfoExtractor):
         'skip': 'Georestricted',
     }]
 
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1',
+            webpage)
+        if mobj is not None:
+            return mobj.group('url')
+
     @staticmethod
     def _handle_error(response):
         status = int_or_none(response.get('status', 200))
@@ -109,8 +117,11 @@ class EaglePlatformIE(InfoExtractor):
             mobj = re.search('/([^/]+)/index\.m3u8', m3u8_format['url'])
             if mobj:
                 http_format = m3u8_format.copy()
+                video_url = mp4_url.replace(mp4_url_basename, mobj.group(1))
+                if not self._is_valid_url(video_url, video_id):
+                    continue
                 http_format.update({
-                    'url': mp4_url.replace(mp4_url_basename, mobj.group(1)),
+                    'url': video_url,
                     'format_id': m3u8_format['format_id'].replace('hls', 'http'),
                     'protocol': 'http',
                 })
index e006921ec3f8d2a0aff0e6bb0595148469b1c256..ac5d0fe2426a8e3fb213b2bb6d8f59fa8acc5fba 100644 (file)
@@ -11,8 +11,8 @@ from ..utils import (
 
 
 class EpornerIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P<id>\d+)/(?P<display_id>[\w-]+)'
-    _TEST = {
+    _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P<id>\w+)/(?P<display_id>[\w-]+)'
+    _TESTS = [{
         'url': 'http://www.eporner.com/hd-porn/95008/Infamous-Tiffany-Teen-Strip-Tease-Video/',
         'md5': '39d486f046212d8e1b911c52ab4691f8',
         'info_dict': {
@@ -23,8 +23,12 @@ class EpornerIE(InfoExtractor):
             'duration': 1838,
             'view_count': int,
             'age_limit': 18,
-        }
-    }
+        },
+    }, {
+        # New (May 2016) URL layout
+        'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0/Star-Wars-XXX-Parody/',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
index db4b263bcbf40a9cb133d2a9729e4fe07292bae3..66c08bec47d8aa639cf758bb3e083b9772230c76 100644 (file)
@@ -8,6 +8,7 @@ class ESPNIE(InfoExtractor):
     _VALID_URL = r'https?://espn\.go\.com/(?:[^/]+/)*(?P<id>[^/]+)'
     _TESTS = [{
         'url': 'http://espn.go.com/video/clip?id=10365079',
+        'md5': '60e5d097a523e767d06479335d1bdc58',
         'info_dict': {
             'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG',
             'ext': 'mp4',
@@ -15,21 +16,22 @@ class ESPNIE(InfoExtractor):
             'description': None,
         },
         'params': {
-            # m3u8 download
             'skip_download': True,
         },
+        'add_ie': ['OoyalaExternal'],
     }, {
         # intl video, from http://www.espnfc.us/video/mls-highlights/150/video/2743663/must-see-moments-best-of-the-mls-season
         'url': 'http://espn.go.com/video/clip?id=2743663',
+        'md5': 'f4ac89b59afc7e2d7dbb049523df6768',
         'info_dict': {
             'id': '50NDFkeTqRHB0nXBOK-RGdSG5YQPuxHg',
             'ext': 'mp4',
             'title': 'Must-See Moments: Best of the MLS season',
         },
         'params': {
-            # m3u8 download
             'skip_download': True,
         },
+        'add_ie': ['OoyalaExternal'],
     }, {
         'url': 'https://espn.go.com/video/iframe/twitter/?cms=espn&id=10365079',
         'only_matching': True,
index 6de3438fc8993e2f789494f951822a0d06850880..9f70ce75265d02e4cd2a6f2bb4b6b1ce8231d1f6 100644 (file)
@@ -3,6 +3,10 @@ from __future__ import unicode_literals
 
 from .abc import ABCIE
 from .abc7news import Abc7NewsIE
+from .abcnews import (
+    AbcNewsIE,
+    AbcNewsVideoIE,
+)
 from .academicearth import AcademicEarthCourseIE
 from .acast import (
     ACastIE,
@@ -16,7 +20,11 @@ from .adobetv import (
     AdobeTVVideoIE,
 )
 from .adultswim import AdultSwimIE
-from .aenetworks import AENetworksIE
+from .aenetworks import (
+    AENetworksIE,
+    HistoryTopicIE,
+)
+from .afreecatv import AfreecaTVIE
 from .aftonbladet import AftonbladetIE
 from .airmozilla import AirMozillaIE
 from .aljazeera import AlJazeeraIE
@@ -39,7 +47,6 @@ from .archiveorg import ArchiveOrgIE
 from .ard import (
     ARDIE,
     ARDMediathekIE,
-    SportschauIE,
 )
 from .arte import (
     ArteTvIE,
@@ -52,6 +59,7 @@ from .arte import (
     ArteTVDDCIE,
     ArteTVMagazineIE,
     ArteTVEmbedIE,
+    ArteTVPlaylistIE,
 )
 from .atresplayer import AtresPlayerIE
 from .atttechchannel import ATTTechChannelIE
@@ -65,6 +73,8 @@ from .bandcamp import BandcampIE, BandcampAlbumIE
 from .bbc import (
     BBCCoUkIE,
     BBCCoUkArticleIE,
+    BBCCoUkIPlayerPlaylistIE,
+    BBCCoUkPlaylistIE,
     BBCIE,
 )
 from .beeg import BeegIE
@@ -75,6 +85,7 @@ from .bigflix import BigflixIE
 from .bild import BildIE
 from .bilibili import BiliBiliIE
 from .biobiochiletv import BioBioChileTVIE
+from .biqle import BIQLEIE
 from .bleacherreport import (
     BleacherReportIE,
     BleacherReportCMSIE,
@@ -101,11 +112,16 @@ from .camwithher import CamWithHerIE
 from .canalplus import CanalplusIE
 from .canalc2 import Canalc2IE
 from .canvas import CanvasIE
+from .carambatv import (
+    CarambaTVIE,
+    CarambaTVPageIE,
+)
 from .cbc import (
     CBCIE,
     CBCPlayerIE,
 )
 from .cbs import CBSIE
+from .cbslocal import CBSLocalIE
 from .cbsinteractive import CBSInteractiveIE
 from .cbsnews import (
     CBSNewsIE,
@@ -123,11 +139,11 @@ from .chirbit import (
     ChirbitProfileIE,
 )
 from .cinchcast import CinchcastIE
-from .cinemassacre import CinemassacreIE
-from .cliprs import ClipRsIE
 from .clipfish import ClipfishIE
 from .cliphunter import CliphunterIE
+from .cliprs import ClipRsIE
 from .clipsyndicate import ClipsyndicateIE
+from .closertotruth import CloserToTruthIE
 from .cloudy import CloudyIE
 from .clubic import ClubicIE
 from .clyp import ClypIE
@@ -138,7 +154,7 @@ from .cnn import (
     CNNBlogsIE,
     CNNArticleIE,
 )
-from .collegehumor import CollegeHumorIE
+from .coub import CoubIE
 from .collegerama import CollegeRamaIE
 from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE
 from .comcarcoff import ComCarCoffIE
@@ -155,8 +171,11 @@ from .crunchyroll import (
 )
 from .cspan import CSpanIE
 from .ctsnews import CtsNewsIE
+from .ctv import CTVIE
+from .ctvnews import CTVNewsIE
 from .cultureunplugged import CultureUnpluggedIE
 from .cwtv import CWTVIE
+from .dailymail import DailyMailIE
 from .dailymotion import (
     DailymotionIE,
     DailymotionPlaylistIE,
@@ -226,6 +245,7 @@ from .everyonesmixtape import EveryonesMixtapeIE
 from .exfm import ExfmIE
 from .expotv import ExpoTVIE
 from .extremetube import ExtremeTubeIE
+from .eyedotv import EyedoTVIE
 from .facebook import FacebookIE
 from .faz import FazIE
 from .fc2 import FC2IE
@@ -236,8 +256,10 @@ from .fivemin import FiveMinIE
 from .fivetv import FiveTVIE
 from .fktv import FKTVIE
 from .flickr import FlickrIE
+from .flipagram import FlipagramIE
 from .folketinget import FolketingetIE
 from .footyroom import FootyRoomIE
+from .formula1 import Formula1IE
 from .fourtube import FourTubeIE
 from .fox import FOXIE
 from .foxgay import FoxgayIE
@@ -260,6 +282,7 @@ from .freespeech import FreespeechIE
 from .freevideo import FreeVideoIE
 from .funimation import FunimationIE
 from .funnyordie import FunnyOrDieIE
+from .fusion import FusionIE
 from .gameinformer import GameInformerIE
 from .gamekings import GamekingsIE
 from .gameone import (
@@ -269,7 +292,6 @@ from .gameone import (
 from .gamersyde import GamersydeIE
 from .gamespot import GameSpotIE
 from .gamestar import GameStarIE
-from .gametrailers import GametrailersIE
 from .gazeta import GazetaIE
 from .gdcvault import GDCVaultIE
 from .generic import GenericIE
@@ -282,6 +304,7 @@ from .globo import (
     GloboArticleIE,
 )
 from .godtube import GodTubeIE
+from .godtv import GodTVIE
 from .goldenmoustache import GoldenMoustacheIE
 from .golem import GolemIE
 from .googledrive import GoogleDriveIE
@@ -304,6 +327,10 @@ from .hotnewhiphop import HotNewHipHopIE
 from .hotstar import HotStarIE
 from .howcast import HowcastIE
 from .howstuffworks import HowStuffWorksIE
+from .hrti import (
+    HRTiIE,
+    HRTiPlaylistIE,
+)
 from .huffpost import HuffPostIE
 from .hypem import HypemIE
 from .iconosquare import IconosquareIE
@@ -342,6 +369,7 @@ from .jove import JoveIE
 from .jwplatform import JWPlatformIE
 from .jpopsukitv import JpopsukiIE
 from .kaltura import KalturaIE
+from .kamcord import KamcordIE
 from .kanalplay import KanalPlayIE
 from .kankan import KankanIE
 from .karaoketv import KaraoketvIE
@@ -365,6 +393,7 @@ from .kuwo import (
 )
 from .la7 import LA7IE
 from .laola1tv import Laola1TvIE
+from .learnr import LearnrIE
 from .lecture2go import Lecture2GoIE
 from .lemonde import LemondeIE
 from .leeco import (
@@ -372,6 +401,7 @@ from .leeco import (
     LePlaylistIE,
     LetvCloudIE,
 )
+from .libraryofcongress import LibraryOfCongressIE
 from .libsyn import LibsynIE
 from .lifenews import (
     LifeNewsIE,
@@ -382,6 +412,7 @@ from .limelight import (
     LimelightChannelIE,
     LimelightChannelListIE,
 )
+from .litv import LiTVIE
 from .liveleak import LiveLeakIE
 from .livestream import (
     LivestreamIE,
@@ -389,6 +420,7 @@ from .livestream import (
     LivestreamShortenerIE,
 )
 from .lnkgo import LnkGoIE
+from .localnews8 import LocalNews8IE
 from .lovehomeporn import LoveHomePornIE
 from .lrt import LRTIE
 from .lynda import (
@@ -400,13 +432,17 @@ from .macgamestore import MacGameStoreIE
 from .mailru import MailRuIE
 from .makerschannel import MakersChannelIE
 from .makertv import MakerTVIE
-from .malemotion import MalemotionIE
 from .matchtv import MatchTVIE
 from .mdr import MDRIE
+from .meta import METAIE
 from .metacafe import MetacafeIE
 from .metacritic import MetacriticIE
 from .mgoon import MgoonIE
 from .mgtv import MGTVIE
+from .microsoftvirtualacademy import (
+    MicrosoftVirtualAcademyIE,
+    MicrosoftVirtualAcademyCourseIE,
+)
 from .minhateca import MinhatecaIE
 from .ministrygrid import MinistryGridIE
 from .minoto import MinotoIE
@@ -431,6 +467,7 @@ from .motherless import MotherlessIE
 from .motorsport import MotorsportIE
 from .movieclips import MovieClipsIE
 from .moviezine import MoviezineIE
+from .msn import MSNIE
 from .mtv import (
     MTVIE,
     MTVServicesEmbeddedIE,
@@ -439,8 +476,7 @@ from .mtv import (
 )
 from .muenchentv import MuenchenTVIE
 from .musicplayon import MusicPlayOnIE
-from .muzu import MuzuTVIE
-from .mwave import MwaveIE
+from .mwave import MwaveIE, MwaveMeetGreetIE
 from .myspace import MySpaceIE, MySpaceAlbumIE
 from .myspass import MySpassIE
 from .myvi import MyviIE
@@ -458,7 +494,6 @@ from .nbc import (
     NBCNewsIE,
     NBCSportsIE,
     NBCSportsVPlayerIE,
-    MSNBCIE,
 )
 from .ndr import (
     NDRIE,
@@ -495,8 +530,12 @@ from .nhl import (
     NHLVideocenterCategoryIE,
     NHLIE,
 )
-from .nick import NickIE
+from .nick import (
+    NickIE,
+    NickDeIE,
+)
 from .niconico import NiconicoIE, NiconicoPlaylistIE
+from .ninecninemedia import NineCNineMediaIE
 from .ninegag import NineGagIE
 from .noco import NocoIE
 from .normalboots import NormalbootsIE
@@ -544,6 +583,10 @@ from .nytimes import (
 from .nuvid import NuvidIE
 from .odnoklassniki import OdnoklassnikiIE
 from .oktoberfesttv import OktoberfestTVIE
+from .onet import (
+    OnetIE,
+    OnetChannelIE,
+)
 from .onionstudios import OnionStudiosIE
 from .ooyala import (
     OoyalaIE,
@@ -562,7 +605,10 @@ from .parliamentliveuk import ParliamentLiveUKIE
 from .patreon import PatreonIE
 from .pbs import PBSIE
 from .people import PeopleIE
-from .periscope import PeriscopeIE
+from .periscope import (
+    PeriscopeIE,
+    PeriscopeUserIE,
+)
 from .philharmoniedeparis import PhilharmonieDeParisIE
 from .phoenix import PhoenixIE
 from .photobucket import PhotobucketIE
@@ -579,6 +625,7 @@ from .pluralsight import (
     PluralsightCourseIE,
 )
 from .podomatic import PodomaticIE
+from .polskieradio import PolskieRadioIE
 from .porn91 import Porn91IE
 from .pornhd import PornHdIE
 from .pornhub import (
@@ -602,7 +649,14 @@ from .qqmusic import (
     QQMusicToplistIE,
     QQMusicPlaylistIE,
 )
-from .r7 import R7IE
+from .r7 import (
+    R7IE,
+    R7ArticleIE,
+)
+from .radiocanada import (
+    RadioCanadaIE,
+    RadioCanadaAudioVideoIE,
+)
 from .radiode import RadioDeIE
 from .radiojavan import RadioJavanIE
 from .radiobremen import RadioBremenIE
@@ -616,11 +670,16 @@ from .rds import RDSIE
 from .redtube import RedTubeIE
 from .regiotv import RegioTVIE
 from .restudy import RestudyIE
+from .reuters import ReutersIE
 from .reverbnation import ReverbNationIE
-from .revision3 import Revision3IE
+from .revision3 import (
+    Revision3EmbedIE,
+    Revision3IE,
+)
 from .rice import RICEIE
 from .ringtv import RingTVIE
 from .ro220 import Ro220IE
+from .rockstargames import RockstarGamesIE
 from .rottentomatoes import RottenTomatoesIE
 from .roxwel import RoxwelIE
 from .rtbf import RTBFIE
@@ -656,18 +715,21 @@ from .screencast import ScreencastIE
 from .screencastomatic import ScreencastOMaticIE
 from .screenjunkies import ScreenJunkiesIE
 from .screenwavemedia import ScreenwaveMediaIE, TeamFourIE
+from .seeker import SeekerIE
 from .senateisvp import SenateISVPIE
+from .sendtonews import SendtoNewsIE
 from .servingsys import ServingSysIE
 from .sexu import SexuIE
-from .sexykarma import SexyKarmaIE
 from .shahid import ShahidIE
 from .shared import SharedIE
 from .sharesix import ShareSixIE
 from .sina import SinaIE
+from .sixplay import SixPlayIE
 from .skynewsarabia import (
     SkyNewsArabiaIE,
     SkyNewsArabiaArticleIE,
 )
+from .skysports import SkySportsIE
 from .slideshare import SlideshareIE
 from .slutload import SlutloadIE
 from .smotri import (
@@ -676,10 +738,6 @@ from .smotri import (
     SmotriUserIE,
     SmotriBroadcastIE,
 )
-from .snagfilms import (
-    SnagFilmsIE,
-    SnagFilmsEmbedIE,
-)
 from .snotr import SnotrIE
 from .sohu import SohuIE
 from .soundcloud import (
@@ -712,6 +770,7 @@ from .sportbox import (
     SportBoxEmbedIE,
 )
 from .sportdeutschland import SportDeutschlandIE
+from .sportschau import SportschauIE
 from .srgssr import (
     SRGSSRIE,
     SRGSSRPlayIE,
@@ -731,7 +790,10 @@ from .svt import (
 from .swrmediathek import SWRMediathekIE
 from .syfy import SyfyIE
 from .sztvhu import SztvHuIE
-from .tagesschau import TagesschauIE
+from .tagesschau import (
+    TagesschauPlayerIE,
+    TagesschauIE,
+)
 from .tapely import TapelyIE
 from .tass import TassIE
 from .tdslifeway import TDSLifewayIE
@@ -749,6 +811,7 @@ from .telecinco import TelecincoIE
 from .telegraaf import TelegraafIE
 from .telemb import TeleMBIE
 from .teletask import TeleTaskIE
+from .telewebion import TelewebionIE
 from .testurl import TestURLIE
 from .tf1 import TF1IE
 from .theintercept import TheInterceptIE
@@ -761,6 +824,7 @@ from .thesixtyone import TheSixtyOneIE
 from .thestar import TheStarIE
 from .thisamericanlife import ThisAmericanLifeIE
 from .thisav import ThisAVIE
+from .threeqsdn import ThreeQSDNIE
 from .tinypic import TinyPicIE
 from .tlc import TlcDeIE
 from .tmz import (
@@ -813,7 +877,10 @@ from .tvc import (
 )
 from .tvigle import TvigleIE
 from .tvland import TVLandIE
-from .tvp import TvpIE, TvpSeriesIE
+from .tvp import (
+    TVPIE,
+    TVPSeriesIE,
+)
 from .tvplay import TVPlayIE
 from .tweakers import TweakersIE
 from .twentyfourvideo import TwentyFourVideoIE
@@ -828,8 +895,8 @@ from .twitch import (
     TwitchVodIE,
     TwitchProfileIE,
     TwitchPastBroadcastsIE,
-    TwitchBookmarksIE,
     TwitchStreamIE,
+    TwitchClipsIE,
 )
 from .twitter import (
     TwitterCardIE,
@@ -844,16 +911,23 @@ from .udn import UDNEmbedIE
 from .digiteka import DigitekaIE
 from .unistra import UnistraIE
 from .urort import UrortIE
+from .urplay import URPlayIE
 from .usatoday import USATodayIE
 from .ustream import UstreamIE, UstreamChannelIE
-from .ustudio import UstudioIE
+from .ustudio import (
+    UstudioIE,
+    UstudioEmbedIE,
+)
 from .varzesh3 import Varzesh3IE
 from .vbox7 import Vbox7IE
 from .veehd import VeeHDIE
 from .veoh import VeohIE
 from .vessel import VesselIE
 from .vesti import VestiIE
-from .vevo import VevoIE
+from .vevo import (
+    VevoIE,
+    VevoPlaylistIE,
+)
 from .vgtv import (
     BTArticleIE,
     BTVestlendingenIE,
@@ -864,6 +938,7 @@ from .vice import (
     ViceIE,
     ViceShowIE,
 )
+from .vidbit import VidbitIE
 from .viddler import ViddlerIE
 from .videodetective import VideoDetectiveIE
 from .videofyme import VideofyMeIE
@@ -875,6 +950,7 @@ from .videomore import (
 )
 from .videopremium import VideoPremiumIE
 from .videott import VideoTtIE
+from .vidio import VidioIE
 from .vidme import (
     VidmeIE,
     VidmeUserIE,
@@ -882,6 +958,10 @@ from .vidme import (
 )
 from .vidzi import VidziIE
 from .vier import VierIE, VierVideosIE
+from .viewlift import (
+    ViewLiftIE,
+    ViewLiftEmbedIE,
+)
 from .viewster import ViewsterIE
 from .viidea import ViideaIE
 from .vimeo import (
@@ -916,25 +996,29 @@ from .vporn import VpornIE
 from .vrt import VRTIE
 from .vube import VubeIE
 from .vuclip import VuClipIE
-from .vulture import VultureIE
 from .walla import WallaIE
-from .washingtonpost import WashingtonPostIE
+from .washingtonpost import (
+    WashingtonPostIE,
+    WashingtonPostArticleIE,
+)
 from .wat import WatIE
+from .watchindianporn import WatchIndianPornIE
 from .wdr import (
     WDRIE,
     WDRMobileIE,
-    WDRMausIE,
 )
 from .webofstories import (
     WebOfStoriesIE,
     WebOfStoriesPlaylistIE,
 )
-from .weibo import WeiboIE
 from .weiqitv import WeiqiTVIE
 from .wimp import WimpIE
 from .wistia import WistiaIE
 from .worldstarhiphop import WorldStarHipHopIE
-from .wrzuta import WrzutaIE
+from .wrzuta import (
+    WrzutaIE,
+    WrzutaPlaylistIE,
+)
 from .wsj import WSJIE
 from .xbef import XBefIE
 from .xboxclips import XboxClipsIE
@@ -943,6 +1027,12 @@ from .xhamster import (
     XHamsterIE,
     XHamsterEmbedIE,
 )
+from .xiami import (
+    XiamiSongIE,
+    XiamiAlbumIE,
+    XiamiArtistIE,
+    XiamiCollectionIE
+)
 from .xminus import XMinusIE
 from .xnxx import XNXXIE
 from .xstream import XstreamIE
@@ -964,7 +1054,10 @@ from .yesjapan import YesJapanIE
 from .yinyuetai import YinYueTaiIE
 from .ynet import YnetIE
 from .youjizz import YouJizzIE
-from .youku import YoukuIE
+from .youku import (
+    YoukuIE,
+    YoukuShowIE,
+)
 from .youporn import YouPornIE
 from .yourupload import YourUploadIE
 from .youtube import (
@@ -979,6 +1072,7 @@ from .youtube import (
     YoutubeSearchDateIE,
     YoutubeSearchIE,
     YoutubeSearchURLIE,
+    YoutubeSharedVideoIE,
     YoutubeShowIE,
     YoutubeSubscriptionsIE,
     YoutubeTruncatedIDIE,
diff --git a/youtube_dl/extractor/eyedotv.py b/youtube_dl/extractor/eyedotv.py
new file mode 100644 (file)
index 0000000..2f30351
--- /dev/null
@@ -0,0 +1,64 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    xpath_text,
+    parse_duration,
+    ExtractorError,
+)
+
+
+class EyedoTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?eyedo\.tv/[^/]+/(?:#!/)?Live/Detail/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'https://www.eyedo.tv/en-US/#!/Live/Detail/16301',
+        'md5': 'ba14f17995cdfc20c36ba40e21bf73f7',
+        'info_dict': {
+            'id': '16301',
+            'ext': 'mp4',
+            'title': 'Journée du conseil scientifique de l\'Afnic 2015',
+            'description': 'md5:4abe07293b2f73efc6e1c37028d58c98',
+            'uploader': 'Afnic Live',
+            'uploader_id': '8023',
+        }
+    }
+    _ROOT_URL = 'http://live.eyedo.net:1935/'
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        video_data = self._download_xml('http://eyedo.tv/api/live/GetLive/%s' % video_id, video_id)
+
+        def _add_ns(path):
+            return self._xpath_ns(path, 'http://schemas.datacontract.org/2004/07/EyeDo.Core.Implementation.Web.ViewModels.Api')
+
+        title = xpath_text(video_data, _add_ns('Titre'), 'title', True)
+        state_live_code = xpath_text(video_data, _add_ns('StateLiveCode'), 'title', True)
+        if state_live_code == 'avenir':
+            raise ExtractorError(
+                '%s said: We\'re sorry, but this video is not yet available.' % self.IE_NAME,
+                expected=True)
+
+        is_live = state_live_code == 'live'
+        m3u8_url = None
+        # http://eyedo.tv/Content/Html5/Scripts/html5view.js
+        if is_live:
+            if xpath_text(video_data, 'Cdn') == 'true':
+                m3u8_url = 'http://rrr.sz.xlcdn.com/?account=eyedo&file=A%s&type=live&service=wowza&protocol=http&output=playlist.m3u8' % video_id
+            else:
+                m3u8_url = self._ROOT_URL + 'w/%s/eyedo_720p/playlist.m3u8' % video_id
+        else:
+            m3u8_url = self._ROOT_URL + 'replay-w/%s/mp4:%s.mp4/playlist.m3u8' % (video_id, video_id)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': self._extract_m3u8_formats(
+                m3u8_url, video_id, 'mp4', 'm3u8' if is_live else 'm3u8_native'),
+            'description': xpath_text(video_data, _add_ns('Description')),
+            'duration': parse_duration(xpath_text(video_data, _add_ns('Duration'))),
+            'uploader': xpath_text(video_data, _add_ns('Createur')),
+            'uploader_id': xpath_text(video_data, _add_ns('CreateurId')),
+            'chapter': xpath_text(video_data, _add_ns('ChapitreTitre')),
+            'chapter_id': xpath_text(video_data, _add_ns('ChapitreId')),
+        }
index f5bbd39d2d0e90996c118e3fae325034fc2bbb6d..0d43acc4ac7ff0162bdcc95471ccdb2376c0f7f2 100644 (file)
@@ -129,6 +129,21 @@ class FacebookIE(InfoExtractor):
         'only_matching': True,
     }]
 
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
+        if mobj is not None:
+            return mobj.group('url')
+
+        # Facebook API embed
+        # see https://developers.facebook.com/docs/plugins/embedded-video-player
+        mobj = re.search(r'''(?x)<div[^>]+
+                class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+
+                data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage)
+        if mobj is not None:
+            return mobj.group('url')
+
     def _login(self):
         (useremail, password) = self._get_login_info()
         if useremail is None:
@@ -204,12 +219,23 @@ class FacebookIE(InfoExtractor):
 
         BEFORE = '{swf.addParam(param[0], param[1]);});'
         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
-        m = re.search(re.escape(BEFORE) + '(?:\n|\\\\n)(.*?)' + re.escape(AFTER), webpage)
-        if m:
-            swf_params = m.group(1).replace('\\\\', '\\').replace('\\"', '"')
+        PATTERN = re.escape(BEFORE) + '(?:\n|\\\\n)(.*?)' + re.escape(AFTER)
+
+        for m in re.findall(PATTERN, webpage):
+            swf_params = m.replace('\\\\', '\\').replace('\\"', '"')
             data = dict(json.loads(swf_params))
             params_raw = compat_urllib_parse_unquote(data['params'])
-            video_data = json.loads(params_raw)['video_data']
+            video_data_candidate = json.loads(params_raw)['video_data']
+            for _, f in video_data_candidate.items():
+                if not f:
+                    continue
+                if isinstance(f, dict):
+                    f = [f]
+                if isinstance(f, list):
+                    continue
+                if f[0].get('video_id') == video_id:
+                    video_data = video_data_candidate
+                    break
 
         def video_data_list2dict(video_data):
             ret = {}
@@ -239,6 +265,8 @@ class FacebookIE(InfoExtractor):
 
         formats = []
         for format_id, f in video_data.items():
+            if f and isinstance(f, dict):
+                f = [f]
             if not f or not isinstance(f, list):
                 continue
             for quality in ('sd', 'hd'):
index f1f150ef2ce41defbcee841d86fe4f9ada34d25d..8d1010b88c83dcbfd3e71e9f20275bf6fb9c9d21 100644 (file)
@@ -1,20 +1,19 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
+from ..compat import compat_urlparse
 
 
 class FczenitIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/gl(?P<id>[0-9]+)'
+    _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/(?P<id>[0-9]+)'
     _TEST = {
-        'url': 'http://fc-zenit.ru/video/gl6785/',
-        'md5': '458bacc24549173fe5a5aa29174a5606',
+        'url': 'http://fc-zenit.ru/video/41044/',
+        'md5': '0e3fab421b455e970fa1aa3891e57df0',
         'info_dict': {
-            'id': '6785',
+            'id': '41044',
             'ext': 'mp4',
-            'title': '«Зенит-ТВ»: как Олег Шатов играл против «Урала»',
+            'title': 'Так пишется история: казанский разгром ЦСКА на «Зенит-ТВ»',
         },
     }
 
@@ -22,15 +21,23 @@ class FczenitIE(InfoExtractor):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
 
-        video_title = self._html_search_regex(r'<div class=\"photoalbum__title\">([^<]+)', webpage, 'title')
+        video_title = self._html_search_regex(
+            r'<[^>]+class=\"photoalbum__title\">([^<]+)', webpage, 'title')
+
+        video_items = self._parse_json(self._search_regex(
+            r'arrPath\s*=\s*JSON\.parse\(\'(.+)\'\)', webpage, 'video items'),
+            video_id)
 
-        bitrates_raw = self._html_search_regex(r'bitrates:.*\n(.*)\]', webpage, 'video URL')
-        bitrates = re.findall(r'url:.?\'(.+?)\'.*?bitrate:.?([0-9]{3}?)', bitrates_raw)
+        def merge_dicts(*dicts):
+            ret = {}
+            for a_dict in dicts:
+                ret.update(a_dict)
+            return ret
 
         formats = [{
-            'url': furl,
-            'tbr': tbr,
-        } for furl, tbr in bitrates]
+            'url': compat_urlparse.urljoin(url, video_url),
+            'tbr': int(tbr),
+        } for tbr, video_url in merge_dicts(*video_items).items()]
 
         self._sort_formats(formats)
 
index 0a3de14988dc06e92a7a27e52c4c7838caf69b2b..a8e1bf42a433fd87f638e8b34ce5ab68464a9252 100644 (file)
@@ -24,13 +24,28 @@ class FlickrIE(InfoExtractor):
             'upload_date': '20110423',
             'uploader_id': '10922353@N03',
             'uploader': 'Forest Wander',
+            'uploader_url': 'https://www.flickr.com/photos/forestwander-nature-pictures/',
             'comment_count': int,
             'view_count': int,
             'tags': list,
+            'license': 'Attribution-ShareAlike',
         }
     }
-
     _API_BASE_URL = 'https://api.flickr.com/services/rest?'
+    # https://help.yahoo.com/kb/flickr/SLN25525.html
+    _LICENSES = {
+        '0': 'All Rights Reserved',
+        '1': 'Attribution-NonCommercial-ShareAlike',
+        '2': 'Attribution-NonCommercial',
+        '3': 'Attribution-NonCommercial-NoDerivs',
+        '4': 'Attribution',
+        '5': 'Attribution-ShareAlike',
+        '6': 'Attribution-NoDerivs',
+        '7': 'No known copyright restrictions',
+        '8': 'United States government work',
+        '9': 'Public Domain Dedication (CC0)',
+        '10': 'Public Domain Work',
+    }
 
     def _call_api(self, method, video_id, api_key, note, secret=None):
         query = {
@@ -75,6 +90,9 @@ class FlickrIE(InfoExtractor):
             self._sort_formats(formats)
 
             owner = video_info.get('owner', {})
+            uploader_id = owner.get('nsid')
+            uploader_path = owner.get('path_alias') or uploader_id
+            uploader_url = 'https://www.flickr.com/photos/%s/' % uploader_path if uploader_path else None
 
             return {
                 'id': video_id,
@@ -83,11 +101,13 @@ class FlickrIE(InfoExtractor):
                 'formats': formats,
                 'timestamp': int_or_none(video_info.get('dateuploaded')),
                 'duration': int_or_none(video_info.get('video', {}).get('duration')),
-                'uploader_id': owner.get('nsid'),
+                'uploader_id': uploader_id,
                 'uploader': owner.get('realname'),
+                'uploader_url': uploader_url,
                 'comment_count': int_or_none(video_info.get('comments', {}).get('_content')),
                 'view_count': int_or_none(video_info.get('views')),
-                'tags': [tag.get('_content') for tag in video_info.get('tags', {}).get('tag', [])]
+                'tags': [tag.get('_content') for tag in video_info.get('tags', {}).get('tag', [])],
+                'license': self._LICENSES.get(video_info.get('license')),
             }
         else:
             raise ExtractorError('not a video', expected=True)
diff --git a/youtube_dl/extractor/flipagram.py b/youtube_dl/extractor/flipagram.py
new file mode 100644 (file)
index 0000000..acb6133
--- /dev/null
@@ -0,0 +1,115 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    int_or_none,
+    float_or_none,
+    try_get,
+    unified_timestamp,
+)
+
+
+class FlipagramIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?flipagram\.com/f/(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'https://flipagram.com/f/nyvTSJMKId',
+        'md5': '888dcf08b7ea671381f00fab74692755',
+        'info_dict': {
+            'id': 'nyvTSJMKId',
+            'ext': 'mp4',
+            'title': 'Flipagram by sjuria101 featuring Midnight Memories by One Direction',
+            'description': 'md5:d55e32edc55261cae96a41fa85ff630e',
+            'duration': 35.571,
+            'timestamp': 1461244995,
+            'upload_date': '20160421',
+            'uploader': 'kitty juria',
+            'uploader_id': 'sjuria101',
+            'creator': 'kitty juria',
+            'view_count': int,
+            'like_count': int,
+            'repost_count': int,
+            'comment_count': int,
+            'comments': list,
+            'formats': 'mincount:2',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        video_data = self._parse_json(
+            self._search_regex(
+                r'window\.reactH2O\s*=\s*({.+});', webpage, 'video data'),
+            video_id)
+
+        flipagram = video_data['flipagram']
+        video = flipagram['video']
+
+        json_ld = self._search_json_ld(webpage, video_id, default=False)
+        title = json_ld.get('title') or flipagram['captionText']
+        description = json_ld.get('description') or flipagram.get('captionText')
+
+        formats = [{
+            'url': video['url'],
+            'width': int_or_none(video.get('width')),
+            'height': int_or_none(video.get('height')),
+            'filesize': int_or_none(video_data.get('size')),
+        }]
+
+        preview_url = try_get(
+            flipagram, lambda x: x['music']['track']['previewUrl'], compat_str)
+        if preview_url:
+            formats.append({
+                'url': preview_url,
+                'ext': 'm4a',
+                'vcodec': 'none',
+            })
+
+        self._sort_formats(formats)
+
+        counts = flipagram.get('counts', {})
+        user = flipagram.get('user', {})
+        video_data = flipagram.get('video', {})
+
+        thumbnails = [{
+            'url': self._proto_relative_url(cover['url']),
+            'width': int_or_none(cover.get('width')),
+            'height': int_or_none(cover.get('height')),
+            'filesize': int_or_none(cover.get('size')),
+        } for cover in flipagram.get('covers', []) if cover.get('url')]
+
+        # Note that this only retrieves comments that are initally loaded.
+        # For videos with large amounts of comments, most won't be retrieved.
+        comments = []
+        for comment in video_data.get('comments', {}).get(video_id, {}).get('items', []):
+            text = comment.get('comment')
+            if not text or not isinstance(text, list):
+                continue
+            comments.append({
+                'author': comment.get('user', {}).get('name'),
+                'author_id': comment.get('user', {}).get('username'),
+                'id': comment.get('id'),
+                'text': text[0],
+                'timestamp': unified_timestamp(comment.get('created')),
+            })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'duration': float_or_none(flipagram.get('duration'), 1000),
+            'thumbnails': thumbnails,
+            'timestamp': unified_timestamp(flipagram.get('iso8601Created')),
+            'uploader': user.get('name'),
+            'uploader_id': user.get('username'),
+            'creator': user.get('name'),
+            'view_count': int_or_none(counts.get('plays')),
+            'like_count': int_or_none(counts.get('likes')),
+            'repost_count': int_or_none(counts.get('reflips')),
+            'comment_count': int_or_none(counts.get('comments')),
+            'comments': comments,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/formula1.py b/youtube_dl/extractor/formula1.py
new file mode 100644 (file)
index 0000000..322c41e
--- /dev/null
@@ -0,0 +1,26 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class Formula1IE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?formula1\.com/content/fom-website/en/video/\d{4}/\d{1,2}/(?P<id>.+?)\.html'
+    _TEST = {
+        'url': 'http://www.formula1.com/content/fom-website/en/video/2016/5/Race_highlights_-_Spain_2016.html',
+        'md5': '8c79e54be72078b26b89e0e111c0502b',
+        'info_dict': {
+            'id': 'JvYXJpMzE6pArfHWm5ARp5AiUmD-gibV',
+            'ext': 'flv',
+            'title': 'Race highlights - Spain 2016',
+        },
+        'add_ie': ['Ooyala'],
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        ooyala_embed_code = self._search_regex(
+            r'data-videoid="([^"]+)"', webpage, 'ooyala embed code')
+        return self.url_result(
+            'ooyala:%s' % ooyala_embed_code, 'Ooyala', ooyala_embed_code)
index df7665176ec3827f836e18b8ca46e3fec7c97c3b..a3bb98377cf4feb769d89769c40fe7098ae20743 100644 (file)
@@ -1,7 +1,10 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..utils import smuggle_url
+from ..utils import (
+    smuggle_url,
+    update_url_query,
+)
 
 
 class FoxSportsIE(InfoExtractor):
@@ -9,11 +12,15 @@ class FoxSportsIE(InfoExtractor):
 
     _TEST = {
         'url': 'http://www.foxsports.com/video?vid=432609859715',
+        'md5': 'b49050e955bebe32c301972e4012ac17',
         'info_dict': {
-            'id': 'gA0bHB3Ladz3',
-            'ext': 'flv',
+            'id': 'i0qKWsk3qJaM',
+            'ext': 'mp4',
             'title': 'Courtney Lee on going up 2-0 in series vs. Blazers',
             'description': 'Courtney Lee talks about Memphis being focused.',
+            'upload_date': '20150423',
+            'timestamp': 1429761109,
+            'uploader': 'NEWA-FNG-FOXSPORTS',
         },
         'add_ie': ['ThePlatform'],
     }
@@ -28,5 +35,8 @@ class FoxSportsIE(InfoExtractor):
                 r"data-player-config='([^']+)'", webpage, 'data player config'),
             video_id)
 
-        return self.url_result(smuggle_url(
-            config['releaseURL'] + '&manifest=f4m', {'force_smil_url': True}))
+        return self.url_result(smuggle_url(update_url_query(
+            config['releaseURL'], {
+                'mbr': 'true',
+                'switch': 'http',
+            }), {'force_smil_url': True}))
index ad94e31f346cc97cd71ad1be9f6983a16b6df209..7653975e3cdfa5b7ae12845b760c6283c5f0c22c 100644 (file)
@@ -14,7 +14,10 @@ from ..utils import (
     parse_duration,
     determine_ext,
 )
-from .dailymotion import DailymotionCloudIE
+from .dailymotion import (
+    DailymotionIE,
+    DailymotionCloudIE,
+)
 
 
 class FranceTVBaseInfoExtractor(InfoExtractor):
@@ -188,6 +191,21 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
         'params': {
             'skip_download': True,
         },
+    }, {
+        # Dailymotion embed
+        'url': 'http://www.francetvinfo.fr/politique/notre-dame-des-landes/video-sur-france-inter-cecile-duflot-denonce-le-regard-meprisant-de-patrick-cohen_1520091.html',
+        'md5': 'ee7f1828f25a648addc90cb2687b1f12',
+        'info_dict': {
+            'id': 'x4iiko0',
+            'ext': 'mp4',
+            'title': 'NDDL, référendum, Brexit : Cécile Duflot répond à Patrick Cohen',
+            'description': 'Au lendemain de la victoire du "oui" au référendum sur l\'aéroport de Notre-Dame-des-Landes, l\'ancienne ministre écologiste est l\'invitée de Patrick Cohen. Plus d\'info : https://www.franceinter.fr/emissions/le-7-9/le-7-9-27-juin-2016',
+            'timestamp': 1467011958,
+            'upload_date': '20160627',
+            'uploader': 'France Inter',
+            'uploader_id': 'x2q2ez',
+        },
+        'add_ie': ['Dailymotion'],
     }]
 
     def _real_extract(self, url):
@@ -197,7 +215,13 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
 
         dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage)
         if dmcloud_url:
-            return self.url_result(dmcloud_url, 'DailymotionCloud')
+            return self.url_result(dmcloud_url, DailymotionCloudIE.ie_key())
+
+        dailymotion_urls = DailymotionIE._extract_urls(webpage)
+        if dailymotion_urls:
+            return self.playlist_result([
+                self.url_result(dailymotion_url, DailymotionIE.ie_key())
+                for dailymotion_url in dailymotion_urls])
 
         video_id, catalogue = self._search_regex(
             (r'id-video=([^@]+@[^"]+)',
index 1eb528f31f4b908b8d832cfe1fd4e1647ef74058..0ad0d9b6a9fe789228487e861139fa2166d88767 100644 (file)
@@ -2,6 +2,10 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
+from ..compat import (
+    compat_HTTPError,
+    compat_urllib_parse_unquote_plus,
+)
 from ..utils import (
     clean_html,
     determine_ext,
@@ -27,6 +31,7 @@ class FunimationIE(InfoExtractor):
             'description': 'md5:1769f43cd5fc130ace8fd87232207892',
             'thumbnail': 're:https?://.*\.jpg',
         },
+        'skip': 'Access without user interaction is forbidden by CloudFlare, and video removed',
     }, {
         'url': 'http://www.funimation.com/shows/hacksign/videos/official/role-play',
         'info_dict': {
@@ -37,6 +42,7 @@ class FunimationIE(InfoExtractor):
             'description': 'md5:b602bdc15eef4c9bbb201bb6e6a4a2dd',
             'thumbnail': 're:https?://.*\.jpg',
         },
+        'skip': 'Access without user interaction is forbidden by CloudFlare',
     }, {
         'url': 'http://www.funimation.com/shows/attack-on-titan-junior-high/videos/promotional/broadcast-dub-preview',
         'info_dict': {
@@ -47,8 +53,36 @@ class FunimationIE(InfoExtractor):
             'description': 'md5:f8ec49c0aff702a7832cd81b8a44f803',
             'thumbnail': 're:https?://.*\.(?:jpg|png)',
         },
+        'skip': 'Access without user interaction is forbidden by CloudFlare',
     }]
 
+    _LOGIN_URL = 'http://www.funimation.com/login'
+
+    def _download_webpage(self, *args, **kwargs):
+        try:
+            return super(FunimationIE, self)._download_webpage(*args, **kwargs)
+        except ExtractorError as ee:
+            if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
+                response = ee.cause.read()
+                if b'>Please complete the security check to access<' in response:
+                    raise ExtractorError(
+                        'Access to funimation.com is blocked by CloudFlare. '
+                        'Please browse to http://www.funimation.com/, solve '
+                        'the reCAPTCHA, export browser cookies to a text file,'
+                        ' and then try again with --cookies YOUR_COOKIE_FILE.',
+                        expected=True)
+            raise
+
+    def _extract_cloudflare_session_ua(self, url):
+        ci_session_cookie = self._get_cookies(url).get('ci_session')
+        if ci_session_cookie:
+            ci_session = compat_urllib_parse_unquote_plus(ci_session_cookie.value)
+            # ci_session is a string serialized by PHP function serialize()
+            # This case is simple enough to use regular expressions only
+            return self._search_regex(
+                r'"user_agent";s:\d+:"([^"]+)"', ci_session, 'user agent',
+                default=None)
+
     def _login(self):
         (username, password) = self._get_login_info()
         if username is None:
@@ -57,8 +91,11 @@ class FunimationIE(InfoExtractor):
             'email_field': username,
             'password_field': password,
         })
-        login_request = sanitized_Request('http://www.funimation.com/login', data, headers={
-            'User-Agent': 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0',
+        user_agent = self._extract_cloudflare_session_ua(self._LOGIN_URL)
+        if not user_agent:
+            user_agent = 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0'
+        login_request = sanitized_Request(self._LOGIN_URL, data, headers={
+            'User-Agent': user_agent,
             'Content-Type': 'application/x-www-form-urlencoded'
         })
         login_page = self._download_webpage(
@@ -103,11 +140,16 @@ class FunimationIE(InfoExtractor):
             ('mobile', 'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.114 Mobile Safari/537.36'),
         )
 
+        user_agent = self._extract_cloudflare_session_ua(url)
+        if user_agent:
+            USER_AGENTS = ((None, user_agent),)
+
         for kind, user_agent in USER_AGENTS:
             request = sanitized_Request(url)
             request.add_header('User-Agent', user_agent)
             webpage = self._download_webpage(
-                request, display_id, 'Downloading %s webpage' % kind)
+                request, display_id,
+                'Downloading %s webpage' % kind if kind else 'Downloading webpage')
 
             playlist = self._parse_json(
                 self._search_regex(
diff --git a/youtube_dl/extractor/fusion.py b/youtube_dl/extractor/fusion.py
new file mode 100644 (file)
index 0000000..b4ab4cb
--- /dev/null
@@ -0,0 +1,35 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .ooyala import OoyalaIE
+
+
+class FusionIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?fusion\.net/video/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://fusion.net/video/201781/u-s-and-panamanian-forces-work-together-to-stop-a-vessel-smuggling-drugs/',
+        'info_dict': {
+            'id': 'ZpcWNoMTE6x6uVIIWYpHh0qQDjxBuq5P',
+            'ext': 'mp4',
+            'title': 'U.S. and Panamanian forces work together to stop a vessel smuggling drugs',
+            'description': 'md5:0cc84a9943c064c0f46b128b41b1b0d7',
+            'duration': 140.0,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': ['Ooyala'],
+    }, {
+        'url': 'http://fusion.net/video/201781',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+
+        ooyala_code = self._search_regex(
+            r'data-video-id=(["\'])(?P<code>.+?)\1',
+            webpage, 'ooyala code', group='code')
+
+        return OoyalaIE._build_url_result(ooyala_code)
index 4ffdd75157486957810f718cb1019cdc5dd80f4f..621257c9f72383a5c6133001953a7e51cc5df435 100644 (file)
@@ -1,19 +1,19 @@
 from __future__ import unicode_literals
 
 import re
-import json
 
-from .common import InfoExtractor
+from .once import OnceIE
 from ..compat import (
     compat_urllib_parse_unquote,
-    compat_urlparse,
 )
 from ..utils import (
     unescapeHTML,
+    url_basename,
+    dict_get,
 )
 
 
-class GameSpotIE(InfoExtractor):
+class GameSpotIE(OnceIE):
     _VALID_URL = r'https?://(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?'
     _TESTS = [{
         'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/',
@@ -39,29 +39,73 @@ class GameSpotIE(InfoExtractor):
         webpage = self._download_webpage(url, page_id)
         data_video_json = self._search_regex(
             r'data-video=["\'](.*?)["\']', webpage, 'data video')
-        data_video = json.loads(unescapeHTML(data_video_json))
+        data_video = self._parse_json(unescapeHTML(data_video_json), page_id)
         streams = data_video['videoStreams']
 
+        manifest_url = None
         formats = []
         f4m_url = streams.get('f4m_stream')
-        if f4m_url is not None:
-            # Transform the manifest url to a link to the mp4 files
-            # they are used in mobile devices.
-            f4m_path = compat_urlparse.urlparse(f4m_url).path
-            QUALITIES_RE = r'((,\d+)+,?)'
-            qualities = self._search_regex(QUALITIES_RE, f4m_path, 'qualities').strip(',').split(',')
-            http_path = f4m_path[1:].split('/', 1)[1]
-            http_template = re.sub(QUALITIES_RE, r'%s', http_path)
-            http_template = http_template.replace('.csmil/manifest.f4m', '')
-            http_template = compat_urlparse.urljoin(
-                'http://video.gamespotcdn.com/', http_template)
-            for q in qualities:
-                formats.append({
-                    'url': http_template % q,
-                    'ext': 'mp4',
-                    'format_id': q,
-                })
-        else:
+        if f4m_url:
+            manifest_url = f4m_url
+            formats.extend(self._extract_f4m_formats(
+                f4m_url + '?hdcore=3.7.0', page_id, f4m_id='hds', fatal=False))
+        m3u8_url = streams.get('m3u8_stream')
+        if m3u8_url:
+            manifest_url = m3u8_url
+            m3u8_formats = self._extract_m3u8_formats(
+                m3u8_url, page_id, 'mp4', 'm3u8_native',
+                m3u8_id='hls', fatal=False)
+            formats.extend(m3u8_formats)
+        progressive_url = dict_get(
+            streams, ('progressive_hd', 'progressive_high', 'progressive_low'))
+        if progressive_url and manifest_url:
+            qualities_basename = self._search_regex(
+                '/([^/]+)\.csmil/',
+                manifest_url, 'qualities basename', default=None)
+            if qualities_basename:
+                QUALITIES_RE = r'((,\d+)+,?)'
+                qualities = self._search_regex(
+                    QUALITIES_RE, qualities_basename,
+                    'qualities', default=None)
+                if qualities:
+                    qualities = list(map(lambda q: int(q), qualities.strip(',').split(',')))
+                    qualities.sort()
+                    http_template = re.sub(QUALITIES_RE, r'%d', qualities_basename)
+                    http_url_basename = url_basename(progressive_url)
+                    if m3u8_formats:
+                        self._sort_formats(m3u8_formats)
+                        m3u8_formats = list(filter(
+                            lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+                            m3u8_formats))
+                    if len(qualities) == len(m3u8_formats):
+                        for q, m3u8_format in zip(qualities, m3u8_formats):
+                            f = m3u8_format.copy()
+                            f.update({
+                                'url': progressive_url.replace(
+                                    http_url_basename, http_template % q),
+                                'format_id': f['format_id'].replace('hls', 'http'),
+                                'protocol': 'http',
+                            })
+                            formats.append(f)
+                    else:
+                        for q in qualities:
+                            formats.append({
+                                'url': progressive_url.replace(
+                                    http_url_basename, http_template % q),
+                                'ext': 'mp4',
+                                'format_id': 'http-%d' % q,
+                                'tbr': q,
+                            })
+
+        onceux_json = self._search_regex(
+            r'data-onceux-options=["\'](.*?)["\']', webpage, 'data video', default=None)
+        if onceux_json:
+            onceux_url = self._parse_json(unescapeHTML(onceux_json), page_id).get('metadataUri')
+            if onceux_url:
+                formats.extend(self._extract_once_formats(re.sub(
+                    r'https?://[^/]+', 'http://once.unicornmedia.com', onceux_url).replace('ads/vmap/', '')))
+
+        if not formats:
             for quality in ['sd', 'hd']:
                 # It's actually a link to a flv file
                 flv_url = streams.get('f4m_{0}'.format(quality))
@@ -71,6 +115,7 @@ class GameSpotIE(InfoExtractor):
                         'ext': 'flv',
                         'format_id': quality,
                     })
+        self._sort_formats(formats)
 
         return {
             'id': data_video['guid'],
diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py
deleted file mode 100644 (file)
index 1e7948a..0000000
+++ /dev/null
@@ -1,62 +0,0 @@
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..utils import (
-    int_or_none,
-    parse_age_limit,
-    url_basename,
-)
-
-
-class GametrailersIE(InfoExtractor):
-    _VALID_URL = r'https?://www\.gametrailers\.com/videos/view/[^/]+/(?P<id>.+)'
-
-    _TEST = {
-        'url': 'http://www.gametrailers.com/videos/view/gametrailers-com/116437-Just-Cause-3-Review',
-        'md5': 'f28c4efa0bdfaf9b760f6507955b6a6a',
-        'info_dict': {
-            'id': '2983958',
-            'ext': 'mp4',
-            'display_id': '116437-Just-Cause-3-Review',
-            'title': 'Just Cause 3 - Review',
-            'description': 'It\'s a lot of fun to shoot at things and then watch them explode in Just Cause 3, but should there be more to the experience than that?',
-        },
-    }
-
-    def _real_extract(self, url):
-        display_id = self._match_id(url)
-        webpage = self._download_webpage(url, display_id)
-        title = self._html_search_regex(
-            r'<title>(.+?)\|', webpage, 'title').strip()
-        embed_url = self._proto_relative_url(
-            self._search_regex(
-                r'src=\'(//embed.gametrailers.com/embed/[^\']+)\'', webpage,
-                'embed url'),
-            scheme='http:')
-        video_id = url_basename(embed_url)
-        embed_page = self._download_webpage(embed_url, video_id)
-        embed_vars_json = self._search_regex(
-            r'(?s)var embedVars = (\{.*?\})\s*</script>', embed_page,
-            'embed vars')
-        info = self._parse_json(embed_vars_json, video_id)
-
-        formats = []
-        for media in info['media']:
-            if media['mediaPurpose'] == 'play':
-                formats.append({
-                    'url': media['uri'],
-                    'height': media['height'],
-                    'width:': media['width'],
-                })
-        self._sort_formats(formats)
-
-        return {
-            'id': video_id,
-            'display_id': display_id,
-            'title': title,
-            'formats': formats,
-            'thumbnail': info.get('thumbUri'),
-            'description': self._og_search_description(webpage),
-            'duration': int_or_none(info.get('videoLengthInSeconds')),
-            'age_limit': parse_age_limit(info.get('audienceRating')),
-        }
index 95d23325900e8ed0d61bcec5d09ff3ba3d5e7a82..4efdf146e740214ad776b6a6c3e939e491cf3814 100644 (file)
@@ -49,9 +49,12 @@ from .pornhub import PornHubIE
 from .xhamster import XHamsterEmbedIE
 from .tnaflix import TNAFlixNetworkEmbedIE
 from .vimeo import VimeoIE
-from .dailymotion import DailymotionCloudIE
+from .dailymotion import (
+    DailymotionIE,
+    DailymotionCloudIE,
+)
 from .onionstudios import OnionStudiosIE
-from .snagfilms import SnagFilmsEmbedIE
+from .viewlift import ViewLiftEmbedIE
 from .screenwavemedia import ScreenwaveMediaIE
 from .mtv import MTVServicesEmbeddedIE
 from .pladform import PladformIE
@@ -61,6 +64,12 @@ from .jwplatform import JWPlatformIE
 from .digiteka import DigitekaIE
 from .instagram import InstagramIE
 from .liveleak import LiveLeakIE
+from .threeqsdn import ThreeQSDNIE
+from .theplatform import ThePlatformIE
+from .vessel import VesselIE
+from .kaltura import KalturaIE
+from .eagleplatform import EaglePlatformIE
+from .facebook import FacebookIE
 
 
 class GenericIE(InfoExtractor):
@@ -237,6 +246,7 @@ class GenericIE(InfoExtractor):
                 'ext': 'mp4',
                 'title': 'car-20120827-manifest',
                 'formats': 'mincount:9',
+                'upload_date': '20130904',
             },
             'params': {
                 'format': 'bestvideo',
@@ -596,7 +606,11 @@ class GenericIE(InfoExtractor):
                 'id': 'k2mm4bCdJ6CQ2i7c8o2',
                 'ext': 'mp4',
                 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
+                'description': 'md5:faf028e48a461b8b7fad38f1e104b119',
                 'uploader': 'Spi0n',
+                'uploader_id': 'xgditw',
+                'upload_date': '20140425',
+                'timestamp': 1398441542,
             },
             'add_ie': ['Dailymotion'],
         },
@@ -619,13 +633,13 @@ class GenericIE(InfoExtractor):
         },
         # MTVSercices embed
         {
-            'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
-            'md5': '35727f82f58c76d996fc188f9755b0d5',
+            'url': 'http://www.vulture.com/2016/06/new-key-peele-sketches-released.html',
+            'md5': 'ca1aef97695ef2c1d6973256a57e5252',
             'info_dict': {
-                'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
+                'id': '769f7ec0-0692-4d62-9b45-0d88074bffc1',
                 'ext': 'mp4',
-                'title': 'Review',
-                'description': 'Mario\'s life in the fast lane has never looked so good.',
+                'title': 'Key and Peele|October 10, 2012|2|203|Liam Neesons - Uncensored',
+                'description': 'Two valets share their love for movie star Liam Neesons.',
             },
         },
         # YouTube embed via <data-embed-url="">
@@ -711,15 +725,18 @@ class GenericIE(InfoExtractor):
         },
         # Wistia embed
         {
-            'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
-            'md5': '8788b683c777a5cf25621eaf286d0c23',
+            'url': 'http://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
+            'md5': '1953f3a698ab51cfc948ed3992a0b7ff',
             'info_dict': {
-                'id': '1cfaf6b7ea',
+                'id': '6e2wtrbdaf',
                 'ext': 'mov',
-                'title': 'md5:51364a8d3d009997ba99656004b5e20d',
-                'duration': 643.0,
-                'filesize': 182808282,
-                'uploader': 'education-portal.com',
+                'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england',
+                'description': 'a Paywall Videos video from Remilon',
+                'duration': 644.072,
+                'uploader': 'study.com',
+                'timestamp': 1459678540,
+                'upload_date': '20160403',
+                'filesize': 24687186,
             },
         },
         {
@@ -728,10 +745,29 @@ class GenericIE(InfoExtractor):
             'info_dict': {
                 'id': 'uxjb0lwrcz',
                 'ext': 'mp4',
-                'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
+                'title': 'Conversation about Hexagonal Rails Part 1',
+                'description': 'a Martin Fowler video from ThoughtWorks',
                 'duration': 1715.0,
                 'uploader': 'thoughtworks.wistia.com',
+                'timestamp': 1401832161,
+                'upload_date': '20140603',
+            },
+        },
+        # Wistia standard embed (async)
+        {
+            'url': 'https://www.getdrip.com/university/brennan-dunn-drip-workshop/',
+            'info_dict': {
+                'id': '807fafadvk',
+                'ext': 'mp4',
+                'title': 'Drip Brennan Dunn Workshop',
+                'description': 'a JV Webinars video from getdrip-1',
+                'duration': 4986.95,
+                'timestamp': 1463607249,
+                'upload_date': '20160518',
             },
+            'params': {
+                'skip_download': True,
+            }
         },
         # Soundcloud embed
         {
@@ -755,6 +791,19 @@ class GenericIE(InfoExtractor):
                 'title': 'Rosetta #CometLanding webcast HL 10',
             }
         },
+        # Another Livestream embed, without 'new.' in URL
+        {
+            'url': 'https://www.freespeech.org/',
+            'info_dict': {
+                'id': '123537347',
+                'ext': 'mp4',
+                'title': 're:^FSTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            },
+            'params': {
+                # Live stream
+                'skip_download': True,
+            },
+        },
         # LazyYT
         {
             'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
@@ -839,18 +888,6 @@ class GenericIE(InfoExtractor):
                 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
             }
         },
-        # Kaltura embed
-        {
-            'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
-            'info_dict': {
-                'id': '1_eergr3h1',
-                'ext': 'mp4',
-                'upload_date': '20150226',
-                'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
-                'timestamp': int,
-                'title': 'John Carlson Postgame 2/25/15',
-            },
-        },
         # Kaltura embed (different embed code)
         {
             'url': 'http://www.premierchristianradio.com/Shows/Saturday/Unbelievable/Conference-Videos/Os-Guinness-Is-It-Fools-Talk-Unbelievable-Conference-2014',
@@ -876,9 +913,41 @@ class GenericIE(InfoExtractor):
                 'uploader_id': 'echojecka',
             },
         },
+        # Kaltura embed with single quotes
+        {
+            'url': 'http://fod.infobase.com/p_ViewPlaylist.aspx?AssignmentID=NUN8ZY',
+            'info_dict': {
+                'id': '0_izeg5utt',
+                'ext': 'mp4',
+                'title': '35871',
+                'timestamp': 1355743100,
+                'upload_date': '20121217',
+                'uploader_id': 'batchUser',
+            },
+            'add_ie': ['Kaltura'],
+        },
+        {
+            # Kaltura embedded via quoted entry_id
+            'url': 'https://www.oreilly.com/ideas/my-cloud-makes-pretty-pictures',
+            'info_dict': {
+                'id': '0_utuok90b',
+                'ext': 'mp4',
+                'title': '06_matthew_brender_raj_dutt',
+                'timestamp': 1466638791,
+                'upload_date': '20160622',
+            },
+            'add_ie': ['Kaltura'],
+            'expected_warnings': [
+                'Could not send HEAD request'
+            ],
+            'params': {
+                'skip_download': True,
+            }
+        },
         # Eagle.Platform embed (generic URL)
         {
             'url': 'http://lenta.ru/news/2015/03/06/navalny/',
+            # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
             'info_dict': {
                 'id': '227304',
                 'ext': 'mp4',
@@ -893,6 +962,7 @@ class GenericIE(InfoExtractor):
         # ClipYou (Eagle.Platform) embed (custom URL)
         {
             'url': 'http://muz-tv.ru/play/7129/',
+            # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
             'info_dict': {
                 'id': '12820',
                 'ext': 'mp4',
@@ -981,18 +1051,36 @@ class GenericIE(InfoExtractor):
                 'ext': 'flv',
                 'title': "PFT Live: New leader in the 'new-look' defense",
                 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
+                'uploader': 'NBCU-SPORTS',
+                'upload_date': '20140107',
+                'timestamp': 1389118457,
+            },
+        },
+        # NBC News embed
+        {
+            'url': 'http://www.vulture.com/2016/06/letterman-couldnt-care-less-about-late-night.html',
+            'md5': '1aa589c675898ae6d37a17913cf68d66',
+            'info_dict': {
+                'id': '701714499682',
+                'ext': 'mp4',
+                'title': 'PREVIEW: On Assignment: David Letterman',
+                'description': 'A preview of Tom Brokaw\'s interview with David Letterman as part of the On Assignment series powered by Dateline. Airs Sunday June 12 at 7/6c.',
             },
         },
         # UDN embed
         {
-            'url': 'http://www.udn.com/news/story/7314/822787',
+            'url': 'https://video.udn.com/news/300346',
             'md5': 'fd2060e988c326991037b9aff9df21a6',
             'info_dict': {
                 'id': '300346',
                 'ext': 'mp4',
                 'title': '中一中男師變性 全校師生力挺',
                 'thumbnail': 're:^https?://.*\.jpg$',
-            }
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
         },
         # Ooyala embed
         {
@@ -1009,20 +1097,6 @@ class GenericIE(InfoExtractor):
                 'skip_download': True,
             }
         },
-        # Contains a SMIL manifest
-        {
-            'url': 'http://www.telewebion.com/fa/1263668/%D9%82%D8%B1%D8%B9%D9%87%E2%80%8C%DA%A9%D8%B4%DB%8C-%D9%84%DB%8C%DA%AF-%D9%82%D9%87%D8%B1%D9%85%D8%A7%D9%86%D8%A7%D9%86-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7/%2B-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84.html',
-            'info_dict': {
-                'id': 'file',
-                'ext': 'flv',
-                'title': '+ Football: Lottery Champions League Europe',
-                'uploader': 'www.telewebion.com',
-            },
-            'params': {
-                # rtmpe downloads
-                'skip_download': True,
-            }
-        },
         # Brightcove URL in single quotes
         {
             'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/',
@@ -1033,17 +1107,25 @@ class GenericIE(InfoExtractor):
                 'title': 'SN Presents: Russell Martin, World Citizen',
                 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.',
                 'uploader': 'Rogers Sportsnet',
+                'uploader_id': '1704050871',
+                'upload_date': '20150525',
+                'timestamp': 1432570283,
             },
         },
         # Dailymotion Cloud video
         {
             'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910',
-            'md5': '49444254273501a64675a7e68c502681',
+            'md5': 'dcaf23ad0c67a256f4278bce6e0bae38',
             'info_dict': {
-                'id': '5585de919473990de4bee11b',
+                'id': 'x2uy8t3',
                 'ext': 'mp4',
-                'title': 'Le débat',
+                'title': 'Sauvons les abeilles ! - Le débat',
+                'description': 'md5:d9082128b1c5277987825d684939ca26',
                 'thumbnail': 're:^https?://.*\.jpe?g$',
+                'timestamp': 1434970506,
+                'upload_date': '20150622',
+                'uploader': 'Public Sénat',
+                'uploader_id': 'xa9gza',
             }
         },
         # OnionStudios embed
@@ -1124,6 +1206,9 @@ class GenericIE(InfoExtractor):
                 'title': 'The Cardinal Pell Interview',
                 'description': 'Sky News Contributor Andrew Bolt interviews George Pell in Rome, following the Cardinal\'s evidence before the Royal Commission into Child Abuse. ',
                 'uploader': 'GlobeCast Australia - GlobeStream',
+                'uploader_id': '2733773828001',
+                'upload_date': '20160304',
+                'timestamp': 1457083087,
             },
             'params': {
                 # m3u8 downloads
@@ -1154,6 +1239,97 @@ class GenericIE(InfoExtractor):
                 'uploader': 'Lake8737',
             }
         },
+        # Duplicated embedded video URLs
+        {
+            'url': 'http://www.hudl.com/athlete/2538180/highlights/149298443',
+            'info_dict': {
+                'id': '149298443_480_16c25b74_2',
+                'ext': 'mp4',
+                'title': 'vs. Blue Orange Spring Game',
+                'uploader': 'www.hudl.com',
+            },
+        },
+        # twitter:player embed
+        {
+            'url': 'http://www.theatlantic.com/video/index/484130/what-do-black-holes-sound-like/',
+            'md5': 'a3e0df96369831de324f0778e126653c',
+            'info_dict': {
+                'id': '4909620399001',
+                'ext': 'mp4',
+                'title': 'What Do Black Holes Sound Like?',
+                'description': 'what do black holes sound like',
+                'upload_date': '20160524',
+                'uploader_id': '29913724001',
+                'timestamp': 1464107587,
+                'uploader': 'TheAtlantic',
+            },
+            'add_ie': ['BrightcoveLegacy'],
+        },
+        # Facebook <iframe> embed
+        {
+            'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html',
+            'md5': 'fbcde74f534176ecb015849146dd3aee',
+            'info_dict': {
+                'id': '599637780109885',
+                'ext': 'mp4',
+                'title': 'Facebook video #599637780109885',
+            },
+        },
+        # Facebook API embed
+        {
+            'url': 'http://www.lothype.com/blue-stars-2016-preview-standstill-full-show/',
+            'md5': 'a47372ee61b39a7b90287094d447d94e',
+            'info_dict': {
+                'id': '10153467542406923',
+                'ext': 'mp4',
+                'title': 'Facebook video #10153467542406923',
+            },
+        },
+        # Wordpress "YouTube Video Importer" plugin
+        {
+            'url': 'http://www.lothype.com/blue-devils-drumline-stanford-lot-2016/',
+            'md5': 'd16797741b560b485194eddda8121b48',
+            'info_dict': {
+                'id': 'HNTXWDXV9Is',
+                'ext': 'mp4',
+                'title': 'Blue Devils Drumline Stanford lot 2016',
+                'upload_date': '20160627',
+                'uploader_id': 'GENOCIDE8GENERAL10',
+                'uploader': 'cylus cyrus',
+            },
+        },
+        {
+            # video stored on custom kaltura server
+            'url': 'http://www.expansion.com/multimedia/videos.html?media=EQcM30NHIPv',
+            'md5': '537617d06e64dfed891fa1593c4b30cc',
+            'info_dict': {
+                'id': '0_1iotm5bh',
+                'ext': 'mp4',
+                'title': 'Elecciones británicas: 5 lecciones para Rajoy',
+                'description': 'md5:435a89d68b9760b92ce67ed227055f16',
+                'uploader_id': 'videos.expansion@el-mundo.net',
+                'upload_date': '20150429',
+                'timestamp': 1430303472,
+            },
+            'add_ie': ['Kaltura'],
+        },
+        # {
+        #     # TODO: find another test
+        #     # http://schema.org/VideoObject
+        #     'url': 'https://flipagram.com/f/nyvTSJMKId',
+        #     'md5': '888dcf08b7ea671381f00fab74692755',
+        #     'info_dict': {
+        #         'id': 'nyvTSJMKId',
+        #         'ext': 'mp4',
+        #         'title': 'Flipagram by sjuria101 featuring Midnight Memories by One Direction',
+        #         'description': '#love for cats.',
+        #         'timestamp': 1461244995,
+        #         'upload_date': '20160421',
+        #     },
+        #     'params': {
+        #         'force_generic_extractor': True,
+        #     },
+        # }
     ]
 
     def report_following_redirect(self, new_url):
@@ -1408,7 +1584,8 @@ class GenericIE(InfoExtractor):
         #   Site Name | Video Title
         #   Video Title - Tagline | Site Name
         # and so on and so forth; it's just not practical
-        video_title = self._html_search_regex(
+        video_title = self._og_search_title(
+            webpage, default=None) or self._html_search_regex(
             r'(?s)<title>(.*?)</title>', webpage, 'video title',
             default='video')
 
@@ -1426,6 +1603,9 @@ class GenericIE(InfoExtractor):
         video_uploader = self._search_regex(
             r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
 
+        video_description = self._og_search_description(webpage, default=None)
+        video_thumbnail = self._og_search_thumbnail(webpage, default=None)
+
         # Helper method
         def _playlist_from_matches(matches, getter=None, ie=None):
             urlrs = orderedSet(
@@ -1456,6 +1636,16 @@ class GenericIE(InfoExtractor):
         if bc_urls:
             return _playlist_from_matches(bc_urls, ie='BrightcoveNew')
 
+        # Look for ThePlatform embeds
+        tp_urls = ThePlatformIE._extract_urls(webpage)
+        if tp_urls:
+            return _playlist_from_matches(tp_urls, ie='ThePlatform')
+
+        # Look for Vessel embeds
+        vessel_urls = VesselIE._extract_urls(webpage)
+        if vessel_urls:
+            return _playlist_from_matches(vessel_urls, ie=VesselIE.ie_key())
+
         # Look for embedded rtl.nl player
         matches = re.findall(
             r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
@@ -1496,12 +1686,16 @@ class GenericIE(InfoExtractor):
         if matches:
             return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
 
-        # Look for embedded Dailymotion player
-        matches = re.findall(
-            r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage)
+        # Look for Wordpress "YouTube Video Importer" plugin
+        matches = re.findall(r'''(?x)<div[^>]+
+            class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
+            data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
         if matches:
-            return _playlist_from_matches(
-                matches, lambda m: unescapeHTML(m[1]))
+            return _playlist_from_matches(matches, lambda m: m[-1])
+
+        matches = DailymotionIE._extract_urls(webpage)
+        if matches:
+            return _playlist_from_matches(matches)
 
         # Look for embedded Dailymotion playlist player (#3822)
         m = re.search(
@@ -1524,21 +1718,26 @@ class GenericIE(InfoExtractor):
                 'url': embed_url,
                 'ie_key': 'Wistia',
                 'uploader': video_uploader,
-                'title': video_title,
-                'id': video_id,
             }
 
         match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
         if match:
             return {
                 '_type': 'url_transparent',
-                'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
+                'url': 'wistia:%s' % match.group('id'),
                 'ie_key': 'Wistia',
                 'uploader': video_uploader,
-                'title': video_title,
-                'id': match.group('id')
             }
 
+        match = re.search(
+            r'''(?sx)
+                <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*?
+                <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2
+            ''', webpage)
+        if match:
+            return self.url_result(self._proto_relative_url(
+                'wistia:%s' % match.group('id')), 'Wistia')
+
         # Look for SVT player
         svt_url = SVTIE._extract_url(webpage)
         if svt_url:
@@ -1633,10 +1832,9 @@ class GenericIE(InfoExtractor):
             return self.url_result(mobj.group('url'))
 
         # Look for embedded Facebook player
-        mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
-        if mobj is not None:
-            return self.url_result(mobj.group('url'), 'Facebook')
+        facebook_url = FacebookIE._extract_url(webpage)
+        if facebook_url is not None:
+            return self.url_result(facebook_url, 'Facebook')
 
         # Look for embedded VK player
         mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
@@ -1758,14 +1956,6 @@ class GenericIE(InfoExtractor):
             url = unescapeHTML(mobj.group('url'))
             return self.url_result(url)
 
-        # Look for embedded vulture.com player
-        mobj = re.search(
-            r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
-            webpage)
-        if mobj is not None:
-            url = unescapeHTML(mobj.group('url'))
-            return self.url_result(url, ie='Vulture')
-
         # Look for embedded mtvservices player
         mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage)
         if mtvservices_url:
@@ -1814,7 +2004,7 @@ class GenericIE(InfoExtractor):
             return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
 
         mobj = re.search(
-            r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
+            r'<iframe[^>]+src="(?P<url>https?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"',
             webpage)
         if mobj is not None:
             return self.url_result(mobj.group('url'), 'Livestream')
@@ -1826,18 +2016,14 @@ class GenericIE(InfoExtractor):
             return self.url_result(mobj.group('url'), 'Zapiks')
 
         # Look for Kaltura embeds
-        mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_?[Ii]d'\s*:\s*'(?P<id>[^']+)',", webpage) or
-                re.search(r'(?s)(?P<q1>["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?(?P=q1).*?entry_?[Ii]d\s*:\s*(?P<q2>["\'])(?P<id>.+?)(?P=q2)', webpage))
-        if mobj is not None:
-            return self.url_result(smuggle_url(
-                'kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(),
-                {'source_url': url}), 'Kaltura')
+        kaltura_url = KalturaIE._extract_url(webpage)
+        if kaltura_url:
+            return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key())
 
         # Look for Eagle.Platform embeds
-        mobj = re.search(
-            r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
-        if mobj is not None:
-            return self.url_result(mobj.group('url'), 'EaglePlatform')
+        eagleplatform_url = EaglePlatformIE._extract_url(webpage)
+        if eagleplatform_url:
+            return self.url_result(eagleplatform_url, EaglePlatformIE.ie_key())
 
         # Look for ClipYou (uses Eagle.Platform) embeds
         mobj = re.search(
@@ -1878,6 +2064,12 @@ class GenericIE(InfoExtractor):
         if nbc_sports_url:
             return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
 
+        # Look for NBC News embeds
+        nbc_news_embed_url = re.search(
+            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//www\.nbcnews\.com/widget/video-embed/[^"\']+)\1', webpage)
+        if nbc_news_embed_url:
+            return self.url_result(nbc_news_embed_url.group('url'), 'NBCNews')
+
         # Look for Google Drive embeds
         google_drive_url = GoogleDriveIE._extract_url(webpage)
         if google_drive_url:
@@ -1905,10 +2097,10 @@ class GenericIE(InfoExtractor):
         if onionstudios_url:
             return self.url_result(onionstudios_url)
 
-        # Look for SnagFilms embeds
-        snagfilms_url = SnagFilmsEmbedIE._extract_url(webpage)
-        if snagfilms_url:
-            return self.url_result(snagfilms_url)
+        # Look for ViewLift embeds
+        viewlift_url = ViewLiftEmbedIE._extract_url(webpage)
+        if viewlift_url:
+            return self.url_result(viewlift_url)
 
         # Look for JWPlatform embeds
         jwplatform_url = JWPlatformIE._extract_url(webpage)
@@ -1964,6 +2156,37 @@ class GenericIE(InfoExtractor):
         if liveleak_url:
             return self.url_result(liveleak_url, 'LiveLeak')
 
+        # Look for 3Q SDN embeds
+        threeqsdn_url = ThreeQSDNIE._extract_url(webpage)
+        if threeqsdn_url:
+            return {
+                '_type': 'url_transparent',
+                'ie_key': ThreeQSDNIE.ie_key(),
+                'url': self._proto_relative_url(threeqsdn_url),
+                'title': video_title,
+                'description': video_description,
+                'thumbnail': video_thumbnail,
+                'uploader': video_uploader,
+            }
+
+        # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser
+        embed_url = self._html_search_meta('twitter:player', webpage, default=None)
+        if embed_url:
+            return self.url_result(embed_url)
+
+        # Looking for http://schema.org/VideoObject
+        json_ld = self._search_json_ld(
+            webpage, video_id, default=None, expected_type='VideoObject')
+        if json_ld and json_ld.get('url'):
+            info_dict.update({
+                'title': video_title or info_dict['title'],
+                'description': video_description,
+                'thumbnail': video_thumbnail,
+                'age_limit': age_limit
+            })
+            info_dict.update(json_ld)
+            return info_dict
+
         def check_video(vurl):
             if YoutubeIE.suitable(vurl):
                 return True
@@ -2044,7 +2267,8 @@ class GenericIE(InfoExtractor):
             raise UnsupportedError(url)
 
         entries = []
-        for video_url in found:
+        for video_url in orderedSet(found):
+            video_url = unescapeHTML(video_url)
             video_url = video_url.replace('\\/', '/')
             video_url = compat_urlparse.urljoin(url, video_url)
             video_id = compat_urllib_parse_unquote(os.path.basename(video_url))
diff --git a/youtube_dl/extractor/godtv.py b/youtube_dl/extractor/godtv.py
new file mode 100644 (file)
index 0000000..c5d3b4e
--- /dev/null
@@ -0,0 +1,66 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .ooyala import OoyalaIE
+from ..utils import js_to_json
+
+
+class GodTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?god\.tv(?:/[^/]+)*/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'http://god.tv/jesus-image/video/jesus-conference-2016/randy-needham',
+        'info_dict': {
+            'id': 'lpd3g2MzE6D1g8zFAKz8AGpxWcpu6o_3',
+            'ext': 'mp4',
+            'title': 'Randy Needham',
+            'duration': 3615.08,
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }, {
+        'url': 'http://god.tv/playlist/bible-study',
+        'info_dict': {
+            'id': 'bible-study',
+        },
+        'playlist_mincount': 37,
+    }, {
+        'url': 'http://god.tv/node/15097',
+        'only_matching': True,
+    }, {
+        'url': 'http://god.tv/live/africa',
+        'only_matching': True,
+    }, {
+        'url': 'http://god.tv/liveevents',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        settings = self._parse_json(
+            self._search_regex(
+                r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
+                webpage, 'settings', default='{}'),
+            display_id, transform_source=js_to_json, fatal=False)
+
+        ooyala_id = None
+
+        if settings:
+            playlist = settings.get('playlist')
+            if playlist and isinstance(playlist, list):
+                entries = [
+                    OoyalaIE._build_url_result(video['content_id'])
+                    for video in playlist if video.get('content_id')]
+                if entries:
+                    return self.playlist_result(entries, display_id)
+            ooyala_id = settings.get('ooyala', {}).get('content_id')
+
+        if not ooyala_id:
+            ooyala_id = self._search_regex(
+                r'["\']content_id["\']\s*:\s*(["\'])(?P<id>[\w-]+)\1',
+                webpage, 'ooyala id', group='id')
+
+        return OoyalaIE._build_url_result(ooyala_id)
index f6b69662baf547aa48a9bdf460671f072bd59884..a6da909310a5591fe39a68244142a46fb24ce65d 100644 (file)
@@ -4,7 +4,7 @@ from .common import InfoExtractor
 
 
 class GrouponIE(InfoExtractor):
-    _VALID_URL = r'https?://www\.groupon\.com/deals/(?P<id>[^?#]+)'
+    _VALID_URL = r'https?://(?:www\.)?groupon\.com/deals/(?P<id>[^/?#&]+)'
 
     _TEST = {
         'url': 'https://www.groupon.com/deals/bikram-yoga-huntington-beach-2#ooid=tubGNycTo_9Uxg82uESj4i61EYX8nyuf',
@@ -14,17 +14,27 @@ class GrouponIE(InfoExtractor):
             'description': 'Studio kept at 105 degrees and 40% humidity with anti-microbial and anti-slip Flotex flooring; certified instructors',
         },
         'playlist': [{
+            'md5': '42428ce8a00585f9bc36e49226eae7a1',
             'info_dict': {
-                'id': 'tubGNycTo_9Uxg82uESj4i61EYX8nyuf',
-                'ext': 'flv',
-                'title': 'Bikram Yoga Huntington Beach | Orange County',
+                'id': 'fk6OhWpXgIQ',
+                'ext': 'mp4',
+                'title': 'Bikram Yoga Huntington Beach | Orange County !tubGNycTo@9Uxg82uESj4i61EYX8nyuf',
                 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
-                'duration': 44.961,
+                'duration': 45,
+                'upload_date': '20160405',
+                'uploader_id': 'groupon',
+                'uploader': 'Groupon',
             },
+            'add_ie': ['Youtube'],
         }],
         'params': {
-            'skip_download': 'HDS',
-        }
+            'skip_download': True,
+        },
+    }
+
+    _PROVIDERS = {
+        'ooyala': ('ooyala:%s', 'Ooyala'),
+        'youtube': ('%s', 'Youtube'),
     }
 
     def _real_extract(self, url):
@@ -36,12 +46,17 @@ class GrouponIE(InfoExtractor):
         videos = payload['carousel'].get('dealVideos', [])
         entries = []
         for v in videos:
-            if v.get('provider') != 'OOYALA':
+            provider = v.get('provider')
+            video_id = v.get('media') or v.get('id') or v.get('baseURL')
+            if not provider or not video_id:
+                continue
+            url_pattern, ie_key = self._PROVIDERS.get(provider.lower())
+            if not url_pattern:
                 self.report_warning(
                     '%s: Unsupported video provider %s, skipping video' %
-                    (playlist_id, v.get('provider')))
+                    (playlist_id, provider))
                 continue
-            entries.append(self.url_result('ooyala:%s' % v['media']))
+            entries.append(self.url_result(url_pattern % video_id, ie_key))
 
         return {
             '_type': 'playlist',
index 7d8698655666f8de4e8850ac2684a16dd28810af..2564538820e7d534adc24fd8c967ee44490e0dc3 100644 (file)
@@ -7,6 +7,7 @@ from .common import InfoExtractor
 from ..compat import compat_urlparse
 from ..utils import (
     HEADRequest,
+    KNOWN_EXTENSIONS,
     sanitized_Request,
     str_to_int,
     urlencode_postdata,
@@ -17,7 +18,7 @@ from ..utils import (
 class HearThisAtIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?hearthis\.at/(?P<artist>[^/]+)/(?P<title>[A-Za-z0-9\-]+)/?$'
     _PLAYLIST_URL = 'https://hearthis.at/playlist.php'
-    _TEST = {
+    _TESTS = [{
         'url': 'https://hearthis.at/moofi/dr-kreep',
         'md5': 'ab6ec33c8fed6556029337c7885eb4e0',
         'info_dict': {
@@ -26,7 +27,7 @@ class HearThisAtIE(InfoExtractor):
             'title': 'Moofi - Dr. Kreep',
             'thumbnail': 're:^https?://.*\.jpg$',
             'timestamp': 1421564134,
-            'description': 'Creepy Patch. Mutable Instruments Braids Vowel + Formant Mode.',
+            'description': 'Listen to Dr. Kreep by Moofi on hearthis.at - Modular, Eurorack, Mutable Intruments Braids, Valhalla-DSP',
             'upload_date': '20150118',
             'comment_count': int,
             'view_count': int,
@@ -34,7 +35,25 @@ class HearThisAtIE(InfoExtractor):
             'duration': 71,
             'categories': ['Experimental'],
         }
-    }
+    }, {
+        # 'download' link redirects to the original webpage
+        'url': 'https://hearthis.at/twitchsf/dj-jim-hopkins-totally-bitchin-80s-dance-mix/',
+        'md5': '5980ceb7c461605d30f1f039df160c6e',
+        'info_dict': {
+            'id': '811296',
+            'ext': 'mp3',
+            'title': 'TwitchSF - DJ Jim Hopkins -  Totally Bitchin\' 80\'s Dance Mix!',
+            'description': 'Listen to DJ Jim Hopkins -  Totally Bitchin\' 80\'s Dance Mix! by TwitchSF on hearthis.at - Dance',
+            'upload_date': '20160328',
+            'timestamp': 1459186146,
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'comment_count': int,
+            'view_count': int,
+            'like_count': int,
+            'duration': 4360,
+            'categories': ['Dance'],
+        },
+    }]
 
     def _real_extract(self, url):
         m = re.match(self._VALID_URL, url)
@@ -90,13 +109,14 @@ class HearThisAtIE(InfoExtractor):
             ext_handle = self._request_webpage(
                 ext_req, display_id, note='Determining extension')
             ext = urlhandle_detect_ext(ext_handle)
-            formats.append({
-                'format_id': 'download',
-                'vcodec': 'none',
-                'ext': ext,
-                'url': download_url,
-                'preference': 2,  # Usually better quality
-            })
+            if ext in KNOWN_EXTENSIONS:
+                formats.append({
+                    'format_id': 'download',
+                    'vcodec': 'none',
+                    'ext': ext,
+                    'url': download_url,
+                    'preference': 2,  # Usually better quality
+                })
         self._sort_formats(formats)
 
         return {
index e8f51e545bfd2b89a251e1a4fbbeefe80aa371f9..7e36b85ad586984dfb761e4518b23d2b4a074bf7 100644 (file)
@@ -8,7 +8,7 @@ class HowcastIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
     _TEST = {
         'url': 'http://www.howcast.com/videos/390161-How-to-Tie-a-Square-Knot-Properly',
-        'md5': '8b743df908c42f60cf6496586c7f12c3',
+        'md5': '7d45932269a288149483144f01b99789',
         'info_dict': {
             'id': '390161',
             'ext': 'mp4',
@@ -19,9 +19,9 @@ class HowcastIE(InfoExtractor):
             'duration': 56.823,
         },
         'params': {
-            # m3u8 download
             'skip_download': True,
         },
+        'add_ie': ['Ooyala'],
     }
 
     def _real_extract(self, url):
diff --git a/youtube_dl/extractor/hrti.py b/youtube_dl/extractor/hrti.py
new file mode 100644 (file)
index 0000000..656ce6d
--- /dev/null
@@ -0,0 +1,202 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+    clean_html,
+    ExtractorError,
+    int_or_none,
+    parse_age_limit,
+    sanitized_Request,
+    try_get,
+)
+
+
+class HRTiBaseIE(InfoExtractor):
+    """
+        Base Information Extractor for Croatian Radiotelevision
+        video on demand site https://hrti.hrt.hr
+        Reverse engineered from the JavaScript app in app.min.js
+    """
+    _NETRC_MACHINE = 'hrti'
+
+    _APP_LANGUAGE = 'hr'
+    _APP_VERSION = '1.1'
+    _APP_PUBLICATION_ID = 'all_in_one'
+    _API_URL = 'http://clientapi.hrt.hr/client_api.php/config/identify/format/json'
+
+    def _initialize_api(self):
+        init_data = {
+            'application_publication_id': self._APP_PUBLICATION_ID
+        }
+
+        uuid = self._download_json(
+            self._API_URL, None, note='Downloading uuid',
+            errnote='Unable to download uuid',
+            data=json.dumps(init_data).encode('utf-8'))['uuid']
+
+        app_data = {
+            'uuid': uuid,
+            'application_publication_id': self._APP_PUBLICATION_ID,
+            'application_version': self._APP_VERSION
+        }
+
+        req = sanitized_Request(self._API_URL, data=json.dumps(app_data).encode('utf-8'))
+        req.get_method = lambda: 'PUT'
+
+        resources = self._download_json(
+            req, None, note='Downloading session information',
+            errnote='Unable to download session information')
+
+        self._session_id = resources['session_id']
+
+        modules = resources['modules']
+
+        self._search_url = modules['vod_catalog']['resources']['search']['uri'].format(
+            language=self._APP_LANGUAGE,
+            application_id=self._APP_PUBLICATION_ID)
+
+        self._login_url = (modules['user']['resources']['login']['uri'] +
+                           '/format/json').format(session_id=self._session_id)
+
+        self._logout_url = modules['user']['resources']['logout']['uri']
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+        # TODO: figure out authentication with cookies
+        if username is None or password is None:
+            self.raise_login_required()
+
+        auth_data = {
+            'username': username,
+            'password': password,
+        }
+
+        try:
+            auth_info = self._download_json(
+                self._login_url, None, note='Logging in', errnote='Unable to log in',
+                data=json.dumps(auth_data).encode('utf-8'))
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 406:
+                auth_info = self._parse_json(e.cause.read().encode('utf-8'), None)
+            else:
+                raise
+
+        error_message = auth_info.get('error', {}).get('message')
+        if error_message:
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, error_message),
+                expected=True)
+
+        self._token = auth_info['secure_streaming_token']
+
+    def _real_initialize(self):
+        self._initialize_api()
+        self._login()
+
+
+class HRTiIE(HRTiBaseIE):
+    _VALID_URL = r'''(?x)
+                        (?:
+                            hrti:(?P<short_id>[0-9]+)|
+                            https?://
+                                hrti\.hrt\.hr/\#/video/show/(?P<id>[0-9]+)/(?P<display_id>[^/]+)?
+                        )
+                    '''
+    _TESTS = [{
+        'url': 'https://hrti.hrt.hr/#/video/show/2181385/republika-dokumentarna-serija-16-hd',
+        'info_dict': {
+            'id': '2181385',
+            'display_id': 'republika-dokumentarna-serija-16-hd',
+            'ext': 'mp4',
+            'title': 'REPUBLIKA, dokumentarna serija (1/6) (HD)',
+            'description': 'md5:48af85f620e8e0e1df4096270568544f',
+            'duration': 2922,
+            'view_count': int,
+            'average_rating': int,
+            'episode_number': int,
+            'season_number': int,
+            'age_limit': 12,
+        },
+        'skip': 'Requires account credentials',
+    }, {
+        'url': 'https://hrti.hrt.hr/#/video/show/2181385/',
+        'only_matching': True,
+    }, {
+        'url': 'hrti:2181385',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('short_id') or mobj.group('id')
+        display_id = mobj.group('display_id') or video_id
+
+        video = self._download_json(
+            '%s/video_id/%s/format/json' % (self._search_url, video_id),
+            display_id, 'Downloading video metadata JSON')['video'][0]
+
+        title_info = video['title']
+        title = title_info['title_long']
+
+        movie = video['video_assets']['movie'][0]
+        m3u8_url = movie['url'].format(TOKEN=self._token)
+        formats = self._extract_m3u8_formats(
+            m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native',
+            m3u8_id='hls')
+        self._sort_formats(formats)
+
+        description = clean_html(title_info.get('summary_long'))
+        age_limit = parse_age_limit(video.get('parental_control', {}).get('rating'))
+        view_count = int_or_none(video.get('views'))
+        average_rating = int_or_none(video.get('user_rating'))
+        duration = int_or_none(movie.get('duration'))
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'view_count': view_count,
+            'average_rating': average_rating,
+            'age_limit': age_limit,
+            'formats': formats,
+        }
+
+
+class HRTiPlaylistIE(HRTiBaseIE):
+    _VALID_URL = r'https?://hrti.hrt.hr/#/video/list/category/(?P<id>[0-9]+)/(?P<display_id>[^/]+)?'
+    _TESTS = [{
+        'url': 'https://hrti.hrt.hr/#/video/list/category/212/ekumena',
+        'info_dict': {
+            'id': '212',
+            'title': 'ekumena',
+        },
+        'playlist_mincount': 8,
+        'skip': 'Requires account credentials',
+    }, {
+        'url': 'https://hrti.hrt.hr/#/video/list/category/212/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        category_id = mobj.group('id')
+        display_id = mobj.group('display_id') or category_id
+
+        response = self._download_json(
+            '%s/category_id/%s/format/json' % (self._search_url, category_id),
+            display_id, 'Downloading video metadata JSON')
+
+        video_ids = try_get(
+            response, lambda x: x['video_listings'][0]['alternatives'][0]['list'],
+            list) or [video['id'] for video in response.get('videos', []) if video.get('id')]
+
+        entries = [self.url_result('hrti:%s' % video_id) for video_id in video_ids]
+
+        return self.playlist_result(entries, category_id, display_id)
index 8bed8ccd06e2eeb64eba69f3407c9271c0643731..0acce9f4c2525a62e3d3ad22b16743737dbb5b07 100644 (file)
@@ -1,10 +1,10 @@
 from __future__ import unicode_literals
 
 import re
-import json
 
 from .common import InfoExtractor
 from ..utils import (
+    mimetype2ext,
     qualities,
 )
 
@@ -12,9 +12,9 @@ from ..utils import (
 class ImdbIE(InfoExtractor):
     IE_NAME = 'imdb'
     IE_DESC = 'Internet Movie Database trailers'
-    _VALID_URL = r'https?://(?:www|m)\.imdb\.com/video/imdb/vi(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video/[^/]+/|title/tt\d+.*?#lb-)vi(?P<id>\d+)'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.imdb.com/video/imdb/vi2524815897',
         'info_dict': {
             'id': '2524815897',
@@ -22,7 +22,16 @@ class ImdbIE(InfoExtractor):
             'title': 'Ice Age: Continental Drift Trailer (No. 2) - IMDb',
             'description': 'md5:9061c2219254e5d14e03c25c98e96a81',
         }
-    }
+    }, {
+        'url': 'http://www.imdb.com/video/_/vi2524815897',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.imdb.com/title/tt1667889/?ref_=ext_shr_eml_vi#lb-vi2524815897',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.imdb.com/title/tt1667889/#lb-vi2524815897',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -48,13 +57,27 @@ class ImdbIE(InfoExtractor):
             json_data = self._search_regex(
                 r'<script[^>]+class="imdb-player-data"[^>]*?>(.*?)</script>',
                 format_page, 'json data', flags=re.DOTALL)
-            info = json.loads(json_data)
-            format_info = info['videoPlayerObject']['video']
-            f_id = format_info['ffname']
+            info = self._parse_json(json_data, video_id, fatal=False)
+            if not info:
+                continue
+            format_info = info.get('videoPlayerObject', {}).get('video', {})
+            if not format_info:
+                continue
+            video_info_list = format_info.get('videoInfoList')
+            if not video_info_list or not isinstance(video_info_list, list):
+                continue
+            video_info = video_info_list[0]
+            if not video_info or not isinstance(video_info, dict):
+                continue
+            video_url = video_info.get('videoUrl')
+            if not video_url:
+                continue
+            format_id = format_info.get('ffname')
             formats.append({
-                'format_id': f_id,
-                'url': format_info['videoInfoList'][0]['videoUrl'],
-                'quality': quality(f_id),
+                'format_id': format_id,
+                'url': video_url,
+                'ext': mimetype2ext(video_info.get('videoMimeType')),
+                'quality': quality(format_id),
             })
         self._sort_formats(formats)
 
index 9622f198aa6aaf99094a9b85c5a914d4f0c07d46..c6f080484a99f43614f104ead8023e8e57609cda 100644 (file)
@@ -60,7 +60,8 @@ class IndavideoEmbedIE(InfoExtractor):
 
         formats = [{
             'url': video_url,
-            'height': self._search_regex(r'\.(\d{3,4})\.mp4$', video_url, 'height', default=None),
+            'height': int_or_none(self._search_regex(
+                r'\.(\d{3,4})\.mp4(?:\?|$)', video_url, 'height', default=None)),
         } for video_url in video_urls]
         self._sort_formats(formats)
 
index 3cbe77ad80f2fc9a03c738745524d5dac98c9d37..fc0197ae19d2c3db4a504268256400d6ef81702d 100644 (file)
@@ -8,6 +8,7 @@ from ..utils import (
     int_or_none,
     limit_length,
     lowercase_escape,
+    try_get,
 )
 
 
@@ -19,10 +20,16 @@ class InstagramIE(InfoExtractor):
         'info_dict': {
             'id': 'aye83DjauH',
             'ext': 'mp4',
-            'uploader_id': 'naomipq',
             'title': 'Video by naomipq',
             'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8',
-        }
+            'thumbnail': 're:^https?://.*\.jpg',
+            'timestamp': 1371748545,
+            'upload_date': '20130620',
+            'uploader_id': 'naomipq',
+            'uploader': 'Naomi Leonor Phan-Quang',
+            'like_count': int,
+            'comment_count': int,
+        },
     }, {
         # missing description
         'url': 'https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears',
@@ -31,6 +38,13 @@ class InstagramIE(InfoExtractor):
             'ext': 'mp4',
             'uploader_id': 'britneyspears',
             'title': 'Video by britneyspears',
+            'thumbnail': 're:^https?://.*\.jpg',
+            'timestamp': 1453760977,
+            'upload_date': '20160125',
+            'uploader_id': 'britneyspears',
+            'uploader': 'Britney Spears',
+            'like_count': int,
+            'comment_count': int,
         },
         'params': {
             'skip_download': True,
@@ -67,21 +81,57 @@ class InstagramIE(InfoExtractor):
         url = mobj.group('url')
 
         webpage = self._download_webpage(url, video_id)
-        uploader_id = self._search_regex(r'"owner":{"username":"(.+?)"',
-                                         webpage, 'uploader id', fatal=False)
-        desc = self._search_regex(
-            r'"caption":"(.+?)"', webpage, 'description', default=None)
-        if desc is not None:
-            desc = lowercase_escape(desc)
+
+        (video_url, description, thumbnail, timestamp, uploader,
+         uploader_id, like_count, comment_count) = [None] * 8
+
+        shared_data = self._parse_json(
+            self._search_regex(
+                r'window\._sharedData\s*=\s*({.+?});',
+                webpage, 'shared data', default='{}'),
+            video_id, fatal=False)
+        if shared_data:
+            media = try_get(
+                shared_data, lambda x: x['entry_data']['PostPage'][0]['media'], dict)
+            if media:
+                video_url = media.get('video_url')
+                description = media.get('caption')
+                thumbnail = media.get('display_src')
+                timestamp = int_or_none(media.get('date'))
+                uploader = media.get('owner', {}).get('full_name')
+                uploader_id = media.get('owner', {}).get('username')
+                like_count = int_or_none(media.get('likes', {}).get('count'))
+                comment_count = int_or_none(media.get('comments', {}).get('count'))
+
+        if not video_url:
+            video_url = self._og_search_video_url(webpage, secure=False)
+
+        if not uploader_id:
+            uploader_id = self._search_regex(
+                r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"',
+                webpage, 'uploader id', fatal=False)
+
+        if not description:
+            description = self._search_regex(
+                r'"caption"\s*:\s*"(.+?)"', webpage, 'description', default=None)
+            if description is not None:
+                description = lowercase_escape(description)
+
+        if not thumbnail:
+            thumbnail = self._og_search_thumbnail(webpage)
 
         return {
             'id': video_id,
-            'url': self._og_search_video_url(webpage, secure=False),
+            'url': video_url,
             'ext': 'mp4',
             'title': 'Video by %s' % uploader_id,
-            'thumbnail': self._og_search_thumbnail(webpage),
+            'description': description,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
             'uploader_id': uploader_id,
-            'description': desc,
+            'uploader': uploader,
+            'like_count': like_count,
+            'comment_count': comment_count,
         }
 
 
index ffb8008ce29c81363c58e8b7af135b4d096835e8..01c7b30428f8a750c9932b0ea734f795c09866d6 100644 (file)
@@ -3,28 +3,22 @@ from __future__ import unicode_literals
 
 import hashlib
 import itertools
-import math
-import os
-import random
 import re
 import time
-import uuid
 
 from .common import InfoExtractor
 from ..compat import (
-    compat_parse_qs,
     compat_str,
     compat_urllib_parse_urlencode,
-    compat_urllib_parse_urlparse,
 )
 from ..utils import (
+    clean_html,
     decode_packed_codes,
+    get_element_by_id,
+    get_element_by_attribute,
     ExtractorError,
     ohdave_rsa_encrypt,
     remove_start,
-    sanitized_Request,
-    urlencode_postdata,
-    url_basename,
 )
 
 
@@ -171,70 +165,21 @@ class IqiyiIE(InfoExtractor):
 
     _TESTS = [{
         'url': 'http://www.iqiyi.com/v_19rrojlavg.html',
-        'md5': '2cb594dc2781e6c941a110d8f358118b',
+        # MD5 checksum differs on my machine and Travis CI
         'info_dict': {
             'id': '9c1fb1b99d192b21c559e5a1a2cb3c73',
+            'ext': 'mp4',
             'title': '美国德州空中惊现奇异云团 酷似UFO',
-            'ext': 'f4v',
         }
     }, {
         'url': 'http://www.iqiyi.com/v_19rrhnnclk.html',
+        'md5': '667171934041350c5de3f5015f7f1152',
         'info_dict': {
             'id': 'e3f585b550a280af23c98b6cb2be19fb',
-            'title': '名侦探柯南第752集',
-        },
-        'playlist': [{
-            'info_dict': {
-                'id': 'e3f585b550a280af23c98b6cb2be19fb_part1',
-                'ext': 'f4v',
-                'title': '名侦探柯南第752集',
-            },
-        }, {
-            'info_dict': {
-                'id': 'e3f585b550a280af23c98b6cb2be19fb_part2',
-                'ext': 'f4v',
-                'title': '名侦探柯南第752集',
-            },
-        }, {
-            'info_dict': {
-                'id': 'e3f585b550a280af23c98b6cb2be19fb_part3',
-                'ext': 'f4v',
-                'title': '名侦探柯南第752集',
-            },
-        }, {
-            'info_dict': {
-                'id': 'e3f585b550a280af23c98b6cb2be19fb_part4',
-                'ext': 'f4v',
-                'title': '名侦探柯南第752集',
-            },
-        }, {
-            'info_dict': {
-                'id': 'e3f585b550a280af23c98b6cb2be19fb_part5',
-                'ext': 'f4v',
-                'title': '名侦探柯南第752集',
-            },
-        }, {
-            'info_dict': {
-                'id': 'e3f585b550a280af23c98b6cb2be19fb_part6',
-                'ext': 'f4v',
-                'title': '名侦探柯南第752集',
-            },
-        }, {
-            'info_dict': {
-                'id': 'e3f585b550a280af23c98b6cb2be19fb_part7',
-                'ext': 'f4v',
-                'title': '名侦探柯南第752集',
-            },
-        }, {
-            'info_dict': {
-                'id': 'e3f585b550a280af23c98b6cb2be19fb_part8',
-                'ext': 'f4v',
-                'title': '名侦探柯南第752集',
-            },
-        }],
-        'params': {
-            'skip_download': True,
+            'ext': 'mp4',
+            'title': '名侦探柯南 国语版:第752集 迫近灰原秘密的黑影 下篇',
         },
+        'skip': 'Geo-restricted to China',
     }, {
         'url': 'http://www.iqiyi.com/w_19rt6o8t9p.html',
         'only_matching': True,
@@ -250,22 +195,10 @@ class IqiyiIE(InfoExtractor):
         'url': 'http://www.iqiyi.com/v_19rrny4w8w.html',
         'info_dict': {
             'id': 'f3cf468b39dddb30d676f89a91200dc1',
+            'ext': 'mp4',
             'title': '泰坦尼克号',
         },
-        'playlist': [{
-            'info_dict': {
-                'id': 'f3cf468b39dddb30d676f89a91200dc1_part1',
-                'ext': 'f4v',
-                'title': '泰坦尼克号',
-            },
-        }, {
-            'info_dict': {
-                'id': 'f3cf468b39dddb30d676f89a91200dc1_part2',
-                'ext': 'f4v',
-                'title': '泰坦尼克号',
-            },
-        }],
-        'expected_warnings': ['Needs a VIP account for full video'],
+        'skip': 'Geo-restricted to China',
     }, {
         'url': 'http://www.iqiyi.com/a_19rrhb8ce1.html',
         'info_dict': {
@@ -278,20 +211,15 @@ class IqiyiIE(InfoExtractor):
         'only_matching': True,
     }]
 
-    _FORMATS_MAP = [
-        ('1', 'h6'),
-        ('2', 'h5'),
-        ('3', 'h4'),
-        ('4', 'h3'),
-        ('5', 'h2'),
-        ('10', 'h1'),
-    ]
-
-    AUTH_API_ERRORS = {
-        # No preview available (不允许试看鉴权失败)
-        'Q00505': 'This video requires a VIP account',
-        # End of preview time (试看结束鉴权失败)
-        'Q00506': 'Needs a VIP account for full video',
+    _FORMATS_MAP = {
+        '96': 1,    # 216p, 240p
+        '1': 2,     # 336p, 360p
+        '2': 3,     # 480p, 504p
+        '21': 4,    # 504p
+        '4': 5,     # 720p
+        '17': 5,    # 720p
+        '5': 6,     # 1072p, 1080p
+        '18': 7,    # 1080p
     }
 
     def _real_initialize(self):
@@ -352,174 +280,23 @@ class IqiyiIE(InfoExtractor):
 
         return True
 
-    def _authenticate_vip_video(self, api_video_url, video_id, tvid, _uuid, do_report_warning):
-        auth_params = {
-            # version and platform hard-coded in com/qiyi/player/core/model/remote/AuthenticationRemote.as
-            'version': '2.0',
-            'platform': 'b6c13e26323c537d',
-            'aid': tvid,
-            'tvid': tvid,
-            'uid': '',
-            'deviceId': _uuid,
-            'playType': 'main',  # XXX: always main?
-            'filename': os.path.splitext(url_basename(api_video_url))[0],
-        }
+    def get_raw_data(self, tvid, video_id):
+        tm = int(time.time() * 1000)
 
-        qd_items = compat_parse_qs(compat_urllib_parse_urlparse(api_video_url).query)
-        for key, val in qd_items.items():
-            auth_params[key] = val[0]
-
-        auth_req = sanitized_Request(
-            'http://api.vip.iqiyi.com/services/ckn.action',
-            urlencode_postdata(auth_params))
-        # iQiyi server throws HTTP 405 error without the following header
-        auth_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
-        auth_result = self._download_json(
-            auth_req, video_id,
-            note='Downloading video authentication JSON',
-            errnote='Unable to download video authentication JSON')
-
-        code = auth_result.get('code')
-        msg = self.AUTH_API_ERRORS.get(code) or auth_result.get('msg') or code
-        if code == 'Q00506':
-            if do_report_warning:
-                self.report_warning(msg)
-            return False
-        if 'data' not in auth_result:
-            if msg is not None:
-                raise ExtractorError('%s said: %s' % (self.IE_NAME, msg), expected=True)
-            raise ExtractorError('Unexpected error from Iqiyi auth API')
-
-        return auth_result['data']
-
-    def construct_video_urls(self, data, video_id, _uuid, tvid):
-        def do_xor(x, y):
-            a = y % 3
-            if a == 1:
-                return x ^ 121
-            if a == 2:
-                return x ^ 72
-            return x ^ 103
-
-        def get_encode_code(l):
-            a = 0
-            b = l.split('-')
-            c = len(b)
-            s = ''
-            for i in range(c - 1, -1, -1):
-                a = do_xor(int(b[c - i - 1], 16), i)
-                s += chr(a)
-            return s[::-1]
-
-        def get_path_key(x, format_id, segment_index):
-            mg = ')(*&^flash@#$%a'
-            tm = self._download_json(
-                'http://data.video.qiyi.com/t?tn=' + str(random.random()), video_id,
-                note='Download path key of segment %d for format %s' % (segment_index + 1, format_id)
-            )['t']
-            t = str(int(math.floor(int(tm) / (600.0))))
-            return md5_text(t + mg + x)
-
-        video_urls_dict = {}
-        need_vip_warning_report = True
-        for format_item in data['vp']['tkl'][0]['vs']:
-            if 0 < int(format_item['bid']) <= 10:
-                format_id = self.get_format(format_item['bid'])
-            else:
-                continue
-
-            video_urls = []
-
-            video_urls_info = format_item['fs']
-            if not format_item['fs'][0]['l'].startswith('/'):
-                t = get_encode_code(format_item['fs'][0]['l'])
-                if t.endswith('mp4'):
-                    video_urls_info = format_item['flvs']
-
-            for segment_index, segment in enumerate(video_urls_info):
-                vl = segment['l']
-                if not vl.startswith('/'):
-                    vl = get_encode_code(vl)
-                is_vip_video = '/vip/' in vl
-                filesize = segment['b']
-                base_url = data['vp']['du'].split('/')
-                if not is_vip_video:
-                    key = get_path_key(
-                        vl.split('/')[-1].split('.')[0], format_id, segment_index)
-                    base_url.insert(-1, key)
-                base_url = '/'.join(base_url)
-                param = {
-                    'su': _uuid,
-                    'qyid': uuid.uuid4().hex,
-                    'client': '',
-                    'z': '',
-                    'bt': '',
-                    'ct': '',
-                    'tn': str(int(time.time()))
-                }
-                api_video_url = base_url + vl
-                if is_vip_video:
-                    api_video_url = api_video_url.replace('.f4v', '.hml')
-                    auth_result = self._authenticate_vip_video(
-                        api_video_url, video_id, tvid, _uuid, need_vip_warning_report)
-                    if auth_result is False:
-                        need_vip_warning_report = False
-                        break
-                    param.update({
-                        't': auth_result['t'],
-                        # cid is hard-coded in com/qiyi/player/core/player/RuntimeData.as
-                        'cid': 'afbe8fd3d73448c9',
-                        'vid': video_id,
-                        'QY00001': auth_result['u'],
-                    })
-                api_video_url += '?' if '?' not in api_video_url else '&'
-                api_video_url += compat_urllib_parse_urlencode(param)
-                js = self._download_json(
-                    api_video_url, video_id,
-                    note='Download video info of segment %d for format %s' % (segment_index + 1, format_id))
-                video_url = js['l']
-                video_urls.append(
-                    (video_url, filesize))
-
-            video_urls_dict[format_id] = video_urls
-        return video_urls_dict
-
-    def get_format(self, bid):
-        matched_format_ids = [_format_id for _bid, _format_id in self._FORMATS_MAP if _bid == str(bid)]
-        return matched_format_ids[0] if len(matched_format_ids) else None
-
-    def get_bid(self, format_id):
-        matched_bids = [_bid for _bid, _format_id in self._FORMATS_MAP if _format_id == format_id]
-        return matched_bids[0] if len(matched_bids) else None
-
-    def get_raw_data(self, tvid, video_id, enc_key, _uuid):
-        tm = str(int(time.time()))
-        tail = tm + tvid
-        param = {
-            'key': 'fvip',
-            'src': md5_text('youtube-dl'),
-            'tvId': tvid,
+        key = 'd5fb4bd9d50c4be6948c97edd7254b0e'
+        sc = md5_text(compat_str(tm) + key + tvid)
+        params = {
+            'tvid': tvid,
             'vid': video_id,
-            'vinfo': 1,
-            'tm': tm,
-            'enc': md5_text(enc_key + tail),
-            'qyid': _uuid,
-            'tn': random.random(),
-            'um': 0,
-            'authkey': md5_text(md5_text('') + tail),
-            'k_tag': 1,
+            'src': '76f90cbd92f94a2e925d83e8ccd22cb7',
+            'sc': sc,
+            't': tm,
         }
 
-        api_url = 'http://cache.video.qiyi.com/vms' + '?' + \
-            compat_urllib_parse_urlencode(param)
-        raw_data = self._download_json(api_url, video_id)
-        return raw_data
-
-    def get_enc_key(self, video_id):
-        # TODO: automatic key extraction
-        # last update at 2016-01-22 for Zombie::bite
-        enc_key = '4a1caba4b4465345366f28da7c117d20'
-        return enc_key
+        return self._download_json(
+            'http://cache.m.iqiyi.com/jp/tmts/%s/%s/' % (tvid, video_id),
+            video_id, transform_source=lambda s: remove_start(s, 'var tvInfoJs='),
+            query=params, headers=self.geo_verification_headers())
 
     def _extract_playlist(self, webpage):
         PAGE_SIZE = 50
@@ -568,58 +345,41 @@ class IqiyiIE(InfoExtractor):
             r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid')
         video_id = self._search_regex(
             r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id')
-        _uuid = uuid.uuid4().hex
-
-        enc_key = self.get_enc_key(video_id)
-
-        raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid)
-
-        if raw_data['code'] != 'A000000':
-            raise ExtractorError('Unable to load data. Error code: ' + raw_data['code'])
-
-        data = raw_data['data']
-
-        title = data['vi']['vn']
-
-        # generate video_urls_dict
-        video_urls_dict = self.construct_video_urls(
-            data, video_id, _uuid, tvid)
-
-        # construct info
-        entries = []
-        for format_id in video_urls_dict:
-            video_urls = video_urls_dict[format_id]
-            for i, video_url_info in enumerate(video_urls):
-                if len(entries) < i + 1:
-                    entries.append({'formats': []})
-                entries[i]['formats'].append(
-                    {
-                        'url': video_url_info[0],
-                        'filesize': video_url_info[-1],
-                        'format_id': format_id,
-                        'preference': int(self.get_bid(format_id))
-                    }
-                )
-
-        for i in range(len(entries)):
-            self._sort_formats(entries[i]['formats'])
-            entries[i].update(
-                {
-                    'id': '%s_part%d' % (video_id, i + 1),
-                    'title': title,
-                }
-            )
-
-        if len(entries) > 1:
-            info = {
-                '_type': 'multi_video',
-                'id': video_id,
-                'title': title,
-                'entries': entries,
-            }
-        else:
-            info = entries[0]
-            info['id'] = video_id
-            info['title'] = title
-
-        return info
+
+        formats = []
+        for _ in range(5):
+            raw_data = self.get_raw_data(tvid, video_id)
+
+            if raw_data['code'] != 'A00000':
+                if raw_data['code'] == 'A00111':
+                    self.raise_geo_restricted()
+                raise ExtractorError('Unable to load data. Error code: ' + raw_data['code'])
+
+            data = raw_data['data']
+
+            for stream in data['vidl']:
+                if 'm3utx' not in stream:
+                    continue
+                vd = compat_str(stream['vd'])
+                formats.append({
+                    'url': stream['m3utx'],
+                    'format_id': vd,
+                    'ext': 'mp4',
+                    'preference': self._FORMATS_MAP.get(vd, -1),
+                    'protocol': 'm3u8_native',
+                })
+
+            if formats:
+                break
+
+            self._sleep(5, video_id)
+
+        self._sort_formats(formats)
+        title = (get_element_by_id('widget-videotitle', webpage) or
+                 clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage)))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+        }
index 8a5e562dbc24fac4d18498e631e8f5e10d8fe038..e44e31104f55fc72daa6884f09bcf8faab390568 100644 (file)
@@ -5,33 +5,76 @@ import re
 
 from .common import InfoExtractor
 from ..utils import (
+    determine_ext,
     float_or_none,
     int_or_none,
 )
 
 
 class JWPlatformBaseIE(InfoExtractor):
-    def _parse_jwplayer_data(self, jwplayer_data, video_id, require_title=True):
+    @staticmethod
+    def _find_jwplayer_data(webpage):
+        # TODO: Merge this with JWPlayer-related codes in generic.py
+
+        mobj = re.search(
+            'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\((?P<options>[^)]+)\)',
+            webpage)
+        if mobj:
+            return mobj.group('options')
+
+    def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
+        jwplayer_data = self._parse_json(
+            self._find_jwplayer_data(webpage), video_id)
+        return self._parse_jwplayer_data(
+            jwplayer_data, video_id, *args, **kwargs)
+
+    def _parse_jwplayer_data(self, jwplayer_data, video_id, require_title=True, m3u8_id=None, rtmp_params=None):
+        # JWPlayer backward compatibility: flattened playlists
+        # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
+        if 'playlist' not in jwplayer_data:
+            jwplayer_data = {'playlist': [jwplayer_data]}
+
         video_data = jwplayer_data['playlist'][0]
 
+        # JWPlayer backward compatibility: flattened sources
+        # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
+        if 'sources' not in video_data:
+            video_data['sources'] = [video_data]
+
         formats = []
         for source in video_data['sources']:
             source_url = self._proto_relative_url(source['file'])
             source_type = source.get('type') or ''
-            if source_type in ('application/vnd.apple.mpegurl', 'hls'):
+            if source_type in ('application/vnd.apple.mpegurl', 'hls') or determine_ext(source_url) == 'm3u8':
                 formats.extend(self._extract_m3u8_formats(
-                    source_url, video_id, 'mp4', 'm3u8_native', fatal=False))
+                    source_url, video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False))
             elif source_type.startswith('audio'):
                 formats.append({
                     'url': source_url,
                     'vcodec': 'none',
                 })
             else:
-                formats.append({
+                a_format = {
                     'url': source_url,
                     'width': int_or_none(source.get('width')),
                     'height': int_or_none(source.get('height')),
-                })
+                }
+                if source_url.startswith('rtmp'):
+                    a_format['ext'] = 'flv',
+
+                    # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
+                    # of jwplayer.flash.swf
+                    rtmp_url_parts = re.split(
+                        r'((?:mp4|mp3|flv):)', source_url, 1)
+                    if len(rtmp_url_parts) == 3:
+                        rtmp_url, prefix, play_path = rtmp_url_parts
+                        a_format.update({
+                            'url': rtmp_url,
+                            'play_path': prefix + play_path,
+                        })
+                    if rtmp_params:
+                        a_format.update(rtmp_params)
+                formats.append(a_format)
         self._sort_formats(formats)
 
         subtitles = {}
index a65697ff558864f36cc5e8b8f82f959b19ea16fc..1729f5bfbfa84efebcef054423152e5dcb1ec8d2 100644 (file)
@@ -6,7 +6,6 @@ import base64
 
 from .common import InfoExtractor
 from ..compat import (
-    compat_urllib_parse_urlencode,
     compat_urlparse,
     compat_parse_qs,
 )
@@ -15,6 +14,7 @@ from ..utils import (
     ExtractorError,
     int_or_none,
     unsmuggle_url,
+    smuggle_url,
 )
 
 
@@ -34,7 +34,8 @@ class KalturaIE(InfoExtractor):
                         )(?:/(?P<path>[^?]+))?(?:\?(?P<query>.*))?
                 )
                 '''
-    _API_BASE = 'http://cdnapi.kaltura.com/api_v3/index.php?'
+    _SERVICE_URL = 'http://cdnapi.kaltura.com'
+    _SERVICE_BASE = '/api_v3/index.php'
     _TESTS = [
         {
             'url': 'kaltura:269692:1_1jc2y3e4',
@@ -64,16 +65,50 @@ class KalturaIE(InfoExtractor):
         }
     ]
 
-    def _kaltura_api_call(self, video_id, actions, *args, **kwargs):
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = (
+            re.search(
+                r"""(?xs)
+                    kWidget\.(?:thumb)?[Ee]mbed\(
+                    \{.*?
+                        (?P<q1>['\"])wid(?P=q1)\s*:\s*
+                        (?P<q2>['\"])_?(?P<partner_id>[^'\"]+)(?P=q2),.*?
+                        (?P<q3>['\"])entry_?[Ii]d(?P=q3)\s*:\s*
+                        (?P<q4>['\"])(?P<id>[^'\"]+)(?P=q4),
+                """, webpage) or
+            re.search(
+                r'''(?xs)
+                    (?P<q1>["\'])
+                        (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?
+                    (?P=q1).*?
+                    (?:
+                        entry_?[Ii]d|
+                        (?P<q2>["\'])entry_?[Ii]d(?P=q2)
+                    )\s*:\s*
+                    (?P<q3>["\'])(?P<id>.+?)(?P=q3)
+                ''', webpage))
+        if mobj:
+            embed_info = mobj.groupdict()
+            url = 'kaltura:%(partner_id)s:%(id)s' % embed_info
+            escaped_pid = re.escape(embed_info['partner_id'])
+            service_url = re.search(
+                r'<script[^>]+src=["\']((?:https?:)?//.+?)/p/%s/sp/%s00/embedIframeJs' % (escaped_pid, escaped_pid),
+                webpage)
+            if service_url:
+                url = smuggle_url(url, {'service_url': service_url.group(1)})
+            return url
+
+    def _kaltura_api_call(self, video_id, actions, service_url=None, *args, **kwargs):
         params = actions[0]
         if len(actions) > 1:
             for i, a in enumerate(actions[1:], start=1):
                 for k, v in a.items():
                     params['%d:%s' % (i, k)] = v
 
-        query = compat_urllib_parse_urlencode(params)
-        url = self._API_BASE + query
-        data = self._download_json(url, video_id, *args, **kwargs)
+        data = self._download_json(
+            (service_url or self._SERVICE_URL) + self._SERVICE_BASE,
+            video_id, query=params, *args, **kwargs)
 
         status = data if len(actions) == 1 else data[0]
         if status.get('objectType') == 'KalturaAPIException':
@@ -82,7 +117,7 @@ class KalturaIE(InfoExtractor):
 
         return data
 
-    def _get_kaltura_signature(self, video_id, partner_id):
+    def _get_kaltura_signature(self, video_id, partner_id, service_url=None):
         actions = [{
             'apiVersion': '3.1',
             'expiry': 86400,
@@ -92,10 +127,10 @@ class KalturaIE(InfoExtractor):
             'widgetId': '_%s' % partner_id,
         }]
         return self._kaltura_api_call(
-            video_id, actions, note='Downloading Kaltura signature')['ks']
+            video_id, actions, service_url, note='Downloading Kaltura signature')['ks']
 
-    def _get_video_info(self, video_id, partner_id):
-        signature = self._get_kaltura_signature(video_id, partner_id)
+    def _get_video_info(self, video_id, partner_id, service_url=None):
+        signature = self._get_kaltura_signature(video_id, partner_id, service_url)
         actions = [
             {
                 'action': 'null',
@@ -118,7 +153,7 @@ class KalturaIE(InfoExtractor):
             },
         ]
         return self._kaltura_api_call(
-            video_id, actions, note='Downloading video info JSON')
+            video_id, actions, service_url, note='Downloading video info JSON')
 
     def _real_extract(self, url):
         url, smuggled_data = unsmuggle_url(url, {})
@@ -127,7 +162,7 @@ class KalturaIE(InfoExtractor):
         partner_id, entry_id = mobj.group('partner_id', 'id')
         ks = None
         if partner_id and entry_id:
-            info, flavor_assets = self._get_video_info(entry_id, partner_id)
+            info, flavor_assets = self._get_video_info(entry_id, partner_id, smuggled_data.get('service_url'))
         else:
             path, query = mobj.group('path', 'query')
             if not path and not query:
@@ -175,12 +210,17 @@ class KalturaIE(InfoExtractor):
                 unsigned_url += '?referrer=%s' % referrer
             return unsigned_url
 
+        data_url = info['dataUrl']
+        if '/flvclipper/' in data_url:
+            data_url = re.sub(r'/flvclipper/.*', '/serveFlavor', data_url)
+
         formats = []
         for f in flavor_assets:
             # Continue if asset is not ready
             if f['status'] != 2:
                 continue
-            video_url = sign_url('%s/flavorId/%s' % (info['dataUrl'], f['id']))
+            video_url = sign_url(
+                '%s/flavorId/%s' % (data_url, f['id']))
             formats.append({
                 'format_id': '%(fileExt)s-%(bitrate)s' % f,
                 'ext': f.get('fileExt'),
@@ -193,9 +233,12 @@ class KalturaIE(InfoExtractor):
                 'width': int_or_none(f.get('width')),
                 'url': video_url,
             })
-        m3u8_url = sign_url(info['dataUrl'].replace('format/url', 'format/applehttp'))
-        formats.extend(self._extract_m3u8_formats(
-            m3u8_url, entry_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+        if '/playManifest/' in data_url:
+            m3u8_url = sign_url(data_url.replace(
+                'format/url', 'format/applehttp'))
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, entry_id, 'mp4', 'm3u8_native',
+                m3u8_id='hls', fatal=False))
 
         self._check_formats(formats, entry_id)
         self._sort_formats(formats)
diff --git a/youtube_dl/extractor/kamcord.py b/youtube_dl/extractor/kamcord.py
new file mode 100644 (file)
index 0000000..b50120d
--- /dev/null
@@ -0,0 +1,71 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    int_or_none,
+    qualities,
+)
+
+
+class KamcordIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?kamcord\.com/v/(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'https://www.kamcord.com/v/hNYRduDgWb4',
+        'md5': 'c3180e8a9cfac2e86e1b88cb8751b54c',
+        'info_dict': {
+            'id': 'hNYRduDgWb4',
+            'ext': 'mp4',
+            'title': 'Drinking Madness',
+            'uploader': 'jacksfilms',
+            'uploader_id': '3044562',
+            'view_count': int,
+            'like_count': int,
+            'comment_count': int,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        video = self._parse_json(
+            self._search_regex(
+                r'window\.__props\s*=\s*({.+?});?(?:\n|\s*</script)',
+                webpage, 'video'),
+            video_id)['video']
+
+        title = video['title']
+
+        formats = self._extract_m3u8_formats(
+            video['play']['hls'], video_id, 'mp4', entry_protocol='m3u8_native')
+        self._sort_formats(formats)
+
+        uploader = video.get('user', {}).get('username')
+        uploader_id = video.get('user', {}).get('id')
+
+        view_count = int_or_none(video.get('viewCount'))
+        like_count = int_or_none(video.get('heartCount'))
+        comment_count = int_or_none(video.get('messageCount'))
+
+        preference_key = qualities(('small', 'medium', 'large'))
+
+        thumbnails = [{
+            'url': thumbnail_url,
+            'id': thumbnail_id,
+            'preference': preference_key(thumbnail_id),
+        } for thumbnail_id, thumbnail_url in (video.get('thumbnail') or {}).items()
+            if isinstance(thumbnail_id, compat_str) and isinstance(thumbnail_url, compat_str)]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'view_count': view_count,
+            'like_count': like_count,
+            'comment_count': comment_count,
+            'thumbnails': thumbnails,
+            'formats': formats,
+        }
index c0ece51133a441629a0272757f1a5e8157f671f6..b1d460599e2dafb16d7f16a963de5764e0577b3b 100644 (file)
@@ -26,11 +26,6 @@ class KuwoBaseIE(InfoExtractor):
     def _get_formats(self, song_id, tolerate_ip_deny=False):
         formats = []
         for file_format in self._FORMATS:
-            headers = {}
-            cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
-            if cn_verification_proxy:
-                headers['Ytdl-request-proxy'] = cn_verification_proxy
-
             query = {
                 'format': file_format['ext'],
                 'br': file_format.get('br', ''),
@@ -42,7 +37,7 @@ class KuwoBaseIE(InfoExtractor):
             song_url = self._download_webpage(
                 'http://antiserver.kuwo.cn/anti.s',
                 song_id, note='Download %s url info' % file_format['format'],
-                query=query, headers=headers,
+                query=query, headers=self.geo_verification_headers(),
             )
 
             if song_url == 'IPDeny' and not tolerate_ip_deny:
@@ -81,7 +76,7 @@ class KuwoIE(KuwoBaseIE):
             'id': '6446136',
             'ext': 'mp3',
             'title': '心',
-            'description': 'md5:b2ab6295d014005bfc607525bfc1e38a',
+            'description': 'md5:5d0e947b242c35dc0eb1d2fce9fbf02c',
             'creator': 'IU',
             'upload_date': '20150518',
         },
@@ -102,10 +97,10 @@ class KuwoIE(KuwoBaseIE):
             raise ExtractorError('this song has been offline because of copyright issues', expected=True)
 
         song_name = self._html_search_regex(
-            r'(?s)class="(?:[^"\s]+\s+)*title(?:\s+[^"\s]+)*".*?<h1[^>]+title="([^"]+)"', webpage, 'song name')
-        singer_name = self._html_search_regex(
-            r'<div[^>]+class="s_img">\s*<a[^>]+title="([^>]+)"',
-            webpage, 'singer name', fatal=False)
+            r'<p[^>]+id="lrcName">([^<]+)</p>', webpage, 'song name')
+        singer_name = remove_start(self._html_search_regex(
+            r'<a[^>]+href="http://www\.kuwo\.cn/artist/content\?name=([^"]+)">',
+            webpage, 'singer name', fatal=False), '歌手')
         lrc_content = clean_html(get_element_by_id('lrcContent', webpage))
         if lrc_content == '暂无':     # indicates no lyrics
             lrc_content = None
@@ -114,7 +109,7 @@ class KuwoIE(KuwoBaseIE):
         self._sort_formats(formats)
 
         album_id = self._html_search_regex(
-            r'<p[^>]+class="album"[^<]+<a[^>]+href="http://www\.kuwo\.cn/album/(\d+)/"',
+            r'<a[^>]+href="http://www\.kuwo\.cn/album/(\d+)/"',
             webpage, 'album id', fatal=False)
 
         publish_time = None
@@ -148,8 +143,8 @@ class KuwoAlbumIE(InfoExtractor):
         'url': 'http://www.kuwo.cn/album/502294/',
         'info_dict': {
             'id': '502294',
-            'title': 'M',
-            'description': 'md5:6a7235a84cc6400ec3b38a7bdaf1d60c',
+            'title': 'Made\xa0Series\xa0《M》',
+            'description': 'md5:d463f0d8a0ff3c3ea3d6ed7452a9483f',
         },
         'playlist_count': 2,
     }
@@ -209,7 +204,7 @@ class KuwoSingerIE(InfoExtractor):
         'url': 'http://www.kuwo.cn/mingxing/bruno+mars/',
         'info_dict': {
             'id': 'bruno+mars',
-            'title': 'Bruno Mars',
+            'title': 'Bruno\xa0Mars',
         },
         'playlist_mincount': 329,
     }, {
@@ -268,7 +263,7 @@ class KuwoCategoryIE(InfoExtractor):
             'title': '八十年代精选',
             'description': '这些都是属于八十年代的回忆!',
         },
-        'playlist_count': 24,
+        'playlist_mincount': 24,
     }
 
     def _real_extract(self, url):
@@ -283,6 +278,8 @@ class KuwoCategoryIE(InfoExtractor):
         category_desc = remove_start(
             get_element_by_id('intro', webpage).strip(),
             '%s简介:' % category_name)
+        if category_desc == '暂无':
+            category_desc = None
 
         jsonm = self._parse_json(self._html_search_regex(
             r'var\s+jsonm\s*=\s*([^;]+);', webpage, 'category songs'), category_id)
@@ -304,7 +301,7 @@ class KuwoMvIE(KuwoBaseIE):
             'id': '6480076',
             'ext': 'mp4',
             'title': 'My HouseMV',
-            'creator': '2PM',
+            'creator': 'PM02:00',
         },
         # In this video, music URLs (anti.s) are blocked outside China and
         # USA, while the MV URL (mvurl) is available globally, so force the MV
index b08f6e3c9548de02217e43bebbf20b5f2ab871e8..da5a5de4ad7e65b995a257303096b4bc58061b67 100644 (file)
@@ -1,60 +1,65 @@
+# coding: utf-8
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
 from ..utils import (
-    parse_duration,
+    js_to_json,
+    smuggle_url,
 )
 
 
 class LA7IE(InfoExtractor):
-    IE_NAME = 'la7.tv'
-    _VALID_URL = r'''(?x)
-        https?://(?:www\.)?la7\.tv/
-        (?:
-            richplayer/\?assetid=|
-            \?contentId=
-        )
-        (?P<id>[0-9]+)'''
-
-    _TEST = {
-        'url': 'http://www.la7.tv/richplayer/?assetid=50355319',
-        'md5': 'ec7d1f0224d20ba293ab56cf2259651f',
+    IE_NAME = 'la7.it'
+    _VALID_URL = r'''(?x)(https?://)?(?:
+        (?:www\.)?la7\.it/([^/]+)/(?:rivedila7|video)/|
+        tg\.la7\.it/repliche-tgla7\?id=
+    )(?P<id>.+)'''
+
+    _TESTS = [{
+        # 'src' is a plain URL
+        'url': 'http://www.la7.it/crozza/video/inccool8-02-10-2015-163722',
+        'md5': '8b613ffc0c4bf9b9e377169fc19c214c',
         'info_dict': {
-            'id': '50355319',
+            'id': 'inccool8-02-10-2015-163722',
             'ext': 'mp4',
-            'title': 'IL DIVO',
-            'description': 'Un film di Paolo Sorrentino con Toni Servillo, Anna Bonaiuto, Giulio Bosetti  e Flavio Bucci',
-            'duration': 6254,
+            'title': 'Inc.Cool8',
+            'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto  atletico',
+            'thumbnail': 're:^https?://.*',
+            'uploader_id': 'kdla7pillole@iltrovatore.it',
+            'timestamp': 1443814869,
+            'upload_date': '20151002',
         },
-        'skip': 'Blocked in the US',
-    }
+    }, {
+        # 'src' is a dictionary
+        'url': 'http://tg.la7.it/repliche-tgla7?id=189080',
+        'md5': '6b0d8888d286e39870208dfeceaf456b',
+        'info_dict': {
+            'id': '189080',
+            'ext': 'mp4',
+            'title': 'TG LA7',
+        },
+    }, {
+        'url': 'http://www.la7.it/omnibus/rivedila7/omnibus-news-02-07-2016-189077',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        xml_url = 'http://www.la7.tv/repliche/content/index.php?contentId=%s' % video_id
-        doc = self._download_xml(xml_url, video_id)
-
-        video_title = doc.find('title').text
-        description = doc.find('description').text
-        duration = parse_duration(doc.find('duration').text)
-        thumbnail = doc.find('img').text
-        view_count = int(doc.find('views').text)
 
-        prefix = doc.find('.//fqdn').text.strip().replace('auto:', 'http:')
+        webpage = self._download_webpage(url, video_id)
 
-        formats = [{
-            'format': vnode.find('quality').text,
-            'tbr': int(vnode.find('quality').text),
-            'url': vnode.find('fms').text.strip().replace('mp4:', prefix),
-        } for vnode in doc.findall('.//videos/video')]
-        self._sort_formats(formats)
+        player_data = self._parse_json(
+            self._search_regex(r'videoLa7\(({[^;]+})\);', webpage, 'player data'),
+            video_id, transform_source=js_to_json)
 
         return {
+            '_type': 'url_transparent',
+            'url': smuggle_url('kaltura:103:%s' % player_data['vid'], {
+                'service_url': 'http://kdam.iltrovatore.it',
+            }),
             'id': video_id,
-            'title': video_title,
-            'description': description,
-            'thumbnail': thumbnail,
-            'duration': duration,
-            'formats': formats,
-            'view_count': view_count,
+            'title': player_data['title'],
+            'description': self._og_search_description(webpage, default=None),
+            'thumbnail': player_data.get('poster'),
+            'ie_key': 'Kaltura',
         }
diff --git a/youtube_dl/extractor/learnr.py b/youtube_dl/extractor/learnr.py
new file mode 100644 (file)
index 0000000..1435e09
--- /dev/null
@@ -0,0 +1,33 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class LearnrIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?learnr\.pro/view/video/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.learnr.pro/view/video/51624-web-development-tutorial-for-beginners-1-how-to-build-webpages-with-html-css-javascript',
+        'md5': '3719fdf0a68397f49899e82c308a89de',
+        'info_dict': {
+            'id': '51624',
+            'ext': 'mp4',
+            'title': 'Web Development Tutorial for Beginners (#1) - How to build webpages with HTML, CSS, Javascript',
+            'description': 'md5:b36dbfa92350176cdf12b4d388485503',
+            'uploader': 'LearnCode.academy',
+            'uploader_id': 'learncodeacademy',
+            'upload_date': '20131021',
+        },
+        'add_ie': ['Youtube'],
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        return {
+            '_type': 'url_transparent',
+            'url': self._search_regex(
+                r"videoId\s*:\s*'([^']+)'", webpage, 'youtube id'),
+            'id': video_id,
+        }
index 375fdaed129421371f8575c3aeceb71ed4712de7..e9cc9aa5983967861b08a2d9ee79297ae3a1726e 100644 (file)
@@ -20,15 +20,16 @@ from ..utils import (
     int_or_none,
     orderedSet,
     parse_iso8601,
-    sanitized_Request,
     str_or_none,
     url_basename,
+    urshift,
+    update_url_query,
 )
 
 
 class LeIE(InfoExtractor):
     IE_DESC = '乐视网'
-    _VALID_URL = r'https?://www\.le\.com/ptv/vplay/(?P<id>\d+)\.html'
+    _VALID_URL = r'https?://(?:www\.le\.com/ptv/vplay|sports\.le\.com/video)/(?P<id>\d+)\.html'
 
     _URL_TEMPLATE = 'http://www.le.com/ptv/vplay/%s.html'
 
@@ -69,17 +70,16 @@ class LeIE(InfoExtractor):
             'hls_prefer_native': True,
         },
         'skip': 'Only available in China',
+    }, {
+        'url': 'http://sports.le.com/video/25737697.html',
+        'only_matching': True,
     }]
 
-    @staticmethod
-    def urshift(val, n):
-        return val >> n if val >= 0 else (val + 0x100000000) >> n
-
     # ror() and calc_time_key() are reversed from a embedded swf file in KLetvPlayer.swf
     def ror(self, param1, param2):
         _loc3_ = 0
         while _loc3_ < param2:
-            param1 = self.urshift(param1, 1) + ((param1 & 1) << 31)
+            param1 = urshift(param1, 1) + ((param1 & 1) << 31)
             _loc3_ += 1
         return param1
 
@@ -90,6 +90,10 @@ class LeIE(InfoExtractor):
         _loc3_ = self.ror(_loc3_, _loc2_ % 17)
         return _loc3_
 
+    # reversed from http://jstatic.letvcdn.com/sdk/player.js
+    def get_mms_key(self, time):
+        return self.ror(time, 8) ^ 185025305
+
     # see M3U8Encryption class in KLetvPlayer.swf
     @staticmethod
     def decrypt_m3u8(encrypted_data):
@@ -110,28 +114,7 @@ class LeIE(InfoExtractor):
 
         return bytes(_loc7_)
 
-    def _real_extract(self, url):
-        media_id = self._match_id(url)
-        page = self._download_webpage(url, media_id)
-        params = {
-            'id': media_id,
-            'platid': 1,
-            'splatid': 101,
-            'format': 1,
-            'tkey': self.calc_time_key(int(time.time())),
-            'domain': 'www.le.com'
-        }
-        play_json_req = sanitized_Request(
-            'http://api.le.com/mms/out/video/playJson?' + compat_urllib_parse_urlencode(params)
-        )
-        cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
-        if cn_verification_proxy:
-            play_json_req.add_header('Ytdl-request-proxy', cn_verification_proxy)
-
-        play_json = self._download_json(
-            play_json_req,
-            media_id, 'Downloading playJson data')
-
+    def _check_errors(self, play_json):
         # Check for errors
         playstatus = play_json['playstatus']
         if playstatus['status'] == 0:
@@ -142,43 +125,99 @@ class LeIE(InfoExtractor):
                 msg = 'Generic error. flag = %d' % flag
             raise ExtractorError(msg, expected=True)
 
-        playurl = play_json['playurl']
-
-        formats = ['350', '1000', '1300', '720p', '1080p']
-        dispatch = playurl['dispatch']
+    def _real_extract(self, url):
+        media_id = self._match_id(url)
+        page = self._download_webpage(url, media_id)
 
-        urls = []
-        for format_id in formats:
-            if format_id in dispatch:
-                media_url = playurl['domain'][0] + dispatch[format_id][0]
-                media_url += '&' + compat_urllib_parse_urlencode({
-                    'm3v': 1,
+        play_json_h5 = self._download_json(
+            'http://api.le.com/mms/out/video/playJsonH5',
+            media_id, 'Downloading html5 playJson data', query={
+                'id': media_id,
+                'platid': 3,
+                'splatid': 304,
+                'format': 1,
+                'tkey': self.get_mms_key(int(time.time())),
+                'domain': 'www.le.com',
+                'tss': 'no',
+            },
+            headers=self.geo_verification_headers())
+        self._check_errors(play_json_h5)
+
+        play_json_flash = self._download_json(
+            'http://api.le.com/mms/out/video/playJson',
+            media_id, 'Downloading flash playJson data', query={
+                'id': media_id,
+                'platid': 1,
+                'splatid': 101,
+                'format': 1,
+                'tkey': self.calc_time_key(int(time.time())),
+                'domain': 'www.le.com',
+            },
+            headers=self.geo_verification_headers())
+        self._check_errors(play_json_flash)
+
+        def get_h5_urls(media_url, format_id):
+            location = self._download_json(
+                media_url, media_id,
+                'Download JSON metadata for format %s' % format_id, query={
                     'format': 1,
                     'expect': 3,
-                    'rateid': format_id,
-                })
+                    'tss': 'no',
+                })['location']
 
-                nodes_data = self._download_json(
-                    media_url, media_id,
-                    'Download JSON metadata for format %s' % format_id)
+            return {
+                'http': update_url_query(location, {'tss': 'no'}),
+                'hls': update_url_query(location, {'tss': 'ios'}),
+            }
 
-                req = self._request_webpage(
-                    nodes_data['nodelist'][0]['location'], media_id,
-                    note='Downloading m3u8 information for format %s' % format_id)
+        def get_flash_urls(media_url, format_id):
+            media_url += '&' + compat_urllib_parse_urlencode({
+                'm3v': 1,
+                'format': 1,
+                'expect': 3,
+                'rateid': format_id,
+            })
 
-                m3u8_data = self.decrypt_m3u8(req.read())
+            nodes_data = self._download_json(
+                media_url, media_id,
+                'Download JSON metadata for format %s' % format_id)
 
-                url_info_dict = {
-                    'url': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'),
-                    'ext': determine_ext(dispatch[format_id][1]),
-                    'format_id': format_id,
-                    'protocol': 'm3u8',
-                }
+            req = self._request_webpage(
+                nodes_data['nodelist'][0]['location'], media_id,
+                note='Downloading m3u8 information for format %s' % format_id)
 
-                if format_id[-1:] == 'p':
-                    url_info_dict['height'] = int_or_none(format_id[:-1])
+            m3u8_data = self.decrypt_m3u8(req.read())
 
-                urls.append(url_info_dict)
+            return {
+                'hls': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'),
+            }
+
+        extracted_formats = []
+        formats = []
+        for play_json, get_urls in ((play_json_h5, get_h5_urls), (play_json_flash, get_flash_urls)):
+            playurl = play_json['playurl']
+            play_domain = playurl['domain'][0]
+
+            for format_id, format_data in playurl.get('dispatch', []).items():
+                if format_id in extracted_formats:
+                    continue
+                extracted_formats.append(format_id)
+
+                media_url = play_domain + format_data[0]
+                for protocol, format_url in get_urls(media_url, format_id).items():
+                    f = {
+                        'url': format_url,
+                        'ext': determine_ext(format_data[1]),
+                        'format_id': '%s-%s' % (protocol, format_id),
+                        'protocol': 'm3u8_native' if protocol == 'hls' else 'http',
+                        'quality': int_or_none(format_id),
+                    }
+
+                    if format_id[-1:] == 'p':
+                        f['height'] = int_or_none(format_id[:-1])
+
+                    formats.append(f)
+        self._sort_formats(formats, ('height', 'quality', 'format_id'))
 
         publish_time = parse_iso8601(self._html_search_regex(
             r'发布时间&nbsp;([^<>]+) ', page, 'publish time', default=None),
@@ -187,7 +226,7 @@ class LeIE(InfoExtractor):
 
         return {
             'id': media_id,
-            'formats': urls,
+            'formats': formats,
             'title': playurl['title'],
             'thumbnail': playurl['pic'],
             'description': description,
@@ -196,7 +235,7 @@ class LeIE(InfoExtractor):
 
 
 class LePlaylistIE(InfoExtractor):
-    _VALID_URL = r'https?://[a-z]+\.le\.com/[a-z]+/(?P<id>[a-z0-9_]+)'
+    _VALID_URL = r'https?://[a-z]+\.le\.com/(?!video)[a-z]+/(?P<id>[a-z0-9_]+)'
 
     _TESTS = [{
         'url': 'http://www.le.com/tv/46177.html',
diff --git a/youtube_dl/extractor/libraryofcongress.py b/youtube_dl/extractor/libraryofcongress.py
new file mode 100644 (file)
index 0000000..0a94366
--- /dev/null
@@ -0,0 +1,143 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+from ..utils import (
+    determine_ext,
+    float_or_none,
+    int_or_none,
+    parse_filesize,
+)
+
+
+class LibraryOfCongressIE(InfoExtractor):
+    IE_NAME = 'loc'
+    IE_DESC = 'Library of Congress'
+    _VALID_URL = r'https?://(?:www\.)?loc\.gov/(?:item/|today/cyberlc/feature_wdesc\.php\?.*\brec=)(?P<id>[0-9]+)'
+    _TESTS = [{
+        # embedded via <div class="media-player"
+        'url': 'http://loc.gov/item/90716351/',
+        'md5': '353917ff7f0255aa6d4b80a034833de8',
+        'info_dict': {
+            'id': '90716351',
+            'ext': 'mp4',
+            'title': "Pa's trip to Mars",
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 0,
+            'view_count': int,
+        },
+    }, {
+        # webcast embedded via mediaObjectId
+        'url': 'https://www.loc.gov/today/cyberlc/feature_wdesc.php?rec=5578',
+        'info_dict': {
+            'id': '5578',
+            'ext': 'mp4',
+            'title': 'Help! Preservation Training Needs Here, There & Everywhere',
+            'duration': 3765,
+            'view_count': int,
+            'subtitles': 'mincount:1',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # with direct download links
+        'url': 'https://www.loc.gov/item/78710669/',
+        'info_dict': {
+            'id': '78710669',
+            'ext': 'mp4',
+            'title': 'La vie et la passion de Jesus-Christ',
+            'duration': 0,
+            'view_count': int,
+            'formats': 'mincount:4',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        media_id = self._search_regex(
+            (r'id=(["\'])media-player-(?P<id>.+?)\1',
+             r'<video[^>]+id=(["\'])uuid-(?P<id>.+?)\1',
+             r'<video[^>]+data-uuid=(["\'])(?P<id>.+?)\1',
+             r'mediaObjectId\s*:\s*(["\'])(?P<id>.+?)\1'),
+            webpage, 'media id', group='id')
+
+        data = self._download_json(
+            'https://media.loc.gov/services/v1/media?id=%s&context=json' % media_id,
+            video_id)['mediaObject']
+
+        derivative = data['derivatives'][0]
+        media_url = derivative['derivativeUrl']
+
+        title = derivative.get('shortName') or data.get('shortName') or self._og_search_title(
+            webpage)
+
+        # Following algorithm was extracted from setAVSource js function
+        # found in webpage
+        media_url = media_url.replace('rtmp', 'https')
+
+        is_video = data.get('mediaType', 'v').lower() == 'v'
+        ext = determine_ext(media_url)
+        if ext not in ('mp4', 'mp3'):
+            media_url += '.mp4' if is_video else '.mp3'
+
+        if 'vod/mp4:' in media_url:
+            formats = [{
+                'url': media_url.replace('vod/mp4:', 'hls-vod/media/') + '.m3u8',
+                'format_id': 'hls',
+                'ext': 'mp4',
+                'protocol': 'm3u8_native',
+                'quality': 1,
+            }]
+        elif 'vod/mp3:' in media_url:
+            formats = [{
+                'url': media_url.replace('vod/mp3:', ''),
+                'vcodec': 'none',
+            }]
+
+        download_urls = set()
+        for m in re.finditer(
+                r'<option[^>]+value=(["\'])(?P<url>.+?)\1[^>]+data-file-download=[^>]+>\s*(?P<id>.+?)(?:(?:&nbsp;|\s+)\((?P<size>.+?)\))?\s*<', webpage):
+            format_id = m.group('id').lower()
+            if format_id == 'gif':
+                continue
+            download_url = m.group('url')
+            if download_url in download_urls:
+                continue
+            download_urls.add(download_url)
+            formats.append({
+                'url': download_url,
+                'format_id': format_id,
+                'filesize_approx': parse_filesize(m.group('size')),
+            })
+
+        self._sort_formats(formats)
+
+        duration = float_or_none(data.get('duration'))
+        view_count = int_or_none(data.get('viewCount'))
+
+        subtitles = {}
+        cc_url = data.get('ccUrl')
+        if cc_url:
+            subtitles.setdefault('en', []).append({
+                'url': cc_url,
+                'ext': 'ttml',
+            })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': self._og_search_thumbnail(webpage, default=None),
+            'duration': duration,
+            'view_count': view_count,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
index ba2f80a757d071042b8d574721bde37a1b7006ba..c2b4490c49044bea3426dc48b873218aa98146b1 100644 (file)
@@ -7,48 +7,53 @@ from .common import InfoExtractor
 from ..compat import compat_urlparse
 from ..utils import (
     determine_ext,
+    ExtractorError,
     int_or_none,
+    parse_iso8601,
     remove_end,
-    unified_strdate,
-    ExtractorError,
 )
 
 
 class LifeNewsIE(InfoExtractor):
-    IE_NAME = 'lifenews'
-    IE_DESC = 'LIFE | NEWS'
-    _VALID_URL = r'https?://lifenews\.ru/(?:mobile/)?(?P<section>news|video)/(?P<id>\d+)'
+    IE_NAME = 'life'
+    IE_DESC = 'Life.ru'
+    _VALID_URL = r'https?://life\.ru/t/[^/]+/(?P<id>\d+)'
 
     _TESTS = [{
         # single video embedded via video/source
-        'url': 'http://lifenews.ru/news/98736',
+        'url': 'https://life.ru/t/новости/98736',
         'md5': '77c95eaefaca216e32a76a343ad89d23',
         'info_dict': {
             'id': '98736',
             'ext': 'mp4',
             'title': 'Мужчина нашел дома архив оборонного завода',
             'description': 'md5:3b06b1b39b5e2bea548e403d99b8bf26',
+            'timestamp': 1344154740,
             'upload_date': '20120805',
+            'view_count': int,
         }
     }, {
         # single video embedded via iframe
-        'url': 'http://lifenews.ru/news/152125',
+        'url': 'https://life.ru/t/новости/152125',
         'md5': '77d19a6f0886cd76bdbf44b4d971a273',
         'info_dict': {
             'id': '152125',
             'ext': 'mp4',
             'title': 'В Сети появилось видео захвата «Правым сектором» колхозных полей ',
             'description': 'Жители двух поселков Днепропетровской области не простили радикалам угрозу лишения плодородных земель и пошли в лобовую. ',
+            'timestamp': 1427961840,
             'upload_date': '20150402',
+            'view_count': int,
         }
     }, {
         # two videos embedded via iframe
-        'url': 'http://lifenews.ru/news/153461',
+        'url': 'https://life.ru/t/новости/153461',
         'info_dict': {
             'id': '153461',
             'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве',
             'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
-            'upload_date': '20150505',
+            'timestamp': 1430825520,
+            'view_count': int,
         },
         'playlist': [{
             'md5': '9b6ef8bc0ffa25aebc8bdb40d89ab795',
@@ -57,6 +62,7 @@ class LifeNewsIE(InfoExtractor):
                 'ext': 'mp4',
                 'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 1)',
                 'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
+                'timestamp': 1430825520,
                 'upload_date': '20150505',
             },
         }, {
@@ -66,22 +72,25 @@ class LifeNewsIE(InfoExtractor):
                 'ext': 'mp4',
                 'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 2)',
                 'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
+                'timestamp': 1430825520,
                 'upload_date': '20150505',
             },
         }],
     }, {
-        'url': 'http://lifenews.ru/video/13035',
+        'url': 'https://life.ru/t/новости/213035',
+        'only_matching': True,
+    }, {
+        'url': 'https://life.ru/t/%D0%BD%D0%BE%D0%B2%D0%BE%D1%81%D1%82%D0%B8/153461',
+        'only_matching': True,
+    }, {
+        'url': 'https://life.ru/t/новости/411489/manuel_vals_nazval_frantsiiu_tsieliu_nomier_odin_dlia_ighil',
         'only_matching': True,
     }]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        section = mobj.group('section')
+        video_id = self._match_id(url)
 
-        webpage = self._download_webpage(
-            'http://lifenews.ru/%s/%s' % (section, video_id),
-            video_id, 'Downloading page')
+        webpage = self._download_webpage(url, video_id)
 
         video_urls = re.findall(
             r'<video[^>]+><source[^>]+src=["\'](.+?)["\']', webpage)
@@ -95,26 +104,22 @@ class LifeNewsIE(InfoExtractor):
 
         title = remove_end(
             self._og_search_title(webpage),
-            ' - Первый по срочным новостям — LIFE | NEWS')
+            ' - Life.ru')
 
         description = self._og_search_description(webpage)
 
         view_count = self._html_search_regex(
-            r'<div class=\'views\'>\s*(\d+)\s*</div>', webpage, 'view count', fatal=False)
-        comment_count = self._html_search_regex(
-            r'=\'commentCount\'[^>]*>\s*(\d+)\s*<',
-            webpage, 'comment count', fatal=False)
+            r'<div[^>]+class=(["\']).*?\bhits-count\b.*?\1[^>]*>\s*(?P<value>\d+)\s*</div>',
+            webpage, 'view count', fatal=False, group='value')
 
-        upload_date = self._html_search_regex(
-            r'<time[^>]*datetime=\'([^\']+)\'', webpage, 'upload date', fatal=False)
-        if upload_date is not None:
-            upload_date = unified_strdate(upload_date)
+        timestamp = parse_iso8601(self._search_regex(
+            r'<time[^>]+datetime=(["\'])(?P<value>.+?)\1',
+            webpage, 'upload date', fatal=False, group='value'))
 
         common_info = {
             'description': description,
             'view_count': int_or_none(view_count),
-            'comment_count': int_or_none(comment_count),
-            'upload_date': upload_date,
+            'timestamp': timestamp,
         }
 
         def make_entry(video_id, video_url, index=None):
@@ -183,7 +188,8 @@ class LifeEmbedIE(InfoExtractor):
             ext = determine_ext(video_url)
             if ext == 'm3u8':
                 formats.extend(self._extract_m3u8_formats(
-                    video_url, video_id, 'mp4', m3u8_id='m3u8'))
+                    video_url, video_id, 'mp4',
+                    entry_protocol='m3u8_native', m3u8_id='m3u8'))
             else:
                 formats.append({
                     'url': video_url,
index 2599d45c37e3c7874e12227677962fae3a2fbf84..5d2c3e256740d865e1c0be2c8e2808a9ee97ee43 100644 (file)
@@ -98,13 +98,19 @@ class LimelightBaseIE(InfoExtractor):
         } for thumbnail in properties.get('thumbnails', []) if thumbnail.get('url')]
 
         subtitles = {}
-        for caption in properties.get('captions', {}):
+        for caption in properties.get('captions', []):
             lang = caption.get('language_code')
             subtitles_url = caption.get('url')
             if lang and subtitles_url:
-                subtitles[lang] = [{
+                subtitles.setdefault(lang, []).append({
                     'url': subtitles_url,
-                }]
+                })
+        closed_captions_url = properties.get('closed_captions_url')
+        if closed_captions_url:
+            subtitles.setdefault('en', []).append({
+                'url': closed_captions_url,
+                'ext': 'ttml',
+            })
 
         return {
             'id': video_id,
@@ -123,7 +129,18 @@ class LimelightBaseIE(InfoExtractor):
 
 class LimelightMediaIE(LimelightBaseIE):
     IE_NAME = 'limelight'
-    _VALID_URL = r'(?:limelight:media:|https?://link\.videoplatform\.limelight\.com/media/\??\bmediaId=)(?P<id>[a-z0-9]{32})'
+    _VALID_URL = r'''(?x)
+                        (?:
+                            limelight:media:|
+                            https?://
+                                (?:
+                                    link\.videoplatform\.limelight\.com/media/|
+                                    assets\.delvenetworks\.com/player/loader\.swf
+                                )
+                                \?.*?\bmediaId=
+                        )
+                        (?P<id>[a-z0-9]{32})
+                    '''
     _TESTS = [{
         'url': 'http://link.videoplatform.limelight.com/media/?mediaId=3ffd040b522b4485b6d84effc750cd86',
         'info_dict': {
@@ -158,6 +175,9 @@ class LimelightMediaIE(LimelightBaseIE):
             # rtmp download
             'skip_download': True,
         },
+    }, {
+        'url': 'https://assets.delvenetworks.com/player/loader.swf?mediaId=8018a574f08d416e95ceaccae4ba0452',
+        'only_matching': True,
     }]
     _PLAYLIST_SERVICE_PATH = 'media'
     _API_PATH = 'media'
@@ -176,15 +196,29 @@ class LimelightMediaIE(LimelightBaseIE):
 
 class LimelightChannelIE(LimelightBaseIE):
     IE_NAME = 'limelight:channel'
-    _VALID_URL = r'(?:limelight:channel:|https?://link\.videoplatform\.limelight\.com/media/\??\bchannelId=)(?P<id>[a-z0-9]{32})'
-    _TEST = {
+    _VALID_URL = r'''(?x)
+                        (?:
+                            limelight:channel:|
+                            https?://
+                                (?:
+                                    link\.videoplatform\.limelight\.com/media/|
+                                    assets\.delvenetworks\.com/player/loader\.swf
+                                )
+                                \?.*?\bchannelId=
+                        )
+                        (?P<id>[a-z0-9]{32})
+                    '''
+    _TESTS = [{
         'url': 'http://link.videoplatform.limelight.com/media/?channelId=ab6a524c379342f9b23642917020c082',
         'info_dict': {
             'id': 'ab6a524c379342f9b23642917020c082',
             'title': 'Javascript Sample Code',
         },
         'playlist_mincount': 3,
-    }
+    }, {
+        'url': 'http://assets.delvenetworks.com/player/loader.swf?channelId=ab6a524c379342f9b23642917020c082',
+        'only_matching': True,
+    }]
     _PLAYLIST_SERVICE_PATH = 'channel'
     _API_PATH = 'channels'
 
@@ -207,15 +241,29 @@ class LimelightChannelIE(LimelightBaseIE):
 
 class LimelightChannelListIE(LimelightBaseIE):
     IE_NAME = 'limelight:channel_list'
-    _VALID_URL = r'(?:limelight:channel_list:|https?://link\.videoplatform\.limelight\.com/media/\?.*?\bchannelListId=)(?P<id>[a-z0-9]{32})'
-    _TEST = {
+    _VALID_URL = r'''(?x)
+                        (?:
+                            limelight:channel_list:|
+                            https?://
+                                (?:
+                                    link\.videoplatform\.limelight\.com/media/|
+                                    assets\.delvenetworks\.com/player/loader\.swf
+                                )
+                                \?.*?\bchannelListId=
+                        )
+                        (?P<id>[a-z0-9]{32})
+                    '''
+    _TESTS = [{
         'url': 'http://link.videoplatform.limelight.com/media/?channelListId=301b117890c4465c8179ede21fd92e2b',
         'info_dict': {
             'id': '301b117890c4465c8179ede21fd92e2b',
             'title': 'Website - Hero Player',
         },
         'playlist_mincount': 2,
-    }
+    }, {
+        'url': 'https://assets.delvenetworks.com/player/loader.swf?channelListId=301b117890c4465c8179ede21fd92e2b',
+        'only_matching': True,
+    }]
     _PLAYLIST_SERVICE_PATH = 'channel_list'
 
     def _real_extract(self, url):
diff --git a/youtube_dl/extractor/litv.py b/youtube_dl/extractor/litv.py
new file mode 100644 (file)
index 0000000..3356d01
--- /dev/null
@@ -0,0 +1,137 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    smuggle_url,
+    unsmuggle_url,
+)
+
+
+class LiTVIE(InfoExtractor):
+    _VALID_URL = r'https?://www\.litv\.tv/vod/[^/]+/content\.do\?.*?\bid=(?P<id>[^&]+)'
+
+    _URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?id=%s'
+
+    _TESTS = [{
+        'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1',
+        'info_dict': {
+            'id': 'VOD00041606',
+            'title': '花千骨',
+        },
+        'playlist_count': 50,
+    }, {
+        'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1',
+        'info_dict': {
+            'id': 'VOD00041610',
+            'ext': 'mp4',
+            'title': '花千骨第1集',
+            'thumbnail': 're:https?://.*\.jpg$',
+            'description': 'md5:c7017aa144c87467c4fb2909c4b05d6f',
+            'episode_number': 1,
+        },
+        'params': {
+            'noplaylist': True,
+            'skip_download': True,  # m3u8 download
+        },
+        'skip': 'Georestricted to Taiwan',
+    }]
+
+    def _extract_playlist(self, season_list, video_id, vod_data, view_data, prompt=True):
+        episode_title = view_data['title']
+        content_id = season_list['contentId']
+
+        if prompt:
+            self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (content_id, video_id))
+
+        all_episodes = [
+            self.url_result(smuggle_url(
+                self._URL_TEMPLATE % (view_data['contentType'], episode['contentId']),
+                {'force_noplaylist': True}))  # To prevent infinite recursion
+            for episode in season_list['episode']]
+
+        return self.playlist_result(all_episodes, content_id, episode_title)
+
+    def _real_extract(self, url):
+        url, data = unsmuggle_url(url, {})
+
+        video_id = self._match_id(url)
+
+        noplaylist = self._downloader.params.get('noplaylist')
+        noplaylist_prompt = True
+        if 'force_noplaylist' in data:
+            noplaylist = data['force_noplaylist']
+            noplaylist_prompt = False
+
+        webpage = self._download_webpage(url, video_id)
+
+        view_data = dict(map(lambda t: (t[0], t[2]), re.findall(
+            r'viewData\.([a-zA-Z]+)\s*=\s*(["\'])([^"\']+)\2',
+            webpage)))
+
+        vod_data = self._parse_json(self._search_regex(
+            'var\s+vod\s*=\s*([^;]+)', webpage, 'VOD data', default='{}'),
+            video_id)
+
+        season_list = list(vod_data.get('seasonList', {}).values())
+        if season_list:
+            if not noplaylist:
+                return self._extract_playlist(
+                    season_list[0], video_id, vod_data, view_data,
+                    prompt=noplaylist_prompt)
+
+            if noplaylist_prompt:
+                self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+
+        # In browsers `getMainUrl` request is always issued. Usually this
+        # endpoint gives the same result as the data embedded in the webpage.
+        # If georestricted, there are no embedded data, so an extra request is
+        # necessary to get the error code
+        video_data = self._parse_json(self._search_regex(
+            r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);',
+            webpage, 'video data', default='{}'), video_id)
+        if not video_data:
+            payload = {
+                'assetId': view_data['assetId'],
+                'watchDevices': vod_data['watchDevices'],
+                'contentType': view_data['contentType'],
+            }
+            video_data = self._download_json(
+                'https://www.litv.tv/vod/getMainUrl', video_id,
+                data=json.dumps(payload).encode('utf-8'),
+                headers={'Content-Type': 'application/json'})
+
+        if not video_data.get('fullpath'):
+            error_msg = video_data.get('errorMessage')
+            if error_msg == 'vod.error.outsideregionerror':
+                self.raise_geo_restricted('This video is available in Taiwan only')
+            if error_msg:
+                raise ExtractorError('%s said: %s' % (self.IE_NAME, error_msg), expected=True)
+            raise ExtractorError('Unexpected result from %s' % self.IE_NAME)
+
+        formats = self._extract_m3u8_formats(
+            video_data['fullpath'], video_id, ext='mp4', m3u8_id='hls')
+        for a_format in formats:
+            # LiTV HLS segments doesn't like compressions
+            a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = True
+
+        title = view_data['title'] + view_data.get('secondaryMark', '')
+        description = view_data.get('description')
+        thumbnail = view_data.get('imageFile')
+        categories = [item['name'] for item in vod_data.get('category', [])]
+        episode = int_or_none(view_data.get('episode'))
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'categories': categories,
+            'episode_number': episode,
+        }
index 29fba5f30b0cc4633dbc978e886c62eab0d4ac81..ea0565ac05099aab8c05609aee4140a1b4c2c1c7 100644 (file)
@@ -17,7 +17,8 @@ class LiveLeakIE(InfoExtractor):
             'ext': 'flv',
             'description': 'extremely bad day for this guy..!',
             'uploader': 'ljfriel2',
-            'title': 'Most unlucky car accident'
+            'title': 'Most unlucky car accident',
+            'thumbnail': 're:^https?://.*\.jpg$'
         }
     }, {
         'url': 'http://www.liveleak.com/view?i=f93_1390833151',
@@ -28,6 +29,7 @@ class LiveLeakIE(InfoExtractor):
             'description': 'German Television Channel NDR does an exclusive interview with Edward Snowden.\r\nUploaded on LiveLeak cause German Television thinks the rest of the world isn\'t intereseted in Edward Snowden.',
             'uploader': 'ARD_Stinkt',
             'title': 'German Television does first Edward Snowden Interview (ENGLISH)',
+            'thumbnail': 're:^https?://.*\.jpg$'
         }
     }, {
         'url': 'http://www.liveleak.com/view?i=4f7_1392687779',
@@ -49,7 +51,8 @@ class LiveLeakIE(InfoExtractor):
             'ext': 'mp4',
             'description': 'Happened on 27.7.2014. \r\nAt 0:53 you can see people still swimming at near beach.',
             'uploader': 'bony333',
-            'title': 'Crazy Hungarian tourist films close call waterspout in Croatia'
+            'title': 'Crazy Hungarian tourist films close call waterspout in Croatia',
+            'thumbnail': 're:^https?://.*\.jpg$'
         }
     }]
 
@@ -72,6 +75,7 @@ class LiveLeakIE(InfoExtractor):
         age_limit = int_or_none(self._search_regex(
             r'you confirm that you are ([0-9]+) years and over.',
             webpage, 'age limit', default=None))
+        video_thumbnail = self._og_search_thumbnail(webpage)
 
         sources_raw = self._search_regex(
             r'(?s)sources:\s*(\[.*?\]),', webpage, 'video URLs', default=None)
@@ -124,4 +128,5 @@ class LiveLeakIE(InfoExtractor):
             'uploader': video_uploader,
             'formats': formats,
             'age_limit': age_limit,
+            'thumbnail': video_thumbnail,
         }
index eada7c299238953baa9fd3d8219b2754aa7f9356..bc7894bf13ed29963aa1dad7880cf8549be1ca77 100644 (file)
@@ -150,7 +150,7 @@ class LivestreamIE(InfoExtractor):
         }
 
     def _extract_stream_info(self, stream_info):
-        broadcast_id = stream_info['broadcast_id']
+        broadcast_id = compat_str(stream_info['broadcast_id'])
         is_live = stream_info.get('is_live')
 
         formats = []
@@ -203,9 +203,10 @@ class LivestreamIE(InfoExtractor):
             if not videos_info:
                 break
             for v in videos_info:
+                v_id = compat_str(v['id'])
                 entries.append(self.url_result(
-                    'http://livestream.com/accounts/%s/events/%s/videos/%s' % (account_id, event_id, v['id']),
-                    'Livestream', v['id'], v['caption']))
+                    'http://livestream.com/accounts/%s/events/%s/videos/%s' % (account_id, event_id, v_id),
+                    'Livestream', v_id, v.get('caption')))
             last_video = videos_info[-1]['id']
         return self.playlist_result(entries, event_id, event_data['full_name'])
 
diff --git a/youtube_dl/extractor/localnews8.py b/youtube_dl/extractor/localnews8.py
new file mode 100644 (file)
index 0000000..aad3961
--- /dev/null
@@ -0,0 +1,47 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class LocalNews8IE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?localnews8\.com/(?:[^/]+/)*(?P<display_id>[^/]+)/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.localnews8.com/news/rexburg-business-turns-carbon-fiber-scraps-into-wedding-rings/35183304',
+        'md5': 'be4d48aea61aa2bde7be2ee47691ad20',
+        'info_dict': {
+            'id': '35183304',
+            'display_id': 'rexburg-business-turns-carbon-fiber-scraps-into-wedding-rings',
+            'ext': 'mp4',
+            'title': 'Rexburg business turns carbon fiber scraps into wedding ring',
+            'description': 'The process was first invented by Lamborghini and less than a dozen companies around the world use it.',
+            'duration': 153,
+            'timestamp': 1441844822,
+            'upload_date': '20150910',
+            'uploader_id': 'api',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        partner_id = self._search_regex(
+            r'partnerId\s*[:=]\s*(["\'])(?P<id>\d+)\1',
+            webpage, 'partner id', group='id')
+        kaltura_id = self._search_regex(
+            r'videoIdString\s*[:=]\s*(["\'])kaltura:(?P<id>[0-9a-z_]+)\1',
+            webpage, 'videl id', group='id')
+
+        return {
+            '_type': 'url_transparent',
+            'url': 'kaltura:%s:%s' % (partner_id, kaltura_id),
+            'ie_key': 'Kaltura',
+            'id': video_id,
+            'display_id': display_id,
+        }
index 86d47266f80affd7edaec53dba66ee40a3dd90b9..1237e15735e7fe1b493a20fab40bc2935520cc58 100644 (file)
 from __future__ import unicode_literals
 
 import re
-import json
 
 from .common import InfoExtractor
-from ..compat import compat_str
+from ..compat import (
+    compat_HTTPError,
+    compat_str,
+    compat_urlparse,
+)
 from ..utils import (
     ExtractorError,
-    clean_html,
     int_or_none,
-    sanitized_Request,
     urlencode_postdata,
 )
 
 
 class LyndaBaseIE(InfoExtractor):
-    _LOGIN_URL = 'https://www.lynda.com/login/login.aspx'
+    _SIGNIN_URL = 'https://www.lynda.com/signin'
+    _PASSWORD_URL = 'https://www.lynda.com/signin/password'
+    _USER_URL = 'https://www.lynda.com/signin/user'
     _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.'
     _NETRC_MACHINE = 'lynda'
 
     def _real_initialize(self):
         self._login()
 
+    @staticmethod
+    def _check_error(json_string, key_or_keys):
+        keys = [key_or_keys] if isinstance(key_or_keys, compat_str) else key_or_keys
+        for key in keys:
+            error = json_string.get(key)
+            if error:
+                raise ExtractorError('Unable to login: %s' % error, expected=True)
+
+    def _login_step(self, form_html, fallback_action_url, extra_form_data, note, referrer_url):
+        action_url = self._search_regex(
+            r'<form[^>]+action=(["\'])(?P<url>.+?)\1', form_html,
+            'post url', default=fallback_action_url, group='url')
+
+        if not action_url.startswith('http'):
+            action_url = compat_urlparse.urljoin(self._SIGNIN_URL, action_url)
+
+        form_data = self._hidden_inputs(form_html)
+        form_data.update(extra_form_data)
+
+        try:
+            response = self._download_json(
+                action_url, None, note,
+                data=urlencode_postdata(form_data),
+                headers={
+                    'Referer': referrer_url,
+                    'X-Requested-With': 'XMLHttpRequest',
+                })
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
+                response = self._parse_json(e.cause.read().decode('utf-8'), None)
+                self._check_error(response, ('email', 'password'))
+            raise
+
+        self._check_error(response, 'ErrorMessage')
+
+        return response, action_url
+
     def _login(self):
         username, password = self._get_login_info()
         if username is None:
             return
 
-        login_form = {
-            'username': username,
-            'password': password,
-            'remember': 'false',
-            'stayPut': 'false'
-        }
-        request = sanitized_Request(
-            self._LOGIN_URL, urlencode_postdata(login_form))
-        login_page = self._download_webpage(
-            request, None, 'Logging in as %s' % username)
-
-        # Not (yet) logged in
-        m = re.search(r'loginResultJson\s*=\s*\'(?P<json>[^\']+)\';', login_page)
-        if m is not None:
-            response = m.group('json')
-            response_json = json.loads(response)
-            state = response_json['state']
-
-            if state == 'notlogged':
-                raise ExtractorError(
-                    'Unable to login, incorrect username and/or password',
-                    expected=True)
-
-            # This is when we get popup:
-            # > You're already logged in to lynda.com on two devices.
-            # > If you log in here, we'll log you out of another device.
-            # So, we need to confirm this.
-            if state == 'conflicted':
-                confirm_form = {
-                    'username': '',
-                    'password': '',
-                    'resolve': 'true',
-                    'remember': 'false',
-                    'stayPut': 'false',
-                }
-                request = sanitized_Request(
-                    self._LOGIN_URL, urlencode_postdata(confirm_form))
-                login_page = self._download_webpage(
-                    request, None,
-                    'Confirming log in and log out from another device')
-
-        if all(not re.search(p, login_page) for p in ('isLoggedIn\s*:\s*true', r'logout\.aspx', r'>Log out<')):
-            if 'login error' in login_page:
-                mobj = re.search(
-                    r'(?s)<h1[^>]+class="topmost">(?P<title>[^<]+)</h1>\s*<div>(?P<description>.+?)</div>',
-                    login_page)
-                if mobj:
-                    raise ExtractorError(
-                        'lynda returned error: %s - %s'
-                        % (mobj.group('title'), clean_html(mobj.group('description'))),
-                        expected=True)
-            raise ExtractorError('Unable to log in')
-
-    def _logout(self):
-        username, _ = self._get_login_info()
-        if username is None:
+        # Step 1: download signin page
+        signin_page = self._download_webpage(
+            self._SIGNIN_URL, None, 'Downloading signin page')
+
+        # Already logged in
+        if any(re.search(p, signin_page) for p in (
+                'isLoggedIn\s*:\s*true', r'logout\.aspx', r'>Log out<')):
             return
 
-        self._download_webpage(
-            'http://www.lynda.com/ajax/logout.aspx', None,
-            'Logging out', 'Unable to log out', fatal=False)
+        # Step 2: submit email
+        signin_form = self._search_regex(
+            r'(?s)(<form[^>]+data-form-name=["\']signin["\'][^>]*>.+?</form>)',
+            signin_page, 'signin form')
+        signin_page, signin_url = self._login_step(
+            signin_form, self._PASSWORD_URL, {'email': username},
+            'Submitting email', self._SIGNIN_URL)
+
+        # Step 3: submit password
+        password_form = signin_page['body']
+        self._login_step(
+            password_form, self._USER_URL, {'email': username, 'password': password},
+            'Submitting password', signin_url)
 
 
 class LyndaIE(LyndaBaseIE):
     IE_NAME = 'lynda'
     IE_DESC = 'lynda.com videos'
     _VALID_URL = r'https?://www\.lynda\.com/(?:[^/]+/[^/]+/\d+|player/embed)/(?P<id>\d+)'
-    _NETRC_MACHINE = 'lynda'
 
     _TIMECODE_REGEX = r'\[(?P<timecode>\d+:\d+:\d+[\.,]\d+)\]'
 
     _TESTS = [{
         'url': 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html',
-        'md5': 'ecfc6862da89489161fb9cd5f5a6fac1',
+        'md5': '679734f6786145da3546585de9a356be',
         'info_dict': {
             'id': '114408',
             'ext': 'mp4',
@@ -212,8 +212,6 @@ class LyndaCourseIE(LyndaBaseIE):
             'http://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id,
             course_id, 'Downloading course JSON')
 
-        self._logout()
-
         if course.get('Status') == 'NotFound':
             raise ExtractorError(
                 'Course %s does not exist' % course_id, expected=True)
@@ -246,5 +244,6 @@ class LyndaCourseIE(LyndaBaseIE):
                 % unaccessible_videos + self._ACCOUNT_CREDENTIALS_HINT)
 
         course_title = course.get('Title')
+        course_description = course.get('Description')
 
-        return self.playlist_result(entries, course_id, course_title)
+        return self.playlist_result(entries, course_id, course_title, course_description)
index d5945ad66b3a784263fb1c5106534081b1f04913..39d2742c89282c2773ee1aca44ca14f047393bc5 100644 (file)
@@ -1,8 +1,6 @@
 # encoding: utf-8
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
 
 
@@ -23,34 +21,5 @@ class M6IE(InfoExtractor):
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-
-        rss = self._download_xml('http://ws.m6.fr/v1/video/info/m6/bonus/%s' % video_id, video_id,
-                                 'Downloading video RSS')
-
-        title = rss.find('./channel/item/title').text
-        description = rss.find('./channel/item/description').text
-        thumbnail = rss.find('./channel/item/visuel_clip_big').text
-        duration = int(rss.find('./channel/item/duration').text)
-        view_count = int(rss.find('./channel/item/nombre_vues').text)
-
-        formats = []
-        for format_id in ['lq', 'sd', 'hq', 'hd']:
-            video_url = rss.find('./channel/item/url_video_%s' % format_id)
-            if video_url is None:
-                continue
-            formats.append({
-                'url': video_url.text,
-                'format_id': format_id,
-            })
-
-        return {
-            'id': video_id,
-            'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
-            'duration': duration,
-            'view_count': view_count,
-            'formats': formats,
-        }
+        video_id = self._match_id(url)
+        return self.url_result('6play:%s' % video_id, 'SixPlay', video_id)
diff --git a/youtube_dl/extractor/malemotion.py b/youtube_dl/extractor/malemotion.py
deleted file mode 100644 (file)
index 92511a6..0000000
+++ /dev/null
@@ -1,46 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..compat import compat_urllib_parse_unquote
-
-
-class MalemotionIE(InfoExtractor):
-    _VALID_URL = r'https?://malemotion\.com/video/(.+?)\.(?P<id>.+?)(#|$)'
-    _TEST = {
-        'url': 'http://malemotion.com/video/bete-de-concours.ltc',
-        'md5': '3013e53a0afbde2878bc39998c33e8a5',
-        'info_dict': {
-            'id': 'ltc',
-            'ext': 'mp4',
-            'title': 'Bête de Concours',
-            'age_limit': 18,
-        },
-    }
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
-
-        video_url = compat_urllib_parse_unquote(self._search_regex(
-            r'<source type="video/mp4" src="(.+?)"', webpage, 'video URL'))
-        video_title = self._html_search_regex(
-            r'<title>(.*?)</title', webpage, 'title')
-        video_thumbnail = self._search_regex(
-            r'<video .+?poster="(.+?)"', webpage, 'thumbnail', fatal=False)
-
-        formats = [{
-            'url': video_url,
-            'ext': 'mp4',
-            'format_id': 'mp4',
-            'preference': 1,
-        }]
-        self._sort_formats(formats)
-
-        return {
-            'id': video_id,
-            'formats': formats,
-            'title': video_title,
-            'thumbnail': video_thumbnail,
-            'age_limit': 18,
-        }
index 80a0d7013b064b6a919303c80430576f61d36e7c..33b0b539fa9dfde80274d983aa003ea7b39e6622 100644 (file)
@@ -4,16 +4,12 @@ from __future__ import unicode_literals
 import random
 
 from .common import InfoExtractor
-from ..compat import compat_urllib_parse_urlencode
-from ..utils import (
-    sanitized_Request,
-    xpath_text,
-)
+from ..utils import xpath_text
 
 
 class MatchTVIE(InfoExtractor):
-    _VALID_URL = r'https?://matchtv\.ru/?#live-player'
-    _TEST = {
+    _VALID_URL = r'https?://matchtv\.ru(?:/on-air|/?#live-player)'
+    _TESTS = [{
         'url': 'http://matchtv.ru/#live-player',
         'info_dict': {
             'id': 'matchtv-live',
@@ -24,12 +20,16 @@ class MatchTVIE(InfoExtractor):
         'params': {
             'skip_download': True,
         },
-    }
+    }, {
+        'url': 'http://matchtv.ru/on-air/',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = 'matchtv-live'
-        request = sanitized_Request(
-            'http://player.matchtv.ntvplus.tv/player/smil?%s' % compat_urllib_parse_urlencode({
+        video_url = self._download_json(
+            'http://player.matchtv.ntvplus.tv/player/smil', video_id,
+            query={
                 'ts': '',
                 'quality': 'SD',
                 'contentId': '561d2c0df7159b37178b4567',
@@ -40,11 +40,10 @@ class MatchTVIE(InfoExtractor):
                 'contentType': 'channel',
                 'timeShift': '0',
                 'platform': 'portal',
-            }),
+            },
             headers={
                 'Referer': 'http://player.matchtv.ntvplus.tv/embed-player/NTVEmbedPlayer.swf',
-            })
-        video_url = self._download_json(request, video_id)['data']['videoUrl']
+            })['data']['videoUrl']
         f4m_url = xpath_text(self._download_xml(video_url, video_id), './to')
         formats = self._extract_f4m_formats(f4m_url, video_id)
         self._sort_formats(formats)
diff --git a/youtube_dl/extractor/meta.py b/youtube_dl/extractor/meta.py
new file mode 100644 (file)
index 0000000..cdb46e1
--- /dev/null
@@ -0,0 +1,73 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .pladform import PladformIE
+from ..utils import (
+    unescapeHTML,
+    int_or_none,
+    ExtractorError,
+)
+
+
+class METAIE(InfoExtractor):
+    _VALID_URL = r'https?://video\.meta\.ua/(?:iframe/)?(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://video.meta.ua/5502115.video',
+        'md5': '71b6f3ee274bef16f1ab410f7f56b476',
+        'info_dict': {
+            'id': '5502115',
+            'ext': 'mp4',
+            'title': 'Sony Xperia Z camera test [HQ]',
+            'description': 'Xperia Z shoots video in FullHD HDR.',
+            'uploader_id': 'nomobile',
+            'uploader': 'CHЁZA.TV',
+            'upload_date': '20130211',
+        },
+        'add_ie': ['Youtube'],
+    }, {
+        'url': 'http://video.meta.ua/iframe/5502115',
+        'only_matching': True,
+    }, {
+        # pladform embed
+        'url': 'http://video.meta.ua/7121015.video',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        st_html5 = self._search_regex(
+            r"st_html5\s*=\s*'#([^']+)'", webpage, 'uppod html5 st', default=None)
+
+        if st_html5:
+            # uppod st decryption algorithm is reverse engineered from function un(s) at uppod.js
+            json_str = ''
+            for i in range(0, len(st_html5), 3):
+                json_str += '&#x0%s;' % st_html5[i:i + 3]
+            uppod_data = self._parse_json(unescapeHTML(json_str), video_id)
+            error = uppod_data.get('customnotfound')
+            if error:
+                raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
+
+            video_url = uppod_data['file']
+            info = {
+                'id': video_id,
+                'url': video_url,
+                'title': uppod_data.get('comment') or self._og_search_title(webpage),
+                'description': self._og_search_description(webpage, default=None),
+                'thumbnail': uppod_data.get('poster') or self._og_search_thumbnail(webpage),
+                'duration': int_or_none(self._og_search_property(
+                    'video:duration', webpage, default=None)),
+            }
+            if 'youtube.com/' in video_url:
+                info.update({
+                    '_type': 'url_transparent',
+                    'ie_key': 'Youtube',
+                })
+            return info
+
+        pladform_url = PladformIE._extract_url(webpage)
+        if pladform_url:
+            return self.url_result(pladform_url)
index 61dadb7a7de9cbd6134e4a285e655487276c4bce..e6e7659a1de0ebe86f48a4128192de5d14d6d586 100644 (file)
@@ -11,13 +11,14 @@ from ..utils import (
     determine_ext,
     ExtractorError,
     int_or_none,
-    sanitized_Request,
     urlencode_postdata,
+    get_element_by_attribute,
+    mimetype2ext,
 )
 
 
 class MetacafeIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
+    _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/(?P<video_id>[^/]+)/(?P<display_id>[^/?#]+)'
     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
     IE_NAME = 'metacafe'
@@ -47,6 +48,7 @@ class MetacafeIE(InfoExtractor):
                 'uploader': 'ign',
                 'description': 'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.',
             },
+            'skip': 'Page is temporarily unavailable.',
         },
         # AnyClip video
         {
@@ -55,8 +57,8 @@ class MetacafeIE(InfoExtractor):
                 'id': 'an-dVVXnuY7Jh77J',
                 'ext': 'mp4',
                 'title': 'The Andromeda Strain (1971): Stop the Bomb Part 3',
-                'uploader': 'anyclip',
-                'description': 'md5:38c711dd98f5bb87acf973d573442e67',
+                'uploader': 'AnyClip',
+                'description': 'md5:cbef0460d31e3807f6feb4e7a5952e5b',
             },
         },
         # age-restricted video
@@ -81,6 +83,9 @@ class MetacafeIE(InfoExtractor):
                 'title': 'Open: This is Face the Nation, February 9',
                 'description': 'md5:8a9ceec26d1f7ed6eab610834cc1a476',
                 'duration': 96,
+                'uploader': 'CBSI-NEW',
+                'upload_date': '20140209',
+                'timestamp': 1391959800,
             },
             'params': {
                 # rtmp download
@@ -107,28 +112,25 @@ class MetacafeIE(InfoExtractor):
     def report_disclaimer(self):
         self.to_screen('Retrieving disclaimer')
 
-    def _real_initialize(self):
+    def _confirm_age(self):
         # Retrieve disclaimer
         self.report_disclaimer()
         self._download_webpage(self._DISCLAIMER, None, False, 'Unable to retrieve disclaimer')
 
         # Confirm age
-        disclaimer_form = {
-            'filters': '0',
-            'submit': "Continue - I'm over 18",
-        }
-        request = sanitized_Request(self._FILTER_POST, urlencode_postdata(disclaimer_form))
-        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
         self.report_age_confirmation()
-        self._download_webpage(request, None, False, 'Unable to confirm age')
+        self._download_webpage(
+            self._FILTER_POST, None, False, 'Unable to confirm age',
+            data=urlencode_postdata({
+                'filters': '0',
+                'submit': "Continue - I'm over 18",
+            }), headers={
+                'Content-Type': 'application/x-www-form-urlencoded',
+            })
 
     def _real_extract(self, url):
         # Extract id and simplified title from URL
-        mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError('Invalid URL: %s' % url)
-
-        video_id = mobj.group(1)
+        video_id, display_id = re.match(self._VALID_URL, url).groups()
 
         # the video may come from an external site
         m_external = re.match('^(\w{2})-(.*)$', video_id)
@@ -141,15 +143,24 @@ class MetacafeIE(InfoExtractor):
             if prefix == 'cb':
                 return self.url_result('theplatform:%s' % ext_id, 'ThePlatform')
 
-        # Retrieve video webpage to extract further information
-        req = sanitized_Request('http://www.metacafe.com/watch/%s/' % video_id)
+        # self._confirm_age()
 
         # AnyClip videos require the flashversion cookie so that we get the link
         # to the mp4 file
-        mobj_an = re.match(r'^an-(.*?)$', video_id)
-        if mobj_an:
-            req.headers['Cookie'] = 'flashVersion=0;'
-        webpage = self._download_webpage(req, video_id)
+        headers = {}
+        if video_id.startswith('an-'):
+            headers['Cookie'] = 'flashVersion=0;'
+
+        # Retrieve video webpage to extract further information
+        webpage = self._download_webpage(url, video_id, headers=headers)
+
+        error = get_element_by_attribute(
+            'class', 'notfound-page-title', webpage)
+        if error:
+            raise ExtractorError(error, expected=True)
+
+        video_title = self._html_search_meta(
+            ['og:title', 'twitter:title'], webpage, 'title', default=None) or self._search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
 
         # Extract URL, uploader and title from webpage
         self.report_extraction(video_id)
@@ -213,20 +224,40 @@ class MetacafeIE(InfoExtractor):
                         'player_url': player_url,
                         'ext': play_path.partition(':')[0],
                     })
+        if video_url is None:
+            flashvars = self._parse_json(self._search_regex(
+                r'flashvars\s*=\s*({.*});', webpage, 'flashvars',
+                default=None), video_id, fatal=False)
+            if flashvars:
+                video_url = []
+                for source in flashvars.get('sources'):
+                    source_url = source.get('src')
+                    if not source_url:
+                        continue
+                    ext = mimetype2ext(source.get('type')) or determine_ext(source_url)
+                    if ext == 'm3u8':
+                        video_url.extend(self._extract_m3u8_formats(
+                            source_url, video_id, 'mp4',
+                            'm3u8_native', m3u8_id='hls', fatal=False))
+                    else:
+                        video_url.append({
+                            'url': source_url,
+                            'ext': ext,
+                        })
 
         if video_url is None:
             raise ExtractorError('Unsupported video type')
 
-        video_title = self._html_search_regex(
-            r'(?im)<title>(.*) - Video</title>', webpage, 'title')
-        description = self._og_search_description(webpage)
-        thumbnail = self._og_search_thumbnail(webpage)
+        description = self._html_search_meta(
+            ['og:description', 'twitter:description', 'description'],
+            webpage, 'title', fatal=False)
+        thumbnail = self._html_search_meta(
+            ['og:image', 'twitter:image'], webpage, 'title', fatal=False)
         video_uploader = self._html_search_regex(
             r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);',
             webpage, 'uploader nickname', fatal=False)
         duration = int_or_none(
-            self._html_search_meta('video:duration', webpage))
-
+            self._html_search_meta('video:duration', webpage, default=None))
         age_limit = (
             18
             if re.search(r'(?:"contentRating":|"rating",)"restricted"', webpage)
@@ -239,10 +270,11 @@ class MetacafeIE(InfoExtractor):
                 'url': video_url,
                 'ext': video_ext,
             }]
-
         self._sort_formats(formats)
+
         return {
             'id': video_id,
+            'display_id': display_id,
             'description': description,
             'uploader': video_uploader,
             'title': video_title,
index a14d176a550c1cc5a21f9f06255a686afd24d03f..d970e94ecd2425db06056ea615af02b92a39e292 100644 (file)
@@ -11,7 +11,7 @@ class MGTVIE(InfoExtractor):
 
     _TEST = {
         'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html',
-        'md5': '',
+        'md5': '1bdadcf760a0b90946ca68ee9a2db41a',
         'info_dict': {
             'id': '3116640',
             'ext': 'mp4',
@@ -20,37 +20,39 @@ class MGTVIE(InfoExtractor):
             'duration': 7461,
             'thumbnail': 're:^https?://.*\.jpg$',
         },
-        'params': {
-            'skip_download': True,  # m3u8 download
-        },
-    }
-
-    _FORMAT_MAP = {
-        '标清': ('Standard', 0),
-        '高清': ('High', 1),
-        '超清': ('SuperHigh', 2),
     }
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
         api_data = self._download_json(
             'http://v.api.mgtv.com/player/video', video_id,
-            query={'video_id': video_id})['data']
+            query={'video_id': video_id},
+            headers=self.geo_verification_headers())['data']
         info = api_data['info']
 
         formats = []
         for idx, stream in enumerate(api_data['stream']):
-            format_name = stream.get('name')
-            format_id, preference = self._FORMAT_MAP.get(format_name, (None, None))
-            format_info = self._download_json(
-                stream['url'], video_id,
-                note='Download video info for format %s' % format_id or '#%d' % idx)
-            formats.append({
-                'format_id': format_id,
-                'url': format_info['info'],
-                'ext': 'mp4',  # These are m3u8 playlists
-                'preference': preference,
-            })
+            stream_url = stream.get('url')
+            if not stream_url:
+                continue
+            tbr = int_or_none(self._search_regex(
+                r'(\d+)\.mp4', stream_url, 'tbr', default=None))
+
+            def extract_format(stream_url, format_id, idx, query={}):
+                format_info = self._download_json(
+                    stream_url, video_id,
+                    note='Download video info for format %s' % format_id or '#%d' % idx, query=query)
+                return {
+                    'format_id': format_id,
+                    'url': format_info['info'],
+                    'ext': 'mp4',
+                    'tbr': tbr,
+                }
+
+            formats.append(extract_format(
+                stream_url, 'hls-%d' % tbr if tbr else None, idx * 2))
+            formats.append(extract_format(stream_url.replace(
+                '/playlist.m3u8', ''), 'http-%d' % tbr if tbr else None, idx * 2 + 1, {'pno': 1031}))
         self._sort_formats(formats)
 
         return {
diff --git a/youtube_dl/extractor/microsoftvirtualacademy.py b/youtube_dl/extractor/microsoftvirtualacademy.py
new file mode 100644 (file)
index 0000000..afd3e98
--- /dev/null
@@ -0,0 +1,192 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_xpath,
+)
+from ..utils import (
+    int_or_none,
+    parse_duration,
+    smuggle_url,
+    unsmuggle_url,
+    xpath_text,
+)
+
+
+class MicrosoftVirtualAcademyBaseIE(InfoExtractor):
+    def _extract_base_url(self, course_id, display_id):
+        return self._download_json(
+            'https://api-mlxprod.microsoft.com/services/products/anonymous/%s' % course_id,
+            display_id, 'Downloading course base URL')
+
+    def _extract_chapter_and_title(self, title):
+        if not title:
+            return None, None
+        m = re.search(r'(?P<chapter>\d+)\s*\|\s*(?P<title>.+)', title)
+        return (int(m.group('chapter')), m.group('title')) if m else (None, title)
+
+
+class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE):
+    IE_NAME = 'mva'
+    IE_DESC = 'Microsoft Virtual Academy videos'
+    _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/[^/?#&]+-)(?P<course_id>\d+)(?::|\?l=)(?P<id>[\da-zA-Z]+_\d+)' % IE_NAME
+
+    _TESTS = [{
+        'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788?l=gfVXISmEB_6804984382',
+        'md5': '7826c44fc31678b12ad8db11f6b5abb9',
+        'info_dict': {
+            'id': 'gfVXISmEB_6804984382',
+            'ext': 'mp4',
+            'title': 'Course Introduction',
+            'formats': 'mincount:3',
+            'subtitles': {
+                'en': [{
+                    'ext': 'ttml',
+                }],
+            },
+        }
+    }, {
+        'url': 'mva:11788:gfVXISmEB_6804984382',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        url, smuggled_data = unsmuggle_url(url, {})
+
+        mobj = re.match(self._VALID_URL, url)
+        course_id = mobj.group('course_id')
+        video_id = mobj.group('id')
+
+        base_url = smuggled_data.get('base_url') or self._extract_base_url(course_id, video_id)
+
+        settings = self._download_xml(
+            '%s/content/content_%s/videosettings.xml?v=1' % (base_url, video_id),
+            video_id, 'Downloading video settings XML')
+
+        _, title = self._extract_chapter_and_title(xpath_text(
+            settings, './/Title', 'title', fatal=True))
+
+        formats = []
+
+        for sources in settings.findall(compat_xpath('.//MediaSources')):
+            if sources.get('videoType') == 'smoothstreaming':
+                continue
+            for source in sources.findall(compat_xpath('./MediaSource')):
+                video_url = source.text
+                if not video_url or not video_url.startswith('http'):
+                    continue
+                video_mode = source.get('videoMode')
+                height = int_or_none(self._search_regex(
+                    r'^(\d+)[pP]$', video_mode or '', 'height', default=None))
+                codec = source.get('codec')
+                acodec, vcodec = [None] * 2
+                if codec:
+                    codecs = codec.split(',')
+                    if len(codecs) == 2:
+                        acodec, vcodec = codecs
+                    elif len(codecs) == 1:
+                        vcodec = codecs[0]
+                formats.append({
+                    'url': video_url,
+                    'format_id': video_mode,
+                    'height': height,
+                    'acodec': acodec,
+                    'vcodec': vcodec,
+                })
+        self._sort_formats(formats)
+
+        subtitles = {}
+        for source in settings.findall(compat_xpath('.//MarkerResourceSource')):
+            subtitle_url = source.text
+            if not subtitle_url:
+                continue
+            subtitles.setdefault('en', []).append({
+                'url': '%s/%s' % (base_url, subtitle_url),
+                'ext': source.get('type'),
+            })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'subtitles': subtitles,
+            'formats': formats
+        }
+
+
+class MicrosoftVirtualAcademyCourseIE(MicrosoftVirtualAcademyBaseIE):
+    IE_NAME = 'mva:course'
+    IE_DESC = 'Microsoft Virtual Academy courses'
+    _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/(?P<display_id>[^/?#&]+)-)(?P<id>\d+)' % IE_NAME
+
+    _TESTS = [{
+        'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788',
+        'info_dict': {
+            'id': '11788',
+            'title': 'Microsoft Azure Fundamentals: Virtual Machines',
+        },
+        'playlist_count': 36,
+    }, {
+        # with emphasized chapters
+        'url': 'https://mva.microsoft.com/en-US/training-courses/developing-windows-10-games-with-construct-2-16335',
+        'info_dict': {
+            'id': '16335',
+            'title': 'Developing Windows 10 Games with Construct 2',
+        },
+        'playlist_count': 10,
+    }, {
+        'url': 'https://www.microsoftvirtualacademy.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788',
+        'only_matching': True,
+    }, {
+        'url': 'mva:course:11788',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if MicrosoftVirtualAcademyIE.suitable(url) else super(
+            MicrosoftVirtualAcademyCourseIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        course_id = mobj.group('id')
+        display_id = mobj.group('display_id')
+
+        base_url = self._extract_base_url(course_id, display_id)
+
+        manifest = self._download_json(
+            '%s/imsmanifestlite.json' % base_url,
+            display_id, 'Downloading course manifest JSON')['manifest']
+
+        organization = manifest['organizations']['organization'][0]
+
+        entries = []
+        for chapter in organization['item']:
+            chapter_number, chapter_title = self._extract_chapter_and_title(chapter.get('title'))
+            chapter_id = chapter.get('@identifier')
+            for item in chapter.get('item', []):
+                item_id = item.get('@identifier')
+                if not item_id:
+                    continue
+                metadata = item.get('resource', {}).get('metadata') or {}
+                if metadata.get('learningresourcetype') != 'Video':
+                    continue
+                _, title = self._extract_chapter_and_title(item.get('title'))
+                duration = parse_duration(metadata.get('duration'))
+                description = metadata.get('description')
+                entries.append({
+                    '_type': 'url_transparent',
+                    'url': smuggle_url(
+                        'mva:%s:%s' % (course_id, item_id), {'base_url': base_url}),
+                    'title': title,
+                    'description': description,
+                    'duration': duration,
+                    'chapter': chapter_title,
+                    'chapter_number': chapter_number,
+                    'chapter_id': chapter_id,
+                })
+
+        title = organization.get('title') or manifest.get('metadata', {}).get('title')
+
+        return self.playlist_result(entries, course_id, title)
index 7b4581dc58415f508ca0d34d61a5cd96b0b08e31..cd169f3616729871fcad9e6c619e67f392f73312 100644 (file)
@@ -1,5 +1,8 @@
+# coding: utf-8
 from __future__ import unicode_literals
 
+import re
+
 from .common import InfoExtractor
 from ..compat import (
     compat_urllib_parse_urlencode,
@@ -8,25 +11,104 @@ from ..compat import (
 from ..utils import (
     get_element_by_attribute,
     int_or_none,
+    remove_start,
+    extract_attributes,
+    determine_ext,
 )
 
 
-class MiTeleIE(InfoExtractor):
+class MiTeleBaseIE(InfoExtractor):
+    def _get_player_info(self, url, webpage):
+        player_data = extract_attributes(self._search_regex(
+            r'(?s)(<ms-video-player.+?</ms-video-player>)',
+            webpage, 'ms video player'))
+        video_id = player_data['data-media-id']
+        config_url = compat_urlparse.urljoin(url, player_data['data-config'])
+        config = self._download_json(
+            config_url, video_id, 'Downloading config JSON')
+        mmc_url = config['services']['mmc']
+
+        duration = None
+        formats = []
+        for m_url in (mmc_url, mmc_url.replace('/flash.json', '/html5.json')):
+            mmc = self._download_json(
+                m_url, video_id, 'Downloading mmc JSON')
+            if not duration:
+                duration = int_or_none(mmc.get('duration'))
+            for location in mmc['locations']:
+                gat = self._proto_relative_url(location.get('gat'), 'http:')
+                bas = location.get('bas')
+                loc = location.get('loc')
+                ogn = location.get('ogn')
+                if None in (gat, bas, loc, ogn):
+                    continue
+                token_data = {
+                    'bas': bas,
+                    'icd': loc,
+                    'ogn': ogn,
+                    'sta': '0',
+                }
+                media = self._download_json(
+                    '%s/?%s' % (gat, compat_urllib_parse_urlencode(token_data)),
+                    video_id, 'Downloading %s JSON' % location['loc'])
+                file_ = media.get('file')
+                if not file_:
+                    continue
+                ext = determine_ext(file_)
+                if ext == 'f4m':
+                    formats.extend(self._extract_f4m_formats(
+                        file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18',
+                        video_id, f4m_id='hds', fatal=False))
+                elif ext == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        file_, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'thumbnail': player_data.get('data-poster') or config.get('poster', {}).get('imageUrl'),
+            'duration': duration,
+        }
+
+
+class MiTeleIE(MiTeleBaseIE):
     IE_DESC = 'mitele.es'
-    _VALID_URL = r'https?://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<id>[^/]+)/'
+    _VALID_URL = r'https?://www\.mitele\.es/(?:[^/]+/){3}(?P<id>[^/]+)/'
 
     _TESTS = [{
         'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/',
-        'md5': '0ff1a13aebb35d9bc14081ff633dd324',
+        # MD5 is unstable
         'info_dict': {
             'id': '0NF1jJnxS1Wu3pHrmvFyw2',
             'display_id': 'programa-144',
-            'ext': 'flv',
+            'ext': 'mp4',
             'title': 'Tor, la web invisible',
             'description': 'md5:3b6fce7eaa41b2d97358726378d9369f',
+            'series': 'Diario de',
+            'season': 'La redacción',
+            'episode': 'Programa 144',
             'thumbnail': 're:(?i)^https?://.*\.jpg$',
             'duration': 2913,
         },
+    }, {
+        # no explicit title
+        'url': 'http://www.mitele.es/programas-tv/cuarto-milenio/temporada-6/programa-226/',
+        'info_dict': {
+            'id': 'eLZSwoEd1S3pVyUm8lc6F',
+            'display_id': 'programa-226',
+            'ext': 'mp4',
+            'title': 'Cuarto Milenio - Temporada 6 - Programa 226',
+            'description': 'md5:50daf9fadefa4e62d9fc866d0c015701',
+            'series': 'Cuarto Milenio',
+            'season': 'Temporada 6',
+            'episode': 'Programa 226',
+            'thumbnail': 're:(?i)^https?://.*\.jpg$',
+            'duration': 7312,
+        },
+        'params': {
+            'skip_download': True,
+        },
     }]
 
     def _real_extract(self, url):
@@ -34,56 +116,32 @@ class MiTeleIE(InfoExtractor):
 
         webpage = self._download_webpage(url, display_id)
 
-        config_url = self._search_regex(
-            r'data-config\s*=\s*"([^"]+)"', webpage, 'data config url')
-        config_url = compat_urlparse.urljoin(url, config_url)
-
-        config = self._download_json(
-            config_url, display_id, 'Downloading config JSON')
-
-        mmc = self._download_json(
-            config['services']['mmc'], display_id, 'Downloading mmc JSON')
-
-        formats = []
-        for location in mmc['locations']:
-            gat = self._proto_relative_url(location.get('gat'), 'http:')
-            bas = location.get('bas')
-            loc = location.get('loc')
-            ogn = location.get('ogn')
-            if None in (gat, bas, loc, ogn):
-                continue
-            token_data = {
-                'bas': bas,
-                'icd': loc,
-                'ogn': ogn,
-                'sta': '0',
-            }
-            media = self._download_json(
-                '%s/?%s' % (gat, compat_urllib_parse_urlencode(token_data)),
-                display_id, 'Downloading %s JSON' % location['loc'])
-            file_ = media.get('file')
-            if not file_:
-                continue
-            formats.extend(self._extract_f4m_formats(
-                file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18',
-                display_id, f4m_id=loc))
-        self._sort_formats(formats)
+        info = self._get_player_info(url, webpage)
 
         title = self._search_regex(
-            r'class="Destacado-text"[^>]*>\s*<strong>([^<]+)</strong>', webpage, 'title')
+            r'class="Destacado-text"[^>]*>\s*<strong>([^<]+)</strong>',
+            webpage, 'title', default=None)
 
-        video_id = self._search_regex(
-            r'data-media-id\s*=\s*"([^"]+)"', webpage,
-            'data media id', default=None) or display_id
-        thumbnail = config.get('poster', {}).get('imageUrl')
-        duration = int_or_none(mmc.get('duration'))
+        mobj = re.search(r'''(?sx)
+                            class="Destacado-text"[^>]*>.*?<h1>\s*
+                            <span>(?P<series>[^<]+)</span>\s*
+                            <span>(?P<season>[^<]+)</span>\s*
+                            <span>(?P<episode>[^<]+)</span>''', webpage)
+        series, season, episode = mobj.groups() if mobj else [None] * 3
 
-        return {
-            'id': video_id,
+        if not title:
+            if mobj:
+                title = '%s - %s - %s' % (series, season, episode)
+            else:
+                title = remove_start(self._search_regex(
+                    r'<title>([^<]+)</title>', webpage, 'title'), 'Ver online ')
+
+        info.update({
             'display_id': display_id,
             'title': title,
             'description': get_element_by_attribute('class', 'text', webpage),
-            'thumbnail': thumbnail,
-            'duration': duration,
-            'formats': formats,
-        }
+            'series': series,
+            'season': season,
+            'episode': episode,
+        })
+        return info
index 483f6925fda989fc5111694c8c82f1807a1f3d97..560fe188b675a619785332eea285484fa85154bf 100644 (file)
@@ -102,11 +102,11 @@ class MixcloudIE(InfoExtractor):
         description = self._og_search_description(webpage)
         like_count = parse_count(self._search_regex(
             r'\bbutton-favorite[^>]+>.*?<span[^>]+class=["\']toggle-number[^>]+>\s*([^<]+)',
-            webpage, 'like count', fatal=False))
+            webpage, 'like count', default=None))
         view_count = str_to_int(self._search_regex(
             [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',
              r'/listeners/?">([0-9,.]+)</a>'],
-            webpage, 'play count', fatal=False))
+            webpage, 'play count', default=None))
 
         return {
             'id': track_id,
diff --git a/youtube_dl/extractor/msn.py b/youtube_dl/extractor/msn.py
new file mode 100644 (file)
index 0000000..1ec8e0f
--- /dev/null
@@ -0,0 +1,122 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    int_or_none,
+    unescapeHTML,
+)
+
+
+class MSNIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?msn\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/[a-z]{2}-(?P<id>[\da-zA-Z]+)'
+    _TESTS = [{
+        'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/criminal-minds-shemar-moore-shares-a-touching-goodbye-message/vp-BBqQYNE',
+        'md5': '8442f66c116cbab1ff7098f986983458',
+        'info_dict': {
+            'id': 'BBqQYNE',
+            'display_id': 'criminal-minds-shemar-moore-shares-a-touching-goodbye-message',
+            'ext': 'mp4',
+            'title': 'Criminal Minds - Shemar Moore Shares A Touching Goodbye Message',
+            'description': 'md5:e8e89b897b222eb33a6b5067a8f1bc25',
+            'duration': 104,
+            'uploader': 'CBS Entertainment',
+            'uploader_id': 'IT0X5aoJ6bJgYerJXSDCgFmYPB1__54v',
+        },
+    }, {
+        'url': 'http://www.msn.com/en-ae/news/offbeat/meet-the-nine-year-old-self-made-millionaire/ar-BBt6ZKf',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.msn.com/en-ae/video/watch/obama-a-lot-of-people-will-be-disappointed/vi-AAhxUMH',
+        'only_matching': True,
+    }, {
+        # geo restricted
+        'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/the-first-fart-makes-you-laugh-the-last-fart-makes-you-cry/vp-AAhzIBU',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.msn.com/en-ae/entertainment/bollywood/watch-how-salman-khan-reacted-when-asked-if-he-would-apologize-for-his-‘raped-woman’-comment/vi-AAhvzW6',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id, display_id = mobj.group('id', 'display_id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        video = self._parse_json(
+            self._search_regex(
+                r'data-metadata\s*=\s*(["\'])(?P<data>.+?)\1',
+                webpage, 'video data', default='{}', group='data'),
+            display_id, transform_source=unescapeHTML)
+
+        if not video:
+            error = unescapeHTML(self._search_regex(
+                r'data-error=(["\'])(?P<error>.+?)\1',
+                webpage, 'error', group='error'))
+            raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
+
+        title = video['title']
+
+        formats = []
+        for file_ in video.get('videoFiles', []):
+            format_url = file_.get('url')
+            if not format_url:
+                continue
+            ext = determine_ext(format_url)
+            # .ism is not yet supported (see
+            # https://github.com/rg3/youtube-dl/issues/8118)
+            if ext == 'ism':
+                continue
+            if 'm3u8' in format_url:
+                # m3u8_native should not be used here until
+                # https://github.com/rg3/youtube-dl/issues/9913 is fixed
+                m3u8_formats = self._extract_m3u8_formats(
+                    format_url, display_id, 'mp4',
+                    m3u8_id='hls', fatal=False)
+                # Despite metadata in m3u8 all video+audio formats are
+                # actually video-only (no audio)
+                for f in m3u8_formats:
+                    if f.get('acodec') != 'none' and f.get('vcodec') != 'none':
+                        f['acodec'] = 'none'
+                formats.extend(m3u8_formats)
+            else:
+                formats.append({
+                    'url': format_url,
+                    'ext': 'mp4',
+                    'format_id': 'http',
+                    'width': int_or_none(file_.get('width')),
+                    'height': int_or_none(file_.get('height')),
+                })
+        self._sort_formats(formats)
+
+        subtitles = {}
+        for file_ in video.get('files', []):
+            format_url = file_.get('url')
+            format_code = file_.get('formatCode')
+            if not format_url or not format_code:
+                continue
+            if compat_str(format_code) == '3100':
+                subtitles.setdefault(file_.get('culture', 'en'), []).append({
+                    'ext': determine_ext(format_url, 'ttml'),
+                    'url': format_url,
+                })
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': video.get('description'),
+            'thumbnail': video.get('headlineImage', {}).get('url'),
+            'duration': int_or_none(video.get('durationSecs')),
+            'uploader': video.get('sourceFriendly'),
+            'uploader_id': video.get('providerId'),
+            'creator': video.get('creator'),
+            'subtitles': subtitles,
+            'formats': formats,
+        }
index 640ee3d9339c48e2b3fef0ade15ee8ebcae8b292..dd06395891ac02ce81b40efd471776cb080d328d 100644 (file)
@@ -6,6 +6,7 @@ from .common import InfoExtractor
 from ..compat import (
     compat_urllib_parse_urlencode,
     compat_str,
+    compat_xpath,
 )
 from ..utils import (
     ExtractorError,
@@ -84,9 +85,10 @@ class MTVServicesInfoExtractor(InfoExtractor):
                 rtmp_video_url = rendition.find('./src').text
                 if rtmp_video_url.endswith('siteunavail.png'):
                     continue
+                new_url = self._transform_rtmp_url(rtmp_video_url)
                 formats.append({
-                    'ext': ext,
-                    'url': self._transform_rtmp_url(rtmp_video_url),
+                    'ext': 'flv' if new_url.startswith('rtmp') else ext,
+                    'url': new_url,
                     'format_id': rendition.get('bitrate'),
                     'width': int(rendition.get('width')),
                     'height': int(rendition.get('height')),
@@ -139,9 +141,9 @@ class MTVServicesInfoExtractor(InfoExtractor):
                 itemdoc, './/{http://search.yahoo.com/mrss/}category',
                 'scheme', 'urn:mtvn:video_title')
         if title_el is None:
-            title_el = itemdoc.find('.//{http://search.yahoo.com/mrss/}title')
+            title_el = itemdoc.find(compat_xpath('.//{http://search.yahoo.com/mrss/}title'))
         if title_el is None:
-            title_el = itemdoc.find('.//title') or itemdoc.find('./title')
+            title_el = itemdoc.find(compat_xpath('.//title'))
             if title_el.text is None:
                 title_el = None
 
diff --git a/youtube_dl/extractor/muzu.py b/youtube_dl/extractor/muzu.py
deleted file mode 100644 (file)
index cbc8004..0000000
+++ /dev/null
@@ -1,63 +0,0 @@
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..compat import compat_urllib_parse_urlencode
-
-
-class MuzuTVIE(InfoExtractor):
-    _VALID_URL = r'https?://www\.muzu\.tv/(.+?)/(.+?)/(?P<id>\d+)'
-    IE_NAME = 'muzu.tv'
-
-    _TEST = {
-        'url': 'http://www.muzu.tv/defected/marcashken-featuring-sos-cat-walk-original-mix-music-video/1981454/',
-        'md5': '98f8b2c7bc50578d6a0364fff2bfb000',
-        'info_dict': {
-            'id': '1981454',
-            'ext': 'mp4',
-            'title': 'Cat Walk (Original Mix)',
-            'description': 'md5:90e868994de201b2570e4e5854e19420',
-            'uploader': 'MarcAshken featuring SOS',
-        },
-    }
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-
-        info_data = compat_urllib_parse_urlencode({
-            'format': 'json',
-            'url': url,
-        })
-        info = self._download_json(
-            'http://www.muzu.tv/api/oembed/?%s' % info_data,
-            video_id, 'Downloading video info')
-
-        player_info = self._download_json(
-            'http://player.muzu.tv/player/playerInit?ai=%s' % video_id,
-            video_id, 'Downloading player info')
-        video_info = player_info['videos'][0]
-        for quality in ['1080', '720', '480', '360']:
-            if video_info.get('v%s' % quality):
-                break
-
-        data = compat_urllib_parse_urlencode({
-            'ai': video_id,
-            # Even if each time you watch a video the hash changes,
-            # it seems to work for different videos, and it will work
-            # even if you use any non empty string as a hash
-            'viewhash': 'VBNff6djeV4HV5TRPW5kOHub2k',
-            'device': 'web',
-            'qv': quality,
-        })
-        video_url_info = self._download_json(
-            'http://player.muzu.tv/player/requestVideo?%s' % data,
-            video_id, 'Downloading video url')
-        video_url = video_url_info['url']
-
-        return {
-            'id': video_id,
-            'title': info['title'],
-            'url': video_url,
-            'thumbnail': info['thumbnail_url'],
-            'description': info['description'],
-            'uploader': info['author_name'],
-        }
index 66b5231979ce8399c38074a9a14a2b4ea2114ee8..a103e0323a6c62e4b0d283afdb6d4f5662bb1869 100644 (file)
@@ -10,9 +10,10 @@ from ..utils import (
 
 class MwaveIE(InfoExtractor):
     _VALID_URL = r'https?://mwave\.interest\.me/mnettv/videodetail\.m\?searchVideoDetailVO\.clip_id=(?P<id>[0-9]+)'
+    _URL_TEMPLATE = 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id=%s'
     _TEST = {
         'url': 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id=168859',
-        'md5': 'c930e27b7720aaa3c9d0018dfc8ff6cc',
+        # md5 is unstable
         'info_dict': {
             'id': '168859',
             'ext': 'flv',
@@ -56,3 +57,28 @@ class MwaveIE(InfoExtractor):
             'view_count': int_or_none(vod_info.get('hit')),
             'formats': formats,
         }
+
+
+class MwaveMeetGreetIE(InfoExtractor):
+    _VALID_URL = r'https?://mwave\.interest\.me/meetgreet/view/(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://mwave.interest.me/meetgreet/view/256',
+        'info_dict': {
+            'id': '173294',
+            'ext': 'flv',
+            'title': '[MEET&GREET] Park BoRam',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'uploader': 'Mwave',
+            'duration': 3634,
+            'view_count': int,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        clip_id = self._html_search_regex(
+            r'<iframe[^>]+src="/mnettv/ifr_clip\.m\?searchVideoDetailVO\.clip_id=(\d+)',
+            webpage, 'clip ID')
+        clip_url = MwaveIE._URL_TEMPLATE % clip_id
+        return self.url_result(clip_url, 'Mwave', clip_id)
index 72251866303885f6cd9b040ec9ad3f042d8add6a..e717abb9fa44fbb70b7cd8d7068e989b662648de 100644 (file)
@@ -1,6 +1,7 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
+from .theplatform import ThePlatformIE
 from ..utils import (
     smuggle_url,
     url_basename,
@@ -61,7 +62,7 @@ class NationalGeographicIE(InfoExtractor):
         }
 
 
-class NationalGeographicChannelIE(InfoExtractor):
+class NationalGeographicChannelIE(ThePlatformIE):
     IE_NAME = 'natgeo:channel'
     _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?[^/]+/videos/(?P<id>[^/?]+)'
 
@@ -102,12 +103,22 @@ class NationalGeographicChannelIE(InfoExtractor):
         release_url = self._search_regex(
             r'video_auth_playlist_url\s*=\s*"([^"]+)"',
             webpage, 'release url')
+        query = {
+            'mbr': 'true',
+            'switch': 'http',
+        }
+        is_auth = self._search_regex(r'video_is_auth\s*=\s*"([^"]+)"', webpage, 'is auth', fatal=False)
+        if is_auth == 'auth':
+            auth_resource_id = self._search_regex(
+                r"video_auth_resourceId\s*=\s*'([^']+)'",
+                webpage, 'auth resource id')
+            query['auth'] = self._extract_mvpd_auth(url, display_id, 'natgeo', auth_resource_id) or ''
 
         return {
             '_type': 'url_transparent',
             'ie_key': 'ThePlatform',
             'url': smuggle_url(
-                update_url_query(release_url, {'mbr': 'true', 'switch': 'http'}),
+                update_url_query(release_url, query),
                 {'force_smil_url': True}),
             'display_id': display_id,
         }
index f9d42d07a4f4995ce86be23ffaf73033ac91aadf..f694e210b1dadceb030cb24f6498abe30de5b976 100644 (file)
@@ -9,10 +9,6 @@ from ..utils import (
     lowercase_escape,
     smuggle_url,
     unescapeHTML,
-    update_url_query,
-    int_or_none,
-    HEADRequest,
-    parse_iso8601,
 )
 
 
@@ -67,6 +63,23 @@ class NBCIE(InfoExtractor):
             # This video has expired but with an escaped embedURL
             'url': 'http://www.nbc.com/parenthood/episode-guide/season-5/just-like-at-home/515',
             'only_matching': True,
+        },
+        {
+            # HLS streams requires the 'hdnea3' cookie
+            'url': 'http://www.nbc.com/Kings/video/goliath/n1806',
+            'info_dict': {
+                'id': 'n1806',
+                'ext': 'mp4',
+                'title': 'Goliath',
+                'description': 'When an unknown soldier saves the life of the King\'s son in battle, he\'s thrust into the limelight and politics of the kingdom.',
+                'timestamp': 1237100400,
+                'upload_date': '20090315',
+                'uploader': 'NBCU-COM',
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'skip': 'Only works from US',
         }
     ]
 
@@ -134,6 +147,9 @@ class NBCSportsIE(InfoExtractor):
             'ext': 'flv',
             'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke',
             'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113',
+            'uploader': 'NBCU-SPORTS',
+            'upload_date': '20150330',
+            'timestamp': 1427726529,
         }
     }
 
@@ -172,9 +188,9 @@ class CSNNEIE(InfoExtractor):
 
 
 class NBCNewsIE(ThePlatformIE):
-    _VALID_URL = r'''(?x)https?://(?:www\.)?(?:nbcnews|today)\.com/
+    _VALID_URL = r'''(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/
         (?:video/.+?/(?P<id>\d+)|
-        ([^/]+/)*(?P<display_id>[^/?]+))
+        ([^/]+/)*(?:.*-)?(?P<mpx_id>[^/?]+))
         '''
 
     _TESTS = [
@@ -196,13 +212,16 @@ class NBCNewsIE(ThePlatformIE):
                 'ext': 'mp4',
                 'title': 'How Twitter Reacted To The Snowden Interview',
                 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64',
+                'uploader': 'NBCU-NEWS',
+                'timestamp': 1401363060,
+                'upload_date': '20140529',
             },
         },
         {
             'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156',
             'md5': 'fdbf39ab73a72df5896b6234ff98518a',
             'info_dict': {
-                'id': 'Wjf9EDR3A_60',
+                'id': '529953347624',
                 'ext': 'mp4',
                 'title': 'FULL EPISODE: Family Business',
                 'description': 'md5:757988edbaae9d7be1d585eb5d55cc04',
@@ -217,6 +236,9 @@ class NBCNewsIE(ThePlatformIE):
                 'ext': 'mp4',
                 'title': 'Nightly News with Brian Williams Full Broadcast (February 4)',
                 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5',
+                'timestamp': 1423104900,
+                'uploader': 'NBCU-NEWS',
+                'upload_date': '20150205',
             },
         },
         {
@@ -225,10 +247,12 @@ class NBCNewsIE(ThePlatformIE):
             'info_dict': {
                 'id': '529953347624',
                 'ext': 'mp4',
-                'title': 'Volkswagen U.S. Chief: We \'Totally Screwed Up\'',
-                'description': 'md5:d22d1281a24f22ea0880741bb4dd6301',
+                'title': 'Volkswagen U.S. Chief:\xa0 We Have Totally Screwed Up',
+                'description': 'md5:c8be487b2d80ff0594c005add88d8351',
+                'upload_date': '20150922',
+                'timestamp': 1442917800,
+                'uploader': 'NBCU-NEWS',
             },
-            'expected_warnings': ['http-6000 is not available']
         },
         {
             'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788',
@@ -240,12 +264,33 @@ class NBCNewsIE(ThePlatformIE):
                 'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1',
                 'upload_date': '20160420',
                 'timestamp': 1461152093,
+                'uploader': 'NBCU-NEWS',
+            },
+        },
+        {
+            'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924',
+            'md5': '6d236bf4f3dddc226633ce6e2c3f814d',
+            'info_dict': {
+                'id': '314487875924',
+                'ext': 'mp4',
+                'title': 'The chaotic GOP immigration vote',
+                'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.',
+                'thumbnail': 're:^https?://.*\.jpg$',
+                'timestamp': 1406937606,
+                'upload_date': '20140802',
+                'uploader': 'NBCU-NEWS',
+                'categories': ['MSNBC/Topics/Franchise/Best of last night', 'MSNBC/Topics/General/Congress'],
             },
         },
         {
             'url': 'http://www.nbcnews.com/watch/dateline/full-episode--deadly-betrayal-386250819952',
             'only_matching': True,
         },
+        {
+            # From http://www.vulture.com/2016/06/letterman-couldnt-care-less-about-late-night.html
+            'url': 'http://www.nbcnews.com/widget/video-embed/701714499682',
+            'only_matching': True,
+        },
     ]
 
     def _real_extract(self, url):
@@ -265,106 +310,28 @@ class NBCNewsIE(ThePlatformIE):
             }
         else:
             # "feature" and "nightly-news" pages use theplatform.com
-            display_id = mobj.group('display_id')
-            webpage = self._download_webpage(url, display_id)
-            info = None
-            bootstrap_json = self._search_regex(
-                r'(?m)var\s+(?:bootstrapJson|playlistData)\s*=\s*({.+});?\s*$',
-                webpage, 'bootstrap json', default=None)
-            if bootstrap_json:
-                bootstrap = self._parse_json(bootstrap_json, display_id)
-                info = bootstrap['results'][0]['video']
-            else:
-                player_instance_json = self._search_regex(
-                    r'videoObj\s*:\s*({.+})', webpage, 'player instance', default=None)
-                if not player_instance_json:
-                    player_instance_json = self._html_search_regex(
-                        r'data-video="([^"]+)"', webpage, 'video json')
-                info = self._parse_json(player_instance_json, display_id)
-            video_id = info['mpxId']
-            title = info['title']
-
-            subtitles = {}
-            caption_links = info.get('captionLinks')
-            if caption_links:
-                for (sub_key, sub_ext) in (('smpte-tt', 'ttml'), ('web-vtt', 'vtt'), ('srt', 'srt')):
-                    sub_url = caption_links.get(sub_key)
-                    if sub_url:
-                        subtitles.setdefault('en', []).append({
-                            'url': sub_url,
-                            'ext': sub_ext,
-                        })
-
-            formats = []
-            for video_asset in info['videoAssets']:
-                video_url = video_asset.get('publicUrl')
-                if not video_url:
-                    continue
-                container = video_asset.get('format')
-                asset_type = video_asset.get('assetType') or ''
-                if container == 'ISM' or asset_type == 'FireTV-Once':
-                    continue
-                elif asset_type == 'OnceURL':
-                    tp_formats, tp_subtitles = self._extract_theplatform_smil(
-                        video_url, video_id)
-                    formats.extend(tp_formats)
-                    subtitles = self._merge_subtitles(subtitles, tp_subtitles)
+            video_id = mobj.group('mpx_id')
+            if not video_id.isdigit():
+                webpage = self._download_webpage(url, video_id)
+                info = None
+                bootstrap_json = self._search_regex(
+                    [r'(?m)(?:var\s+(?:bootstrapJson|playlistData)|NEWS\.videoObj)\s*=\s*({.+});?\s*$',
+                     r'videoObj\s*:\s*({.+})', r'data-video="([^"]+)"'],
+                    webpage, 'bootstrap json', default=None)
+                bootstrap = self._parse_json(
+                    bootstrap_json, video_id, transform_source=unescapeHTML)
+                if 'results' in bootstrap:
+                    info = bootstrap['results'][0]['video']
+                elif 'video' in bootstrap:
+                    info = bootstrap['video']
                 else:
-                    tbr = int_or_none(video_asset.get('bitRate') or video_asset.get('bitrate'), 1000)
-                    format_id = 'http%s' % ('-%d' % tbr if tbr else '')
-                    video_url = update_url_query(
-                        video_url, {'format': 'redirect'})
-                    # resolve the url so that we can check availability and detect the correct extension
-                    head = self._request_webpage(
-                        HEADRequest(video_url), video_id,
-                        'Checking %s url' % format_id,
-                        '%s is not available' % format_id,
-                        fatal=False)
-                    if head:
-                        video_url = head.geturl()
-                        formats.append({
-                            'format_id': format_id,
-                            'url': video_url,
-                            'width': int_or_none(video_asset.get('width')),
-                            'height': int_or_none(video_asset.get('height')),
-                            'tbr': tbr,
-                            'container': video_asset.get('format'),
-                        })
-            self._sort_formats(formats)
+                    info = bootstrap
+                video_id = info['mpxId']
 
             return {
+                '_type': 'url_transparent',
                 'id': video_id,
-                'title': title,
-                'description': info.get('description'),
-                'thumbnail': info.get('thumbnail'),
-                'duration': int_or_none(info.get('duration')),
-                'timestamp': parse_iso8601(info.get('pubDate') or info.get('pub_date')),
-                'formats': formats,
-                'subtitles': subtitles,
+                # http://feed.theplatform.com/f/2E2eJC/nbcnews also works
+                'url': 'http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews?byId=%s' % video_id,
+                'ie_key': 'ThePlatformFeed',
             }
-
-
-class MSNBCIE(InfoExtractor):
-    # https URLs redirect to corresponding http ones
-    _VALID_URL = r'https?://www\.msnbc\.com/[^/]+/watch/(?P<id>[^/]+)'
-    _TEST = {
-        'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924',
-        'md5': '6d236bf4f3dddc226633ce6e2c3f814d',
-        'info_dict': {
-            'id': 'n_hayes_Aimm_140801_272214',
-            'ext': 'mp4',
-            'title': 'The chaotic GOP immigration vote',
-            'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.',
-            'thumbnail': 're:^https?://.*\.jpg$',
-            'timestamp': 1406937606,
-            'upload_date': '20140802',
-            'uploader': 'NBCU-NEWS',
-            'categories': ['MSNBC/Topics/Franchise/Best of last night', 'MSNBC/Topics/General/Congress'],
-        },
-    }
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
-        embed_url = self._html_search_meta('embedURL', webpage)
-        return self.url_result(embed_url)
index 2a1ca80df797f0abe63cc6327c5e283965865f70..96528f6499d1e02c5208e61fe8abd1f606b29392 100644 (file)
@@ -1,19 +1,18 @@
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
 from ..utils import (
-    month_by_name,
     int_or_none,
+    remove_end,
+    unified_strdate,
 )
 
 
 class NDTVIE(InfoExtractor):
-    _VALID_URL = r'^https?://(?:www\.)?ndtv\.com/video/player/[^/]*/[^/]*/(?P<id>[a-z0-9]+)'
+    _VALID_URL = r'https?://(?:www\.)?ndtv\.com/video/(?:[^/]+/)+[^/?^&]+-(?P<id>\d+)'
 
     _TEST = {
-        'url': 'http://www.ndtv.com/video/player/news/ndtv-exclusive-don-t-need-character-certificate-from-rahul-gandhi-says-arvind-kejriwal/300710',
+        'url': 'http://www.ndtv.com/video/news/news/ndtv-exclusive-don-t-need-character-certificate-from-rahul-gandhi-says-arvind-kejriwal-300710',
         'md5': '39f992dbe5fb531c395d8bbedb1e5e88',
         'info_dict': {
             'id': '300710',
@@ -22,7 +21,7 @@ class NDTVIE(InfoExtractor):
             'description': 'md5:ab2d4b4a6056c5cb4caa6d729deabf02',
             'upload_date': '20131208',
             'duration': 1327,
-            'thumbnail': 'http://i.ndtvimg.com/video/images/vod/medium/2013-12/big_300710_1386518307.jpg',
+            'thumbnail': 're:https?://.*\.jpg',
         },
     }
 
@@ -30,36 +29,19 @@ class NDTVIE(InfoExtractor):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
 
+        title = remove_end(self._og_search_title(webpage), ' - NDTV')
+
         filename = self._search_regex(
             r"__filename='([^']+)'", webpage, 'video filename')
-        video_url = ('http://bitcast-b.bitgravity.com/ndtvod/23372/ndtv/%s' %
-                     filename)
+        video_url = 'http://bitcast-b.bitgravity.com/ndtvod/23372/ndtv/%s' % filename
 
         duration = int_or_none(self._search_regex(
             r"__duration='([^']+)'", webpage, 'duration', fatal=False))
 
-        date_m = re.search(r'''(?x)
-            <p\s+class="vod_dateline">\s*
-                Published\s+On:\s*
-                (?P<monthname>[A-Za-z]+)\s+(?P<day>[0-9]+),\s*(?P<year>[0-9]+)
-            ''', webpage)
-        upload_date = None
-
-        if date_m is not None:
-            month = month_by_name(date_m.group('monthname'))
-            if month is not None:
-                upload_date = '%s%02d%02d' % (
-                    date_m.group('year'), month, int(date_m.group('day')))
-
-        description = self._og_search_description(webpage)
-        READ_MORE = ' (Read more)'
-        if description.endswith(READ_MORE):
-            description = description[:-len(READ_MORE)]
+        upload_date = unified_strdate(self._html_search_meta(
+            'publish-date', webpage, 'upload date', fatal=False))
 
-        title = self._og_search_title(webpage)
-        TITLE_SUFFIX = ' - NDTV'
-        if title.endswith(TITLE_SUFFIX):
-            title = title[:-len(TITLE_SUFFIX)]
+        description = remove_end(self._og_search_description(webpage), ' (Read more)')
 
         return {
             'id': video_id,
index 51e4a34f789f0e7e9dff2eeb9ec839e655632c75..adcc636bc32c062fec74074044783e452a3725d9 100644 (file)
@@ -2,8 +2,12 @@ from __future__ import unicode_literals
 
 from .common import InfoExtractor
 from ..utils import (
-    sanitized_Request,
+    clean_html,
+    determine_ext,
+    int_or_none,
+    qualities,
     urlencode_postdata,
+    xpath_text,
 )
 
 
@@ -16,12 +20,12 @@ class NFBIE(InfoExtractor):
         'url': 'https://www.nfb.ca/film/qallunaat_why_white_people_are_funny',
         'info_dict': {
             'id': 'qallunaat_why_white_people_are_funny',
-            'ext': 'mp4',
+            'ext': 'flv',
             'title': 'Qallunaat! Why White People Are Funny ',
-            'description': 'md5:836d8aff55e087d04d9f6df554d4e038',
+            'description': 'md5:6b8e32dde3abf91e58857b174916620c',
             'duration': 3128,
+            'creator': 'Mark Sandiford',
             'uploader': 'Mark Sandiford',
-            'uploader_id': 'mark-sandiford',
         },
         'params': {
             # rtmp download
@@ -31,65 +35,78 @@ class NFBIE(InfoExtractor):
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        page = self._download_webpage(
-            'https://www.nfb.ca/film/%s' % video_id, video_id,
-            'Downloading film page')
 
-        uploader_id = self._html_search_regex(r'<a class="director-link" href="/explore-all-directors/([^/]+)/"',
-                                              page, 'director id', fatal=False)
-        uploader = self._html_search_regex(r'<em class="director-name" itemprop="name">([^<]+)</em>',
-                                           page, 'director name', fatal=False)
-
-        request = sanitized_Request(
+        config = self._download_xml(
             'https://www.nfb.ca/film/%s/player_config' % video_id,
-            urlencode_postdata({'getConfig': 'true'}))
-        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
-        request.add_header('X-NFB-Referer', 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf')
-
-        config = self._download_xml(request, video_id, 'Downloading player config XML')
+            video_id, 'Downloading player config XML',
+            data=urlencode_postdata({'getConfig': 'true'}),
+            headers={
+                'Content-Type': 'application/x-www-form-urlencoded',
+                'X-NFB-Referer': 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf'
+            })
 
-        title = None
-        description = None
-        thumbnail = None
-        duration = None
-        formats = []
-
-        def extract_thumbnail(media):
-            thumbnails = {}
-            for asset in media.findall('assets/asset'):
-                thumbnails[asset.get('quality')] = asset.find('default/url').text
-            if not thumbnails:
-                return None
-            if 'high' in thumbnails:
-                return thumbnails['high']
-            return list(thumbnails.values())[0]
+        title, description, thumbnail, duration, uploader, author = [None] * 6
+        thumbnails, formats = [[]] * 2
+        subtitles = {}
 
         for media in config.findall('./player/stream/media'):
             if media.get('type') == 'posterImage':
-                thumbnail = extract_thumbnail(media)
+                quality_key = qualities(('low', 'high'))
+                thumbnails = []
+                for asset in media.findall('assets/asset'):
+                    asset_url = xpath_text(asset, 'default/url', default=None)
+                    if not asset_url:
+                        continue
+                    quality = asset.get('quality')
+                    thumbnails.append({
+                        'url': asset_url,
+                        'id': quality,
+                        'preference': quality_key(quality),
+                    })
             elif media.get('type') == 'video':
-                duration = int(media.get('duration'))
-                title = media.find('title').text
-                description = media.find('description').text
-                # It seems assets always go from lower to better quality, so no need to sort
+                title = xpath_text(media, 'title', fatal=True)
                 for asset in media.findall('assets/asset'):
-                    for x in asset:
+                    quality = asset.get('quality')
+                    height = int_or_none(self._search_regex(
+                        r'^(\d+)[pP]$', quality or '', 'height', default=None))
+                    for node in asset:
+                        streamer = xpath_text(node, 'streamerURI', default=None)
+                        if not streamer:
+                            continue
+                        play_path = xpath_text(node, 'url', default=None)
+                        if not play_path:
+                            continue
                         formats.append({
-                            'url': x.find('streamerURI').text,
-                            'app': x.find('streamerURI').text.split('/', 3)[3],
-                            'play_path': x.find('url').text,
+                            'url': streamer,
+                            'app': streamer.split('/', 3)[3],
+                            'play_path': play_path,
                             'rtmp_live': False,
-                            'ext': 'mp4',
-                            'format_id': '%s-%s' % (x.tag, asset.get('quality')),
+                            'ext': 'flv',
+                            'format_id': '%s-%s' % (node.tag, quality) if quality else node.tag,
+                            'height': height,
                         })
+                self._sort_formats(formats)
+                description = clean_html(xpath_text(media, 'description'))
+                uploader = xpath_text(media, 'author')
+                duration = int_or_none(media.get('duration'))
+                for subtitle in media.findall('./subtitles/subtitle'):
+                    subtitle_url = xpath_text(subtitle, 'url', default=None)
+                    if not subtitle_url:
+                        continue
+                    lang = xpath_text(subtitle, 'lang', default='en')
+                    subtitles.setdefault(lang, []).append({
+                        'url': subtitle_url,
+                        'ext': (subtitle.get('format') or determine_ext(subtitle_url)).lower(),
+                    })
 
         return {
             'id': video_id,
             'title': title,
             'description': description,
-            'thumbnail': thumbnail,
+            'thumbnails': thumbnails,
             'duration': duration,
+            'creator': uploader,
             'uploader': uploader,
-            'uploader_id': uploader_id,
             'formats': formats,
+            'subtitles': subtitles,
         }
index ce065f2b086adbca9c551afeb0d2437a59248d88..4935002d0fcf3ec968d73e925a193425dc53dd40 100644 (file)
@@ -3,11 +3,12 @@ from __future__ import unicode_literals
 
 from .mtv import MTVServicesInfoExtractor
 from ..compat import compat_urllib_parse_urlencode
+from ..utils import update_url_query
 
 
 class NickIE(MTVServicesInfoExtractor):
     IE_NAME = 'nick.com'
-    _VALID_URL = r'https?://(?:www\.)?nick\.com/videos/clip/(?P<id>[^/?#.]+)'
+    _VALID_URL = r'https?://(?:www\.)?nick(?:jr)?\.com/(?:videos/clip|[^/]+/videos)/(?P<id>[^/?#.]+)'
     _FEED_URL = 'http://udat.mtvnservices.com/service1/dispatch.htm'
     _TESTS = [{
         'url': 'http://www.nick.com/videos/clip/alvinnn-and-the-chipmunks-112-full-episode.html',
@@ -51,6 +52,9 @@ class NickIE(MTVServicesInfoExtractor):
                 }
             },
         ],
+    }, {
+        'url': 'http://www.nickjr.com/paw-patrol/videos/pups-save-a-goldrush-s3-ep302-full-episode/',
+        'only_matching': True,
     }]
 
     def _get_feed_query(self, uri):
@@ -61,3 +65,26 @@ class NickIE(MTVServicesInfoExtractor):
 
     def _extract_mgid(self, webpage):
         return self._search_regex(r'data-contenturi="([^"]+)', webpage, 'mgid')
+
+
+class NickDeIE(MTVServicesInfoExtractor):
+    IE_NAME = 'nick.de'
+    _VALID_URL = r'https?://(?:www\.)?nick\.de/(?:playlist|shows)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'http://www.nick.de/playlist/3773-top-videos/videos/episode/17306-zu-wasser-und-zu-land-rauchende-erdnusse',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.nick.de/shows/342-icarly',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        mrss_url = update_url_query(self._search_regex(
+            r'data-mrss=(["\'])(?P<url>http.+?)\1', webpage, 'mrss url', group='url'),
+            {'siteKey': 'nick.de'})
+
+        return self._get_videos_info_from_url(mrss_url, video_id)
diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py
new file mode 100644 (file)
index 0000000..d889245
--- /dev/null
@@ -0,0 +1,55 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    parse_iso8601,
+    parse_duration,
+    ExtractorError
+)
+
+
+class NineCNineMediaIE(InfoExtractor):
+    _VALID_URL = r'9c9media:(?P<destination_code>[^:]+):(?P<id>\d+)'
+
+    def _real_extract(self, url):
+        destination_code, video_id = re.match(self._VALID_URL, url).groups()
+        api_base_url = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/' % (destination_code, video_id)
+        content = self._download_json(api_base_url, video_id, query={
+            '$include': '[contentpackages]',
+        })
+        title = content['Name']
+        if len(content['ContentPackages']) > 1:
+            raise ExtractorError('multiple content packages')
+        content_package = content['ContentPackages'][0]
+        stacks_base_url = api_base_url + 'contentpackages/%s/stacks/' % content_package['Id']
+        stacks = self._download_json(stacks_base_url, video_id)['Items']
+        if len(stacks) > 1:
+            raise ExtractorError('multiple stacks')
+        stack = stacks[0]
+        stack_base_url = '%s%s/manifest.' % (stacks_base_url, stack['Id'])
+        formats = []
+        formats.extend(self._extract_m3u8_formats(
+            stack_base_url + 'm3u8', video_id, 'mp4',
+            'm3u8_native', m3u8_id='hls', fatal=False))
+        formats.extend(self._extract_f4m_formats(
+            stack_base_url + 'f4m', video_id,
+            f4m_id='hds', fatal=False))
+        mp4_url = self._download_webpage(stack_base_url + 'pd', video_id, fatal=False)
+        if mp4_url:
+            formats.append({
+                'url': mp4_url,
+                'format_id': 'mp4',
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': content.get('Desc') or content.get('ShortDesc'),
+            'timestamp': parse_iso8601(content.get('BroadcastDateTime')),
+            'duration': parse_duration(content.get('BroadcastTime')),
+            'formats': formats,
+        }
index 77e09107299824f5ae4063817d73e505e893c2af..af44c3bb5714bc0079e3d1307782a8ff1fe5ba84 100644 (file)
@@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
+from .screenwavemedia import ScreenwaveMediaIE
 
 from ..utils import (
     unified_strdate,
@@ -12,7 +13,6 @@ class NormalbootsIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?normalboots\.com/video/(?P<id>[0-9a-z-]*)/?$'
     _TEST = {
         'url': 'http://normalboots.com/video/home-alone-games-jontron/',
-        'md5': '8bf6de238915dd501105b44ef5f1e0f6',
         'info_dict': {
             'id': 'home-alone-games-jontron',
             'ext': 'mp4',
@@ -22,9 +22,10 @@ class NormalbootsIE(InfoExtractor):
             'upload_date': '20140125',
         },
         'params': {
-            # rtmp download
+            # m3u8 download
             'skip_download': True,
         },
+        'add_ie': ['ScreenwaveMedia'],
     }
 
     def _real_extract(self, url):
@@ -38,16 +39,15 @@ class NormalbootsIE(InfoExtractor):
             r'<span style="text-transform:uppercase; font-size:inherit;">[A-Za-z]+, (?P<date>.*)</span>',
             webpage, 'date', fatal=False))
 
-        player_url = self._html_search_regex(
-            r'<iframe\swidth="[0-9]+"\sheight="[0-9]+"\ssrc="(?P<url>[\S]+)"',
-            webpage, 'player url')
-        player_page = self._download_webpage(player_url, video_id)
-        video_url = self._html_search_regex(
-            r"file:\s'(?P<file>[^']+\.mp4)'", player_page, 'file')
+        screenwavemedia_url = self._html_search_regex(
+            ScreenwaveMediaIE.EMBED_PATTERN, webpage, 'screenwave URL',
+            group='url')
 
         return {
+            '_type': 'url_transparent',
             'id': video_id,
-            'url': video_url,
+            'url': screenwavemedia_url,
+            'ie_key': ScreenwaveMediaIE.ie_key(),
             'title': self._og_search_title(webpage),
             'description': self._og_search_description(webpage),
             'thumbnail': self._og_search_thumbnail(webpage),
index 9df20082224f84099657d2c2415cb9b2e66df8b6..6ded5bd456fa86bf16e1762601889b46f2d68fe9 100644 (file)
@@ -4,90 +4,218 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..compat import (
-    compat_urlparse,
-    compat_urllib_parse_unquote,
-)
+from ..compat import compat_urllib_parse_unquote
 from ..utils import (
-    determine_ext,
     ExtractorError,
-    float_or_none,
+    int_or_none,
+    parse_age_limit,
     parse_duration,
-    unified_strdate,
 )
 
 
-class NRKIE(InfoExtractor):
-    _VALID_URL = r'(?:nrk:|https?://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)'
-
-    _TESTS = [
-        {
-            'url': 'http://www.nrk.no/video/PS*150533',
-            'md5': 'bccd850baebefe23b56d708a113229c2',
-            'info_dict': {
-                'id': '150533',
-                'ext': 'flv',
-                'title': 'Dompap og andre fugler i Piip-Show',
-                'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f',
-                'duration': 263,
-            }
-        },
-        {
-            'url': 'http://www.nrk.no/video/PS*154915',
-            'md5': '0b1493ba1aae7d9579a5ad5531bc395a',
-            'info_dict': {
-                'id': '154915',
-                'ext': 'flv',
-                'title': 'Slik høres internett ut når du er blind',
-                'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568',
-                'duration': 20,
-            }
-        },
-    ]
+class NRKBaseIE(InfoExtractor):
+    def _extract_formats(self, manifest_url, video_id, fatal=True):
+        formats = []
+        formats.extend(self._extract_f4m_formats(
+            manifest_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81',
+            video_id, f4m_id='hds', fatal=fatal))
+        formats.extend(self._extract_m3u8_formats(manifest_url.replace(
+            'akamaihd.net/z/', 'akamaihd.net/i/').replace('/manifest.f4m', '/master.m3u8'),
+            video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=fatal))
+        return formats
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
         data = self._download_json(
-            'http://v8.psapi.nrk.no/mediaelement/%s' % video_id,
-            video_id, 'Downloading media JSON')
+            'http://%s/mediaelement/%s' % (self._API_HOST, video_id),
+            video_id, 'Downloading mediaelement JSON')
+
+        title = data.get('fullTitle') or data.get('mainTitle') or data['title']
+        video_id = data.get('id') or video_id
+
+        entries = []
+
+        media_assets = data.get('mediaAssets')
+        if media_assets and isinstance(media_assets, list):
+            def video_id_and_title(idx):
+                return ((video_id, title) if len(media_assets) == 1
+                        else ('%s-%d' % (video_id, idx), '%s (Part %d)' % (title, idx)))
+            for num, asset in enumerate(media_assets, 1):
+                asset_url = asset.get('url')
+                if not asset_url:
+                    continue
+                formats = self._extract_formats(asset_url, video_id, fatal=False)
+                if not formats:
+                    continue
+                self._sort_formats(formats)
+                entry_id, entry_title = video_id_and_title(num)
+                duration = parse_duration(asset.get('duration'))
+                subtitles = {}
+                for subtitle in ('webVtt', 'timedText'):
+                    subtitle_url = asset.get('%sSubtitlesUrl' % subtitle)
+                    if subtitle_url:
+                        subtitles.setdefault('no', []).append({
+                            'url': compat_urllib_parse_unquote(subtitle_url)
+                        })
+                entries.append({
+                    'id': asset.get('carrierId') or entry_id,
+                    'title': entry_title,
+                    'duration': duration,
+                    'subtitles': subtitles,
+                    'formats': formats,
+                })
 
-        media_url = data.get('mediaUrl')
+        if not entries:
+            media_url = data.get('mediaUrl')
+            if media_url:
+                formats = self._extract_formats(media_url, video_id)
+                self._sort_formats(formats)
+                duration = parse_duration(data.get('duration'))
+                entries = [{
+                    'id': video_id,
+                    'title': title,
+                    'duration': duration,
+                    'formats': formats,
+                }]
 
-        if not media_url:
-            if data['usageRights']['isGeoBlocked']:
+        if not entries:
+            if data.get('usageRights', {}).get('isGeoBlocked'):
                 raise ExtractorError(
                     'NRK har ikke rettigheter til å vise dette programmet utenfor Norge',
                     expected=True)
 
-        if determine_ext(media_url) == 'f4m':
-            formats = self._extract_f4m_formats(
-                media_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81', video_id, f4m_id='hds')
-            self._sort_formats(formats)
-        else:
-            formats = [{
-                'url': media_url,
-                'ext': 'flv',
-            }]
-
-        duration = parse_duration(data.get('duration'))
+        conviva = data.get('convivaStatistics') or {}
+        series = conviva.get('seriesName') or data.get('seriesTitle')
+        episode = conviva.get('episodeName') or data.get('episodeNumberOrDate')
 
+        thumbnails = None
         images = data.get('images')
-        if images:
-            thumbnails = images['webImages']
-            thumbnails.sort(key=lambda image: image['pixelWidth'])
-            thumbnail = thumbnails[-1]['imageUrl']
-        else:
-            thumbnail = None
-
-        return {
-            'id': video_id,
-            'title': data['title'],
-            'description': data['description'],
-            'duration': duration,
-            'thumbnail': thumbnail,
-            'formats': formats,
+        if images and isinstance(images, dict):
+            web_images = images.get('webImages')
+            if isinstance(web_images, list):
+                thumbnails = [{
+                    'url': image['imageUrl'],
+                    'width': int_or_none(image.get('width')),
+                    'height': int_or_none(image.get('height')),
+                } for image in web_images if image.get('imageUrl')]
+
+        description = data.get('description')
+
+        common_info = {
+            'description': description,
+            'series': series,
+            'episode': episode,
+            'age_limit': parse_age_limit(data.get('legalAge')),
+            'thumbnails': thumbnails,
+        }
+
+        vcodec = 'none' if data.get('mediaType') == 'Audio' else None
+
+        # TODO: extract chapters when https://github.com/rg3/youtube-dl/pull/9409 is merged
+
+        for entry in entries:
+            entry.update(common_info)
+            for f in entry['formats']:
+                f['vcodec'] = vcodec
+
+        return self.playlist_result(entries, video_id, title, description)
+
+
+class NRKIE(NRKBaseIE):
+    _VALID_URL = r'(?:nrk:|https?://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)'
+    _API_HOST = 'v8.psapi.nrk.no'
+    _TESTS = [{
+        # video
+        'url': 'http://www.nrk.no/video/PS*150533',
+        'md5': '2f7f6eeb2aacdd99885f355428715cfa',
+        'info_dict': {
+            'id': '150533',
+            'ext': 'mp4',
+            'title': 'Dompap og andre fugler i Piip-Show',
+            'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f',
+            'duration': 263,
+        }
+    }, {
+        # audio
+        'url': 'http://www.nrk.no/video/PS*154915',
+        # MD5 is unstable
+        'info_dict': {
+            'id': '154915',
+            'ext': 'flv',
+            'title': 'Slik høres internett ut når du er blind',
+            'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568',
+            'duration': 20,
         }
+    }]
+
+
+class NRKTVIE(NRKBaseIE):
+    IE_DESC = 'NRK TV and NRK Radio'
+    _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?'
+    _API_HOST = 'psapi-we.nrk.no'
+
+    _TESTS = [{
+        'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
+        'md5': '4e9ca6629f09e588ed240fb11619922a',
+        'info_dict': {
+            'id': 'MUHH48000314AA',
+            'ext': 'mp4',
+            'title': '20 spørsmål 23.05.2014',
+            'description': 'md5:bdea103bc35494c143c6a9acdd84887a',
+            'duration': 1741,
+        },
+    }, {
+        'url': 'https://tv.nrk.no/program/mdfp15000514',
+        'md5': '43d0be26663d380603a9cf0c24366531',
+        'info_dict': {
+            'id': 'MDFP15000514CA',
+            'ext': 'mp4',
+            'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014',
+            'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db',
+            'duration': 4605,
+        },
+    }, {
+        # single playlist video
+        'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2',
+        'md5': 'adbd1dbd813edaf532b0a253780719c2',
+        'info_dict': {
+            'id': 'MSPO40010515-part2',
+            'ext': 'flv',
+            'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)',
+            'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
+        },
+        'skip': 'Only works from Norway',
+    }, {
+        'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015',
+        'playlist': [{
+            'md5': '9480285eff92d64f06e02a5367970a7a',
+            'info_dict': {
+                'id': 'MSPO40010515-part1',
+                'ext': 'flv',
+                'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 1:2)',
+                'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
+            },
+        }, {
+            'md5': 'adbd1dbd813edaf532b0a253780719c2',
+            'info_dict': {
+                'id': 'MSPO40010515-part2',
+                'ext': 'flv',
+                'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)',
+                'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
+            },
+        }],
+        'info_dict': {
+            'id': 'MSPO40010515',
+            'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn',
+            'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
+            'duration': 6947.52,
+        },
+        'skip': 'Only works from Norway',
+    }, {
+        'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#',
+        'only_matching': True,
+    }]
 
 
 class NRKPlaylistIE(InfoExtractor):
@@ -132,206 +260,34 @@ class NRKPlaylistIE(InfoExtractor):
 
 class NRKSkoleIE(InfoExtractor):
     IE_DESC = 'NRK Skole'
-    _VALID_URL = r'https?://(?:www\.)?nrk\.no/skole/klippdetalj?.*\btopic=(?P<id>[^/?#&]+)'
+    _VALID_URL = r'https?://(?:www\.)?nrk\.no/skole/?\?.*\bmediaId=(?P<id>\d+)'
 
     _TESTS = [{
-        'url': 'http://nrk.no/skole/klippdetalj?topic=nrk:klipp/616532',
-        'md5': '04cd85877cc1913bce73c5d28a47e00f',
+        'url': 'https://www.nrk.no/skole/?page=search&q=&mediaId=14099',
+        'md5': '6bc936b01f9dd8ed45bc58b252b2d9b6',
         'info_dict': {
             'id': '6021',
-            'ext': 'flv',
+            'ext': 'mp4',
             'title': 'Genetikk og eneggede tvillinger',
             'description': 'md5:3aca25dcf38ec30f0363428d2b265f8d',
             'duration': 399,
         },
     }, {
-        'url': 'http://www.nrk.no/skole/klippdetalj?topic=nrk%3Aklipp%2F616532#embed',
-        'only_matching': True,
-    }, {
-        'url': 'http://www.nrk.no/skole/klippdetalj?topic=urn:x-mediadb:21379',
+        'url': 'https://www.nrk.no/skole/?page=objectives&subject=naturfag&objective=K15114&mediaId=19355',
         'only_matching': True,
     }]
 
     def _real_extract(self, url):
-        video_id = compat_urllib_parse_unquote(self._match_id(url))
-
-        webpage = self._download_webpage(url, video_id)
-
-        nrk_id = self._search_regex(r'data-nrk-id=["\'](\d+)', webpage, 'nrk id')
-        return self.url_result('nrk:%s' % nrk_id)
-
-
-class NRKTVIE(InfoExtractor):
-    IE_DESC = 'NRK TV and NRK Radio'
-    _VALID_URL = r'(?P<baseurl>https?://(?:tv|radio)\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?'
-
-    _TESTS = [
-        {
-            'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
-            'info_dict': {
-                'id': 'MUHH48000314',
-                'ext': 'mp4',
-                'title': '20 spørsmål',
-                'description': 'md5:bdea103bc35494c143c6a9acdd84887a',
-                'upload_date': '20140523',
-                'duration': 1741.52,
-            },
-            'params': {
-                # m3u8 download
-                'skip_download': True,
-            },
-        },
-        {
-            'url': 'https://tv.nrk.no/program/mdfp15000514',
-            'info_dict': {
-                'id': 'mdfp15000514',
-                'ext': 'mp4',
-                'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting',
-                'description': 'md5:654c12511f035aed1e42bdf5db3b206a',
-                'upload_date': '20140524',
-                'duration': 4605.08,
-            },
-            'params': {
-                # m3u8 download
-                'skip_download': True,
-            },
-        },
-        {
-            # single playlist video
-            'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2',
-            'md5': 'adbd1dbd813edaf532b0a253780719c2',
-            'info_dict': {
-                'id': 'MSPO40010515-part2',
-                'ext': 'flv',
-                'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)',
-                'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
-                'upload_date': '20150106',
-            },
-            'skip': 'Only works from Norway',
-        },
-        {
-            'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015',
-            'playlist': [
-                {
-                    'md5': '9480285eff92d64f06e02a5367970a7a',
-                    'info_dict': {
-                        'id': 'MSPO40010515-part1',
-                        'ext': 'flv',
-                        'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 1:2)',
-                        'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
-                        'upload_date': '20150106',
-                    },
-                },
-                {
-                    'md5': 'adbd1dbd813edaf532b0a253780719c2',
-                    'info_dict': {
-                        'id': 'MSPO40010515-part2',
-                        'ext': 'flv',
-                        'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)',
-                        'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
-                        'upload_date': '20150106',
-                    },
-                },
-            ],
-            'info_dict': {
-                'id': 'MSPO40010515',
-                'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn',
-                'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
-                'upload_date': '20150106',
-                'duration': 6947.5199999999995,
-            },
-            'skip': 'Only works from Norway',
-        },
-        {
-            'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#',
-            'only_matching': True,
-        }
-    ]
+        video_id = self._match_id(url)
 
-    def _extract_f4m(self, manifest_url, video_id):
-        return self._extract_f4m_formats(
-            manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id, f4m_id='hds')
+        webpage = self._download_webpage(
+            'https://mimir.nrk.no/plugin/1.0/static?mediaId=%s' % video_id,
+            video_id)
 
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        part_id = mobj.group('part_id')
-        base_url = mobj.group('baseurl')
-
-        webpage = self._download_webpage(url, video_id)
-
-        title = self._html_search_meta(
-            'title', webpage, 'title')
-        description = self._html_search_meta(
-            'description', webpage, 'description')
-
-        thumbnail = self._html_search_regex(
-            r'data-posterimage="([^"]+)"',
-            webpage, 'thumbnail', fatal=False)
-        upload_date = unified_strdate(self._html_search_meta(
-            'rightsfrom', webpage, 'upload date', fatal=False))
-        duration = float_or_none(self._html_search_regex(
-            r'data-duration="([^"]+)"',
-            webpage, 'duration', fatal=False))
-
-        # playlist
-        parts = re.findall(
-            r'<a href="#del=(\d+)"[^>]+data-argument="([^"]+)">([^<]+)</a>', webpage)
-        if parts:
-            entries = []
-            for current_part_id, stream_url, part_title in parts:
-                if part_id and current_part_id != part_id:
-                    continue
-                video_part_id = '%s-part%s' % (video_id, current_part_id)
-                formats = self._extract_f4m(stream_url, video_part_id)
-                entries.append({
-                    'id': video_part_id,
-                    'title': part_title,
-                    'description': description,
-                    'thumbnail': thumbnail,
-                    'upload_date': upload_date,
-                    'formats': formats,
-                })
-            if part_id:
-                if entries:
-                    return entries[0]
-            else:
-                playlist = self.playlist_result(entries, video_id, title, description)
-                playlist.update({
-                    'thumbnail': thumbnail,
-                    'upload_date': upload_date,
-                    'duration': duration,
-                })
-                return playlist
-
-        formats = []
+        nrk_id = self._parse_json(
+            self._search_regex(
+                r'<script[^>]+type=["\']application/json["\'][^>]*>({.+?})</script>',
+                webpage, 'application json'),
+            video_id)['activeMedia']['psId']
 
-        f4m_url = re.search(r'data-media="([^"]+)"', webpage)
-        if f4m_url:
-            formats.extend(self._extract_f4m(f4m_url.group(1), video_id))
-
-        m3u8_url = re.search(r'data-hls-media="([^"]+)"', webpage)
-        if m3u8_url:
-            formats.extend(self._extract_m3u8_formats(m3u8_url.group(1), video_id, 'mp4', m3u8_id='hls'))
-        self._sort_formats(formats)
-
-        subtitles_url = self._html_search_regex(
-            r'data-subtitlesurl\s*=\s*(["\'])(?P<url>.+?)\1',
-            webpage, 'subtitle URL', default=None, group='url')
-        subtitles = {}
-        if subtitles_url:
-            subtitles['no'] = [{
-                'ext': 'ttml',
-                'url': compat_urlparse.urljoin(base_url, subtitles_url),
-            }]
-
-        return {
-            'id': video_id,
-            'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
-            'upload_date': upload_date,
-            'duration': duration,
-            'formats': formats,
-            'subtitles': subtitles,
-        }
+        return self.url_result('nrk:%s' % nrk_id)
index 9fa7cefadc79ef1d8bda971dc52483a0b8d998eb..ab6bfcd7f485218d19034930607d35d1665e7406 100644 (file)
@@ -5,8 +5,6 @@ import re
 from .common import InfoExtractor
 from ..utils import (
     parse_duration,
-    sanitized_Request,
-    unified_strdate,
 )
 
 
@@ -20,7 +18,6 @@ class NuvidIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'Horny babes show their awesome bodeis and',
             'duration': 129,
-            'upload_date': '20140508',
             'age_limit': 18,
         }
     }
@@ -28,28 +25,31 @@ class NuvidIE(InfoExtractor):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        formats = []
+        page_url = 'http://m.nuvid.com/video/%s' % video_id
+        webpage = self._download_webpage(
+            page_url, video_id, 'Downloading video page')
+        # When dwnld_speed exists and has a value larger than the MP4 file's
+        # bitrate, Nuvid returns the MP4 URL
+        # It's unit is 100bytes/millisecond, see mobile-nuvid-min.js for the algorithm
+        self._set_cookie('nuvid.com', 'dwnld_speed', '10.0')
+        mp4_webpage = self._download_webpage(
+            page_url, video_id, 'Downloading video page for MP4 format')
 
-        for dwnld_speed, format_id in [(0, '3gp'), (5, 'mp4')]:
-            request = sanitized_Request(
-                'http://m.nuvid.com/play/%s' % video_id)
-            request.add_header('Cookie', 'skip_download_page=1; dwnld_speed=%d; adv_show=1' % dwnld_speed)
-            webpage = self._download_webpage(
-                request, video_id, 'Downloading %s page' % format_id)
-            video_url = self._html_search_regex(
-                r'<a\s+href="([^"]+)"\s+class="b_link">', webpage, '%s video URL' % format_id, fatal=False)
-            if not video_url:
-                continue
+        html5_video_re = r'(?s)<(?:video|audio)[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']',
+        video_url = self._html_search_regex(html5_video_re, webpage, video_id)
+        mp4_video_url = self._html_search_regex(html5_video_re, mp4_webpage, video_id)
+        formats = [{
+            'url': video_url,
+        }]
+        if mp4_video_url != video_url:
             formats.append({
-                'url': video_url,
-                'format_id': format_id,
+                'url': mp4_video_url,
             })
 
-        webpage = self._download_webpage(
-            'http://m.nuvid.com/video/%s' % video_id, video_id, 'Downloading video page')
         title = self._html_search_regex(
             [r'<span title="([^"]+)">',
-             r'<div class="thumb-holder video">\s*<h5[^>]*>([^<]+)</h5>'], webpage, 'title').strip()
+             r'<div class="thumb-holder video">\s*<h5[^>]*>([^<]+)</h5>',
+             r'<span[^>]+class="title_thumb">([^<]+)</span>'], webpage, 'title').strip()
         thumbnails = [
             {
                 'url': thumb_url,
@@ -57,9 +57,8 @@ class NuvidIE(InfoExtractor):
         ]
         thumbnail = thumbnails[0]['url'] if thumbnails else None
         duration = parse_duration(self._html_search_regex(
-            r'<i class="fa fa-clock-o"></i>\s*(\d{2}:\d{2})', webpage, 'duration', fatal=False))
-        upload_date = unified_strdate(self._html_search_regex(
-            r'<i class="fa fa-user"></i>\s*(\d{4}-\d{2}-\d{2})', webpage, 'upload date', fatal=False))
+            [r'<i class="fa fa-clock-o"></i>\s*(\d{2}:\d{2})',
+             r'<span[^>]+class="view_time">([^<]+)</span>'], webpage, 'duration', fatal=False))
 
         return {
             'id': video_id,
@@ -67,7 +66,6 @@ class NuvidIE(InfoExtractor):
             'thumbnails': thumbnails,
             'thumbnail': thumbnail,
             'duration': duration,
-            'upload_date': upload_date,
             'age_limit': 18,
             'formats': formats,
         }
index f9e064a60e445668200b759ca4e0ad1a6f7c28ab..986708e75e45f7f24f656f319767e6adbd9504ea 100644 (file)
@@ -2,7 +2,11 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..compat import compat_urllib_parse_unquote
+from ..compat import (
+    compat_parse_qs,
+    compat_urllib_parse_unquote,
+    compat_urllib_parse_urlparse,
+)
 from ..utils import (
     ExtractorError,
     unified_strdate,
@@ -32,7 +36,7 @@ class OdnoklassnikiIE(InfoExtractor):
         'skip': 'Video has been blocked',
     }, {
         # metadataUrl
-        'url': 'http://ok.ru/video/63567059965189-0',
+        'url': 'http://ok.ru/video/63567059965189-0?fromTime=5',
         'md5': '9676cf86eff5391d35dea675d224e131',
         'info_dict': {
             'id': '63567059965189-0',
@@ -44,6 +48,7 @@ class OdnoklassnikiIE(InfoExtractor):
             'uploader': '☭ Андрей Мещанинов ☭',
             'like_count': int,
             'age_limit': 0,
+            'start_time': 5,
         },
     }, {
         # YouTube embed (metadataUrl, provider == USER_YOUTUBE)
@@ -60,6 +65,22 @@ class OdnoklassnikiIE(InfoExtractor):
             'uploader': 'Алина П',
             'age_limit': 0,
         },
+    }, {
+        # YouTube embed (metadata, provider == USER_YOUTUBE, no metadata.movie.title field)
+        'url': 'http://ok.ru/video/62036049272859-0',
+        'info_dict': {
+            'id': '62036049272859-0',
+            'ext': 'mp4',
+            'title': 'МУЗЫКА     ДОЖДЯ .',
+            'description': 'md5:6f1867132bd96e33bf53eda1091e8ed0',
+            'upload_date': '20120106',
+            'uploader_id': '473534735899',
+            'uploader': 'МARINA D',
+            'age_limit': 0,
+        },
+        'params': {
+            'skip_download': True,
+        },
     }, {
         'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452',
         'only_matching': True,
@@ -78,6 +99,9 @@ class OdnoklassnikiIE(InfoExtractor):
     }]
 
     def _real_extract(self, url):
+        start_time = int_or_none(compat_parse_qs(
+            compat_urllib_parse_urlparse(url).query).get('fromTime', [None])[0])
+
         video_id = self._match_id(url)
 
         webpage = self._download_webpage(
@@ -106,7 +130,14 @@ class OdnoklassnikiIE(InfoExtractor):
                 video_id, 'Downloading metadata JSON')
 
         movie = metadata['movie']
-        title = movie['title']
+
+        # Some embedded videos may not contain title in movie dict (e.g.
+        # http://ok.ru/video/62036049272859-0) thus we allow missing title
+        # here and it's going to be extracted later by an extractor that
+        # will process the actual embed.
+        provider = metadata.get('provider')
+        title = movie['title'] if provider == 'UPLOADED_ODKL' else movie.get('title')
+
         thumbnail = movie.get('poster')
         duration = int_or_none(movie.get('duration'))
 
@@ -135,9 +166,10 @@ class OdnoklassnikiIE(InfoExtractor):
             'uploader_id': uploader_id,
             'like_count': like_count,
             'age_limit': age_limit,
+            'start_time': start_time,
         }
 
-        if metadata.get('provider') == 'USER_YOUTUBE':
+        if provider == 'USER_YOUTUBE':
             info.update({
                 '_type': 'url_transparent',
                 'url': movie['contentId'],
diff --git a/youtube_dl/extractor/onet.py b/youtube_dl/extractor/onet.py
new file mode 100644 (file)
index 0000000..402d3a9
--- /dev/null
@@ -0,0 +1,172 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    float_or_none,
+    get_element_by_class,
+    int_or_none,
+    js_to_json,
+    parse_iso8601,
+    remove_start,
+    strip_or_none,
+    url_basename,
+)
+
+
+class OnetBaseIE(InfoExtractor):
+    def _search_mvp_id(self, webpage):
+        return self._search_regex(
+            r'id=(["\'])mvp:(?P<id>.+?)\1', webpage, 'mvp id', group='id')
+
+    def _extract_from_id(self, video_id, webpage):
+        response = self._download_json(
+            'http://qi.ckm.onetapi.pl/', video_id,
+            query={
+                'body[id]': video_id,
+                'body[jsonrpc]': '2.0',
+                'body[method]': 'get_asset_detail',
+                'body[params][ID_Publikacji]': video_id,
+                'body[params][Service]': 'www.onet.pl',
+                'content-type': 'application/jsonp',
+                'x-onet-app': 'player.front.onetapi.pl',
+            })
+
+        error = response.get('error')
+        if error:
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, error['message']), expected=True)
+
+        video = response['result'].get('0')
+
+        formats = []
+        for _, formats_dict in video['formats'].items():
+            if not isinstance(formats_dict, dict):
+                continue
+            for format_id, format_list in formats_dict.items():
+                if not isinstance(format_list, list):
+                    continue
+                for f in format_list:
+                    video_url = f.get('url')
+                    if not video_url:
+                        continue
+                    ext = determine_ext(video_url)
+                    if format_id == 'ism':
+                        # TODO: Support Microsoft Smooth Streaming
+                        continue
+                    elif ext == 'mpd':
+                        # TODO: Current DASH formats are broken - $Time$ pattern in
+                        # <SegmentTemplate> not implemented yet
+                        # formats.extend(self._extract_mpd_formats(
+                        #    video_url, video_id, mpd_id='dash', fatal=False))
+                        continue
+                    else:
+                        formats.append({
+                            'url': video_url,
+                            'format_id': format_id,
+                            'height': int_or_none(f.get('vertical_resolution')),
+                            'width': int_or_none(f.get('horizontal_resolution')),
+                            'abr': float_or_none(f.get('audio_bitrate')),
+                            'vbr': float_or_none(f.get('video_bitrate')),
+                        })
+        self._sort_formats(formats)
+
+        meta = video.get('meta', {})
+
+        title = self._og_search_title(webpage, default=None) or meta['title']
+        description = self._og_search_description(webpage, default=None) or meta.get('description')
+        duration = meta.get('length') or meta.get('lenght')
+        timestamp = parse_iso8601(meta.get('addDate'), ' ')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'timestamp': timestamp,
+            'formats': formats,
+        }
+
+
+class OnetIE(OnetBaseIE):
+    _VALID_URL = 'https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/(?P<display_id>[0-9a-z-]+)/(?P<id>[0-9a-z]+)'
+    IE_NAME = 'onet.tv'
+
+    _TEST = {
+        'url': 'http://onet.tv/k/openerfestival/open-er-festival-2016-najdziwniejsze-wymagania-gwiazd/qbpyqc',
+        'md5': 'e3ffbf47590032ac3f27249204173d50',
+        'info_dict': {
+            'id': 'qbpyqc',
+            'display_id': 'open-er-festival-2016-najdziwniejsze-wymagania-gwiazd',
+            'ext': 'mp4',
+            'title': 'Open\'er Festival 2016: najdziwniejsze wymagania gwiazd',
+            'description': 'Trzy samochody, których nigdy nie użyto, prywatne spa, hotel dekorowany czarnym suknem czy nielegalne używki. Organizatorzy koncertów i festiwali muszą stawać przed nie lada wyzwaniem zapraszając gwia...',
+            'upload_date': '20160705',
+            'timestamp': 1467721580,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id, video_id = mobj.group('display_id', 'id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        mvp_id = self._search_mvp_id(webpage)
+
+        info_dict = self._extract_from_id(mvp_id, webpage)
+        info_dict.update({
+            'id': video_id,
+            'display_id': display_id,
+        })
+
+        return info_dict
+
+
+class OnetChannelIE(OnetBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?onet\.tv/[a-z]/(?P<id>[a-z]+)(?:[?#]|$)'
+    IE_NAME = 'onet.tv:channel'
+
+    _TEST = {
+        'url': 'http://onet.tv/k/openerfestival',
+        'info_dict': {
+            'id': 'openerfestival',
+            'title': 'Open\'er Festival Live',
+            'description': 'Dziękujemy, że oglądaliście transmisje. Zobaczcie nasze relacje i wywiady z artystami.',
+        },
+        'playlist_mincount': 46,
+    }
+
+    def _real_extract(self, url):
+        channel_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, channel_id)
+
+        current_clip_info = self._parse_json(self._search_regex(
+            r'var\s+currentClip\s*=\s*({[^}]+})', webpage, 'video info'), channel_id,
+            transform_source=lambda s: js_to_json(re.sub(r'\'\s*\+\s*\'', '', s)))
+        video_id = remove_start(current_clip_info['ckmId'], 'mvp:')
+        video_name = url_basename(current_clip_info['url'])
+
+        if self._downloader.params.get('noplaylist'):
+            self.to_screen(
+                'Downloading just video %s because of --no-playlist' % video_name)
+            return self._extract_from_id(video_id, webpage)
+
+        self.to_screen(
+            'Downloading channel %s - add --no-playlist to just download video %s' % (
+                channel_id, video_name))
+        matches = re.findall(
+            r'<a[^>]+href=[\'"](https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/[0-9a-z-]+/[0-9a-z]+)',
+            webpage)
+        entries = [
+            self.url_result(video_link, OnetIE.ie_key())
+            for video_link in matches]
+
+        channel_title = strip_or_none(get_element_by_class('o_channelName', webpage))
+        channel_description = strip_or_none(get_element_by_class('o_channelDesc', webpage))
+        return self.playlist_result(entries, channel_id, channel_title, channel_description)
index 6e843c327603f19560e2bb35a2d5d462fe4516c3..6fb1a3fcc0bd565677b232adcb883b3649715dde 100644 (file)
@@ -7,6 +7,8 @@ from .common import InfoExtractor
 from ..utils import (
     determine_ext,
     int_or_none,
+    float_or_none,
+    mimetype2ext,
 )
 
 
@@ -15,15 +17,14 @@ class OnionStudiosIE(InfoExtractor):
 
     _TESTS = [{
         'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937',
-        'md5': 'd4851405d31adfadf71cd7a487b765bb',
+        'md5': 'e49f947c105b8a78a675a0ee1bddedfe',
         'info_dict': {
             'id': '2937',
             'ext': 'mp4',
             'title': 'Hannibal charges forward, stops for a cocktail',
-            'description': 'md5:e786add7f280b7f0fe237b64cc73df76',
             'thumbnail': 're:^https?://.*\.jpg$',
             'uploader': 'The A.V. Club',
-            'uploader_id': 'TheAVClub',
+            'uploader_id': 'the-av-club',
         },
     }, {
         'url': 'http://www.onionstudios.com/embed?id=2855&autoplay=true',
@@ -40,50 +41,38 @@ class OnionStudiosIE(InfoExtractor):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        webpage = self._download_webpage(
-            'http://www.onionstudios.com/embed?id=%s' % video_id, video_id)
+        video_data = self._download_json(
+            'http://www.onionstudios.com/video/%s.json' % video_id, video_id)
+
+        title = video_data['title']
 
         formats = []
-        for src in re.findall(r'<source[^>]+src="([^"]+)"', webpage):
-            ext = determine_ext(src)
+        for source in video_data.get('sources', []):
+            source_url = source.get('url')
+            if not source_url:
+                continue
+            ext = mimetype2ext(source.get('content_type')) or determine_ext(source_url)
             if ext == 'm3u8':
                 formats.extend(self._extract_m3u8_formats(
-                    src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+                    source_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
             else:
-                height = int_or_none(self._search_regex(
-                    r'/(\d+)\.%s' % ext, src, 'height', default=None))
+                tbr = int_or_none(source.get('bitrate'))
                 formats.append({
-                    'format_id': ext + ('-%sp' % height if height else ''),
-                    'url': src,
-                    'height': height,
+                    'format_id': ext + ('-%d' % tbr if tbr else ''),
+                    'url': source_url,
+                    'width': int_or_none(source.get('width')),
+                    'tbr': tbr,
                     'ext': ext,
-                    'preference': 1,
                 })
         self._sort_formats(formats)
 
-        title = self._search_regex(
-            r'share_title\s*=\s*(["\'])(?P<title>[^\1]+?)\1',
-            webpage, 'title', group='title')
-        description = self._search_regex(
-            r'share_description\s*=\s*(["\'])(?P<description>[^\1]+?)\1',
-            webpage, 'description', default=None, group='description')
-        thumbnail = self._search_regex(
-            r'poster\s*=\s*(["\'])(?P<thumbnail>[^\1]+?)\1',
-            webpage, 'thumbnail', default=False, group='thumbnail')
-
-        uploader_id = self._search_regex(
-            r'twitter_handle\s*=\s*(["\'])(?P<uploader_id>[^\1]+?)\1',
-            webpage, 'uploader id', fatal=False, group='uploader_id')
-        uploader = self._search_regex(
-            r'window\.channelName\s*=\s*(["\'])Embedded:(?P<uploader>[^\1]+?)\1',
-            webpage, 'uploader', default=False, group='uploader')
-
         return {
             'id': video_id,
             'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
-            'uploader': uploader,
-            'uploader_id': uploader_id,
+            'thumbnail': video_data.get('poster_url'),
+            'uploader': video_data.get('channel_name'),
+            'uploader_id': video_data.get('channel_slug'),
+            'duration': float_or_none(video_data.get('duration', 1000)),
+            'tags': video_data.get('tags'),
             'formats': formats,
         }
index 16f040191aa31bd9e8dd49b37a42085c2b340582..2038a6ba5001283e786905a23c429d2418762515 100644 (file)
@@ -8,6 +8,7 @@ from ..utils import (
     float_or_none,
     ExtractorError,
     unsmuggle_url,
+    determine_ext,
 )
 from ..compat import compat_urllib_parse_urlencode
 
@@ -15,71 +16,80 @@ from ..compat import compat_urllib_parse_urlencode
 class OoyalaBaseIE(InfoExtractor):
     _PLAYER_BASE = 'http://player.ooyala.com/'
     _CONTENT_TREE_BASE = _PLAYER_BASE + 'player_api/v1/content_tree/'
-    _AUTHORIZATION_URL_TEMPLATE = _PLAYER_BASE + 'sas/player_api/v1/authorization/embed_code/%s/%s?'
+    _AUTHORIZATION_URL_TEMPLATE = _PLAYER_BASE + 'sas/player_api/v2/authorization/embed_code/%s/%s?'
 
     def _extract(self, content_tree_url, video_id, domain='example.org'):
         content_tree = self._download_json(content_tree_url, video_id)['content_tree']
         metadata = content_tree[list(content_tree)[0]]
         embed_code = metadata['embed_code']
         pcode = metadata.get('asset_pcode') or embed_code
-        video_info = {
-            'id': embed_code,
-            'title': metadata['title'],
-            'description': metadata.get('description'),
-            'thumbnail': metadata.get('thumbnail_image') or metadata.get('promo_image'),
-            'duration': float_or_none(metadata.get('duration'), 1000),
-        }
+        title = metadata['title']
+
+        auth_data = self._download_json(
+            self._AUTHORIZATION_URL_TEMPLATE % (pcode, embed_code) +
+            compat_urllib_parse_urlencode({
+                'domain': domain,
+                'supportedFormats': 'mp4,rtmp,m3u8,hds',
+            }), video_id)
+
+        cur_auth_data = auth_data['authorization_data'][embed_code]
 
         urls = []
         formats = []
-        for supported_format in ('mp4', 'm3u8', 'hds', 'rtmp'):
-            auth_data = self._download_json(
-                self._AUTHORIZATION_URL_TEMPLATE % (pcode, embed_code) +
-                compat_urllib_parse_urlencode({
-                    'domain': domain,
-                    'supportedFormats': supported_format
-                }),
-                video_id, 'Downloading %s JSON' % supported_format)
-
-            cur_auth_data = auth_data['authorization_data'][embed_code]
-
-            if cur_auth_data['authorized']:
-                for stream in cur_auth_data['streams']:
-                    url = base64.b64decode(
-                        stream['url']['data'].encode('ascii')).decode('utf-8')
-                    if url in urls:
-                        continue
-                    urls.append(url)
-                    delivery_type = stream['delivery_type']
-                    if delivery_type == 'hls' or '.m3u8' in url:
-                        formats.extend(self._extract_m3u8_formats(
-                            url, embed_code, 'mp4', 'm3u8_native',
-                            m3u8_id='hls', fatal=False))
-                    elif delivery_type == 'hds' or '.f4m' in url:
-                        formats.extend(self._extract_f4m_formats(
-                            url + '?hdcore=3.7.0', embed_code, f4m_id='hds', fatal=False))
-                    elif '.smil' in url:
-                        formats.extend(self._extract_smil_formats(
-                            url, embed_code, fatal=False))
-                    else:
-                        formats.append({
-                            'url': url,
-                            'ext': stream.get('delivery_type'),
-                            'vcodec': stream.get('video_codec'),
-                            'format_id': delivery_type,
-                            'width': int_or_none(stream.get('width')),
-                            'height': int_or_none(stream.get('height')),
-                            'abr': int_or_none(stream.get('audio_bitrate')),
-                            'vbr': int_or_none(stream.get('video_bitrate')),
-                            'fps': float_or_none(stream.get('framerate')),
-                        })
-            else:
-                raise ExtractorError('%s said: %s' % (
-                    self.IE_NAME, cur_auth_data['message']), expected=True)
+        if cur_auth_data['authorized']:
+            for stream in cur_auth_data['streams']:
+                s_url = base64.b64decode(
+                    stream['url']['data'].encode('ascii')).decode('utf-8')
+                if s_url in urls:
+                    continue
+                urls.append(s_url)
+                ext = determine_ext(s_url, None)
+                delivery_type = stream['delivery_type']
+                if delivery_type == 'hls' or ext == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        s_url, embed_code, 'mp4', 'm3u8_native',
+                        m3u8_id='hls', fatal=False))
+                elif delivery_type == 'hds' or ext == 'f4m':
+                    formats.extend(self._extract_f4m_formats(
+                        s_url + '?hdcore=3.7.0', embed_code, f4m_id='hds', fatal=False))
+                elif ext == 'smil':
+                    formats.extend(self._extract_smil_formats(
+                        s_url, embed_code, fatal=False))
+                else:
+                    formats.append({
+                        'url': s_url,
+                        'ext': ext or stream.get('delivery_type'),
+                        'vcodec': stream.get('video_codec'),
+                        'format_id': delivery_type,
+                        'width': int_or_none(stream.get('width')),
+                        'height': int_or_none(stream.get('height')),
+                        'abr': int_or_none(stream.get('audio_bitrate')),
+                        'vbr': int_or_none(stream.get('video_bitrate')),
+                        'fps': float_or_none(stream.get('framerate')),
+                    })
+        else:
+            raise ExtractorError('%s said: %s' % (
+                self.IE_NAME, cur_auth_data['message']), expected=True)
         self._sort_formats(formats)
 
-        video_info['formats'] = formats
-        return video_info
+        subtitles = {}
+        for lang, sub in metadata.get('closed_captions_vtt', {}).get('captions', {}).items():
+            sub_url = sub.get('url')
+            if not sub_url:
+                continue
+            subtitles[lang] = [{
+                'url': sub_url,
+            }]
+
+        return {
+            'id': embed_code,
+            'title': title,
+            'description': metadata.get('description'),
+            'thumbnail': metadata.get('thumbnail_image') or metadata.get('promo_image'),
+            'duration': float_or_none(metadata.get('duration'), 1000),
+            'subtitles': subtitles,
+            'formats': formats,
+        }
 
 
 class OoyalaIE(OoyalaBaseIE):
@@ -96,6 +106,8 @@ class OoyalaIE(OoyalaBaseIE):
                 'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.',
                 'duration': 853.386,
             },
+            # The video in the original webpage now uses PlayWire
+            'skip': 'Ooyala said: movie expired',
         }, {
             # Only available for ipad
             'url': 'http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0',
index 4468f31fcae074090346d134180fee98752b7822..6415b8fdcb7451e6b499d1d4d3515af2775c66c2 100644 (file)
@@ -6,13 +6,15 @@ import re
 from .common import InfoExtractor
 from ..compat import compat_chr
 from ..utils import (
+    determine_ext,
     encode_base_n,
     ExtractorError,
+    mimetype2ext,
 )
 
 
 class OpenloadIE(InfoExtractor):
-    _VALID_URL = r'https://openload.(?:co|io)/(?:f|embed)/(?P<id>[a-zA-Z0-9-]+)'
+    _VALID_URL = r'https://openload.(?:co|io)/(?:f|embed)/(?P<id>[a-zA-Z0-9-_]+)'
 
     _TESTS = [{
         'url': 'https://openload.co/f/kUEfGclsU9o',
@@ -29,6 +31,14 @@ class OpenloadIE(InfoExtractor):
     }, {
         'url': 'https://openload.io/f/ZAn6oz-VZGE/',
         'only_matching': True,
+    }, {
+        'url': 'https://openload.co/f/_-ztPaZtMhM/',
+        'only_matching': True,
+    }, {
+        # unavailable via https://openload.co/f/Sxz5sADo82g/, different layout
+        # for title and ext
+        'url': 'https://openload.co/embed/Sxz5sADo82g/',
+        'only_matching': True,
     }]
 
     @staticmethod
@@ -93,15 +103,28 @@ class OpenloadIE(InfoExtractor):
             raise ExtractorError('File not found', expected=True)
 
         code = self._search_regex(
-            r'<video[^>]+>\s*<script[^>]+>([^<]+)</script>',
+            r'</video>\s*</div>\s*<script[^>]+>[^>]+</script>\s*<script[^>]+>([^<]+)</script>',
             webpage, 'JS code')
 
+        decoded = self.openload_decode(code)
+
         video_url = self._search_regex(
-            r'return\s+"(https?://[^"]+)"', self.openload_decode(code), 'video URL')
+            r'return\s+"(https?://[^"]+)"', decoded, 'video URL')
+
+        title = self._og_search_title(webpage, default=None) or self._search_regex(
+            r'<span[^>]+class=["\']title["\'][^>]*>([^<]+)', webpage,
+            'title', default=None) or self._html_search_meta(
+            'description', webpage, 'title', fatal=True)
+
+        ext = mimetype2ext(self._search_regex(
+            r'window\.vt\s*=\s*(["\'])(?P<mimetype>.+?)\1', decoded,
+            'mimetype', default=None, group='mimetype')) or determine_ext(
+            video_url, 'mp4')
 
         return {
             'id': video_id,
-            'title': self._og_search_title(webpage),
-            'thumbnail': self._og_search_thumbnail(webpage),
+            'title': title,
+            'ext': ext,
+            'thumbnail': self._og_search_thumbnail(webpage, default=None),
             'url': video_url,
         }
index 8545fb1b88cbf29ae1acb999566ee9041335dc4a..1d42be39b3303c95952a8ec54a34abbb9d09f0b1 100644 (file)
@@ -12,8 +12,8 @@ from ..utils import (
 
 
 class OraTVIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?ora\.tv/([^/]+/)*(?P<id>[^/\?#]+)'
-    _TEST = {
+    _VALID_URL = r'https?://(?:www\.)?(?:ora\.tv|unsafespeech\.com)/([^/]+/)*(?P<id>[^/\?#]+)'
+    _TESTS = [{
         'url': 'https://www.ora.tv/larrykingnow/2015/12/16/vine-youtube-stars-zach-king-king-bach-on-their-viral-videos-0_36jupg6090pq',
         'md5': 'fa33717591c631ec93b04b0e330df786',
         'info_dict': {
@@ -22,7 +22,10 @@ class OraTVIE(InfoExtractor):
             'title': 'Vine & YouTube Stars Zach King & King Bach On Their Viral Videos!',
             'description': 'md5:ebbc5b1424dd5dba7be7538148287ac1',
         }
-    }
+    }, {
+        'url': 'http://www.unsafespeech.com/video/2016/5/10/student-self-censorship-and-the-thought-police-on-university-campuses-0_6622bnkppw4d',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         display_id = self._match_id(url)
index 66c75f8b3559752127c091d437e4764b7c722e9d..4e3864f0d7bd1f6b63c560e8b04ae38844a283f9 100644 (file)
@@ -185,6 +185,7 @@ class ORFFM4IE(InfoExtractor):
             'timestamp': 1452456073,
             'upload_date': '20160110',
         },
+        'skip': 'Live streams on FM4 got deleted soon',
     }
 
     def _real_extract(self, url):
index f43e3a146e7bd35d9a99ab730289f4a1d4f5b91c..f6f423597fe4952427f226fe276e17d2539eaddc 100644 (file)
@@ -196,7 +196,7 @@ class PBSIE(InfoExtractor):
     _TESTS = [
         {
             'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/',
-            'md5': 'ce1888486f0908d555a8093cac9a7362',
+            'md5': '173dc391afd361fa72eab5d3d918968d',
             'info_dict': {
                 'id': '2365006249',
                 'ext': 'mp4',
@@ -204,13 +204,10 @@ class PBSIE(InfoExtractor):
                 'description': 'md5:36f341ae62e251b8f5bd2b754b95a071',
                 'duration': 3190,
             },
-            'params': {
-                'skip_download': True,  # requires ffmpeg
-            },
         },
         {
             'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/',
-            'md5': '143c98aa54a346738a3d78f54c925321',
+            'md5': '6f722cb3c3982186d34b0f13374499c7',
             'info_dict': {
                 'id': '2365297690',
                 'ext': 'mp4',
@@ -218,9 +215,6 @@ class PBSIE(InfoExtractor):
                 'description': 'md5:4d3eaa01f94e61b3e73704735f1196d9',
                 'duration': 5050,
             },
-            'params': {
-                'skip_download': True,  # requires ffmpeg
-            }
         },
         {
             'url': 'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/',
@@ -244,9 +238,6 @@ class PBSIE(InfoExtractor):
                 'duration': 6559,
                 'thumbnail': 're:^https?://.*\.jpg$',
             },
-            'params': {
-                'skip_download': True,  # requires ffmpeg
-            },
         },
         {
             'url': 'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html',
@@ -262,9 +253,6 @@ class PBSIE(InfoExtractor):
                 'upload_date': '20140122',
                 'age_limit': 10,
             },
-            'params': {
-                'skip_download': True,  # requires ffmpeg
-            },
         },
         {
             'url': 'http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/',
@@ -290,6 +278,7 @@ class PBSIE(InfoExtractor):
         },
         {
             'url': 'http://www.pbs.org/video/2365245528/',
+            'md5': '115223d41bd55cda8ae5cd5ed4e11497',
             'info_dict': {
                 'id': '2365245528',
                 'display_id': '2365245528',
@@ -299,15 +288,13 @@ class PBSIE(InfoExtractor):
                 'duration': 6851,
                 'thumbnail': 're:^https?://.*\.jpg$',
             },
-            'params': {
-                'skip_download': True,  # requires ffmpeg
-            },
         },
         {
             # Video embedded in iframe containing angle brackets as attribute's value (e.g.
             # "<iframe style='position: absolute;<br />\ntop: 0; left: 0;' ...", see
             # https://github.com/rg3/youtube-dl/issues/7059)
             'url': 'http://www.pbs.org/food/features/a-chefs-life-season-3-episode-5-prickly-business/',
+            'md5': '84ced42850d78f1d4650297356e95e6f',
             'info_dict': {
                 'id': '2365546844',
                 'display_id': 'a-chefs-life-season-3-episode-5-prickly-business',
@@ -317,9 +304,6 @@ class PBSIE(InfoExtractor):
                 'duration': 1480,
                 'thumbnail': 're:^https?://.*\.jpg$',
             },
-            'params': {
-                'skip_download': True,  # requires ffmpeg
-            },
         },
         {
             # Frontline video embedded via flp2012.js
@@ -340,6 +324,7 @@ class PBSIE(InfoExtractor):
         {
             # Serves hd only via wigget/partnerplayer page
             'url': 'http://www.pbs.org/video/2365641075/',
+            'md5': 'acfd4c400b48149a44861cb16dd305cf',
             'info_dict': {
                 'id': '2365641075',
                 'ext': 'mp4',
@@ -348,9 +333,6 @@ class PBSIE(InfoExtractor):
                 'thumbnail': 're:^https?://.*\.jpg$',
                 'formats': 'mincount:8',
             },
-            'params': {
-                'skip_download': True,  # requires ffmpeg
-            },
         },
         {
             'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true',
@@ -494,6 +476,7 @@ class PBSIE(InfoExtractor):
                         info = video_info
 
         formats = []
+        http_url = None
         for num, redirect in enumerate(redirects):
             redirect_id = redirect.get('eeid')
 
@@ -514,13 +497,37 @@ class PBSIE(InfoExtractor):
 
             if determine_ext(format_url) == 'm3u8':
                 formats.extend(self._extract_m3u8_formats(
-                    format_url, display_id, 'mp4', preference=1, m3u8_id='hls'))
+                    format_url, display_id, 'mp4', m3u8_id='hls', fatal=False))
             else:
                 formats.append({
                     'url': format_url,
                     'format_id': redirect_id,
                 })
+                if re.search(r'^https?://.*(?:\d+k|baseline)', format_url):
+                    http_url = format_url
         self._remove_duplicate_formats(formats)
+        m3u8_formats = list(filter(
+            lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+            formats))
+        if http_url:
+            for m3u8_format in m3u8_formats:
+                bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
+                # extract only the formats that we know that they will be available as http format.
+                # https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications
+                if not bitrate or bitrate not in ('400k', '800k', '1200k', '2500k'):
+                    continue
+                f_url = re.sub(r'\d+k|baseline', bitrate, http_url)
+                # This may produce invalid links sometimes (e.g.
+                # http://www.pbs.org/wgbh/frontline/film/suicide-plan)
+                if not self._is_valid_url(f_url, display_id, 'http-%s video' % bitrate):
+                    continue
+                f = m3u8_format.copy()
+                f.update({
+                    'url': f_url,
+                    'format_id': m3u8_format['format_id'].replace('hls', 'http'),
+                    'protocol': 'http',
+                })
+                formats.append(f)
         self._sort_formats(formats)
 
         rating_str = info.get('rating')
@@ -535,6 +542,19 @@ class PBSIE(InfoExtractor):
                 'ext': 'ttml',
                 'url': closed_captions_url,
             }]
+            mobj = re.search(r'/(\d+)_Encoded\.dfxp', closed_captions_url)
+            if mobj:
+                ttml_caption_suffix, ttml_caption_id = mobj.group(0, 1)
+                ttml_caption_id = int(ttml_caption_id)
+                subtitles['en'].extend([{
+                    'url': closed_captions_url.replace(
+                        ttml_caption_suffix, '/%d_Encoded.srt' % (ttml_caption_id + 1)),
+                    'ext': 'srt',
+                }, {
+                    'url': closed_captions_url.replace(
+                        ttml_caption_suffix, '/%d_Encoded.vtt' % (ttml_caption_id + 2)),
+                    'ext': 'vtt',
+                }])
 
         # info['title'] is often incomplete (e.g. 'Full Episode', 'Episode 5', etc)
         # Try turning it to 'program - title' naming scheme if possible
index 514e9b4339be43b509f9c9a8a6d2b87187e5f056..75f5884a928cff177bdabfb3db430ed282d0a7a8 100644 (file)
@@ -2,11 +2,15 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..utils import parse_iso8601
+from ..utils import (
+    parse_iso8601,
+    unescapeHTML,
+)
 
 
 class PeriscopeIE(InfoExtractor):
     IE_DESC = 'Periscope'
+    IE_NAME = 'periscope'
     _VALID_URL = r'https?://(?:www\.)?periscope\.tv/[^/]+/(?P<id>[^/?#]+)'
     # Alive example URLs can be found here http://onperiscope.com/
     _TESTS = [{
@@ -41,8 +45,11 @@ class PeriscopeIE(InfoExtractor):
         broadcast = broadcast_data['broadcast']
         status = broadcast['status']
 
-        uploader = broadcast.get('user_display_name') or broadcast_data.get('user', {}).get('display_name')
-        uploader_id = broadcast.get('user_id') or broadcast_data.get('user', {}).get('id')
+        user = broadcast_data.get('user', {})
+
+        uploader = broadcast.get('user_display_name') or user.get('display_name')
+        uploader_id = (broadcast.get('username') or user.get('username') or
+                       broadcast.get('user_id') or user.get('id'))
 
         title = '%s - %s' % (uploader, status) if uploader else status
         state = broadcast.get('state').lower()
@@ -79,3 +86,46 @@ class PeriscopeIE(InfoExtractor):
             'thumbnails': thumbnails,
             'formats': formats,
         }
+
+
+class PeriscopeUserIE(InfoExtractor):
+    _VALID_URL = r'https?://www\.periscope\.tv/(?P<id>[^/]+)/?$'
+    IE_DESC = 'Periscope user videos'
+    IE_NAME = 'periscope:user'
+
+    _TEST = {
+        'url': 'https://www.periscope.tv/LularoeHusbandMike/',
+        'info_dict': {
+            'id': 'LularoeHusbandMike',
+            'title': 'LULAROE HUSBAND MIKE',
+            'description': 'md5:6cf4ec8047768098da58e446e82c82f0',
+        },
+        # Periscope only shows videos in the last 24 hours, so it's possible to
+        # get 0 videos
+        'playlist_mincount': 0,
+    }
+
+    def _real_extract(self, url):
+        user_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, user_id)
+
+        data_store = self._parse_json(
+            unescapeHTML(self._search_regex(
+                r'data-store=(["\'])(?P<data>.+?)\1',
+                webpage, 'data store', default='{}', group='data')),
+            user_id)
+
+        user = data_store.get('User', {}).get('user', {})
+        title = user.get('display_name') or user.get('username')
+        description = user.get('description')
+
+        broadcast_ids = (data_store.get('UserBroadcastHistory', {}).get('broadcastIds') or
+                         data_store.get('BroadcastCache', {}).get('broadcastIds', []))
+
+        entries = [
+            self.url_result(
+                'https://www.periscope.tv/%s/%s' % (user_id, broadcast_id))
+            for broadcast_id in broadcast_ids]
+
+        return self.playlist_result(entries, user_id, title, description)
index bc559d1df289fca39b96f5cfc5519bf6acb8bbb3..77e1211d6095cf17464ce09a27b756157b4931e9 100644 (file)
@@ -49,7 +49,7 @@ class PladformIE(InfoExtractor):
     @staticmethod
     def _extract_url(webpage):
         mobj = re.search(
-            r'<iframe[^>]+src="(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)"', webpage)
+            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)\1', webpage)
         if mobj:
             return mobj.group('url')
 
index 6d138ef25d2d5cec02a012f5a06af085a6c35d26..0bc7431189a0eed819fb85a6fbbdc1558a4b84ed 100644 (file)
@@ -4,9 +4,8 @@ import re
 
 from .common import InfoExtractor
 from ..utils import (
-    xpath_text,
+    dict_get,
     float_or_none,
-    int_or_none,
 )
 
 
@@ -23,6 +22,19 @@ class PlaywireIE(InfoExtractor):
             'duration': 145.94,
         },
     }, {
+        # m3u8 in f4m
+        'url': 'http://config.playwire.com/21772/videos/v2/4840492/zeus.json',
+        'info_dict': {
+            'id': '4840492',
+            'ext': 'mp4',
+            'title': 'ITV EL SHOW FULL',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # Multiple resolutions while bitrates missing
         'url': 'http://cdn.playwire.com/11625/embed/85228.html',
         'only_matching': True,
     }, {
@@ -48,25 +60,10 @@ class PlaywireIE(InfoExtractor):
         thumbnail = content.get('poster')
         src = content['media']['f4m']
 
-        f4m = self._download_xml(src, video_id)
-        base_url = xpath_text(f4m, './{http://ns.adobe.com/f4m/1.0}baseURL', 'base url', fatal=True)
-        formats = []
-        for media in f4m.findall('./{http://ns.adobe.com/f4m/1.0}media'):
-            media_url = media.get('url')
-            if not media_url:
-                continue
-            tbr = int_or_none(media.get('bitrate'))
-            width = int_or_none(media.get('width'))
-            height = int_or_none(media.get('height'))
-            f = {
-                'url': '%s/%s' % (base_url, media.attrib['url']),
-                'tbr': tbr,
-                'width': width,
-                'height': height,
-            }
-            if not (tbr or width or height):
-                f['quality'] = 1 if '-hd.' in media_url else 0
-            formats.append(f)
+        formats = self._extract_f4m_formats(src, video_id, m3u8_id='hls')
+        for a_format in formats:
+            if not dict_get(a_format, ['tbr', 'width', 'height']):
+                a_format['quality'] = 1 if '-hd.' in a_format['url'] else 0
         self._sort_formats(formats)
 
         return {
diff --git a/youtube_dl/extractor/polskieradio.py b/youtube_dl/extractor/polskieradio.py
new file mode 100644 (file)
index 0000000..f559b89
--- /dev/null
@@ -0,0 +1,99 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_urllib_parse_unquote,
+)
+from ..utils import (
+    int_or_none,
+    strip_or_none,
+    unified_timestamp,
+)
+
+
+class PolskieRadioIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie',
+        'info_dict': {
+            'id': '1587943',
+            'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie',
+            'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5',
+        },
+        'playlist': [{
+            'md5': '2984ee6ce9046d91fc233bc1a864a09a',
+            'info_dict': {
+                'id': '1540576',
+                'ext': 'mp3',
+                'title': 'md5:d4623290d4ac983bf924061c75c23a0d',
+                'timestamp': 1456594200,
+                'upload_date': '20160227',
+                'duration': 2364,
+                'thumbnail': 're:^https?://static\.prsa\.pl/images/.*\.jpg$'
+            },
+        }],
+    }, {
+        'url': 'http://www.polskieradio.pl/265/5217/Artykul/1635803,Euro-2016-nie-ma-miejsca-na-blad-Polacy-graja-ze-Szwajcaria-o-cwiercfinal',
+        'info_dict': {
+            'id': '1635803',
+            'title': 'Euro 2016: nie ma miejsca na błąd. Polacy grają ze Szwajcarią o ćwierćfinał',
+            'description': 'md5:01cb7d0cad58664095d72b51a1ebada2',
+        },
+        'playlist_mincount': 12,
+    }, {
+        'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943',
+        'only_matching': True,
+    }, {
+        # with mp4 video
+        'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        content = self._search_regex(
+            r'(?s)<div[^>]+class="audio atarticle"[^>]*>(.+?)<script>',
+            webpage, 'content')
+
+        timestamp = unified_timestamp(self._html_search_regex(
+            r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>',
+            webpage, 'timestamp', fatal=False))
+
+        thumbnail_url = self._og_search_thumbnail(webpage)
+
+        entries = []
+
+        media_urls = set()
+
+        for data_media in re.findall(r'<[^>]+data-media=({[^>]+})', content):
+            media = self._parse_json(data_media, playlist_id, fatal=False)
+            if not media.get('file') or not media.get('desc'):
+                continue
+            media_url = self._proto_relative_url(media['file'], 'http:')
+            if media_url in media_urls:
+                continue
+            media_urls.add(media_url)
+            entries.append({
+                'id': compat_str(media['id']),
+                'url': media_url,
+                'title': compat_urllib_parse_unquote(media['desc']),
+                'duration': int_or_none(media.get('length')),
+                'vcodec': 'none' if media.get('provider') == 'audio' else None,
+                'timestamp': timestamp,
+                'thumbnail': thumbnail_url
+            })
+
+        title = self._og_search_title(webpage).strip()
+        description = strip_or_none(self._og_search_description(webpage))
+
+        return self.playlist_result(entries, playlist_id, title, description)
index 39b53ecf68c77786f18956040bf7ccac4fd6dbc5..8df12eec0d44c371d99b536b55694cfd2211f9d0 100644 (file)
@@ -1,19 +1,32 @@
 from __future__ import unicode_literals
 
 import re
-import json
 
 from .common import InfoExtractor
 from ..utils import (
+    ExtractorError,
     int_or_none,
     js_to_json,
-    qualities,
 )
 
 
 class PornHdIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)(?:/(?P<display_id>.+))?'
-    _TEST = {
+    _TESTS = [{
+        'url': 'http://www.pornhd.com/videos/9864/selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video',
+        'md5': 'c8b964b1f0a4b5f7f28ae3a5c9f86ad5',
+        'info_dict': {
+            'id': '9864',
+            'display_id': 'selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video',
+            'ext': 'mp4',
+            'title': 'Restroom selfie masturbation',
+            'description': 'md5:3748420395e03e31ac96857a8f125b2b',
+            'thumbnail': 're:^https?://.*\.jpg',
+            'view_count': int,
+            'age_limit': 18,
+        }
+    }, {
+        # removed video
         'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video',
         'md5': '956b8ca569f7f4d8ec563e2c41598441',
         'info_dict': {
@@ -25,8 +38,9 @@ class PornHdIE(InfoExtractor):
             'thumbnail': 're:^https?://.*\.jpg',
             'view_count': int,
             'age_limit': 18,
-        }
-    }
+        },
+        'skip': 'Not available anymore',
+    }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
@@ -38,28 +52,38 @@ class PornHdIE(InfoExtractor):
         title = self._html_search_regex(
             [r'<span[^>]+class=["\']video-name["\'][^>]*>([^<]+)',
              r'<title>(.+?) - .*?[Pp]ornHD.*?</title>'], webpage, 'title')
-        description = self._html_search_regex(
-            r'<div class="description">([^<]+)</div>', webpage, 'description', fatal=False)
-        view_count = int_or_none(self._html_search_regex(
-            r'(\d+) views\s*</span>', webpage, 'view count', fatal=False))
-        thumbnail = self._search_regex(
-            r"'poster'\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False)
 
-        quality = qualities(['sd', 'hd'])
-        sources = json.loads(js_to_json(self._search_regex(
+        sources = self._parse_json(js_to_json(self._search_regex(
             r"(?s)'sources'\s*:\s*(\{.+?\})\s*\}[;,)]",
-            webpage, 'sources')))
+            webpage, 'sources', default='{}')), video_id)
+
+        if not sources:
+            message = self._html_search_regex(
+                r'(?s)<(div|p)[^>]+class="no-video"[^>]*>(?P<value>.+?)</\1',
+                webpage, 'error message', group='value')
+            raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True)
+
         formats = []
-        for qname, video_url in sources.items():
+        for format_id, video_url in sources.items():
             if not video_url:
                 continue
+            height = int_or_none(self._search_regex(
+                r'^(\d+)[pP]', format_id, 'height', default=None))
             formats.append({
                 'url': video_url,
-                'format_id': qname,
-                'quality': quality(qname),
+                'format_id': format_id,
+                'height': height,
             })
         self._sort_formats(formats)
 
+        description = self._html_search_regex(
+            r'<(div|p)[^>]+class="description"[^>]*>(?P<value>[^<]+)</\1',
+            webpage, 'description', fatal=False, group='value')
+        view_count = int_or_none(self._html_search_regex(
+            r'(\d+) views\s*<', webpage, 'view count', fatal=False))
+        thumbnail = self._search_regex(
+            r"'poster'\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False)
+
         return {
             'id': video_id,
             'display_id': display_id,
index 407ea08d4350b52666150e2784652535625c5e31..d2c92531b0745ed189da699f4392725f2d4cddd6 100644 (file)
@@ -1,3 +1,4 @@
+# coding: utf-8
 from __future__ import unicode_literals
 
 import itertools
@@ -24,7 +25,15 @@ from ..aes import (
 
 
 class PornHubIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-z]+)'
+    IE_DESC = 'PornHub and Thumbzilla'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            (?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)|
+                            (?:www\.)?thumbzilla\.com/video/
+                        )
+                        (?P<id>[0-9a-z]+)
+                    '''
     _TESTS = [{
         'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
         'md5': '1e19b41231a02eba417839222ac9d58e',
@@ -39,13 +48,47 @@ class PornHubIE(InfoExtractor):
             'dislike_count': int,
             'comment_count': int,
             'age_limit': 18,
-        }
+        },
+    }, {
+        # non-ASCII title
+        'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002',
+        'info_dict': {
+            'id': '1331683002',
+            'ext': 'mp4',
+            'title': '重庆婷婷女王足交',
+            'uploader': 'cj397186295',
+            'duration': 1753,
+            'view_count': int,
+            'like_count': int,
+            'dislike_count': int,
+            'comment_count': int,
+            'age_limit': 18,
+        },
+        'params': {
+            'skip_download': True,
+        },
     }, {
         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
         'only_matching': True,
     }, {
+        # removed at the request of cam4.com
         'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862',
         'only_matching': True,
+    }, {
+        # removed at the request of the copyright owner
+        'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859',
+        'only_matching': True,
+    }, {
+        # removed by uploader
+        'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111',
+        'only_matching': True,
+    }, {
+        # private video
+        'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',
+        'only_matching': True,
     }]
 
     @classmethod
@@ -68,27 +111,33 @@ class PornHubIE(InfoExtractor):
         webpage = self._download_webpage(req, video_id)
 
         error_msg = self._html_search_regex(
-            r'(?s)<div class="userMessageSection[^"]*".*?>(.*?)</div>',
-            webpage, 'error message', default=None)
+            r'(?s)<div[^>]+class=(["\']).*?\b(?:removed|userMessageSection)\b.*?\1[^>]*>(?P<error>.+?)</div>',
+            webpage, 'error message', default=None, group='error')
         if error_msg:
             error_msg = re.sub(r'\s+', ' ', error_msg)
             raise ExtractorError(
                 'PornHub said: %s' % error_msg,
                 expected=True, video_id=video_id)
 
+        # video_title from flashvars contains whitespace instead of non-ASCII (see
+        # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
+        # on that anymore.
+        title = self._html_search_meta(
+            'twitter:title', webpage, default=None) or self._search_regex(
+            (r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)',
+             r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1',
+             r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'),
+            webpage, 'title', group='title')
+
         flashvars = self._parse_json(
             self._search_regex(
                 r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
             video_id)
         if flashvars:
-            video_title = flashvars.get('video_title')
             thumbnail = flashvars.get('image_url')
             duration = int_or_none(flashvars.get('video_duration'))
         else:
-            video_title, thumbnail, duration = [None] * 3
-
-        if not video_title:
-            video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title')
+            title, thumbnail, duration = [None] * 3
 
         video_uploader = self._html_search_regex(
             r'(?s)From:&nbsp;.+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<',
@@ -137,7 +186,7 @@ class PornHubIE(InfoExtractor):
         return {
             'id': video_id,
             'uploader': video_uploader,
-            'title': video_title,
+            'title': title,
             'thumbnail': thumbnail,
             'duration': duration,
             'view_count': view_count,
index 07d49d489d6779b0f6bb7bd12bc610497c576c2e..c6eee3b72a6e428012644ee7c10caad9be78fa86 100644 (file)
@@ -5,7 +5,7 @@ import re
 
 from hashlib import sha1
 from .common import InfoExtractor
-from ..compat import compat_urllib_parse_urlencode
+from ..compat import compat_str
 from ..utils import (
     ExtractorError,
     determine_ext,
@@ -71,6 +71,7 @@ class ProSiebenSat1IE(InfoExtractor):
                 # rtmp download
                 'skip_download': True,
             },
+            'skip': 'This video is unavailable',
         },
         {
             'url': 'http://www.sixx.de/stars-style/video/sexy-laufen-in-ugg-boots-clip',
@@ -86,6 +87,7 @@ class ProSiebenSat1IE(InfoExtractor):
                 # rtmp download
                 'skip_download': True,
             },
+            'skip': 'This video is unavailable',
         },
         {
             'url': 'http://www.sat1.de/film/der-ruecktritt/video/im-interview-kai-wiesinger-clip',
@@ -101,6 +103,7 @@ class ProSiebenSat1IE(InfoExtractor):
                 # rtmp download
                 'skip_download': True,
             },
+            'skip': 'This video is unavailable',
         },
         {
             'url': 'http://www.kabeleins.de/tv/rosins-restaurants/videos/jagd-auf-fertigkost-im-elsthal-teil-2-ganze-folge',
@@ -116,6 +119,7 @@ class ProSiebenSat1IE(InfoExtractor):
                 # rtmp download
                 'skip_download': True,
             },
+            'skip': 'This video is unavailable',
         },
         {
             'url': 'http://www.ran.de/fussball/bundesliga/video/schalke-toennies-moechte-raul-zurueck-ganze-folge',
@@ -131,6 +135,7 @@ class ProSiebenSat1IE(InfoExtractor):
                 # rtmp download
                 'skip_download': True,
             },
+            'skip': 'This video is unavailable',
         },
         {
             'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip',
@@ -227,70 +232,42 @@ class ProSiebenSat1IE(InfoExtractor):
     ]
 
     def _extract_clip(self, url, webpage):
-        clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id')
+        clip_id = self._html_search_regex(
+            self._CLIPID_REGEXES, webpage, 'clip id')
 
         access_token = 'prosieben'
         client_name = 'kolibri-2.0.19-splec4'
         client_location = url
 
-        videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse_urlencode({
-            'access_token': access_token,
-            'client_location': client_location,
-            'client_name': client_name,
-            'ids': clip_id,
-        })
-
-        video = self._download_json(videos_api_url, clip_id, 'Downloading videos JSON')[0]
+        video = self._download_json(
+            'http://vas.sim-technik.de/vas/live/v2/videos',
+            clip_id, 'Downloading videos JSON', query={
+                'access_token': access_token,
+                'client_location': client_location,
+                'client_name': client_name,
+                'ids': clip_id,
+            })[0]
 
         if video.get('is_protected') is True:
             raise ExtractorError('This video is DRM protected.', expected=True)
 
         duration = float_or_none(video.get('duration'))
-        source_ids = [source['id'] for source in video['sources']]
-        source_ids_str = ','.join(map(str, source_ids))
+        source_ids = [compat_str(source['id']) for source in video['sources']]
 
         g = '01!8d8F_)r9]4s[qeuXfP%'
+        client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name]).encode('utf-8')).hexdigest()
 
-        client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name])
-                                 .encode('utf-8')).hexdigest()
-
-        sources_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources?%s' % (clip_id, compat_urllib_parse_urlencode({
-            'access_token': access_token,
-            'client_id': client_id,
-            'client_location': client_location,
-            'client_name': client_name,
-        }))
-
-        sources = self._download_json(sources_api_url, clip_id, 'Downloading sources JSON')
+        sources = self._download_json(
+            'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources' % clip_id,
+            clip_id, 'Downloading sources JSON', query={
+                'access_token': access_token,
+                'client_id': client_id,
+                'client_location': client_location,
+                'client_name': client_name,
+            })
         server_id = sources['server_id']
 
-        client_id = g[:2] + sha1(''.join([g, clip_id, access_token, server_id,
-                                          client_location, source_ids_str, g, client_name])
-                                 .encode('utf-8')).hexdigest()
-
-        url_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url?%s' % (clip_id, compat_urllib_parse_urlencode({
-            'access_token': access_token,
-            'client_id': client_id,
-            'client_location': client_location,
-            'client_name': client_name,
-            'server_id': server_id,
-            'source_ids': source_ids_str,
-        }))
-
-        urls = self._download_json(url_api_url, clip_id, 'Downloading urls JSON')
-
         title = self._html_search_regex(self._TITLE_REGEXES, webpage, 'title')
-        description = self._html_search_regex(self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False)
-        thumbnail = self._og_search_thumbnail(webpage)
-
-        upload_date = unified_strdate(self._html_search_regex(
-            self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None))
-
-        formats = []
-
-        urls_sources = urls['sources']
-        if isinstance(urls_sources, dict):
-            urls_sources = urls_sources.values()
 
         def fix_bitrate(bitrate):
             bitrate = int_or_none(bitrate)
@@ -298,37 +275,73 @@ class ProSiebenSat1IE(InfoExtractor):
                 return None
             return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate
 
-        for source in urls_sources:
-            protocol = source['protocol']
-            source_url = source['url']
-            if protocol == 'rtmp' or protocol == 'rtmpe':
-                mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url)
-                if not mobj:
-                    continue
-                path = mobj.group('path')
-                mp4colon_index = path.rfind('mp4:')
-                app = path[:mp4colon_index]
-                play_path = path[mp4colon_index:]
-                formats.append({
-                    'url': '%s/%s' % (mobj.group('url'), app),
-                    'app': app,
-                    'play_path': play_path,
-                    'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf',
-                    'page_url': 'http://www.prosieben.de',
-                    'vbr': fix_bitrate(source['bitrate']),
-                    'ext': 'mp4',
-                    'format_id': '%s_%s' % (source['cdn'], source['bitrate']),
-                })
-            elif 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m':
-                formats.extend(self._extract_f4m_formats(source_url, clip_id))
-            else:
-                formats.append({
-                    'url': source_url,
-                    'vbr': fix_bitrate(source['bitrate']),
+        formats = []
+        for source_id in source_ids:
+            client_id = g[:2] + sha1(''.join([g, clip_id, access_token, server_id, client_location, source_id, g, client_name]).encode('utf-8')).hexdigest()
+            urls = self._download_json(
+                'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url' % clip_id,
+                clip_id, 'Downloading urls JSON', fatal=False, query={
+                    'access_token': access_token,
+                    'client_id': client_id,
+                    'client_location': client_location,
+                    'client_name': client_name,
+                    'server_id': server_id,
+                    'source_ids': source_id,
                 })
-
+            if not urls:
+                continue
+            if urls.get('status_code') != 0:
+                raise ExtractorError('This video is unavailable', expected=True)
+            urls_sources = urls['sources']
+            if isinstance(urls_sources, dict):
+                urls_sources = urls_sources.values()
+            for source in urls_sources:
+                source_url = source.get('url')
+                if not source_url:
+                    continue
+                protocol = source.get('protocol')
+                mimetype = source.get('mimetype')
+                if mimetype == 'application/f4m+xml' or 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m':
+                    formats.extend(self._extract_f4m_formats(
+                        source_url, clip_id, f4m_id='hds', fatal=False))
+                elif mimetype == 'application/x-mpegURL':
+                    formats.extend(self._extract_m3u8_formats(
+                        source_url, clip_id, 'mp4', 'm3u8_native',
+                        m3u8_id='hls', fatal=False))
+                else:
+                    tbr = fix_bitrate(source['bitrate'])
+                    if protocol in ('rtmp', 'rtmpe'):
+                        mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url)
+                        if not mobj:
+                            continue
+                        path = mobj.group('path')
+                        mp4colon_index = path.rfind('mp4:')
+                        app = path[:mp4colon_index]
+                        play_path = path[mp4colon_index:]
+                        formats.append({
+                            'url': '%s/%s' % (mobj.group('url'), app),
+                            'app': app,
+                            'play_path': play_path,
+                            'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf',
+                            'page_url': 'http://www.prosieben.de',
+                            'tbr': tbr,
+                            'ext': 'flv',
+                            'format_id': 'rtmp%s' % ('-%d' % tbr if tbr else ''),
+                        })
+                    else:
+                        formats.append({
+                            'url': source_url,
+                            'tbr': tbr,
+                            'format_id': 'http%s' % ('-%d' % tbr if tbr else ''),
+                        })
         self._sort_formats(formats)
 
+        description = self._html_search_regex(
+            self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False)
+        thumbnail = self._og_search_thumbnail(webpage)
+        upload_date = unified_strdate(self._html_search_regex(
+            self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None))
+
         return {
             'id': clip_id,
             'title': title,
index 976c8feec657f8de731d3ffadfa09189ed1628cf..069dbfaed0638e396d024ec81d5142d18f9ad90f 100644 (file)
@@ -2,22 +2,19 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..utils import (
-    js_to_json,
-    unescapeHTML,
-    int_or_none,
-)
+from ..utils import int_or_none
 
 
 class R7IE(InfoExtractor):
-    _VALID_URL = r'''(?x)https?://
+    _VALID_URL = r'''(?x)
+                        https?://
                         (?:
                             (?:[a-zA-Z]+)\.r7\.com(?:/[^/]+)+/idmedia/|
                             noticias\.r7\.com(?:/[^/]+)+/[^/]+-|
                             player\.r7\.com/video/i/
                         )
                         (?P<id>[\da-f]{24})
-                        '''
+                    '''
     _TESTS = [{
         'url': 'http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html',
         'md5': '403c4e393617e8e8ddc748978ee8efde',
@@ -25,6 +22,7 @@ class R7IE(InfoExtractor):
             'id': '54e7050b0cf2ff57e0279389',
             'ext': 'mp4',
             'title': 'Policiais humilham suspeito à beira da morte: "Morre com dignidade"',
+            'description': 'md5:01812008664be76a6479aa58ec865b72',
             'thumbnail': 're:^https?://.*\.jpg$',
             'duration': 98,
             'like_count': int,
@@ -44,45 +42,72 @@ class R7IE(InfoExtractor):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        webpage = self._download_webpage(
-            'http://player.r7.com/video/i/%s' % video_id, video_id)
+        video = self._download_json(
+            'http://player-api.r7.com/video/i/%s' % video_id, video_id)
 
-        item = self._parse_json(js_to_json(self._search_regex(
-            r'(?s)var\s+item\s*=\s*({.+?});', webpage, 'player')), video_id)
-
-        title = unescapeHTML(item['title'])
-        thumbnail = item.get('init', {}).get('thumbUri')
-        duration = None
-
-        statistics = item.get('statistics', {})
-        like_count = int_or_none(statistics.get('likes'))
-        view_count = int_or_none(statistics.get('views'))
+        title = video['title']
 
         formats = []
-        for format_key, format_dict in item['playlist'][0].items():
-            src = format_dict.get('src')
-            if not src:
-                continue
-            format_id = format_dict.get('format') or format_key
-            if duration is None:
-                duration = format_dict.get('duration')
-            if '.f4m' in src:
-                formats.extend(self._extract_f4m_formats(src, video_id, preference=-1))
-            elif src.endswith('.m3u8'):
-                formats.extend(self._extract_m3u8_formats(src, video_id, 'mp4', preference=-2))
-            else:
-                formats.append({
-                    'url': src,
-                    'format_id': format_id,
-                })
+        media_url_hls = video.get('media_url_hls')
+        if media_url_hls:
+            formats.extend(self._extract_m3u8_formats(
+                media_url_hls, video_id, 'mp4', entry_protocol='m3u8_native',
+                m3u8_id='hls', fatal=False))
+        media_url = video.get('media_url')
+        if media_url:
+            f = {
+                'url': media_url,
+                'format_id': 'http',
+            }
+            # m3u8 format always matches the http format, let's copy metadata from
+            # one to another
+            m3u8_formats = list(filter(
+                lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+                formats))
+            if len(m3u8_formats) == 1:
+                f_copy = m3u8_formats[0].copy()
+                f_copy.update(f)
+                f_copy['protocol'] = 'http'
+                f = f_copy
+            formats.append(f)
         self._sort_formats(formats)
 
+        description = video.get('description')
+        thumbnail = video.get('thumb')
+        duration = int_or_none(video.get('media_duration'))
+        like_count = int_or_none(video.get('likes'))
+        view_count = int_or_none(video.get('views'))
+
         return {
             'id': video_id,
             'title': title,
+            'description': description,
             'thumbnail': thumbnail,
             'duration': duration,
             'like_count': like_count,
             'view_count': view_count,
             'formats': formats,
         }
+
+
+class R7ArticleIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:[a-zA-Z]+)\.r7\.com/(?:[^/]+/)+[^/?#&]+-(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://tv.r7.com/record-play/balanco-geral/videos/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-16102015',
+        'only_matching': True,
+    }
+
+    @classmethod
+    def suitable(cls, url):
+        return False if R7IE.suitable(url) else super(R7ArticleIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_id = self._search_regex(
+            r'<div[^>]+(?:id=["\']player-|class=["\']embed["\'][^>]+id=["\'])([\da-f]{24})',
+            webpage, 'video id')
+
+        return self.url_result('http://player.r7.com/video/i/%s' % video_id, R7IE.ie_key())
diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py
new file mode 100644 (file)
index 0000000..8ec4026
--- /dev/null
@@ -0,0 +1,148 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    xpath_text,
+    find_xpath_attr,
+    determine_ext,
+    int_or_none,
+    unified_strdate,
+    xpath_element,
+    ExtractorError,
+    determine_protocol,
+)
+
+
+class RadioCanadaIE(InfoExtractor):
+    IE_NAME = 'radiocanada'
+    _VALID_URL = r'(?:radiocanada:|https?://ici\.radio-canada\.ca/widgets/mediaconsole/)(?P<app_code>[^:/]+)[:/](?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272',
+        'info_dict': {
+            'id': '7184272',
+            'ext': 'mp4',
+            'title': 'Le parcours du tireur capté sur vidéo',
+            'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa',
+            'upload_date': '20141023',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        app_code, video_id = re.match(self._VALID_URL, url).groups()
+
+        device_types = ['ipad', 'android']
+        if app_code != 'toutv':
+            device_types.append('flash')
+
+        formats = []
+        # TODO: extract f4m formats
+        # f4m formats can be extracted using flashhd device_type but they produce unplayable file
+        for device_type in device_types:
+            v_data = self._download_xml(
+                'http://api.radio-canada.ca/validationMedia/v1/Validation.ashx',
+                video_id, note='Downloading %s XML' % device_type, query={
+                    'appCode': app_code,
+                    'idMedia': video_id,
+                    'connectionType': 'broadband',
+                    'multibitrate': 'true',
+                    'deviceType': device_type,
+                    # paysJ391wsHjbOJwvCs26toz and bypasslock are used to bypass geo-restriction
+                    'paysJ391wsHjbOJwvCs26toz': 'CA',
+                    'bypasslock': 'NZt5K62gRqfc',
+                }, fatal=False)
+            v_url = xpath_text(v_data, 'url')
+            if not v_url:
+                continue
+            if v_url == 'null':
+                raise ExtractorError('%s said: %s' % (
+                    self.IE_NAME, xpath_text(v_data, 'message')), expected=True)
+            ext = determine_ext(v_url)
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    v_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
+            elif ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    v_url, video_id, f4m_id='hds', fatal=False))
+            else:
+                ext = determine_ext(v_url)
+                bitrates = xpath_element(v_data, 'bitrates')
+                for url_e in bitrates.findall('url'):
+                    tbr = int_or_none(url_e.get('bitrate'))
+                    if not tbr:
+                        continue
+                    f_url = re.sub(r'\d+\.%s' % ext, '%d.%s' % (tbr, ext), v_url)
+                    protocol = determine_protocol({'url': f_url})
+                    formats.append({
+                        'format_id': '%s-%d' % (protocol, tbr),
+                        'url': f_url,
+                        'ext': 'flv' if protocol == 'rtmp' else ext,
+                        'protocol': protocol,
+                        'width': int_or_none(url_e.get('width')),
+                        'height': int_or_none(url_e.get('height')),
+                        'tbr': tbr,
+                    })
+                    if protocol == 'rtsp':
+                        base_url = self._search_regex(
+                            r'rtsp://([^?]+)', f_url, 'base url', default=None)
+                        if base_url:
+                            base_url = 'http://' + base_url
+                            formats.extend(self._extract_m3u8_formats(
+                                base_url + '/playlist.m3u8', video_id, 'mp4',
+                                'm3u8_native', m3u8_id='hls', fatal=False))
+                            formats.extend(self._extract_f4m_formats(
+                                base_url + '/manifest.f4m', video_id,
+                                f4m_id='hds', fatal=False))
+        self._sort_formats(formats)
+
+        metadata = self._download_xml(
+            'http://api.radio-canada.ca/metaMedia/v1/index.ashx',
+            video_id, note='Downloading metadata XML', query={
+                'appCode': app_code,
+                'idMedia': video_id,
+            })
+
+        def get_meta(name):
+            el = find_xpath_attr(metadata, './/Meta', 'name', name)
+            return el.text if el is not None else None
+
+        return {
+            'id': video_id,
+            'title': get_meta('Title'),
+            'description': get_meta('Description') or get_meta('ShortDescription'),
+            'thumbnail': get_meta('imageHR') or get_meta('imageMR') or get_meta('imageBR'),
+            'duration': int_or_none(get_meta('length')),
+            'series': get_meta('Emission'),
+            'season_number': int_or_none('SrcSaison'),
+            'episode_number': int_or_none('SrcEpisode'),
+            'upload_date': unified_strdate(get_meta('Date')),
+            'formats': formats,
+        }
+
+
+class RadioCanadaAudioVideoIE(InfoExtractor):
+    'radiocanada:audiovideo'
+    _VALID_URL = r'https?://ici\.radio-canada\.ca/audio-video/media-(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://ici.radio-canada.ca/audio-video/media-7527184/barack-obama-au-vietnam',
+        'info_dict': {
+            'id': '7527184',
+            'ext': 'mp4',
+            'title': 'Barack Obama au Vietnam',
+            'description': 'Les États-Unis lèvent l\'embargo sur la vente d\'armes qui datait de la guerre du Vietnam',
+            'upload_date': '20160523',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        return self.url_result('radiocanada:medianet:%s' % self._match_id(url))
index 884c284206cb73303a6631695872ce79ecf7795e..ec4fa6e602ea779dd6d3a530ea6cfb639eee3cf4 100644 (file)
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..utils import(
+from ..utils import (
     unified_strdate,
     str_to_int,
 )
index e36ce1aa1940deafd5a633bec814e7462008c3b1..dc640b1bcb58ddb79c89e5f2346a5bc5c63a3547 100644 (file)
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
-from ..compat import (
-    compat_urllib_parse,
-    compat_urlparse,
-)
+from ..compat import compat_urlparse
 from ..utils import (
-    ExtractorError,
     determine_ext,
+    ExtractorError,
+    find_xpath_attr,
+    fix_xml_ampersands,
+    int_or_none,
     parse_duration,
     unified_strdate,
-    int_or_none,
+    update_url_query,
     xpath_text,
 )
 
 
-class RaiTVIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+media/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html'
+class RaiBaseIE(InfoExtractor):
+    def _extract_relinker_formats(self, relinker_url, video_id):
+        formats = []
+
+        for platform in ('mon', 'flash', 'native'):
+            relinker = self._download_xml(
+                relinker_url, video_id,
+                note='Downloading XML metadata for platform %s' % platform,
+                transform_source=fix_xml_ampersands,
+                query={'output': 45, 'pl': platform},
+                headers=self.geo_verification_headers())
+
+            media_url = find_xpath_attr(relinker, './url', 'type', 'content').text
+            if media_url == 'http://download.rai.it/video_no_available.mp4':
+                self.raise_geo_restricted()
+
+            ext = determine_ext(media_url)
+            if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'):
+                continue
+
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    media_url, video_id, 'mp4', 'm3u8_native',
+                    m3u8_id='hls', fatal=False))
+            elif ext == 'f4m':
+                manifest_url = update_url_query(
+                    media_url.replace('manifest#live_hds.f4m', 'manifest.f4m'),
+                    {'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'})
+                formats.extend(self._extract_f4m_formats(
+                    manifest_url, video_id, f4m_id='hds', fatal=False))
+            else:
+                bitrate = int_or_none(xpath_text(relinker, 'bitrate'))
+                formats.append({
+                    'url': media_url,
+                    'tbr': bitrate if bitrate > 0 else None,
+                    'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http',
+                })
+
+        return formats
+
+    def _extract_from_content_id(self, content_id, base_url):
+        media = self._download_json(
+            'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id,
+            content_id, 'Downloading video JSON')
+
+        thumbnails = []
+        for image_type in ('image', 'image_medium', 'image_300'):
+            thumbnail_url = media.get(image_type)
+            if thumbnail_url:
+                thumbnails.append({
+                    'url': compat_urlparse.urljoin(base_url, thumbnail_url),
+                })
+
+        formats = []
+        media_type = media['type']
+        if 'Audio' in media_type:
+            formats.append({
+                'format_id': media.get('formatoAudio'),
+                'url': media['audioUrl'],
+                'ext': media.get('formatoAudio'),
+            })
+        elif 'Video' in media_type:
+            formats.extend(self._extract_relinker_formats(media['mediaUri'], content_id))
+            self._sort_formats(formats)
+        else:
+            raise ExtractorError('not a media file')
+
+        subtitles = {}
+        captions = media.get('subtitlesUrl')
+        if captions:
+            STL_EXT = '.stl'
+            SRT_EXT = '.srt'
+            if captions.endswith(STL_EXT):
+                captions = captions[:-len(STL_EXT)] + SRT_EXT
+            subtitles['it'] = [{
+                'ext': 'srt',
+                'url': captions,
+            }]
+
+        return {
+            'id': content_id,
+            'title': media['name'],
+            'description': media.get('desc'),
+            'thumbnails': thumbnails,
+            'uploader': media.get('author'),
+            'upload_date': unified_strdate(media.get('date')),
+            'duration': parse_duration(media.get('length')),
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
+
+class RaiTVIE(RaiBaseIE):
+    _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+(?:media|ondemand)/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html'
     _TESTS = [
         {
             'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html',
-            'md5': '96382709b61dd64a6b88e0f791e6df4c',
+            'md5': '8970abf8caf8aef4696e7b1f2adfc696',
             'info_dict': {
                 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391',
-                'ext': 'flv',
+                'ext': 'mp4',
                 'title': 'Report del 07/04/2014',
                 'description': 'md5:f27c544694cacb46a078db84ec35d2d9',
                 'upload_date': '20140407',
                 'duration': 6160,
+                'thumbnail': 're:^https?://.*\.jpg$',
             }
         },
         {
+            # no m3u8 stream
             'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html',
-            'md5': 'd9751b78eac9710d62c2447b224dea39',
+            # HDS download, MD5 is unstable
             'info_dict': {
                 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9',
                 'ext': 'flv',
                 'title': 'TG PRIMO TEMPO',
                 'upload_date': '20140612',
                 'duration': 1758,
+                'thumbnail': 're:^https?://.*\.jpg$',
             },
+            'skip': 'Geo-restricted to Italy',
         },
         {
             'url': 'http://www.rainews.it/dl/rainews/media/state-of-the-net-Antonella-La-Carpia-regole-virali-7aafdea9-0e5d-49d5-88a6-7e65da67ae13.html',
@@ -67,127 +161,70 @@ class RaiTVIE(InfoExtractor):
         },
         {
             'url': 'http://www.ilcandidato.rai.it/dl/ray/media/Il-Candidato---Primo-episodio-Le-Primarie-28e5525a-b495-45e8-a7c3-bc48ba45d2b6.html',
-            'md5': '496ab63e420574447f70d02578333437',
+            'md5': 'e57493e1cb8bc7c564663f363b171847',
             'info_dict': {
                 'id': '28e5525a-b495-45e8-a7c3-bc48ba45d2b6',
-                'ext': 'flv',
+                'ext': 'mp4',
                 'title': 'Il Candidato - Primo episodio: "Le Primarie"',
                 'description': 'md5:364b604f7db50594678f483353164fb8',
                 'upload_date': '20140923',
                 'duration': 386,
+                'thumbnail': 're:^https?://.*\.jpg$',
             }
         },
     ]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        media = self._download_json(
-            'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % video_id,
-            video_id, 'Downloading video JSON')
-
-        thumbnails = []
-        for image_type in ('image', 'image_medium', 'image_300'):
-            thumbnail_url = media.get(image_type)
-            if thumbnail_url:
-                thumbnails.append({
-                    'url': thumbnail_url,
-                })
-
-        subtitles = []
-        formats = []
-        media_type = media['type']
-        if 'Audio' in media_type:
-            formats.append({
-                'format_id': media.get('formatoAudio'),
-                'url': media['audioUrl'],
-                'ext': media.get('formatoAudio'),
-            })
-        elif 'Video' in media_type:
-            def fix_xml(xml):
-                return xml.replace(' tag elementi', '').replace('>/', '</')
-
-            relinker = self._download_xml(
-                media['mediaUri'] + '&output=43',
-                video_id, transform_source=fix_xml)
-
-            has_subtitle = False
-
-            for element in relinker.findall('element'):
-                media_url = xpath_text(element, 'url')
-                ext = determine_ext(media_url)
-                content_type = xpath_text(element, 'content-type')
-                if ext == 'm3u8':
-                    formats.extend(self._extract_m3u8_formats(
-                        media_url, video_id, 'mp4', 'm3u8_native',
-                        m3u8_id='hls', fatal=False))
-                elif ext == 'f4m':
-                    formats.extend(self._extract_f4m_formats(
-                        media_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44',
-                        video_id, f4m_id='hds', fatal=False))
-                elif ext == 'stl':
-                    has_subtitle = True
-                elif content_type.startswith('video/'):
-                    bitrate = int_or_none(xpath_text(element, 'bitrate'))
-                    formats.append({
-                        'url': media_url,
-                        'tbr': bitrate if bitrate > 0 else None,
-                        'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http',
-                    })
-                elif content_type.startswith('image/'):
-                    thumbnails.append({
-                        'url': media_url,
-                    })
-
-            self._sort_formats(formats)
 
-            if has_subtitle:
-                webpage = self._download_webpage(url, video_id)
-                subtitles = self._get_subtitles(video_id, webpage)
-        else:
-            raise ExtractorError('not a media file')
+        return self._extract_from_content_id(video_id, url)
 
-        return {
-            'id': video_id,
-            'title': media['name'],
-            'description': media.get('desc'),
-            'thumbnails': thumbnails,
-            'uploader': media.get('author'),
-            'upload_date': unified_strdate(media.get('date')),
-            'duration': parse_duration(media.get('length')),
-            'formats': formats,
-            'subtitles': subtitles,
-        }
 
-    def _get_subtitles(self, video_id, webpage):
-        subtitles = {}
-        m = re.search(r'<meta name="closedcaption" content="(?P<captions>[^"]+)"', webpage)
-        if m:
-            captions = m.group('captions')
-            STL_EXT = '.stl'
-            SRT_EXT = '.srt'
-            if captions.endswith(STL_EXT):
-                captions = captions[:-len(STL_EXT)] + SRT_EXT
-            subtitles['it'] = [{
-                'ext': 'srt',
-                'url': 'http://www.rai.tv%s' % compat_urllib_parse.quote(captions),
-            }]
-        return subtitles
-
-
-class RaiIE(InfoExtractor):
+class RaiIE(RaiBaseIE):
     _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html'
     _TESTS = [
         {
             'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html',
-            'md5': 'e0e7a8a131e249d1aa0ebf270d1d8db7',
+            'md5': '2dd727e61114e1ee9c47f0da6914e178',
             'info_dict': {
                 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af',
-                'ext': 'flv',
+                'ext': 'mp4',
                 'title': 'Il pacco',
                 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a',
                 'upload_date': '20141221',
             },
-        }
+        },
+        {
+            # Direct relinker URL
+            'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews',
+            # HDS live stream, MD5 is unstable
+            'info_dict': {
+                'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc',
+                'ext': 'flv',
+                'title': 'EuroNews',
+            },
+            'skip': 'Geo-restricted to Italy',
+        },
+        {
+            # Embedded content item ID
+            'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined',
+            'md5': '84c1135ce960e8822ae63cec34441d63',
+            'info_dict': {
+                'id': '0960e765-62c8-474a-ac4b-7eb3e2be39c8',
+                'ext': 'mp4',
+                'title': 'TG1 ore 20:00 del 02/07/2016',
+                'upload_date': '20160702',
+            },
+        },
+        {
+            'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html',
+            # HDS live stream, MD5 is unstable
+            'info_dict': {
+                'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9',
+                'ext': 'flv',
+                'title': 'La diretta di Rainews24',
+            },
+        },
     ]
 
     @classmethod
@@ -201,7 +238,30 @@ class RaiIE(InfoExtractor):
         iframe_url = self._search_regex(
             [r'<iframe[^>]+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"',
              r'drawMediaRaiTV\(["\'](.+?)["\']'],
-            webpage, 'iframe')
-        if not iframe_url.startswith('http'):
-            iframe_url = compat_urlparse.urljoin(url, iframe_url)
-        return self.url_result(iframe_url)
+            webpage, 'iframe', default=None)
+        if iframe_url:
+            if not iframe_url.startswith('http'):
+                iframe_url = compat_urlparse.urljoin(url, iframe_url)
+            return self.url_result(iframe_url)
+
+        content_item_id = self._search_regex(
+            r'initEdizione\((?P<q1>[\'"])ContentItem-(?P<content_id>[^\'"]+)(?P=q1)',
+            webpage, 'content item ID', group='content_id', default=None)
+        if content_item_id:
+            return self._extract_from_content_id(content_item_id, url)
+
+        relinker_url = compat_urlparse.urljoin(url, self._search_regex(
+            r'(?:var\s+videoURL|mediaInfo\.mediaUri)\s*=\s*(?P<q1>[\'"])(?P<url>(https?:)?//mediapolis\.rai\.it/relinker/relinkerServlet\.htm\?cont=\d+)(?P=q1)',
+            webpage, 'relinker URL', group='url'))
+        formats = self._extract_relinker_formats(relinker_url, video_id)
+        self._sort_formats(formats)
+
+        title = self._search_regex(
+            r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1',
+            webpage, 'title', group='title', default=None) or self._og_search_title(webpage)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+        }
index 796adfdf9dab7f07481328026bb21591a7aa5612..bf200ea4d3f8b17f171bcce01c930b5d183fcc2e 100644 (file)
@@ -1,23 +1,23 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
 from ..utils import (
     parse_duration,
     parse_iso8601,
+    js_to_json,
 )
+from ..compat import compat_str
 
 
 class RDSIE(InfoExtractor):
     IE_DESC = 'RDS.ca'
-    _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<display_id>[^/]+)-(?P<id>\d+\.\d+)'
+    _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<id>[^/]+)-\d+\.\d+'
 
     _TESTS = [{
         'url': 'http://www.rds.ca/videos/football/nfl/fowler-jr-prend-la-direction-de-jacksonville-3.1132799',
         'info_dict': {
-            'id': '3.1132799',
+            'id': '604333',
             'display_id': 'fowler-jr-prend-la-direction-de-jacksonville',
             'ext': 'mp4',
             'title': 'Fowler Jr. prend la direction de Jacksonville',
@@ -33,22 +33,17 @@ class RDSIE(InfoExtractor):
     }]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        display_id = mobj.group('display_id')
+        display_id = self._match_id(url)
 
         webpage = self._download_webpage(url, display_id)
 
-        # TODO: extract f4m from 9c9media.com
-        video_url = self._search_regex(
-            r'<span[^>]+itemprop="contentURL"[^>]+content="([^"]+)"',
-            webpage, 'video url')
-
-        title = self._og_search_title(webpage) or self._html_search_meta(
+        item = self._parse_json(self._search_regex(r'(?s)itemToPush\s*=\s*({.+?});', webpage, 'item'), display_id, js_to_json)
+        video_id = compat_str(item['id'])
+        title = item.get('title') or self._og_search_title(webpage) or self._html_search_meta(
             'title', webpage, 'title', fatal=True)
         description = self._og_search_description(webpage) or self._html_search_meta(
             'description', webpage, 'description')
-        thumbnail = self._og_search_thumbnail(webpage) or self._search_regex(
+        thumbnail = item.get('urlImageBig') or self._og_search_thumbnail(webpage) or self._search_regex(
             [r'<link[^>]+itemprop="thumbnailUrl"[^>]+href="([^"]+)"',
              r'<span[^>]+itemprop="thumbnailUrl"[^>]+content="([^"]+)"'],
             webpage, 'thumbnail', fatal=False)
@@ -61,13 +56,15 @@ class RDSIE(InfoExtractor):
         age_limit = self._family_friendly_search(webpage)
 
         return {
+            '_type': 'url_transparent',
             'id': video_id,
             'display_id': display_id,
-            'url': video_url,
+            'url': '9c9media:rds_web:%s' % video_id,
             'title': title,
             'description': description,
             'thumbnail': thumbnail,
             'timestamp': timestamp,
             'duration': duration,
             'age_limit': age_limit,
+            'ie_key': 'NineCNineMedia',
         }
index 7ba41ba593295cdc7d2e28e6b64702321ed1ef08..721fc3a9e2d2b3431051ea00982f72ae1d98ff65 100644 (file)
@@ -1,7 +1,12 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..utils import ExtractorError
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    str_to_int,
+    unified_strdate,
+)
 
 
 class RedTubeIE(InfoExtractor):
@@ -13,6 +18,9 @@ class RedTubeIE(InfoExtractor):
             'id': '66418',
             'ext': 'mp4',
             'title': 'Sucked on a toilet',
+            'upload_date': '20120831',
+            'duration': 596,
+            'view_count': int,
             'age_limit': 18,
         }
     }
@@ -24,12 +32,39 @@ class RedTubeIE(InfoExtractor):
         if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']):
             raise ExtractorError('Video %s has been removed' % video_id, expected=True)
 
-        video_url = self._html_search_regex(
-            r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL')
-        video_title = self._html_search_regex(
-            r'<h1 class="videoTitle[^"]*">(.+?)</h1>',
-            webpage, 'title')
-        video_thumbnail = self._og_search_thumbnail(webpage)
+        title = self._html_search_regex(
+            (r'<h1 class="videoTitle[^"]*">(?P<title>.+?)</h1>',
+             r'videoTitle\s*:\s*(["\'])(?P<title>)\1'),
+            webpage, 'title', group='title')
+
+        formats = []
+        sources = self._parse_json(
+            self._search_regex(
+                r'sources\s*:\s*({.+?})', webpage, 'source', default='{}'),
+            video_id, fatal=False)
+        if sources and isinstance(sources, dict):
+            for format_id, format_url in sources.items():
+                if format_url:
+                    formats.append({
+                        'url': format_url,
+                        'format_id': format_id,
+                        'height': int_or_none(format_id),
+                    })
+        else:
+            video_url = self._html_search_regex(
+                r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL')
+            formats.append({'url': video_url})
+        self._sort_formats(formats)
+
+        thumbnail = self._og_search_thumbnail(webpage)
+        upload_date = unified_strdate(self._search_regex(
+            r'<span[^>]+class="added-time"[^>]*>ADDED ([^<]+)<',
+            webpage, 'upload date', fatal=False))
+        duration = int_or_none(self._search_regex(
+            r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False))
+        view_count = str_to_int(self._search_regex(
+            r'<span[^>]*>VIEWS</span></td>\s*<td>([\d,.]+)',
+            webpage, 'view count', fatal=False))
 
         # No self-labeling, but they describe themselves as
         # "Home of Videos Porno"
@@ -37,9 +72,12 @@ class RedTubeIE(InfoExtractor):
 
         return {
             'id': video_id,
-            'url': video_url,
             'ext': 'mp4',
-            'title': video_title,
-            'thumbnail': video_thumbnail,
+            'title': title,
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+            'duration': duration,
+            'view_count': view_count,
             'age_limit': age_limit,
+            'formats': formats,
         }
diff --git a/youtube_dl/extractor/reuters.py b/youtube_dl/extractor/reuters.py
new file mode 100644 (file)
index 0000000..961d504
--- /dev/null
@@ -0,0 +1,69 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    js_to_json,
+    int_or_none,
+    unescapeHTML,
+)
+
+
+class ReutersIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?reuters\.com/.*?\?.*?videoId=(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.reuters.com/video/2016/05/20/san-francisco-police-chief-resigns?videoId=368575562',
+        'md5': '8015113643a0b12838f160b0b81cc2ee',
+        'info_dict': {
+            'id': '368575562',
+            'ext': 'mp4',
+            'title': 'San Francisco police chief resigns',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(
+            'http://www.reuters.com/assets/iframe/yovideo?videoId=%s' % video_id, video_id)
+        video_data = js_to_json(self._search_regex(
+            r'(?s)Reuters\.yovideo\.drawPlayer\(({.*?})\);',
+            webpage, 'video data'))
+
+        def get_json_value(key, fatal=False):
+            return self._search_regex('"%s"\s*:\s*"([^"]+)"' % key, video_data, key, fatal=fatal)
+
+        title = unescapeHTML(get_json_value('title', fatal=True))
+        mmid, fid = re.search(r',/(\d+)\?f=(\d+)', get_json_value('flv', fatal=True)).groups()
+
+        mas_data = self._download_json(
+            'http://mas-e.cds1.yospace.com/mas/%s/%s?trans=json' % (mmid, fid),
+            video_id, transform_source=js_to_json)
+        formats = []
+        for f in mas_data:
+            f_url = f.get('url')
+            if not f_url:
+                continue
+            method = f.get('method')
+            if method == 'hls':
+                formats.extend(self._extract_m3u8_formats(
+                    f_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+            else:
+                container = f.get('container')
+                ext = '3gp' if method == 'mobile' else container
+                formats.append({
+                    'format_id': ext,
+                    'url': f_url,
+                    'ext': ext,
+                    'container': container if method != 'mobile' else None,
+                })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': get_json_value('thumb'),
+            'duration': int_or_none(get_json_value('seconds')),
+            'formats': formats,
+        }
index 99979ebe1a9fe82099076b46b576ef38a58bca8c..833d8a2f0d3813014224e39a8d2d41fb0e51d515 100644 (file)
@@ -13,8 +13,64 @@ from ..utils import (
 )
 
 
+class Revision3EmbedIE(InfoExtractor):
+    IE_NAME = 'revision3:embed'
+    _VALID_URL = r'(?:revision3:(?:(?P<playlist_type>[^:]+):)?|https?://(?:(?:(?:www|embed)\.)?(?:revision3|animalist)|(?:(?:api|embed)\.)?seekernetwork)\.com/player/embed\?videoId=)(?P<playlist_id>\d+)'
+    _TEST = {
+        'url': 'http://api.seekernetwork.com/player/embed?videoId=67558',
+        'md5': '83bcd157cab89ad7318dd7b8c9cf1306',
+        'info_dict': {
+            'id': '67558',
+            'ext': 'mp4',
+            'title': 'The Pros & Cons Of Zoos',
+            'description': 'Zoos are often depicted as a terrible place for animals to live, but is there any truth to this?',
+            'uploader_id': 'dnews',
+            'uploader': 'DNews',
+        }
+    }
+    _API_KEY = 'ba9c741bce1b9d8e3defcc22193f3651b8867e62'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        playlist_id = mobj.group('playlist_id')
+        playlist_type = mobj.group('playlist_type') or 'video_id'
+        video_data = self._download_json(
+            'http://revision3.com/api/getPlaylist.json', playlist_id, query={
+                'api_key': self._API_KEY,
+                'codecs': 'h264,vp8,theora',
+                playlist_type: playlist_id,
+            })['items'][0]
+
+        formats = []
+        for vcodec, media in video_data['media'].items():
+            for quality_id, quality in media.items():
+                if quality_id == 'hls':
+                    formats.extend(self._extract_m3u8_formats(
+                        quality['url'], playlist_id, 'mp4',
+                        'm3u8_native', m3u8_id='hls', fatal=False))
+                else:
+                    formats.append({
+                        'url': quality['url'],
+                        'format_id': '%s-%s' % (vcodec, quality_id),
+                        'tbr': int_or_none(quality.get('bitrate')),
+                        'vcodec': vcodec,
+                    })
+        self._sort_formats(formats)
+
+        return {
+            'id': playlist_id,
+            'title': unescapeHTML(video_data['title']),
+            'description': unescapeHTML(video_data.get('summary')),
+            'uploader': video_data.get('show', {}).get('name'),
+            'uploader_id': video_data.get('show', {}).get('slug'),
+            'duration': int_or_none(video_data.get('duration')),
+            'formats': formats,
+        }
+
+
 class Revision3IE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:revision3|testtube|animalist)\.com)/(?P<id>[^/]+(?:/[^/?#]+)?)'
+    IE_NAME = 'revision'
+    _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:revision3|animalist)\.com)/(?P<id>[^/]+(?:/[^/?#]+)?)'
     _TESTS = [{
         'url': 'http://www.revision3.com/technobuffalo/5-google-predictions-for-2016',
         'md5': 'd94a72d85d0a829766de4deb8daaf7df',
@@ -32,52 +88,14 @@ class Revision3IE(InfoExtractor):
         }
     }, {
         # Show
-        'url': 'http://testtube.com/brainstuff',
-        'info_dict': {
-            'id': '251',
-            'title': 'BrainStuff',
-            'description': 'Whether the topic is popcorn or particle physics, you can count on the HowStuffWorks team to explore-and explain-the everyday science in the world around us on BrainStuff.',
-        },
-        'playlist_mincount': 93,
-    }, {
-        'url': 'https://testtube.com/dnews/5-weird-ways-plants-can-eat-animals?utm_source=FB&utm_medium=DNews&utm_campaign=DNewsSocial',
-        'info_dict': {
-            'id': '58227',
-            'display_id': 'dnews/5-weird-ways-plants-can-eat-animals',
-            'duration': 275,
-            'ext': 'webm',
-            'title': '5 Weird Ways Plants Can Eat Animals',
-            'description': 'Why have some plants evolved to eat meat?',
-            'upload_date': '20150120',
-            'timestamp': 1421763300,
-            'uploader': 'DNews',
-            'uploader_id': 'dnews',
-        },
-    }, {
-        'url': 'http://testtube.com/tt-editors-picks/the-israel-palestine-conflict-explained-in-ten-min',
-        'info_dict': {
-            'id': '71618',
-            'ext': 'mp4',
-            'display_id': 'tt-editors-picks/the-israel-palestine-conflict-explained-in-ten-min',
-            'title': 'The Israel-Palestine Conflict Explained in Ten Minutes',
-            'description': 'If you\'d like to learn about the struggle between Israelis and Palestinians, this video is a great place to start',
-            'uploader': 'Editors\' Picks',
-            'uploader_id': 'tt-editors-picks',
-            'timestamp': 1453309200,
-            'upload_date': '20160120',
-        },
-        'add_ie': ['Youtube'],
+        'url': 'http://revision3.com/variant',
+        'only_matching': True,
     }, {
         # Tag
-        'url': 'http://testtube.com/tech-news',
-        'info_dict': {
-            'id': '21018',
-            'title': 'tech news',
-        },
-        'playlist_mincount': 9,
+        'url': 'http://revision3.com/vr',
+        'only_matching': True,
     }]
     _PAGE_DATA_TEMPLATE = 'http://www.%s/apiProxy/ddn/%s?domain=%s'
-    _API_KEY = 'ba9c741bce1b9d8e3defcc22193f3651b8867e62'
 
     def _real_extract(self, url):
         domain, display_id = re.match(self._VALID_URL, url).groups()
@@ -119,33 +137,9 @@ class Revision3IE(InfoExtractor):
                 })
                 return info
 
-            video_data = self._download_json(
-                'http://revision3.com/api/getPlaylist.json?api_key=%s&codecs=h264,vp8,theora&video_id=%s' % (self._API_KEY, video_id),
-                video_id)['items'][0]
-
-            formats = []
-            for vcodec, media in video_data['media'].items():
-                for quality_id, quality in media.items():
-                    if quality_id == 'hls':
-                        formats.extend(self._extract_m3u8_formats(
-                            quality['url'], video_id, 'mp4',
-                            'm3u8_native', m3u8_id='hls', fatal=False))
-                    else:
-                        formats.append({
-                            'url': quality['url'],
-                            'format_id': '%s-%s' % (vcodec, quality_id),
-                            'tbr': int_or_none(quality.get('bitrate')),
-                            'vcodec': vcodec,
-                        })
-            self._sort_formats(formats)
-
             info.update({
-                'title': unescapeHTML(video_data['title']),
-                'description': unescapeHTML(video_data.get('summary')),
-                'uploader': video_data.get('show', {}).get('name'),
-                'uploader_id': video_data.get('show', {}).get('slug'),
-                'duration': int_or_none(video_data.get('duration')),
-                'formats': formats,
+                '_type': 'url_transparent',
+                'url': 'revision3:%s' % video_id,
             })
             return info
         else:
diff --git a/youtube_dl/extractor/rockstargames.py b/youtube_dl/extractor/rockstargames.py
new file mode 100644 (file)
index 0000000..48128e2
--- /dev/null
@@ -0,0 +1,69 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_iso8601,
+)
+
+
+class RockstarGamesIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?rockstargames\.com/videos(?:/video/|#?/?\?.*\bvideo=)(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://www.rockstargames.com/videos/video/11544/',
+        'md5': '03b5caa6e357a4bd50e3143fc03e5733',
+        'info_dict': {
+            'id': '11544',
+            'ext': 'mp4',
+            'title': 'Further Adventures in Finance and Felony Trailer',
+            'description': 'md5:6d31f55f30cb101b5476c4a379e324a3',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'timestamp': 1464876000,
+            'upload_date': '20160602',
+        }
+    }, {
+        'url': 'http://www.rockstargames.com/videos#/?video=48',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video = self._download_json(
+            'https://www.rockstargames.com/videoplayer/videos/get-video.json',
+            video_id, query={
+                'id': video_id,
+                'locale': 'en_us',
+            })['video']
+
+        title = video['title']
+
+        formats = []
+        for video in video['files_processed']['video/mp4']:
+            if not video.get('src'):
+                continue
+            resolution = video.get('resolution')
+            height = int_or_none(self._search_regex(
+                r'^(\d+)[pP]$', resolution or '', 'height', default=None))
+            formats.append({
+                'url': self._proto_relative_url(video['src']),
+                'format_id': resolution,
+                'height': height,
+            })
+
+        if not formats:
+            youtube_id = video.get('youtube_id')
+            if youtube_id:
+                return self.url_result(youtube_id, 'Youtube')
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video.get('description'),
+            'thumbnail': self._proto_relative_url(video.get('screencap')),
+            'timestamp': parse_iso8601(video.get('created')),
+            'formats': formats,
+        }
index 543d94417f6d52f9cb5f4bd4508d5bcf3d4b984e..4d612b5e3c1c1bfceec034d476cd5c0952da5b8c 100644 (file)
@@ -20,18 +20,19 @@ class RtlNlIE(InfoExtractor):
         (?P<id>[0-9a-f-]+)'''
 
     _TESTS = [{
-        'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/6e4203a6-0a5e-3596-8424-c599a59e0677',
-        'md5': 'cc16baa36a6c169391f0764fa6b16654',
+        'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/82b1aad1-4a14-3d7b-b554-b0aed1b2c416',
+        'md5': '473d1946c1fdd050b2c0161a4b13c373',
         'info_dict': {
-            'id': '6e4203a6-0a5e-3596-8424-c599a59e0677',
+            'id': '82b1aad1-4a14-3d7b-b554-b0aed1b2c416',
             'ext': 'mp4',
-            'title': 'RTL Nieuws - Laat',
-            'description': 'md5:6b61f66510c8889923b11f2778c72dc5',
-            'timestamp': 1408051800,
-            'upload_date': '20140814',
-            'duration': 576.880,
+            'title': 'RTL Nieuws',
+            'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+            'timestamp': 1461951000,
+            'upload_date': '20160429',
+            'duration': 1167.96,
         },
     }, {
+        # best format avaialble a3t
         'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed/autoplay=false',
         'md5': 'dea7474214af1271d91ef332fb8be7ea',
         'info_dict': {
@@ -39,18 +40,19 @@ class RtlNlIE(InfoExtractor):
             'ext': 'mp4',
             'timestamp': 1424039400,
             'title': 'RTL Nieuws - Nieuwe beelden Kopenhagen: chaos direct na aanslag',
-            'thumbnail': 're:^https?://screenshots\.rtl\.nl/system/thumb/sz=[0-9]+x[0-9]+/uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed$',
+            'thumbnail': 're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed$',
             'upload_date': '20150215',
             'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.',
         }
     }, {
         # empty synopsis and missing episodes (see https://github.com/rg3/youtube-dl/issues/6275)
+        # best format available nettv
         'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a/autoplay=false',
         'info_dict': {
             'id': 'f536aac0-1dc3-4314-920e-3bd1c5b3811a',
             'ext': 'mp4',
             'title': 'RTL Nieuws - Meer beelden van overval juwelier',
-            'thumbnail': 're:^https?://screenshots\.rtl\.nl/system/thumb/sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$',
+            'thumbnail': 're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$',
             'timestamp': 1437233400,
             'upload_date': '20150718',
             'duration': 30.474,
@@ -94,22 +96,46 @@ class RtlNlIE(InfoExtractor):
         videopath = material['videopath']
         m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath
 
-        formats = self._extract_m3u8_formats(m3u8_url, uuid, ext='mp4')
+        formats = self._extract_m3u8_formats(
+            m3u8_url, uuid, 'mp4', m3u8_id='hls', fatal=False)
 
         video_urlpart = videopath.split('/adaptive/')[1][:-5]
         PG_URL_TEMPLATE = 'http://pg.us.rtl.nl/rtlxl/network/%s/progressive/%s.mp4'
 
-        formats.extend([
-            {
-                'url': PG_URL_TEMPLATE % ('a2m', video_urlpart),
-                'format_id': 'pg-sd',
-            },
-            {
-                'url': PG_URL_TEMPLATE % ('a3m', video_urlpart),
-                'format_id': 'pg-hd',
-                'quality': 0,
+        PG_FORMATS = (
+            ('a2t', 512, 288),
+            ('a3t', 704, 400),
+            ('nettv', 1280, 720),
+        )
+
+        def pg_format(format_id, width, height):
+            return {
+                'url': PG_URL_TEMPLATE % (format_id, video_urlpart),
+                'format_id': 'pg-%s' % format_id,
+                'protocol': 'http',
+                'width': width,
+                'height': height,
             }
-        ])
+
+        if not formats:
+            formats = [pg_format(*pg_tuple) for pg_tuple in PG_FORMATS]
+        else:
+            pg_formats = []
+            for format_id, width, height in PG_FORMATS:
+                try:
+                    # Find hls format with the same width and height corresponding
+                    # to progressive format and copy metadata from it.
+                    f = next(f for f in formats if f.get('height') == height)
+                    # hls formats may have invalid width
+                    f['width'] = width
+                    f_copy = f.copy()
+                    f_copy.update(pg_format(format_id, width, height))
+                    pg_formats.append(f_copy)
+                except StopIteration:
+                    # Missing hls format does mean that no progressive format with
+                    # such width and height exists either.
+                    pass
+            formats.extend(pg_formats)
         self._sort_formats(formats)
 
         thumbnails = []
index 79af477158630503078d86b117f960a36f5f1f73..f11e3588b0796718e2ecbe316b53b968a08df98c 100644 (file)
@@ -6,6 +6,9 @@ import re
 import time
 
 from .common import InfoExtractor
+from ..compat import (
+    compat_struct_unpack,
+)
 from ..utils import (
     ExtractorError,
     float_or_none,
@@ -13,7 +16,6 @@ from ..utils import (
     remove_start,
     sanitized_Request,
     std_headers,
-    struct_unpack,
 )
 
 
@@ -21,7 +23,7 @@ def _decrypt_url(png):
     encrypted_data = base64.b64decode(png.encode('utf-8'))
     text_index = encrypted_data.find(b'tEXt')
     text_chunk = encrypted_data[text_index - 4:]
-    length = struct_unpack('!I', text_chunk[:4])[0]
+    length = compat_struct_unpack('!I', text_chunk[:4])[0]
     # Use bytearray to get integers when iterating in both python 2.x and 3.x
     data = bytearray(text_chunk[8:8 + length])
     data = [chr(b) for b in data if b != 0]
@@ -62,7 +64,7 @@ def _decrypt_url(png):
 class RTVEALaCartaIE(InfoExtractor):
     IE_NAME = 'rtve.es:alacarta'
     IE_DESC = 'RTVE a la carta'
-    _VALID_URL = r'https?://www\.rtve\.es/(m/)?alacarta/videos/[^/]+/[^/]+/(?P<id>\d+)'
+    _VALID_URL = r'https?://www\.rtve\.es/(m/)?(alacarta/videos|filmoteca)/[^/]+/[^/]+/(?P<id>\d+)'
 
     _TESTS = [{
         'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/',
@@ -85,6 +87,9 @@ class RTVEALaCartaIE(InfoExtractor):
     }, {
         'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve',
         'only_matching': True,
+    }, {
+        'url': 'http://www.rtve.es/filmoteca/no-do/not-1-introduccion-primer-noticiario-espanol/1465256/',
+        'only_matching': True,
     }]
 
     def _real_initialize(self):
index 4896d09d666e687010ae3cb6ebe0e2bfaec537d6..f6454c6b0082ed431fa74de49dd5881d3b0b7a0f 100644 (file)
@@ -9,7 +9,7 @@ class RTVNHIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?rtvnh\.nl/video/(?P<id>[0-9]+)'
     _TEST = {
         'url': 'http://www.rtvnh.nl/video/131946',
-        'md5': '6e1d0ab079e2a00b6161442d3ceacfc1',
+        'md5': 'cdbec9f44550763c8afc96050fa747dc',
         'info_dict': {
             'id': '131946',
             'ext': 'mp4',
@@ -29,15 +29,29 @@ class RTVNHIE(InfoExtractor):
             raise ExtractorError(
                 '%s returned error code %d' % (self.IE_NAME, status), expected=True)
 
-        formats = self._extract_smil_formats(
-            'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id, fatal=False)
-
-        for item in meta['source']['fb']:
-            if item.get('type') == 'hls':
-                formats.extend(self._extract_m3u8_formats(
-                    item['file'], video_id, ext='mp4', entry_protocol='m3u8_native'))
-            elif item.get('type') == '':
-                formats.append({'url': item['file']})
+        formats = []
+        rtmp_formats = self._extract_smil_formats(
+            'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id)
+        formats.extend(rtmp_formats)
+
+        for rtmp_format in rtmp_formats:
+            rtmp_url = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
+            rtsp_format = rtmp_format.copy()
+            del rtsp_format['play_path']
+            del rtsp_format['ext']
+            rtsp_format.update({
+                'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
+                'url': rtmp_url.replace('rtmp://', 'rtsp://'),
+                'protocol': 'rtsp',
+            })
+            formats.append(rtsp_format)
+            http_base_url = rtmp_url.replace('rtmp://', 'http://')
+            formats.extend(self._extract_m3u8_formats(
+                http_base_url + '/playlist.m3u8', video_id, 'mp4',
+                'm3u8_native', m3u8_id='hls', fatal=False))
+            formats.extend(self._extract_f4m_formats(
+                http_base_url + '/manifest.f4m',
+                video_id, f4m_id='hds', fatal=False))
         self._sort_formats(formats)
 
         return {
index 759898a492f43c67179409c563be42e864deae5f..96e43af849bc9e6b90bbba71a38e7155ef864268 100644 (file)
@@ -1,18 +1,12 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import itertools
 import json
-import re
 
 from .common import InfoExtractor
-from ..compat import compat_urlparse
 from ..utils import (
     int_or_none,
-    js_to_json,
     mimetype2ext,
-    sanitized_Request,
-    unified_strdate,
 )
 
 
@@ -27,7 +21,8 @@ class SandiaIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'Xyce Software Training - Section 1',
             'description': 're:(?s)SAND Number: SAND 2013-7800.{200,}',
-            'upload_date': '20120904',
+            'upload_date': '20120409',
+            'timestamp': 1333983600,
             'duration': 7794,
         }
     }
@@ -35,81 +30,36 @@ class SandiaIE(InfoExtractor):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        req = sanitized_Request(url)
-        req.add_header('Cookie', 'MediasitePlayerCaps=ClientPlugins=4')
-        webpage = self._download_webpage(req, video_id)
+        presentation_data = self._download_json(
+            'http://digitalops.sandia.gov/Mediasite/PlayerService/PlayerService.svc/json/GetPlayerOptions',
+            video_id, data=json.dumps({
+                'getPlayerOptionsRequest': {
+                    'ResourceId': video_id,
+                    'QueryString': '',
+                }
+            }), headers={
+                'Content-Type': 'application/json; charset=utf-8',
+            })['d']['Presentation']
 
-        js_path = self._search_regex(
-            r'<script type="text/javascript" src="(/Mediasite/FileServer/Presentation/[^"]+)"',
-            webpage, 'JS code URL')
-        js_url = compat_urlparse.urljoin(url, js_path)
-
-        js_code = self._download_webpage(
-            js_url, video_id, note='Downloading player')
-
-        def extract_str(key, **args):
-            return self._search_regex(
-                r'Mediasite\.PlaybackManifest\.%s\s*=\s*(.+);\s*?\n' % re.escape(key),
-                js_code, key, **args)
-
-        def extract_data(key, **args):
-            data_json = extract_str(key, **args)
-            if data_json is None:
-                return data_json
-            return self._parse_json(
-                data_json, video_id, transform_source=js_to_json)
+        title = presentation_data['Title']
 
         formats = []
-        for i in itertools.count():
-            fd = extract_data('VideoUrls[%d]' % i, default=None)
-            if fd is None:
-                break
-            formats.append({
-                'format_id': '%s' % i,
-                'format_note': fd['MimeType'].partition('/')[2],
-                'ext': mimetype2ext(fd['MimeType']),
-                'url': fd['Location'],
-                'protocol': 'f4m' if fd['MimeType'] == 'video/x-mp4-fragmented' else None,
-            })
+        for stream in presentation_data.get('Streams', []):
+            for fd in stream.get('VideoUrls', []):
+                formats.append({
+                    'format_id': fd['MediaType'],
+                    'format_note': fd['MimeType'].partition('/')[2],
+                    'ext': mimetype2ext(fd['MimeType']),
+                    'url': fd['Location'],
+                    'protocol': 'f4m' if fd['MimeType'] == 'video/x-mp4-fragmented' else None,
+                })
         self._sort_formats(formats)
 
-        slide_baseurl = compat_urlparse.urljoin(
-            url, extract_data('SlideBaseUrl'))
-        slide_template = slide_baseurl + re.sub(
-            r'\{0:D?([0-9+])\}', r'%0\1d', extract_data('SlideImageFileNameTemplate'))
-        slides = []
-        last_slide_time = 0
-        for i in itertools.count(1):
-            sd = extract_str('Slides[%d]' % i, default=None)
-            if sd is None:
-                break
-            timestamp = int_or_none(self._search_regex(
-                r'^Mediasite\.PlaybackManifest\.CreateSlide\("[^"]*"\s*,\s*([0-9]+),',
-                sd, 'slide %s timestamp' % i, fatal=False))
-            slides.append({
-                'url': slide_template % i,
-                'duration': timestamp - last_slide_time,
-            })
-            last_slide_time = timestamp
-        formats.append({
-            'format_id': 'slides',
-            'protocol': 'slideshow',
-            'url': json.dumps(slides),
-            'preference': -10000,  # Downloader not yet written
-        })
-        self._sort_formats(formats)
-
-        title = extract_data('Title')
-        description = extract_data('Description', fatal=False)
-        duration = int_or_none(extract_data(
-            'Duration', fatal=False), scale=1000)
-        upload_date = unified_strdate(extract_data('AirDate', fatal=False))
-
         return {
             'id': video_id,
             'title': title,
-            'description': description,
+            'description': presentation_data.get('Description'),
             'formats': formats,
-            'upload_date': upload_date,
-            'duration': duration,
+            'timestamp': int_or_none(presentation_data.get('UnixTime'), 1000),
+            'duration': int_or_none(presentation_data.get('Duration'), 1000),
         }
index 3bf93c870b2bc30c3baf9567a64d06171558f06b..b1ca12fdee1c012de789ebfaf15f03f04e73f768 100644 (file)
@@ -18,6 +18,7 @@ class SciVeeIE(InfoExtractor):
             'title': 'Adam Arkin at the 2014 DOE JGI Genomics of Energy & Environment Meeting',
             'description': 'md5:81f1710638e11a481358fab1b11059d7',
         },
+        'skip': 'Not accessible from Travis CI server',
     }
 
     def _real_extract(self, url):
index 44b0bbee68953a199c67e420fe1928048be5f2cf..40333c825443f1b349e8e40c7a08faff7f261e48 100644 (file)
@@ -12,7 +12,7 @@ from ..utils import (
 
 
 class ScreenwaveMediaIE(InfoExtractor):
-    _VALID_URL = r'https?://player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?.*\bid=(?P<id>[A-Za-z0-9-]+)'
+    _VALID_URL = r'(?:https?:)?//player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?.*\bid=(?P<id>[A-Za-z0-9-]+)'
     EMBED_PATTERN = r'src=(["\'])(?P<url>(?:https?:)?//player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?.*\bid=.+?)\1'
     _TESTS = [{
         'url': 'http://player.screenwavemedia.com/play/play.php?playerdiv=videoarea&companiondiv=squareAd&id=Cinemassacre-19911',
diff --git a/youtube_dl/extractor/seeker.py b/youtube_dl/extractor/seeker.py
new file mode 100644 (file)
index 0000000..3b9c65e
--- /dev/null
@@ -0,0 +1,57 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class SeekerIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?seeker\.com/(?P<display_id>.*)-(?P<article_id>\d+)\.html'
+    _TESTS = [{
+        # player.loadRevision3Item
+        'url': 'http://www.seeker.com/should-trump-be-required-to-release-his-tax-returns-1833805621.html',
+        'md5': '30c1dc4030cc715cf05b423d0947ac18',
+        'info_dict': {
+            'id': '76243',
+            'ext': 'webm',
+            'title': 'Should Trump Be Required To Release His Tax Returns?',
+            'description': 'Donald Trump has been secretive about his "big," "beautiful" tax returns. So what can we learn if he decides to release them?',
+            'uploader': 'Seeker Daily',
+            'uploader_id': 'seekerdaily',
+        }
+    }, {
+        'url': 'http://www.seeker.com/changes-expected-at-zoos-following-recent-gorilla-lion-shootings-1834116536.html',
+        'playlist': [
+            {
+                'md5': '83bcd157cab89ad7318dd7b8c9cf1306',
+                'info_dict': {
+                    'id': '67558',
+                    'ext': 'mp4',
+                    'title': 'The Pros & Cons Of Zoos',
+                    'description': 'Zoos are often depicted as a terrible place for animals to live, but is there any truth to this?',
+                    'uploader': 'DNews',
+                    'uploader_id': 'dnews',
+                },
+            }
+        ],
+        'info_dict': {
+            'id': '1834116536',
+            'title': 'After Gorilla Killing, Changes Ahead for Zoos',
+            'description': 'The largest association of zoos and others are hoping to learn from recent incidents that led to the shooting deaths of a gorilla and two lions.',
+        },
+    }]
+
+    def _real_extract(self, url):
+        display_id, article_id = re.match(self._VALID_URL, url).groups()
+        webpage = self._download_webpage(url, display_id)
+        mobj = re.search(r"player\.loadRevision3Item\('([^']+)'\s*,\s*(\d+)\);", webpage)
+        if mobj:
+            playlist_type, playlist_id = mobj.groups()
+            return self.url_result(
+                'revision3:%s:%s' % (playlist_type, playlist_id), 'Revision3Embed', playlist_id)
+        else:
+            entries = [self.url_result('revision3:video_id:%s' % video_id, 'Revision3Embed', video_id) for video_id in re.findall(
+                r'<iframe[^>]+src=[\'"](?:https?:)?//api\.seekernetwork\.com/player/embed\?videoId=(\d+)', webpage)]
+            return self.playlist_result(
+                entries, article_id, self._og_search_title(webpage), self._og_search_description(webpage))
diff --git a/youtube_dl/extractor/sendtonews.py b/youtube_dl/extractor/sendtonews.py
new file mode 100644 (file)
index 0000000..1c636f6
--- /dev/null
@@ -0,0 +1,86 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .jwplatform import JWPlatformBaseIE
+from ..compat import compat_parse_qs
+from ..utils import (
+    ExtractorError,
+    parse_duration,
+)
+
+
+class SendtoNewsIE(JWPlatformBaseIE):
+    _VALID_URL = r'https?://embed\.sendtonews\.com/player/embed\.php\?(?P<query>[^#]+)'
+
+    _TEST = {
+        # From http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/
+        'url': 'http://embed.sendtonews.com/player/embed.php?SK=GxfCe0Zo7D&MK=175909&PK=5588&autoplay=on&sound=yes',
+        'info_dict': {
+            'id': 'GxfCe0Zo7D-175909-5588',
+            'ext': 'mp4',
+            'title': 'Recap: CLE 15, CIN 6',
+            'description': '5/16/16: Indians\' bats explode for 15 runs in a win',
+            'duration': 49,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+
+    _URL_TEMPLATE = '//embed.sendtonews.com/player/embed.php?SK=%s&MK=%s&PK=%s'
+
+    @classmethod
+    def _extract_url(cls, webpage):
+        mobj = re.search(r'''(?x)<script[^>]+src=([\'"])
+            (?:https?:)?//embed\.sendtonews\.com/player/responsiveembed\.php\?
+                .*\bSC=(?P<SC>[0-9a-zA-Z-]+).*
+            \1>''', webpage)
+        if mobj:
+            sk, mk, pk = mobj.group('SC').split('-')
+            return cls._URL_TEMPLATE % (sk, mk, pk)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        params = compat_parse_qs(mobj.group('query'))
+
+        if 'SK' not in params or 'MK' not in params or 'PK' not in params:
+            raise ExtractorError('Invalid URL', expected=True)
+
+        video_id = '-'.join([params['SK'][0], params['MK'][0], params['PK'][0]])
+
+        webpage = self._download_webpage(url, video_id)
+
+        jwplayer_data_str = self._search_regex(
+            r'jwplayer\("[^"]+"\)\.setup\((.+?)\);', webpage, 'JWPlayer data')
+        js_vars = {
+            'w': 1024,
+            'h': 768,
+            'modeVar': 'html5',
+        }
+        for name, val in js_vars.items():
+            js_val = '%d' % val if isinstance(val, int) else '"%s"' % val
+            jwplayer_data_str = jwplayer_data_str.replace(':%s,' % name, ':%s,' % js_val)
+
+        info_dict = self._parse_jwplayer_data(
+            self._parse_json(jwplayer_data_str, video_id),
+            video_id, require_title=False, rtmp_params={'no_resume': True})
+
+        title = self._html_search_regex(
+            r'<div[^>]+class="embedTitle">([^<]+)</div>', webpage, 'title')
+        description = self._html_search_regex(
+            r'<div[^>]+class="embedSubTitle">([^<]+)</div>', webpage,
+            'description', fatal=False)
+        duration = parse_duration(self._html_search_regex(
+            r'<div[^>]+class="embedDetails">([0-9:]+)', webpage,
+            'duration', fatal=False))
+
+        info_dict.update({
+            'title': title,
+            'description': description,
+            'duration': duration,
+        })
+
+        return info_dict
diff --git a/youtube_dl/extractor/sexykarma.py b/youtube_dl/extractor/sexykarma.py
deleted file mode 100644 (file)
index e334836..0000000
+++ /dev/null
@@ -1,121 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import (
-    unified_strdate,
-    parse_duration,
-    int_or_none,
-)
-
-
-class SexyKarmaIE(InfoExtractor):
-    IE_DESC = 'Sexy Karma and Watch Indian Porn'
-    _VALID_URL = r'https?://(?:www\.)?(?:sexykarma\.com|watchindianporn\.net)/(?:[^/]+/)*video/(?P<display_id>[^/]+)-(?P<id>[a-zA-Z0-9]+)\.html'
-    _TESTS = [{
-        'url': 'http://www.sexykarma.com/gonewild/video/taking-a-quick-pee-yHI70cOyIHt.html',
-        'md5': 'b9798e7d1ef1765116a8f516c8091dbd',
-        'info_dict': {
-            'id': 'yHI70cOyIHt',
-            'display_id': 'taking-a-quick-pee',
-            'ext': 'mp4',
-            'title': 'Taking a quick pee.',
-            'thumbnail': 're:^https?://.*\.jpg$',
-            'uploader': 'wildginger7',
-            'upload_date': '20141008',
-            'duration': 22,
-            'view_count': int,
-            'comment_count': int,
-            'categories': list,
-            'age_limit': 18,
-        }
-    }, {
-        'url': 'http://www.sexykarma.com/gonewild/video/pot-pixie-tribute-8Id6EZPbuHf.html',
-        'md5': 'dd216c68d29b49b12842b9babe762a5d',
-        'info_dict': {
-            'id': '8Id6EZPbuHf',
-            'display_id': 'pot-pixie-tribute',
-            'ext': 'mp4',
-            'title': 'pot_pixie tribute',
-            'thumbnail': 're:^https?://.*\.jpg$',
-            'uploader': 'banffite',
-            'upload_date': '20141013',
-            'duration': 16,
-            'view_count': int,
-            'comment_count': int,
-            'categories': list,
-            'age_limit': 18,
-        }
-    }, {
-        'url': 'http://www.watchindianporn.net/video/desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number-dW2mtctxJfs.html',
-        'md5': '9afb80675550406ed9a63ac2819ef69d',
-        'info_dict': {
-            'id': 'dW2mtctxJfs',
-            'display_id': 'desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number',
-            'ext': 'mp4',
-            'title': 'Desi dancer namrata stripping completely nude and dancing on a hot number',
-            'thumbnail': 're:^https?://.*\.jpg$',
-            'uploader': 'Don',
-            'upload_date': '20140213',
-            'duration': 83,
-            'view_count': int,
-            'comment_count': int,
-            'categories': list,
-            'age_limit': 18,
-        }
-    }]
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        display_id = mobj.group('display_id')
-
-        webpage = self._download_webpage(url, display_id)
-
-        video_url = self._html_search_regex(
-            r"url: escape\('([^']+)'\)", webpage, 'url')
-
-        title = self._html_search_regex(
-            r'<h2 class="he2"><span>(.*?)</span>',
-            webpage, 'title')
-        thumbnail = self._html_search_regex(
-            r'<span id="container"><img\s+src="([^"]+)"',
-            webpage, 'thumbnail', fatal=False)
-
-        uploader = self._html_search_regex(
-            r'class="aupa">\s*(.*?)</a>',
-            webpage, 'uploader')
-        upload_date = unified_strdate(self._html_search_regex(
-            r'Added: <strong>(.+?)</strong>', webpage, 'upload date', fatal=False))
-
-        duration = parse_duration(self._search_regex(
-            r'<td>Time:\s*</td>\s*<td align="right"><span>\s*(.+?)\s*</span>',
-            webpage, 'duration', fatal=False))
-
-        view_count = int_or_none(self._search_regex(
-            r'<td>Views:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>',
-            webpage, 'view count', fatal=False))
-        comment_count = int_or_none(self._search_regex(
-            r'<td>Comments:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>',
-            webpage, 'comment count', fatal=False))
-
-        categories = re.findall(
-            r'<a href="[^"]+/search/video/desi"><span>([^<]+)</span></a>',
-            webpage)
-
-        return {
-            'id': video_id,
-            'display_id': display_id,
-            'url': video_url,
-            'title': title,
-            'thumbnail': thumbnail,
-            'uploader': uploader,
-            'upload_date': upload_date,
-            'duration': duration,
-            'view_count': view_count,
-            'comment_count': comment_count,
-            'categories': categories,
-            'age_limit': 18,
-        }
index d03f1b1d4308d047e5b690a682587ac5655ce338..8fc66732af70f4db5305fdc891c5142afd5c97c7 100644 (file)
@@ -4,28 +4,35 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..compat import compat_urllib_parse_urlencode
-from ..utils import sanitized_Request
+from ..utils import (
+    HEADRequest,
+    ExtractorError,
+    int_or_none,
+    update_url_query,
+    qualities,
+    get_element_by_attribute,
+    clean_html,
+)
 
 
 class SinaIE(InfoExtractor):
-    _VALID_URL = r'''(?x)https?://(.*?\.)?video\.sina\.com\.cn/
-                        (
-                            (.+?/(((?P<pseudo_id>\d+).html)|(.*?(\#|(vid=)|b/)(?P<id>\d+?)($|&|\-))))
-                            |
+    _VALID_URL = r'''(?x)https?://(?:.*?\.)?video\.sina\.com\.cn/
+                        (?:
+                            (?:view/|.*\#)(?P<video_id>\d+)|
+                            .+?/(?P<pseudo_id>[^/?#]+)(?:\.s?html)|
                             # This is used by external sites like Weibo
-                            (api/sinawebApi/outplay.php/(?P<token>.+?)\.swf)
+                            api/sinawebApi/outplay.php/(?P<token>.+?)\.swf
                         )
                   '''
 
     _TESTS = [
         {
-            'url': 'http://video.sina.com.cn/news/vlist/zt/chczlj2013/?opsubject_id=top12#110028898',
-            'md5': 'd65dd22ddcf44e38ce2bf58a10c3e71f',
+            'url': 'http://video.sina.com.cn/news/spj/topvideoes20160504/?opsubject_id=top1#250576622',
+            'md5': 'd38433e2fc886007729735650ae4b3e9',
             'info_dict': {
-                'id': '110028898',
-                'ext': 'flv',
-                'title': '《中国新闻》 朝鲜要求巴拿马立即释放被扣船员',
+                'id': '250576622',
+                'ext': 'mp4',
+                'title': '现场:克鲁兹宣布退选 特朗普将稳获提名',
             }
         },
         {
@@ -35,37 +42,74 @@ class SinaIE(InfoExtractor):
                 'ext': 'flv',
                 'title': '军方提高对朝情报监视级别',
             },
+            'skip': 'the page does not exist or has been deleted',
+        },
+        {
+            'url': 'http://video.sina.com.cn/view/250587748.html',
+            'md5': '3d1807a25c775092aab3bc157fff49b4',
+            'info_dict': {
+                'id': '250587748',
+                'ext': 'mp4',
+                'title': '瞬间泪目:8年前汶川地震珍贵视频首曝光',
+            },
         },
     ]
 
-    def _extract_video(self, video_id):
-        data = compat_urllib_parse_urlencode({'vid': video_id})
-        url_doc = self._download_xml('http://v.iask.com/v_play.php?%s' % data,
-                                     video_id, 'Downloading video url')
-        image_page = self._download_webpage(
-            'http://interface.video.sina.com.cn/interface/common/getVideoImage.php?%s' % data,
-            video_id, 'Downloading thumbnail info')
-
-        return {'id': video_id,
-                'url': url_doc.find('./durl/url').text,
-                'ext': 'flv',
-                'title': url_doc.find('./vname').text,
-                'thumbnail': image_page.split('=')[1],
-                }
-
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        if mobj.group('token') is not None:
-            # The video id is in the redirected url
-            self.to_screen('Getting video id')
-            request = sanitized_Request(url)
-            request.get_method = lambda: 'HEAD'
-            (_, urlh) = self._download_webpage_handle(request, 'NA', False)
-            return self._real_extract(urlh.geturl())
-        elif video_id is None:
-            pseudo_id = mobj.group('pseudo_id')
-            webpage = self._download_webpage(url, pseudo_id)
-            video_id = self._search_regex(r'vid:\'(\d+?)\'', webpage, 'video id')
 
-        return self._extract_video(video_id)
+        video_id = mobj.group('video_id')
+        if not video_id:
+            if mobj.group('token') is not None:
+                # The video id is in the redirected url
+                self.to_screen('Getting video id')
+                request = HEADRequest(url)
+                (_, urlh) = self._download_webpage_handle(request, 'NA', False)
+                return self._real_extract(urlh.geturl())
+            else:
+                pseudo_id = mobj.group('pseudo_id')
+                webpage = self._download_webpage(url, pseudo_id)
+                error = get_element_by_attribute('class', 'errtitle', webpage)
+                if error:
+                    raise ExtractorError('%s said: %s' % (
+                        self.IE_NAME, clean_html(error)), expected=True)
+                video_id = self._search_regex(
+                    r"video_id\s*:\s*'(\d+)'", webpage, 'video id')
+
+        video_data = self._download_json(
+            'http://s.video.sina.com.cn/video/h5play',
+            video_id, query={'video_id': video_id})
+        if video_data['code'] != 1:
+            raise ExtractorError('%s said: %s' % (
+                self.IE_NAME, video_data['message']), expected=True)
+        else:
+            video_data = video_data['data']
+            title = video_data['title']
+            description = video_data.get('description')
+            if description:
+                description = description.strip()
+
+            preference = qualities(['cif', 'sd', 'hd', 'fhd', 'ffd'])
+            formats = []
+            for quality_id, quality in video_data.get('videos', {}).get('mp4', {}).items():
+                file_api = quality.get('file_api')
+                file_id = quality.get('file_id')
+                if not file_api or not file_id:
+                    continue
+                formats.append({
+                    'format_id': quality_id,
+                    'url': update_url_query(file_api, {'vid': file_id}),
+                    'preference': preference(quality_id),
+                    'ext': 'mp4',
+                })
+            self._sort_formats(formats)
+
+            return {
+                'id': video_id,
+                'title': title,
+                'description': description,
+                'thumbnail': video_data.get('image'),
+                'duration': int_or_none(video_data.get('length')),
+                'timestamp': int_or_none(video_data.get('create_time')),
+                'formats': formats,
+            }
diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py
new file mode 100644 (file)
index 0000000..d3aba58
--- /dev/null
@@ -0,0 +1,64 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    qualities,
+    int_or_none,
+    mimetype2ext,
+    determine_ext,
+)
+
+
+class SixPlayIE(InfoExtractor):
+    _VALID_URL = r'(?:6play:|https?://(?:www\.)?6play\.fr/.+?-c_)(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.6play.fr/jamel-et-ses-amis-au-marrakech-du-rire-p_1316/jamel-et-ses-amis-au-marrakech-du-rire-2015-c_11495320',
+        'md5': '42310bffe4ba3982db112b9cd3467328',
+        'info_dict': {
+            'id': '11495320',
+            'ext': 'mp4',
+            'title': 'Jamel et ses amis au Marrakech du rire 2015',
+            'description': 'md5:ba2149d5c321d5201b78070ee839d872',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        clip_data = self._download_json(
+            'https://player.m6web.fr/v2/video/config/6play-auth/FR/%s.json' % video_id,
+            video_id)
+        video_data = clip_data['videoInfo']
+
+        quality_key = qualities(['lq', 'sd', 'hq', 'hd'])
+        formats = []
+        for source in clip_data['sources']:
+            source_type, source_url = source.get('type'), source.get('src')
+            if not source_url or source_type == 'hls/primetime':
+                continue
+            ext = mimetype2ext(source_type) or determine_ext(source_url)
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    source_url, video_id, 'mp4', 'm3u8_native',
+                    m3u8_id='hls', fatal=False))
+                formats.extend(self._extract_f4m_formats(
+                    source_url.replace('.m3u8', '.f4m'),
+                    video_id, f4m_id='hds', fatal=False))
+            elif ext == 'mp4':
+                quality = source.get('quality')
+                formats.append({
+                    'url': source_url,
+                    'format_id': quality,
+                    'quality': quality_key(quality),
+                    'ext': ext,
+                })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': video_data['title'].strip(),
+            'description': video_data.get('description'),
+            'duration': int_or_none(video_data.get('duration')),
+            'series': video_data.get('titlePgm'),
+            'formats': formats,
+        }
index 05e1b02ada567c8d88c0dbbba41171d0df1ac680..fffc9aa2277e3a0f35d24c585eec5be1e59d5c17 100644 (file)
@@ -67,7 +67,7 @@ class SkyNewsArabiaIE(SkyNewsArabiaBaseIE):
 
 
 class SkyNewsArabiaArticleIE(SkyNewsArabiaBaseIE):
-    IE_NAME = 'skynewsarabia:video'
+    IE_NAME = 'skynewsarabia:article'
     _VALID_URL = r'https?://(?:www\.)?skynewsarabia\.com/web/article/(?P<id>[0-9]+)'
     _TESTS = [{
         'url': 'http://www.skynewsarabia.com/web/article/794549/%D8%A7%D9%94%D8%AD%D8%AF%D8%A7%D8%AB-%D8%A7%D9%84%D8%B4%D8%B1%D9%82-%D8%A7%D9%84%D8%A7%D9%94%D9%88%D8%B3%D8%B7-%D8%AE%D8%B1%D9%8A%D8%B7%D8%A9-%D8%A7%D9%84%D8%A7%D9%94%D9%84%D8%B9%D8%A7%D8%A8-%D8%A7%D9%84%D8%B0%D9%83%D9%8A%D8%A9',
diff --git a/youtube_dl/extractor/skysports.py b/youtube_dl/extractor/skysports.py
new file mode 100644 (file)
index 0000000..9dc78c7
--- /dev/null
@@ -0,0 +1,33 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class SkySportsIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.skysports.com/watch/video/10328419/bale-its-our-time-to-shine',
+        'md5': 'c44a1db29f27daf9a0003e010af82100',
+        'info_dict': {
+            'id': '10328419',
+            'ext': 'flv',
+            'title': 'Bale: Its our time to shine',
+            'description': 'md5:9fd1de3614d525f5addda32ac3c482c9',
+        },
+        'add_ie': ['Ooyala'],
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'url': 'ooyala:%s' % self._search_regex(
+                r'data-video-id="([^"]+)"', webpage, 'ooyala id'),
+            'title': self._og_search_title(webpage),
+            'description': self._og_search_description(webpage),
+            'ie_key': 'Ooyala',
+        }
index 0b717a1e42b8dd2c3d8a88d602f001876cf99e03..4967c1b7752e4ebfd0c1aac9b0d079c2dc843363 100644 (file)
@@ -9,6 +9,7 @@ from ..compat import (
 )
 from ..utils import (
     ExtractorError,
+    get_element_by_id,
 )
 
 
@@ -40,7 +41,7 @@ class SlideshareIE(InfoExtractor):
         bucket = info['jsplayer']['video_bucket']
         ext = info['jsplayer']['video_extension']
         video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext)
-        description = self._html_search_regex(
+        description = get_element_by_id('slideshow-description-paragraph', webpage) or self._html_search_regex(
             r'(?s)<p[^>]+itemprop="description"[^>]*>(.+?)</p>', webpage,
             'description', fatal=False)
 
@@ -51,5 +52,5 @@ class SlideshareIE(InfoExtractor):
             'ext': ext,
             'url': video_url,
             'thumbnail': info['slideshow']['pin_image_url'],
-            'description': description,
+            'description': description.strip() if description else None,
         }
diff --git a/youtube_dl/extractor/snagfilms.py b/youtube_dl/extractor/snagfilms.py
deleted file mode 100644 (file)
index 6977afb..0000000
+++ /dev/null
@@ -1,181 +0,0 @@
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
-    clean_html,
-    determine_ext,
-    int_or_none,
-    js_to_json,
-    parse_duration,
-)
-
-
-class SnagFilmsEmbedIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:(?:www|embed)\.)?snagfilms\.com/embed/player\?.*\bfilmId=(?P<id>[\da-f-]{36})'
-    _TESTS = [{
-        'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500',
-        'md5': '2924e9215c6eff7a55ed35b72276bd93',
-        'info_dict': {
-            'id': '74849a00-85a9-11e1-9660-123139220831',
-            'ext': 'mp4',
-            'title': '#whilewewatch',
-        }
-    }, {
-        # invalid labels, 360p is better that 480p
-        'url': 'http://www.snagfilms.com/embed/player?filmId=17ca0950-a74a-11e0-a92a-0026bb61d036',
-        'md5': '882fca19b9eb27ef865efeeaed376a48',
-        'info_dict': {
-            'id': '17ca0950-a74a-11e0-a92a-0026bb61d036',
-            'ext': 'mp4',
-            'title': 'Life in Limbo',
-        }
-    }, {
-        'url': 'http://www.snagfilms.com/embed/player?filmId=0000014c-de2f-d5d6-abcf-ffef58af0017',
-        'only_matching': True,
-    }]
-
-    @staticmethod
-    def _extract_url(webpage):
-        mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?snagfilms\.com/embed/player.+?)\1',
-            webpage)
-        if mobj:
-            return mobj.group('url')
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-
-        webpage = self._download_webpage(url, video_id)
-
-        if '>This film is not playable in your area.<' in webpage:
-            raise ExtractorError(
-                'Film %s is not playable in your area.' % video_id, expected=True)
-
-        formats = []
-        for source in self._parse_json(js_to_json(self._search_regex(
-                r'(?s)sources:\s*(\[.+?\]),', webpage, 'json')), video_id):
-            file_ = source.get('file')
-            if not file_:
-                continue
-            type_ = source.get('type')
-            ext = determine_ext(file_)
-            format_id = source.get('label') or ext
-            if all(v == 'm3u8' for v in (type_, ext)):
-                formats.extend(self._extract_m3u8_formats(
-                    file_, video_id, 'mp4', m3u8_id='hls'))
-            else:
-                bitrate = int_or_none(self._search_regex(
-                    [r'(\d+)kbps', r'_\d{1,2}x\d{1,2}_(\d{3,})\.%s' % ext],
-                    file_, 'bitrate', default=None))
-                height = int_or_none(self._search_regex(
-                    r'^(\d+)[pP]$', format_id, 'height', default=None))
-                formats.append({
-                    'url': file_,
-                    'format_id': format_id,
-                    'tbr': bitrate,
-                    'height': height,
-                })
-        self._sort_formats(formats)
-
-        title = self._search_regex(
-            [r"title\s*:\s*'([^']+)'", r'<title>([^<]+)</title>'],
-            webpage, 'title')
-
-        return {
-            'id': video_id,
-            'title': title,
-            'formats': formats,
-        }
-
-
-class SnagFilmsIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?snagfilms\.com/(?:films/title|show)/(?P<id>[^?#]+)'
-    _TESTS = [{
-        'url': 'http://www.snagfilms.com/films/title/lost_for_life',
-        'md5': '19844f897b35af219773fd63bdec2942',
-        'info_dict': {
-            'id': '0000014c-de2f-d5d6-abcf-ffef58af0017',
-            'display_id': 'lost_for_life',
-            'ext': 'mp4',
-            'title': 'Lost for Life',
-            'description': 'md5:fbdacc8bb6b455e464aaf98bc02e1c82',
-            'thumbnail': 're:^https?://.*\.jpg',
-            'duration': 4489,
-            'categories': ['Documentary', 'Crime', 'Award Winning', 'Festivals']
-        }
-    }, {
-        'url': 'http://www.snagfilms.com/show/the_world_cut_project/india',
-        'md5': 'e6292e5b837642bbda82d7f8bf3fbdfd',
-        'info_dict': {
-            'id': '00000145-d75c-d96e-a9c7-ff5c67b20000',
-            'display_id': 'the_world_cut_project/india',
-            'ext': 'mp4',
-            'title': 'India',
-            'description': 'md5:5c168c5a8f4719c146aad2e0dfac6f5f',
-            'thumbnail': 're:^https?://.*\.jpg',
-            'duration': 979,
-            'categories': ['Documentary', 'Sports', 'Politics']
-        }
-    }, {
-        # Film is not playable in your area.
-        'url': 'http://www.snagfilms.com/films/title/inside_mecca',
-        'only_matching': True,
-    }, {
-        # Film is not available.
-        'url': 'http://www.snagfilms.com/show/augie_alone/flirting',
-        'only_matching': True,
-    }]
-
-    def _real_extract(self, url):
-        display_id = self._match_id(url)
-
-        webpage = self._download_webpage(url, display_id)
-
-        if ">Sorry, the Film you're looking for is not available.<" in webpage:
-            raise ExtractorError(
-                'Film %s is not available.' % display_id, expected=True)
-
-        film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id')
-
-        snag = self._parse_json(
-            self._search_regex(
-                'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag'),
-            display_id)
-
-        for item in snag:
-            if item.get('data', {}).get('film', {}).get('id') == film_id:
-                data = item['data']['film']
-                title = data['title']
-                description = clean_html(data.get('synopsis'))
-                thumbnail = data.get('image')
-                duration = int_or_none(data.get('duration') or data.get('runtime'))
-                categories = [
-                    category['title'] for category in data.get('categories', [])
-                    if category.get('title')]
-                break
-        else:
-            title = self._search_regex(
-                r'itemprop="title">([^<]+)<', webpage, 'title')
-            description = self._html_search_regex(
-                r'(?s)<div itemprop="description" class="film-synopsis-inner ">(.+?)</div>',
-                webpage, 'description', default=None) or self._og_search_description(webpage)
-            thumbnail = self._og_search_thumbnail(webpage)
-            duration = parse_duration(self._search_regex(
-                r'<span itemprop="duration" class="film-duration strong">([^<]+)<',
-                webpage, 'duration', fatal=False))
-            categories = re.findall(r'<a href="/movies/[^"]+">([^<]+)</a>', webpage)
-
-        return {
-            '_type': 'url_transparent',
-            'url': 'http://embed.snagfilms.com/embed/player?filmId=%s' % film_id,
-            'id': film_id,
-            'display_id': display_id,
-            'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
-            'duration': duration,
-            'categories': categories,
-        }
index 49e5d09ae450d11bb567a2fe95ecba55998c8b42..72fe66142a4e6c7ab54dbf8f03de5399fac02eeb 100644 (file)
@@ -8,10 +8,7 @@ from ..compat import (
     compat_str,
     compat_urllib_parse_urlencode,
 )
-from ..utils import (
-    ExtractorError,
-    sanitized_Request,
-)
+from ..utils import ExtractorError
 
 
 class SohuIE(InfoExtractor):
@@ -96,15 +93,10 @@ class SohuIE(InfoExtractor):
             else:
                 base_data_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid='
 
-            req = sanitized_Request(base_data_url + vid_id)
-
-            cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
-            if cn_verification_proxy:
-                req.add_header('Ytdl-request-proxy', cn_verification_proxy)
-
             return self._download_json(
-                req, video_id,
-                'Downloading JSON data for %s' % vid_id)
+                base_data_url + vid_id, video_id,
+                'Downloading JSON data for %s' % vid_id,
+                headers=self.geo_verification_headers())
 
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
index 692fd78e886c0a6a932adce4659f2564beeab7e6..92a7120a3242e732ceb58f51b4391a5efbc569d8 100644 (file)
@@ -96,20 +96,18 @@ class SpankwireIE(InfoExtractor):
         formats = []
         for height, video_url in zip(heights, video_urls):
             path = compat_urllib_parse_urlparse(video_url).path
-            _, quality = path.split('/')[4].split('_')[:2]
-            f = {
+            m = re.search(r'/(?P<height>\d+)[pP]_(?P<tbr>\d+)[kK]', path)
+            if m:
+                tbr = int(m.group('tbr'))
+                height = int(m.group('height'))
+            else:
+                tbr = None
+            formats.append({
                 'url': video_url,
+                'format_id': '%dp' % height,
                 'height': height,
-            }
-            tbr = self._search_regex(r'^(\d+)[Kk]$', quality, 'tbr', default=None)
-            if tbr:
-                f.update({
-                    'tbr': int(tbr),
-                    'format_id': '%dp' % height,
-                })
-            else:
-                f['format_id'] = quality
-            formats.append(f)
+                'tbr': tbr,
+            })
         self._sort_formats(formats)
 
         age_limit = self._rta_search(webpage)
index 39a7aaf9d630203dc1796b3b5621aad3c433f575..3c552807e268bb50a6a7d178e61d0834b0c48a42 100644 (file)
@@ -4,8 +4,13 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..compat import compat_urlparse
 from .spiegeltv import SpiegeltvIE
+from ..compat import compat_urlparse
+from ..utils import (
+    extract_attributes,
+    unified_strdate,
+    get_element_by_attribute,
+)
 
 
 class SpiegelIE(InfoExtractor):
@@ -19,6 +24,7 @@ class SpiegelIE(InfoExtractor):
             'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv',
             'description': 'md5:8029d8310232196eb235d27575a8b9f4',
             'duration': 49,
+            'upload_date': '20130311',
         },
     }, {
         'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html',
@@ -29,6 +35,7 @@ class SpiegelIE(InfoExtractor):
             'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers',
             'description': 'md5:c2322b65e58f385a820c10fa03b2d088',
             'duration': 983,
+            'upload_date': '20131115',
         },
     }, {
         'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html',
@@ -38,6 +45,7 @@ class SpiegelIE(InfoExtractor):
             'ext': 'mp4',
             'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.',
             'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"',
+            'upload_date': '20140904',
         }
     }, {
         'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html',
@@ -52,10 +60,10 @@ class SpiegelIE(InfoExtractor):
         if SpiegeltvIE.suitable(handle.geturl()):
             return self.url_result(handle.geturl(), 'Spiegeltv')
 
-        title = re.sub(r'\s+', ' ', self._html_search_regex(
-            r'(?s)<(?:h1|div) class="module-title"[^>]*>(.*?)</(?:h1|div)>',
-            webpage, 'title'))
-        description = self._html_search_meta('description', webpage, 'description')
+        video_data = extract_attributes(self._search_regex(r'(<div[^>]+id="spVideoElements"[^>]+>)', webpage, 'video element', default=''))
+
+        title = video_data.get('data-video-title') or get_element_by_attribute('class', 'module-title', webpage)
+        description = video_data.get('data-video-teaser') or self._html_search_meta('description', webpage, 'description')
 
         base_url = self._search_regex(
             [r'server\s*:\s*(["\'])(?P<url>.+?)\1', r'var\s+server\s*=\s*"(?P<url>[^"]+)\"'],
@@ -87,8 +95,9 @@ class SpiegelIE(InfoExtractor):
         return {
             'id': video_id,
             'title': title,
-            'description': description,
+            'description': description.strip() if description else None,
             'duration': duration,
+            'upload_date': unified_strdate(video_data.get('data-video-date')),
             'formats': formats,
         }
 
@@ -104,6 +113,7 @@ class SpiegelArticleIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'Faszination Badminton: Nennt es bloß nicht Federball',
             'description': 're:^Patrick Kämnitz gehört.{100,}',
+            'upload_date': '20140825',
         },
     }, {
         'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html',
diff --git a/youtube_dl/extractor/sportschau.py b/youtube_dl/extractor/sportschau.py
new file mode 100644 (file)
index 0000000..0d7925a
--- /dev/null
@@ -0,0 +1,38 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .wdr import WDRBaseIE
+from ..utils import get_element_by_attribute
+
+
+class SportschauIE(WDRBaseIE):
+    IE_NAME = 'Sportschau'
+    _VALID_URL = r'https?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video-?(?P<id>[^/#?]+)\.html'
+    _TEST = {
+        'url': 'http://www.sportschau.de/uefaeuro2016/videos/video-dfb-team-geht-gut-gelaunt-ins-spiel-gegen-polen-100.html',
+        'info_dict': {
+            'id': 'mdb-1140188',
+            'display_id': 'dfb-team-geht-gut-gelaunt-ins-spiel-gegen-polen-100',
+            'ext': 'mp4',
+            'title': 'DFB-Team geht gut gelaunt ins Spiel gegen Polen',
+            'description': 'Vor dem zweiten Gruppenspiel gegen Polen herrscht gute Stimmung im deutschen Team. Insbesondere Bastian Schweinsteiger strotzt vor Optimismus nach seinem Tor gegen die Ukraine.',
+            'upload_date': '20160615',
+        },
+        'skip': 'Geo-restricted to Germany',
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+        title = get_element_by_attribute('class', 'headline', webpage)
+        description = self._html_search_meta('description', webpage, 'description')
+
+        info = self._extract_wdr_video(webpage, video_id)
+
+        info.update({
+            'title': title,
+            'description': description,
+        })
+
+        return info
index 74d01183f5f396fb9499a8426775886faed5961d..409d5030422652e26fff1102c7fee1302f2b07b9 100644 (file)
@@ -9,8 +9,9 @@ from ..utils import (
 
 
 class SRMediathekIE(ARDMediathekIE):
+    IE_NAME = 'sr:mediathek'
     IE_DESC = 'Saarländischer Rundfunk'
-    _VALID_URL = r'https?://sr-mediathek\.sr-online\.de/index\.php\?.*?&id=(?P<id>[0-9]+)'
+    _VALID_URL = r'https?://sr-mediathek(?:\.sr-online)?\.de/index\.php\?.*?&id=(?P<id>[0-9]+)'
 
     _TESTS = [{
         'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=28455',
@@ -34,7 +35,9 @@ class SRMediathekIE(ARDMediathekIE):
             # m3u8 download
             'skip_download': True,
         },
-        'expected_warnings': ['Unable to download f4m manifest']
+    }, {
+        'url': 'http://sr-mediathek.de/index.php?seite=7&id=7480',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
index d5c852f5207bdad9510a720d69b0cd70527f9f3f..0f8782d038c9fdadf903b05479ff468a039c6aa4 100644 (file)
@@ -56,7 +56,7 @@ class StitcherIE(InfoExtractor):
 
         episode = self._parse_json(
             js_to_json(self._search_regex(
-                r'(?s)var\s+stitcher\s*=\s*({.+?});\n', webpage, 'episode config')),
+                r'(?s)var\s+stitcher(?:Config)?\s*=\s*({.+?});\n', webpage, 'episode config')),
             display_id)['config']['episode']
 
         title = unescapeHTML(episode['title'])
index 712359885fde90fa3032aeff1b2cb74afb761f35..6a6bb90c493a92fc2e644e2a550547460d899ca4 100644 (file)
@@ -5,7 +5,7 @@ import re
 
 from .common import InfoExtractor
 from ..utils import (
-    sanitized_Request,
+    ExtractorError,
     urlencode_postdata,
 )
 
@@ -14,7 +14,7 @@ class StreamcloudIE(InfoExtractor):
     IE_NAME = 'streamcloud.eu'
     _VALID_URL = r'https?://streamcloud\.eu/(?P<id>[a-zA-Z0-9_-]+)(?:/(?P<fname>[^#?]*)\.html)?'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://streamcloud.eu/skp9j99s4bpz/youtube-dl_test_video_____________-BaW_jenozKc.mp4.html',
         'md5': '6bea4c7fa5daaacc2a946b7146286686',
         'info_dict': {
@@ -23,7 +23,10 @@ class StreamcloudIE(InfoExtractor):
             'title': 'youtube-dl test video  \'/\\ ä ↭',
         },
         'skip': 'Only available from the EU'
-    }
+    }, {
+        'url': 'http://streamcloud.eu/ua8cmfh1nbe6/NSHIP-148--KUC-NG--H264-.mp4.html',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -31,26 +34,36 @@ class StreamcloudIE(InfoExtractor):
 
         orig_webpage = self._download_webpage(url, video_id)
 
+        if '>File Not Found<' in orig_webpage:
+            raise ExtractorError(
+                'Video %s does not exist' % video_id, expected=True)
+
         fields = re.findall(r'''(?x)<input\s+
             type="(?:hidden|submit)"\s+
             name="([^"]+)"\s+
             (?:id="[^"]+"\s+)?
             value="([^"]*)"
             ''', orig_webpage)
-        post = urlencode_postdata(fields)
 
         self._sleep(12, video_id)
-        headers = {
-            b'Content-Type': b'application/x-www-form-urlencoded',
-        }
-        req = sanitized_Request(url, post, headers)
 
         webpage = self._download_webpage(
-            req, video_id, note='Downloading video page ...')
-        title = self._html_search_regex(
-            r'<h1[^>]*>([^<]+)<', webpage, 'title')
-        video_url = self._search_regex(
-            r'file:\s*"([^"]+)"', webpage, 'video URL')
+            url, video_id, data=urlencode_postdata(fields), headers={
+                b'Content-Type': b'application/x-www-form-urlencoded',
+            })
+
+        try:
+            title = self._html_search_regex(
+                r'<h1[^>]*>([^<]+)<', webpage, 'title')
+            video_url = self._search_regex(
+                r'file:\s*"([^"]+)"', webpage, 'video URL')
+        except ExtractorError:
+            message = self._html_search_regex(
+                r'(?s)<div[^>]+class=(["\']).*?msgboxinfo.*?\1[^>]*>(?P<message>.+?)</div>',
+                webpage, 'message', default=None, group='message')
+            if message:
+                raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True)
+            raise
         thumbnail = self._search_regex(
             r'image:\s*"([^"]+)"', webpage, 'thumbnail URL', fatal=False)
 
index 2ab30e45ff7c65ab7dd1d6cff7a1952764799cc0..1c04dfb7bf757477d134cc7caa223ab47d0800ba 100644 (file)
@@ -6,17 +6,14 @@ import re
 from .common import InfoExtractor
 from ..utils import (
     determine_ext,
+    dict_get,
+    int_or_none,
+    try_get,
 )
 
 
 class SVTBaseIE(InfoExtractor):
-    def _extract_video(self, url, video_id):
-        info = self._download_json(url, video_id)
-
-        title = info['context']['title']
-        thumbnail = info['context'].get('thumbnailImage')
-
-        video_info = info['video']
+    def _extract_video(self, video_info, video_id):
         formats = []
         for vr in video_info['videoReferences']:
             player_type = vr.get('playerType')
@@ -40,27 +37,49 @@ class SVTBaseIE(InfoExtractor):
                     'format_id': player_type,
                     'url': vurl,
                 })
+        if not formats and video_info.get('rights', {}).get('geoBlockedSweden'):
+            self.raise_geo_restricted('This video is only available in Sweden')
         self._sort_formats(formats)
 
         subtitles = {}
-        subtitle_references = video_info.get('subtitleReferences')
+        subtitle_references = dict_get(video_info, ('subtitles', 'subtitleReferences'))
         if isinstance(subtitle_references, list):
             for sr in subtitle_references:
                 subtitle_url = sr.get('url')
+                subtitle_lang = sr.get('language', 'sv')
                 if subtitle_url:
-                    subtitles.setdefault('sv', []).append({'url': subtitle_url})
+                    if determine_ext(subtitle_url) == 'm3u8':
+                        # TODO(yan12125): handle WebVTT in m3u8 manifests
+                        continue
+
+                    subtitles.setdefault(subtitle_lang, []).append({'url': subtitle_url})
 
-        duration = video_info.get('materialLength')
-        age_limit = 18 if video_info.get('inappropriateForChildren') else 0
+        title = video_info.get('title')
+
+        series = video_info.get('programTitle')
+        season_number = int_or_none(video_info.get('season'))
+        episode = video_info.get('episodeTitle')
+        episode_number = int_or_none(video_info.get('episodeNumber'))
+
+        duration = int_or_none(dict_get(video_info, ('materialLength', 'contentDuration')))
+        age_limit = None
+        adult = dict_get(
+            video_info, ('inappropriateForChildren', 'blockedForChildren'),
+            skip_false_values=False)
+        if adult is not None:
+            age_limit = 18 if adult else 0
 
         return {
             'id': video_id,
             'title': title,
             'formats': formats,
             'subtitles': subtitles,
-            'thumbnail': thumbnail,
             'duration': duration,
             'age_limit': age_limit,
+            'series': series,
+            'season_number': season_number,
+            'episode': episode,
+            'episode_number': episode_number,
         }
 
 
@@ -68,11 +87,11 @@ class SVTIE(SVTBaseIE):
     _VALID_URL = r'https?://(?:www\.)?svt\.se/wd\?(?:.*?&)?widgetId=(?P<widget_id>\d+)&.*?\barticleId=(?P<id>\d+)'
     _TEST = {
         'url': 'http://www.svt.se/wd?widgetId=23991&sectionId=541&articleId=2900353&type=embed&contextSectionId=123&autostart=false',
-        'md5': '9648197555fc1b49e3dc22db4af51d46',
+        'md5': '33e9a5d8f646523ce0868ecfb0eed77d',
         'info_dict': {
             'id': '2900353',
-            'ext': 'flv',
-            'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
+            'ext': 'mp4',
+            'title': 'Stjärnorna skojar till det - under SVT-intervjun',
             'duration': 27,
             'age_limit': 0,
         },
@@ -89,15 +108,20 @@ class SVTIE(SVTBaseIE):
         mobj = re.match(self._VALID_URL, url)
         widget_id = mobj.group('widget_id')
         article_id = mobj.group('id')
-        return self._extract_video(
+
+        info = self._download_json(
             'http://www.svt.se/wd?widgetId=%s&articleId=%s&format=json&type=embed&output=json' % (widget_id, article_id),
             article_id)
 
+        info_dict = self._extract_video(info['video'], article_id)
+        info_dict['title'] = info['context']['title']
+        return info_dict
+
 
 class SVTPlayIE(SVTBaseIE):
     IE_DESC = 'SVT Play and Öppet arkiv'
-    _VALID_URL = r'https?://(?:www\.)?(?P<host>svtplay|oppetarkiv)\.se/video/(?P<id>[0-9]+)'
-    _TEST = {
+    _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp)/(?P<id>[0-9]+)'
+    _TESTS = [{
         'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2',
         'md5': '2b6704fe4a28801e1a098bbf3c5ac611',
         'info_dict': {
@@ -113,12 +137,50 @@ class SVTPlayIE(SVTBaseIE):
                 }]
             },
         },
-    }
+    }, {
+        # geo restricted to Sweden
+        'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        host = mobj.group('host')
-        return self._extract_video(
-            'http://www.%s.se/video/%s?output=json' % (host, video_id),
-            video_id)
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        data = self._parse_json(
+            self._search_regex(
+                r'root\["__svtplay"\]\s*=\s*([^;]+);',
+                webpage, 'embedded data', default='{}'),
+            video_id, fatal=False)
+
+        thumbnail = self._og_search_thumbnail(webpage)
+
+        if data:
+            video_info = try_get(
+                data, lambda x: x['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'],
+                dict)
+            if video_info:
+                info_dict = self._extract_video(video_info, video_id)
+                info_dict.update({
+                    'title': data['context']['dispatcher']['stores']['MetaStore']['title'],
+                    'thumbnail': thumbnail,
+                })
+                return info_dict
+
+        video_id = self._search_regex(
+            r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)',
+            webpage, 'video id', default=None)
+
+        if video_id:
+            data = self._download_json(
+                'http://www.svt.se/videoplayer-api/video/%s' % video_id, video_id)
+            info_dict = self._extract_video(data, video_id)
+            if not info_dict.get('title'):
+                info_dict['title'] = re.sub(
+                    r'\s*\|\s*.+?$', '',
+                    info_dict.get('episode') or self._og_search_title(webpage))
+            return info_dict
index 73e7657d4bec7b1bc37753923744d92b769d8843..136e18f96cadf7bd5701e32b0a3bc7c8767e324e 100644 (file)
@@ -4,42 +4,178 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..utils import parse_filesize
+from ..utils import (
+    determine_ext,
+    js_to_json,
+    parse_iso8601,
+    parse_filesize,
+)
+
+
+class TagesschauPlayerIE(InfoExtractor):
+    IE_NAME = 'tagesschau:player'
+    _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?P<kind>audio|video)/(?P=kind)-(?P<id>\d+)~player(?:_[^/?#&]+)?\.html'
+
+    _TESTS = [{
+        'url': 'http://www.tagesschau.de/multimedia/video/video-179517~player.html',
+        'md5': '8d09548d5c15debad38bee3a4d15ca21',
+        'info_dict': {
+            'id': '179517',
+            'ext': 'mp4',
+            'title': 'Marie Kristin Boese, ARD Berlin, über den zukünftigen Kurs der AfD',
+            'thumbnail': 're:^https?:.*\.jpg$',
+            'formats': 'mincount:6',
+        },
+    }, {
+        'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html',
+        'md5': '76e6eec6ebd40740671cf0a2c88617e5',
+        'info_dict': {
+            'id': '29417',
+            'ext': 'mp3',
+            'title': 'Trabi - Bye, bye Rennpappe',
+            'thumbnail': 're:^https?:.*\.jpg$',
+            'formats': 'mincount:2',
+        },
+    }, {
+        'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417~player_autoplay-true.html',
+        'only_matching': True,
+    }]
+
+    _FORMATS = {
+        'xs': {'quality': 0},
+        's': {'width': 320, 'height': 180, 'quality': 1},
+        'm': {'width': 512, 'height': 288, 'quality': 2},
+        'l': {'width': 960, 'height': 540, 'quality': 3},
+        'xl': {'width': 1280, 'height': 720, 'quality': 4},
+        'xxl': {'quality': 5},
+    }
+
+    def _extract_via_api(self, kind, video_id):
+        info = self._download_json(
+            'https://www.tagesschau.de/api/multimedia/{0}/{0}-{1}.json'.format(kind, video_id),
+            video_id)
+        title = info['headline']
+        formats = []
+        for media in info['mediadata']:
+            for format_id, format_url in media.items():
+                if determine_ext(format_url) == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        format_url, video_id, 'mp4',
+                        entry_protocol='m3u8_native', m3u8_id='hls'))
+                else:
+                    formats.append({
+                        'url': format_url,
+                        'format_id': format_id,
+                        'vcodec': 'none' if kind == 'audio' else None,
+                    })
+        self._sort_formats(formats)
+        timestamp = parse_iso8601(info.get('date'))
+        return {
+            'id': video_id,
+            'title': title,
+            'timestamp': timestamp,
+            'formats': formats,
+        }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        # kind = mobj.group('kind').lower()
+        # if kind == 'video':
+        #     return self._extract_via_api(kind, video_id)
+
+        # JSON api does not provide some audio formats (e.g. ogg) thus
+        # extractiong audio via webpage
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._og_search_title(webpage).strip()
+        formats = []
+
+        for media_json in re.findall(r'({src\s*:\s*["\']http[^}]+type\s*:[^}]+})', webpage):
+            media = self._parse_json(js_to_json(media_json), video_id, fatal=False)
+            if not media:
+                continue
+            src = media.get('src')
+            if not src:
+                return
+            quality = media.get('quality')
+            kind = media.get('type', '').split('/')[0]
+            ext = determine_ext(src)
+            f = {
+                'url': src,
+                'format_id': '%s_%s' % (quality, ext) if quality else ext,
+                'ext': ext,
+                'vcodec': 'none' if kind == 'audio' else None,
+            }
+            f.update(self._FORMATS.get(quality, {}))
+            formats.append(f)
+
+        self._sort_formats(formats)
+
+        thumbnail = self._og_search_thumbnail(webpage)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'formats': formats,
+        }
 
 
 class TagesschauIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:[^/]+/)*?[^/#?]+?(?P<id>-?[0-9]+)(?:~_[^/#?]+?)?\.html'
+    _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html'
 
     _TESTS = [{
         'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html',
-        'md5': '917a228bc7df7850783bc47979673a09',
+        'md5': 'f7c27a0eff3bfe8c7727e65f8fe1b1e6',
         'info_dict': {
-            'id': '102143',
+            'id': 'video-102143',
             'ext': 'mp4',
             'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt',
-            'description': 'md5:171feccd9d9b3dd54d05d501568f6359',
+            'description': '18.07.2015 20:10 Uhr',
             'thumbnail': 're:^https?:.*\.jpg$',
         },
     }, {
         'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html',
         'md5': '3c54c1f6243d279b706bde660ceec633',
         'info_dict': {
-            'id': '5727',
+            'id': 'ts-5727',
             'ext': 'mp4',
-            'description': 'md5:695c01bfd98b7e313c501386327aea59',
             'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr',
+            'description': 'md5:695c01bfd98b7e313c501386327aea59',
+            'thumbnail': 're:^https?:.*\.jpg$',
+        },
+    }, {
+        # exclusive audio
+        'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html',
+        'md5': '76e6eec6ebd40740671cf0a2c88617e5',
+        'info_dict': {
+            'id': 'audio-29417',
+            'ext': 'mp3',
+            'title': 'Trabi - Bye, bye Rennpappe',
+            'description': 'md5:8687dda862cbbe2cfb2df09b56341317',
             'thumbnail': 're:^https?:.*\.jpg$',
         },
     }, {
-        'url': 'http://www.tagesschau.de/multimedia/politikimradio/audio-18407.html',
-        'md5': 'aef45de271c4bf0a5db834aa40bf774c',
+        # audio in article
+        'url': 'http://www.tagesschau.de/inland/bnd-303.html',
+        'md5': 'e0916c623e85fc1d2b26b78f299d3958',
         'info_dict': {
-            'id': '18407',
+            'id': 'bnd-303',
             'ext': 'mp3',
-            'title': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich',
-            'description': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich',
+            'title': 'Viele Baustellen für neuen BND-Chef',
+            'description': 'md5:1e69a54be3e1255b2b07cdbce5bcd8b4',
             'thumbnail': 're:^https?:.*\.jpg$',
         },
+    }, {
+        'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html',
+        'info_dict': {
+            'id': 'afd-parteitag-135',
+            'title': 'Möchtegern-Underdog mit Machtanspruch',
+        },
+        'playlist_count': 2,
     }, {
         'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html',
         'only_matching': True,
@@ -61,88 +197,108 @@ class TagesschauIE(InfoExtractor):
     }, {
         'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html',
         'only_matching': True,
+    }, {
+        'url': 'http://www.tagesschau.de/100sekunden/index.html',
+        'only_matching': True,
+    }, {
+        # playlist article with collapsing sections
+        'url': 'http://www.tagesschau.de/wirtschaft/faq-freihandelszone-eu-usa-101.html',
+        'only_matching': True,
     }]
 
-    _FORMATS = {
-        's': {'width': 256, 'height': 144, 'quality': 1},
-        'm': {'width': 512, 'height': 288, 'quality': 2},
-        'l': {'width': 960, 'height': 544, 'quality': 3},
-    }
+    @classmethod
+    def suitable(cls, url):
+        return False if TagesschauPlayerIE.suitable(url) else super(TagesschauIE, cls).suitable(url)
+
+    def _extract_formats(self, download_text, media_kind):
+        links = re.finditer(
+            r'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>',
+            download_text)
+        formats = []
+        for l in links:
+            link_url = l.group('url')
+            if not link_url:
+                continue
+            format_id = self._search_regex(
+                r'.*/[^/.]+\.([^/]+)\.[^/.]+$', link_url, 'format ID',
+                default=determine_ext(link_url))
+            format = {
+                'format_id': format_id,
+                'url': l.group('url'),
+                'format_name': l.group('name'),
+            }
+            title = l.group('title')
+            if title:
+                if media_kind.lower() == 'video':
+                    m = re.match(
+                        r'''(?x)
+                            Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10;
+                            (?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10;
+                            (?P<vbr>[0-9]+)kbps&\#10;
+                            Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10;
+                            Gr&ouml;&szlig;e:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''',
+                        title)
+                    if m:
+                        format.update({
+                            'format_note': m.group('audio_desc'),
+                            'vcodec': m.group('vcodec'),
+                            'width': int(m.group('width')),
+                            'height': int(m.group('height')),
+                            'abr': int(m.group('abr')),
+                            'vbr': int(m.group('vbr')),
+                            'filesize_approx': parse_filesize(m.group('filesize_approx')),
+                        })
+                else:
+                    m = re.match(
+                        r'(?P<format>.+?)-Format\s*:\s*(?P<abr>\d+)kbps\s*,\s*(?P<note>.+)',
+                        title)
+                    if m:
+                        format.update({
+                            'format_note': '%s, %s' % (m.group('format'), m.group('note')),
+                            'vcodec': 'none',
+                            'abr': int(m.group('abr')),
+                        })
+            formats.append(format)
+        self._sort_formats(formats)
+        return formats
 
     def _real_extract(self, url):
-        video_id = self._match_id(url)
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id') or mobj.group('path')
         display_id = video_id.lstrip('-')
+
         webpage = self._download_webpage(url, display_id)
 
-        player_url = self._html_search_meta(
-            'twitter:player', webpage, 'player URL', default=None)
-        if player_url:
-            playerpage = self._download_webpage(
-                player_url, display_id, 'Downloading player page')
-
-            formats = []
-            for media in re.finditer(
-                    r'''(?x)
-                        (?P<q_url>["\'])(?P<url>http://media.+?)(?P=q_url)
-                        ,\s*type:(?P<q_type>["\'])(?P<type>video|audio)/(?P<ext>.+?)(?P=q_type)
-                        (?:,\s*quality:(?P<q_quality>["\'])(?P<quality>.+?)(?P=q_quality))?
-                    ''', playerpage):
-                url = media.group('url')
-                type_ = media.group('type')
-                ext = media.group('ext')
-                res = media.group('quality')
-                f = {
-                    'format_id': '%s_%s' % (res, ext) if res else ext,
-                    'url': url,
-                    'ext': ext,
-                    'vcodec': 'none' if type_ == 'audio' else None,
-                }
-                f.update(self._FORMATS.get(res, {}))
-                formats.append(f)
-            thumbnail = self._og_search_thumbnail(playerpage)
-            title = self._og_search_title(webpage).strip()
-            description = self._og_search_description(webpage).strip()
-        else:
+        title = self._html_search_regex(
+            r'<span[^>]*class="headline"[^>]*>(.+?)</span>',
+            webpage, 'title', default=None) or self._og_search_title(webpage)
+
+        DOWNLOAD_REGEX = r'(?s)<p>Wir bieten dieses (?P<kind>Video|Audio) in folgenden Formaten zum Download an:</p>\s*<div class="controls">(?P<links>.*?)</div>\s*<p>'
+
+        webpage_type = self._og_search_property('type', webpage, default=None)
+        if webpage_type == 'website':  # Article
+            entries = []
+            for num, (entry_title, media_kind, download_text) in enumerate(re.findall(
+                    r'(?s)<p[^>]+class="infotext"[^>]*>\s*(?:<a[^>]+>)?\s*<strong>(.+?)</strong>.*?</p>.*?%s' % DOWNLOAD_REGEX,
+                    webpage), 1):
+                entries.append({
+                    'id': '%s-%d' % (display_id, num),
+                    'title': '%s' % entry_title,
+                    'formats': self._extract_formats(download_text, media_kind),
+                })
+            if len(entries) > 1:
+                return self.playlist_result(entries, display_id, title)
+            formats = entries[0]['formats']
+        else:  # Assume single video
             download_text = self._search_regex(
-                r'(?s)<p>Wir bieten dieses Video in folgenden Formaten zum Download an:</p>\s*<div class="controls">(.*?)</div>\s*<p>',
-                webpage, 'download links')
-            links = re.finditer(
-                r'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>',
-                download_text)
-            formats = []
-            for l in links:
-                format_id = self._search_regex(
-                    r'.*/[^/.]+\.([^/]+)\.[^/.]+', l.group('url'), 'format ID')
-                format = {
-                    'format_id': format_id,
-                    'url': l.group('url'),
-                    'format_name': l.group('name'),
-                }
-                m = re.match(
-                    r'''(?x)
-                        Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10;
-                        (?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10;
-                        (?P<vbr>[0-9]+)kbps&\#10;
-                        Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10;
-                        Gr&ouml;&szlig;e:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''',
-                    l.group('title'))
-                if m:
-                    format.update({
-                        'format_note': m.group('audio_desc'),
-                        'vcodec': m.group('vcodec'),
-                        'width': int(m.group('width')),
-                        'height': int(m.group('height')),
-                        'abr': int(m.group('abr')),
-                        'vbr': int(m.group('vbr')),
-                        'filesize_approx': parse_filesize(m.group('filesize_approx')),
-                    })
-                formats.append(format)
-            thumbnail = self._og_search_thumbnail(webpage)
-            description = self._html_search_regex(
-                r'(?s)<p class="teasertext">(.*?)</p>',
-                webpage, 'description', default=None)
-            title = self._html_search_regex(
-                r'<span class="headline".*?>(.*?)</span>', webpage, 'title')
+                DOWNLOAD_REGEX, webpage, 'download links', group='links')
+            media_kind = self._search_regex(
+                DOWNLOAD_REGEX, webpage, 'media kind', default='Video', group='kind')
+            formats = self._extract_formats(download_text, media_kind)
+        thumbnail = self._og_search_thumbnail(webpage)
+        description = self._html_search_regex(
+            r'(?s)<p class="teasertext">(.*?)</p>',
+            webpage, 'description', default=None)
 
         self._sort_formats(formats)
 
index e0477382ceabea0769bd0575ceb1f350ce8c0911..d14d93e3ab1ae87902dc275e1208964a86b6b840 100644 (file)
@@ -11,6 +11,7 @@ class TeachingChannelIE(InfoExtractor):
 
     _TEST = {
         'url': 'https://www.teachingchannel.org/videos/teacher-teaming-evolution',
+        'md5': '3d6361864d7cac20b57c8784da17166f',
         'info_dict': {
             'id': 'F3bnlzbToeI6pLEfRyrlfooIILUjz4nM',
             'ext': 'mp4',
@@ -19,9 +20,9 @@ class TeachingChannelIE(InfoExtractor):
             'duration': 422.255,
         },
         'params': {
-            # m3u8 download
             'skip_download': True,
         },
+        'add_ie': ['Ooyala'],
     }
 
     def _real_extract(self, url):
index b49ab5f5b98c2d6219d1d17a1c0aea02eb534f61..79a7789200e34e1e457d9cd69cdabb495e3548c3 100644 (file)
@@ -88,7 +88,7 @@ class TeamcocoIE(InfoExtractor):
         preload_codes = self._html_search_regex(
             r'(function.+)setTimeout\(function\(\)\{playlist',
             webpage, 'preload codes')
-        base64_fragments = re.findall(r'"([a-zA-z0-9+/=]+)"', preload_codes)
+        base64_fragments = re.findall(r'"([a-zA-Z0-9+/=]+)"', preload_codes)
         base64_fragments.remove('init')
 
         def _check_sequence(cur_fragments):
index cf8851438bb74000abb2692c34607f3137505f1d..451cde76d2e757fcdfb30ad96847b16aa4d156ff 100644 (file)
@@ -27,7 +27,7 @@ class TEDIE(InfoExtractor):
         '''
     _TESTS = [{
         'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
-        'md5': 'fc94ac279feebbce69f21c0c6ee82810',
+        'md5': '0de43ac406aa3e4ea74b66c9c7789b13',
         'info_dict': {
             'id': '102',
             'ext': 'mp4',
@@ -37,21 +37,26 @@ class TEDIE(InfoExtractor):
                             'consciousness, but that half the time our brains are '
                             'actively fooling us.'),
             'uploader': 'Dan Dennett',
-            'width': 854,
+            'width': 853,
             'duration': 1308,
         }
     }, {
         'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
-        'md5': '226f4fb9c62380d11b7995efa4c87994',
+        'md5': 'b899ac15e345fb39534d913f7606082b',
         'info_dict': {
-            'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
+            'id': 'tSVI8ta_P4w',
             'ext': 'mp4',
             'title': 'Vishal Sikka: The beauty and power of algorithms',
             'thumbnail': 're:^https?://.+\.jpg',
-            'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
-        }
+            'description': 'md5:6261fdfe3e02f4f579cbbfc00aff73f4',
+            'upload_date': '20140122',
+            'uploader_id': 'TEDInstitute',
+            'uploader': 'TED Institute',
+        },
+        'add_ie': ['Youtube'],
     }, {
         'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
+        'md5': '71b3ab2f4233012dce09d515c9c39ce2',
         'info_dict': {
             'id': '1972',
             'ext': 'mp4',
@@ -102,9 +107,9 @@ class TEDIE(InfoExtractor):
     }]
 
     _NATIVE_FORMATS = {
-        'low': {'preference': 1, 'width': 320, 'height': 180},
-        'medium': {'preference': 2, 'width': 512, 'height': 288},
-        'high': {'preference': 3, 'width': 854, 'height': 480},
+        'low': {'width': 320, 'height': 180},
+        'medium': {'width': 512, 'height': 288},
+        'high': {'width': 854, 'height': 480},
     }
 
     def _extract_info(self, webpage):
@@ -171,15 +176,21 @@ class TEDIE(InfoExtractor):
                 if finfo:
                     f.update(finfo)
 
+        http_url = None
         for format_id, resources in talk_info['resources'].items():
             if format_id == 'h264':
                 for resource in resources:
+                    h264_url = resource.get('file')
+                    if not h264_url:
+                        continue
                     bitrate = int_or_none(resource.get('bitrate'))
                     formats.append({
-                        'url': resource['file'],
+                        'url': h264_url,
                         'format_id': '%s-%sk' % (format_id, bitrate),
                         'tbr': bitrate,
                     })
+                    if re.search('\d+k', h264_url):
+                        http_url = h264_url
             elif format_id == 'rtmp':
                 streamer = talk_info.get('streamer')
                 if not streamer:
@@ -195,16 +206,24 @@ class TEDIE(InfoExtractor):
                         'tbr': int_or_none(resource.get('bitrate')),
                     })
             elif format_id == 'hls':
-                hls_formats = self._extract_m3u8_formats(
-                    resources.get('stream'), video_name, 'mp4', m3u8_id=format_id)
-                for f in hls_formats:
-                    if f.get('format_id') == 'hls-meta':
-                        continue
-                    if not f.get('height'):
-                        f['vcodec'] = 'none'
-                    else:
-                        f['acodec'] = 'none'
-                formats.extend(hls_formats)
+                formats.extend(self._extract_m3u8_formats(
+                    resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False))
+
+        m3u8_formats = list(filter(
+            lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+            formats))
+        if http_url:
+            for m3u8_format in m3u8_formats:
+                bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
+                if not bitrate:
+                    continue
+                f = m3u8_format.copy()
+                f.update({
+                    'url': re.sub(r'\d+k', bitrate, http_url),
+                    'format_id': m3u8_format['format_id'].replace('hls', 'http'),
+                    'protocol': 'http',
+                })
+                formats.append(f)
 
         audio_download = talk_info.get('audioDownload')
         if audio_download:
@@ -212,7 +231,6 @@ class TEDIE(InfoExtractor):
                 'url': audio_download,
                 'format_id': 'audio',
                 'vcodec': 'none',
-                'preference': -0.5,
             })
 
         self._sort_formats(formats)
@@ -254,7 +272,11 @@ class TEDIE(InfoExtractor):
 
         config_json = self._html_search_regex(
             r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
-            webpage, 'config')
+            webpage, 'config', default=None)
+        if not config_json:
+            embed_url = self._search_regex(
+                r"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage, 'embed url')
+            return self.url_result(self._proto_relative_url(embed_url))
         config = json.loads(config_json)['config']
         video_url = config['video']['url']
         thumbnail = config.get('image', {}).get('url')
index 4b4b740b44d325ffb8a8a5c6cba848b0c99ced13..2ecfd0405afa27d78f81fb0c4ba604d022798850 100644 (file)
@@ -1,50 +1,41 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import json
+from .mitele import MiTeleBaseIE
 
-from .common import InfoExtractor
-from ..compat import (
-    compat_urllib_parse_unquote,
-    compat_urllib_parse_urlencode,
-    compat_urlparse,
-)
-from ..utils import (
-    get_element_by_attribute,
-    parse_duration,
-    strip_jsonp,
-)
 
-
-class TelecincoIE(InfoExtractor):
+class TelecincoIE(MiTeleBaseIE):
     IE_DESC = 'telecinco.es, cuatro.com and mediaset.es'
     _VALID_URL = r'https?://www\.(?:telecinco\.es|cuatro\.com|mediaset\.es)/(?:[^/]+/)+(?P<id>.+?)\.html'
 
     _TESTS = [{
         'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html',
-        'md5': '5cbef3ad5ef17bf0d21570332d140729',
+        'md5': '8d7b2d5f699ee2709d992a63d5cd1712',
         'info_dict': {
-            'id': 'MDSVID20141015_0058',
+            'id': 'JEA5ijCnF6p5W08A1rNKn7',
             'ext': 'mp4',
-            'title': 'Con Martín Berasategui, hacer un bacalao al ...',
+            'title': 'Bacalao con kokotxas al pil-pil',
+            'description': 'md5:1382dacd32dd4592d478cbdca458e5bb',
             'duration': 662,
         },
     }, {
         'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html',
-        'md5': '0a5b9f3cc8b074f50a0578f823a12694',
+        'md5': '284393e5387b3b947b77c613ef04749a',
         'info_dict': {
-            'id': 'MDSVID20150916_0128',
+            'id': 'jn24Od1zGLG4XUZcnUnZB6',
             'ext': 'mp4',
-            'title': '¿Quién es este ex futbolista con el que hablan ...',
+            'title': '¿Quién es este ex futbolista con el que hablan Leo Messi y Luis Suárez?',
+            'description': 'md5:a62ecb5f1934fc787107d7b9a2262805',
             'duration': 79,
         },
     }, {
         'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html',
-        'md5': 'ad1bfaaba922dd4a295724b05b68f86a',
+        'md5': '749afab6ea5a136a8806855166ae46a2',
         'info_dict': {
-            'id': 'MDSVID20150513_0220',
+            'id': 'aywerkD2Sv1vGNqq9b85Q2',
             'ext': 'mp4',
             'title': '#DOYLACARA. Con la trata no hay trato',
+            'description': 'md5:2771356ff7bfad9179c5f5cd954f1477',
             'duration': 50,
         },
     }, {
@@ -56,40 +47,16 @@ class TelecincoIE(InfoExtractor):
     }]
 
     def _real_extract(self, url):
-        episode = self._match_id(url)
-        webpage = self._download_webpage(url, episode)
-        embed_data_json = self._search_regex(
-            r'(?s)MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data',
-        ).replace('\'', '"')
-        embed_data = json.loads(embed_data_json)
-
-        domain = embed_data['mediaUrl']
-        if not domain.startswith('http'):
-            # only happens in telecinco.es videos
-            domain = 'http://' + domain
-        info_url = compat_urlparse.urljoin(
-            domain,
-            compat_urllib_parse_unquote(embed_data['flashvars']['host'])
-        )
-        info_el = self._download_xml(info_url, episode).find('./video/info')
-
-        video_link = info_el.find('videoUrl/link').text
-        token_query = compat_urllib_parse_urlencode({'id': video_link})
-        token_info = self._download_json(
-            embed_data['flashvars']['ov_tk'] + '?' + token_query,
-            episode,
-            transform_source=strip_jsonp
-        )
-        formats = self._extract_m3u8_formats(
-            token_info['tokenizedUrl'], episode, ext='mp4', entry_protocol='m3u8_native')
-        self._sort_formats(formats)
-
-        return {
-            'id': embed_data['videoId'],
-            'display_id': episode,
-            'title': info_el.find('title').text,
-            'formats': formats,
-            'description': get_element_by_attribute('class', 'text', webpage),
-            'thumbnail': info_el.find('thumb').text,
-            'duration': parse_duration(info_el.find('duration').text),
-        }
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        title = self._html_search_meta(
+            ['og:title', 'twitter:title'], webpage, 'title')
+        info = self._get_player_info(url, webpage)
+        info.update({
+            'display_id': display_id,
+            'title': title,
+            'description': self._html_search_meta(
+                ['og:description', 'twitter:description'],
+                webpage, 'title', fatal=False),
+        })
+        return info
index 6f8333cfc0d40aee4d3637ed1e58867da1277e9f..9092e9b853637d14d54a2c15c524b12e35e91a30 100644 (file)
@@ -2,14 +2,16 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..utils import remove_end
+from ..utils import (
+    determine_ext,
+    remove_end,
+)
 
 
 class TelegraafIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?telegraaf\.nl/tv/(?:[^/]+/)+(?P<id>\d+)/[^/]+\.html'
     _TEST = {
         'url': 'http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html',
-        'md5': '83245a9779bcc4a24454bfd53c65b6dc',
         'info_dict': {
             'id': '24353229',
             'ext': 'mp4',
@@ -18,18 +20,60 @@ class TelegraafIE(InfoExtractor):
             'thumbnail': 're:^https?://.*\.jpg$',
             'duration': 33,
         },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
     }
 
     def _real_extract(self, url):
-        playlist_id = self._match_id(url)
+        video_id = self._match_id(url)
 
-        webpage = self._download_webpage(url, playlist_id)
+        webpage = self._download_webpage(url, video_id)
 
+        player_url = self._html_search_regex(
+            r'<iframe[^>]+src="([^"]+")', webpage, 'player URL')
+        player_page = self._download_webpage(
+            player_url, video_id, note='Download player webpage')
         playlist_url = self._search_regex(
-            r"iframe\.loadPlayer\('([^']+)'", webpage, 'player')
+            r'playlist\s*:\s*"([^"]+)"', player_page, 'playlist URL')
+        playlist_data = self._download_json(playlist_url, video_id)
+
+        item = playlist_data['items'][0]
+        formats = []
+        locations = item['locations']
+        for location in locations.get('adaptive', []):
+            manifest_url = location['src']
+            ext = determine_ext(manifest_url)
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    manifest_url, video_id, ext='mp4', m3u8_id='hls'))
+            elif ext == 'mpd':
+                # TODO: Current DASH formats are broken - $Time$ pattern in
+                # <SegmentTemplate> not implemented yet
+                continue
+            else:
+                self.report_warning('Unknown adaptive format %s' % ext)
+        for location in locations.get('progressive', []):
+            formats.append({
+                'url': location['sources'][0]['src'],
+                'width': location.get('width'),
+                'height': location.get('height'),
+                'format_id': 'http-%s' % location['label'],
+            })
+
+        self._sort_formats(formats)
 
-        entries = self._extract_xspf_playlist(playlist_url, playlist_id)
         title = remove_end(self._og_search_title(webpage), ' - VIDEO')
         description = self._og_search_description(webpage)
+        duration = item.get('duration')
+        thumbnail = item.get('poster')
 
-        return self.playlist_result(entries, playlist_id, title, description)
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'formats': formats,
+            'duration': duration,
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/telewebion.py b/youtube_dl/extractor/telewebion.py
new file mode 100644 (file)
index 0000000..77916c6
--- /dev/null
@@ -0,0 +1,55 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class TelewebionIE(InfoExtractor):
+    _VALID_URL = r'https?://www\.telewebion\.com/#!/episode/(?P<id>\d+)'
+
+    _TEST = {
+        'url': 'http://www.telewebion.com/#!/episode/1263668/',
+        'info_dict': {
+            'id': '1263668',
+            'ext': 'mp4',
+            'title': 'قرعه\u200cکشی لیگ قهرمانان اروپا',
+            'thumbnail': 're:^https?://.*\.jpg',
+            'view_count': int,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        secure_token = self._download_webpage(
+            'http://m.s2.telewebion.com/op/op?action=getSecurityToken', video_id)
+        episode_details = self._download_json(
+            'http://m.s2.telewebion.com/op/op', video_id,
+            query={'action': 'getEpisodeDetails', 'episode_id': video_id})
+
+        m3u8_url = 'http://m.s1.telewebion.com/smil/%s.m3u8?filepath=%s&m3u8=1&secure_token=%s' % (
+            video_id, episode_details['file_path'], secure_token)
+        formats = self._extract_m3u8_formats(
+            m3u8_url, video_id, ext='mp4', m3u8_id='hls')
+
+        picture_paths = [
+            episode_details.get('picture_path'),
+            episode_details.get('large_picture_path'),
+        ]
+
+        thumbnails = [{
+            'url': picture_path,
+            'preference': idx,
+        } for idx, picture_path in enumerate(picture_paths) if picture_path is not None]
+
+        return {
+            'id': video_id,
+            'title': episode_details['title'],
+            'formats': formats,
+            'thumbnails': thumbnails,
+            'view_count': episode_details.get('view_count'),
+        }
index 3f54b2744cb16cd6385e5cb06919cbaf9628167a..e595c4a69b3f03361abc05f6bca61adecb61cf36 100644 (file)
@@ -6,7 +6,7 @@ from .common import InfoExtractor
 
 class TF1IE(InfoExtractor):
     """TF1 uses the wat.tv player."""
-    _VALID_URL = r'https?://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/(?:[^/]+/)*(?P<id>.+?)\.html'
+    _VALID_URL = r'https?://(?:(?:videos|www|lci)\.tf1|(?:www\.)?(?:tfou|ushuaiatv|histoire|tvbreizh))\.fr/(?:[^/]+/)*(?P<id>[^/?#.]+)'
     _TESTS = [{
         'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
         'info_dict': {
@@ -48,6 +48,6 @@ class TF1IE(InfoExtractor):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
         wat_id = self._html_search_regex(
-            r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8})(?:#.*?)?\1',
+            r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8})\1',
             webpage, 'wat id', group='id')
         return self.url_result('wat:%s' % wat_id, 'Wat')
index 7a5a533b7473bc483e64915ed2065c12c9adbdc6..bb3efc4ea17e6f4d0ae2252d7e6a317cd6bf7f03 100644 (file)
@@ -6,6 +6,7 @@ import time
 import hmac
 import binascii
 import hashlib
+import netrc
 
 
 from .once import OnceIE
@@ -14,14 +15,19 @@ from ..compat import (
     compat_urllib_parse_urlparse,
 )
 from ..utils import (
+    determine_ext,
     ExtractorError,
     float_or_none,
     int_or_none,
     sanitized_Request,
     unsmuggle_url,
+    update_url_query,
     xpath_with_ns,
     mimetype2ext,
     find_xpath_attr,
+    unescapeHTML,
+    urlencode_postdata,
+    unified_timestamp,
 )
 
 default_ns = 'http://www.w3.org/2005/SMIL21/Language'
@@ -48,16 +54,23 @@ class ThePlatformBaseIE(OnceIE):
             if OnceIE.suitable(_format['url']):
                 formats.extend(self._extract_once_formats(_format['url']))
             else:
+                media_url = _format['url']
+                if determine_ext(media_url) == 'm3u8':
+                    hdnea2 = self._get_cookies(media_url).get('hdnea2')
+                    if hdnea2:
+                        _format['url'] = update_url_query(media_url, {'hdnea3': hdnea2.value})
+
                 formats.append(_format)
 
         subtitles = self._parse_smil_subtitles(meta, default_ns)
 
         return formats, subtitles
 
-    def get_metadata(self, path, video_id):
+    def _download_theplatform_metadata(self, path, video_id):
         info_url = 'http://link.theplatform.com/s/%s?format=preview' % path
-        info = self._download_json(info_url, video_id)
+        return self._download_json(info_url, video_id)
 
+    def _parse_theplatform_metadata(self, info):
         subtitles = {}
         captions = info.get('captions')
         if isinstance(captions, list):
@@ -78,6 +91,10 @@ class ThePlatformBaseIE(OnceIE):
             'uploader': info.get('billingCode'),
         }
 
+    def _extract_theplatform_metadata(self, path, video_id):
+        info = self._download_theplatform_metadata(path, video_id)
+        return self._parse_theplatform_metadata(info)
+
 
 class ThePlatformIE(ThePlatformBaseIE):
     _VALID_URL = r'''(?x)
@@ -150,6 +167,23 @@ class ThePlatformIE(ThePlatformBaseIE):
         'url': 'http://player.theplatform.com/p/NnzsPC/onsite_universal/select/media/guid/2410887629/2928790?fwsitesection=nbc_the_blacklist_video_library&autoPlay=true&carouselID=137781',
         'only_matching': True,
     }]
+    _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s'
+
+    @classmethod
+    def _extract_urls(cls, webpage):
+        m = re.search(
+            r'''(?x)
+                    <meta\s+
+                        property=(["'])(?:og:video(?::(?:secure_)?url)?|twitter:player)\1\s+
+                        content=(["'])(?P<url>https?://player\.theplatform\.com/p/.+?)\2
+            ''', webpage)
+        if m:
+            return [m.group('url')]
+
+        matches = re.findall(
+            r'<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage)
+        if matches:
+            return list(zip(*matches))[1]
 
     @staticmethod
     def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False):
@@ -159,15 +193,105 @@ class ThePlatformIE(ThePlatformBaseIE):
         def str_to_hex(str):
             return binascii.b2a_hex(str.encode('ascii')).decode('ascii')
 
-        def hex_to_str(hex):
-            return binascii.a2b_hex(hex)
+        def hex_to_bytes(hex):
+            return binascii.a2b_hex(hex.encode('ascii'))
 
         relative_path = re.match(r'https?://link.theplatform.com/s/([^?]+)', url).group(1)
-        clear_text = hex_to_str(flags + expiration_date + str_to_hex(relative_path))
+        clear_text = hex_to_bytes(flags + expiration_date + str_to_hex(relative_path))
         checksum = hmac.new(sig_key.encode('ascii'), clear_text, hashlib.sha1).hexdigest()
         sig = flags + expiration_date + checksum + str_to_hex(sig_secret)
         return '%s&sig=%s' % (url, sig)
 
+    def _extract_mvpd_auth(self, url, video_id, requestor_id, resource):
+        def xml_text(xml_str, tag):
+            return self._search_regex(
+                '<%s>(.+?)</%s>' % (tag, tag), xml_str, tag)
+
+        mvpd_headers = {
+            'ap_42': 'anonymous',
+            'ap_11': 'Linux i686',
+            'ap_z': 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0',
+            'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0',
+        }
+
+        guid = xml_text(resource, 'guid')
+        requestor_info = self._downloader.cache.load('mvpd', requestor_id) or {}
+        authn_token = requestor_info.get('authn_token')
+        if authn_token:
+            token_expires = unified_timestamp(xml_text(authn_token, 'simpleTokenExpires').replace('_GMT', ''))
+            if token_expires and token_expires >= time.time():
+                authn_token = None
+        if not authn_token:
+            # TODO add support for other TV Providers
+            mso_id = 'DTV'
+            login_info = netrc.netrc().authenticators(mso_id)
+            if not login_info:
+                return None
+
+            def post_form(form_page, note, data={}):
+                post_url = self._html_search_regex(r'<form[^>]+action=(["\'])(?P<url>.+?)\1', form_page, 'post url', group='url')
+                return self._download_webpage(
+                    post_url, video_id, note, data=urlencode_postdata(data or self._hidden_inputs(form_page)), headers={
+                        'Content-Type': 'application/x-www-form-urlencoded',
+                    })
+
+            provider_redirect_page = self._download_webpage(
+                self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id,
+                'Downloading Provider Redirect Page', query={
+                    'noflash': 'true',
+                    'mso_id': mso_id,
+                    'requestor_id': requestor_id,
+                    'no_iframe': 'false',
+                    'domain_name': 'adobe.com',
+                    'redirect_url': url,
+                })
+            provider_login_page = post_form(
+                provider_redirect_page, 'Downloading Provider Login Page')
+            mvpd_confirm_page = post_form(provider_login_page, 'Logging in', {
+                'username': login_info[0],
+                'password': login_info[2],
+            })
+            post_form(mvpd_confirm_page, 'Confirming Login')
+
+            session = self._download_webpage(
+                self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id,
+                'Retrieving Session', data=urlencode_postdata({
+                    '_method': 'GET',
+                    'requestor_id': requestor_id,
+                }), headers=mvpd_headers)
+            authn_token = unescapeHTML(xml_text(session, 'authnToken'))
+            requestor_info['authn_token'] = authn_token
+            self._downloader.cache.store('mvpd', requestor_id, requestor_info)
+
+        authz_token = requestor_info.get(guid)
+        if not authz_token:
+            authorize = self._download_webpage(
+                self._SERVICE_PROVIDER_TEMPLATE % 'authorize', video_id,
+                'Retrieving Authorization Token', data=urlencode_postdata({
+                    'resource_id': resource,
+                    'requestor_id': requestor_id,
+                    'authentication_token': authn_token,
+                    'mso_id': xml_text(authn_token, 'simpleTokenMsoID'),
+                    'userMeta': '1',
+                }), headers=mvpd_headers)
+            authz_token = unescapeHTML(xml_text(authorize, 'authzToken'))
+            requestor_info[guid] = authz_token
+            self._downloader.cache.store('mvpd', requestor_id, requestor_info)
+
+        mvpd_headers.update({
+            'ap_19': xml_text(authn_token, 'simpleSamlNameID'),
+            'ap_23': xml_text(authn_token, 'simpleSamlSessionIndex'),
+        })
+
+        return self._download_webpage(
+            self._SERVICE_PROVIDER_TEMPLATE % 'shortAuthorize',
+            video_id, 'Retrieving Media Token', data=urlencode_postdata({
+                'authz_token': authz_token,
+                'requestor_id': requestor_id,
+                'session_guid': xml_text(authn_token, 'simpleTokenAuthenticationGuid'),
+                'hashed_guid': 'false',
+            }), headers=mvpd_headers)
+
     def _real_extract(self, url):
         url, smuggled_data = unsmuggle_url(url, {})
 
@@ -241,7 +365,7 @@ class ThePlatformIE(ThePlatformBaseIE):
         formats, subtitles = self._extract_theplatform_smil(smil_url, video_id)
         self._sort_formats(formats)
 
-        ret = self.get_metadata(path, video_id)
+        ret = self._extract_theplatform_metadata(path, video_id)
         combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles)
         ret.update({
             'id': video_id,
@@ -253,9 +377,9 @@ class ThePlatformIE(ThePlatformBaseIE):
 
 
 class ThePlatformFeedIE(ThePlatformBaseIE):
-    _URL_TEMPLATE = '%s//feed.theplatform.com/f/%s/%s?form=json&byGuid=%s'
-    _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*byGuid=(?P<id>[a-zA-Z0-9_]+)'
-    _TEST = {
+    _URL_TEMPLATE = '%s//feed.theplatform.com/f/%s/%s?form=json&%s'
+    _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*(?P<filter>by(?:Gui|I)d=(?P<id>[\w-]+))'
+    _TESTS = [{
         # From http://player.theplatform.com/p/7wvmTC/MSNBCEmbeddedOffSite?guid=n_hardball_5biden_140207
         'url': 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207',
         'md5': '6e32495b5073ab414471b615c5ded394',
@@ -269,33 +393,40 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
             'timestamp': 1391824260,
             'duration': 467.0,
             'categories': ['MSNBC/Issues/Democrats', 'MSNBC/Issues/Elections/Election 2016'],
+            'uploader': 'NBCU-NEWS',
         },
-    }
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-
-        video_id = mobj.group('id')
-        provider_id = mobj.group('provider_id')
-        feed_id = mobj.group('feed_id')
+    }]
 
-        real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, video_id)
-        feed = self._download_json(real_url, video_id)
-        entry = feed['entries'][0]
+    def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custom_fields=None, asset_types_query={}):
+        real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, filter_query)
+        entry = self._download_json(real_url, video_id)['entries'][0]
 
         formats = []
         subtitles = {}
         first_video_id = None
         duration = None
+        asset_types = []
         for item in entry['media$content']:
-            smil_url = item['plfile$url'] + '&mbr=true'
+            smil_url = item['plfile$url']
             cur_video_id = ThePlatformIE._match_id(smil_url)
             if first_video_id is None:
                 first_video_id = cur_video_id
                 duration = float_or_none(item.get('plfile$duration'))
-            cur_formats, cur_subtitles = self._extract_theplatform_smil(smil_url, video_id, 'Downloading SMIL data for %s' % cur_video_id)
-            formats.extend(cur_formats)
-            subtitles = self._merge_subtitles(subtitles, cur_subtitles)
+            for asset_type in item['plfile$assetTypes']:
+                if asset_type in asset_types:
+                    continue
+                asset_types.append(asset_type)
+                query = {
+                    'mbr': 'true',
+                    'formats': item['plfile$format'],
+                    'assetTypes': asset_type,
+                }
+                if asset_type in asset_types_query:
+                    query.update(asset_types_query[asset_type])
+                cur_formats, cur_subtitles = self._extract_theplatform_smil(update_url_query(
+                    smil_url, query), video_id, 'Downloading SMIL data for %s' % asset_type)
+                formats.extend(cur_formats)
+                subtitles = self._merge_subtitles(subtitles, cur_subtitles)
 
         self._sort_formats(formats)
 
@@ -308,7 +439,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
         timestamp = int_or_none(entry.get('media$availableDate'), scale=1000)
         categories = [item['media$name'] for item in entry.get('media$categories', [])]
 
-        ret = self.get_metadata('%s/%s' % (provider_id, first_video_id), video_id)
+        ret = self._extract_theplatform_metadata('%s/%s' % (provider_id, first_video_id), video_id)
         subtitles = self._merge_subtitles(subtitles, ret['subtitles'])
         ret.update({
             'id': video_id,
@@ -319,5 +450,17 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
             'timestamp': timestamp,
             'categories': categories,
         })
+        if custom_fields:
+            ret.update(custom_fields(entry))
 
         return ret
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+
+        video_id = mobj.group('id')
+        provider_id = mobj.group('provider_id')
+        feed_id = mobj.group('feed_id')
+        filter_query = mobj.group('filter')
+
+        return self._extract_feed_info(provider_id, feed_id, filter_query, video_id)
index d8b1fd2813eadc3d17a17a6d46766b3c9c4ea37a..d63aef5dea9a8543f2a919b19321582f20e8df86 100644 (file)
@@ -12,7 +12,7 @@ class TheSixtyOneIE(InfoExtractor):
             s|
             song/comments/list|
             song
-        )/(?P<id>[A-Za-z0-9]+)/?$'''
+        )/(?:[^/]+/)?(?P<id>[A-Za-z0-9]+)/?$'''
     _SONG_URL_TEMPLATE = 'http://thesixtyone.com/s/{0:}'
     _SONG_FILE_URL_TEMPLATE = 'http://{audio_server:}/thesixtyone_production/audio/{0:}_stream'
     _THUMBNAIL_URL_TEMPLATE = '{photo_base_url:}_desktop'
@@ -45,6 +45,10 @@ class TheSixtyOneIE(InfoExtractor):
             'url': 'http://www.thesixtyone.com/song/SrE3zD7s1jt/',
             'only_matching': True,
         },
+        {
+            'url': 'http://www.thesixtyone.com/maryatmidnight/song/StrawberriesandCream/yvWtLp0c4GQ/',
+            'only_matching': True,
+        },
     ]
 
     _DECODE_MAP = {
diff --git a/youtube_dl/extractor/threeqsdn.py b/youtube_dl/extractor/threeqsdn.py
new file mode 100644 (file)
index 0000000..a0bc12c
--- /dev/null
@@ -0,0 +1,138 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    js_to_json,
+    mimetype2ext,
+)
+
+
+class ThreeQSDNIE(InfoExtractor):
+    IE_NAME = '3qsdn'
+    IE_DESC = '3Q SDN'
+    _VALID_URL = r'https?://playout\.3qsdn\.com/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+    _TESTS = [{
+        # ondemand from http://www.philharmonie.tv/veranstaltung/26/
+        'url': 'http://playout.3qsdn.com/0280d6b9-1215-11e6-b427-0cc47a188158?protocol=http',
+        'md5': 'ab040e37bcfa2e0c079f92cb1dd7f6cd',
+        'info_dict': {
+            'id': '0280d6b9-1215-11e6-b427-0cc47a188158',
+            'ext': 'mp4',
+            'title': '0280d6b9-1215-11e6-b427-0cc47a188158',
+            'is_live': False,
+        },
+        'expected_warnings': ['Failed to download MPD manifest'],
+    }, {
+        # live video stream
+        'url': 'https://playout.3qsdn.com/d755d94b-4ab9-11e3-9162-0025907ad44f?js=true',
+        'info_dict': {
+            'id': 'd755d94b-4ab9-11e3-9162-0025907ad44f',
+            'ext': 'mp4',
+            'title': 'd755d94b-4ab9-11e3-9162-0025907ad44f',
+            'is_live': False,
+        },
+    }, {
+        # live audio stream
+        'url': 'http://playout.3qsdn.com/9edf36e0-6bf2-11e2-a16a-9acf09e2db48',
+        'only_matching': True,
+    }, {
+        # live audio stream with some 404 URLs
+        'url': 'http://playout.3qsdn.com/ac5c3186-777a-11e2-9c30-9acf09e2db48',
+        'only_matching': True,
+    }, {
+        # geo restricted with 'This content is not available in your country'
+        'url': 'http://playout.3qsdn.com/d63a3ffe-75e8-11e2-9c30-9acf09e2db48',
+        'only_matching': True,
+    }, {
+        # geo restricted with 'playout.3qsdn.com/forbidden'
+        'url': 'http://playout.3qsdn.com/8e330f26-6ae2-11e2-a16a-9acf09e2db48',
+        'only_matching': True,
+    }, {
+        # live video with rtmp link
+        'url': 'https://playout.3qsdn.com/6092bb9e-8f72-11e4-a173-002590c750be',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'<iframe[^>]+\b(?:data-)?src=(["\'])(?P<url>%s.*?)\1' % ThreeQSDNIE._VALID_URL, webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        js = self._download_webpage(
+            'http://playout.3qsdn.com/%s' % video_id, video_id,
+            query={'js': 'true'})
+
+        if any(p in js for p in (
+                '>This content is not available in your country',
+                'playout.3qsdn.com/forbidden')):
+            self.raise_geo_restricted()
+
+        stream_content = self._search_regex(
+            r'streamContent\s*:\s*(["\'])(?P<content>.+?)\1', js,
+            'stream content', default='demand', group='content')
+
+        live = stream_content == 'live'
+
+        stream_type = self._search_regex(
+            r'streamType\s*:\s*(["\'])(?P<type>audio|video)\1', js,
+            'stream type', default='video', group='type')
+
+        formats = []
+        urls = set()
+
+        def extract_formats(item_url, item={}):
+            if not item_url or item_url in urls:
+                return
+            urls.add(item_url)
+            ext = mimetype2ext(item.get('type')) or determine_ext(item_url, default_ext=None)
+            if ext == 'mpd':
+                formats.extend(self._extract_mpd_formats(
+                    item_url, video_id, mpd_id='mpd', fatal=False))
+            elif ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    item_url, video_id, 'mp4',
+                    entry_protocol='m3u8' if live else 'm3u8_native',
+                    m3u8_id='hls', fatal=False))
+            elif ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    item_url, video_id, f4m_id='hds', fatal=False))
+            else:
+                if not self._is_valid_url(item_url, video_id):
+                    return
+                formats.append({
+                    'url': item_url,
+                    'format_id': item.get('quality'),
+                    'ext': 'mp4' if item_url.startswith('rtsp') else ext,
+                    'vcodec': 'none' if stream_type == 'audio' else None,
+                })
+
+        for item_js in re.findall(r'({.*?\b(?:src|source)\s*:\s*["\'].+?})', js):
+            f = self._parse_json(
+                item_js, video_id, transform_source=js_to_json, fatal=False)
+            if not f:
+                continue
+            extract_formats(f.get('src'), f)
+
+        # More relaxed version to collect additional URLs and acting
+        # as a future-proof fallback
+        for _, src in re.findall(r'\b(?:src|source)\s*:\s*(["\'])((?:https?|rtsp)://.+?)\1', js):
+            extract_formats(src)
+
+        self._sort_formats(formats)
+
+        title = self._live_title(video_id) if live else video_id
+
+        return {
+            'id': video_id,
+            'title': title,
+            'is_live': live,
+            'formats': formats,
+        }
index 4797d1310aaeec2664d822c052f26be5ea5210af..54c2d0aa6c0d234f9f747550ab841c409bfbc079 100644 (file)
@@ -1,74 +1,41 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
-    unified_strdate,
-)
+from ..utils import int_or_none
 
 
 class TouTvIE(InfoExtractor):
     IE_NAME = 'tou.tv'
-    _VALID_URL = r'https?://www\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/(?P<episode>S[0-9]+E[0-9]+)))'
+    _VALID_URL = r'https?://ici\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+/S[0-9]+E[0-9]+)'
 
     _TEST = {
-        'url': 'http://www.tou.tv/30-vies/S04E41',
+        'url': 'http://ici.tou.tv/garfield-tout-court/S2015E17',
         'info_dict': {
-            'id': '30-vies_S04E41',
+            'id': '122017',
             'ext': 'mp4',
-            'title': '30 vies Saison 4 / Épisode 41',
-            'description': 'md5:da363002db82ccbe4dafeb9cab039b09',
-            'age_limit': 8,
-            'uploader': 'Groupe des Nouveaux Médias',
-            'duration': 1296,
-            'upload_date': '20131118',
-            'thumbnail': 'http://static.tou.tv/medias/images/2013-11-18_19_00_00_30VIES_0341_01_L.jpeg',
+            'title': 'Saison 2015 Épisode 17',
+            'description': 'La photo de famille 2',
+            'upload_date': '20100717',
         },
         'params': {
-            'skip_download': True,  # Requires rtmpdump
+            # m3u8 download
+            'skip_download': True,
         },
-        'skip': 'Only available in Canada'
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        webpage = self._download_webpage(url, video_id)
-
-        mediaId = self._search_regex(
-            r'"idMedia":\s*"([^"]+)"', webpage, 'media ID')
-
-        streams_url = 'http://release.theplatform.com/content.select?pid=' + mediaId
-        streams_doc = self._download_xml(
-            streams_url, video_id, note='Downloading stream list')
-
-        video_url = next(n.text
-                         for n in streams_doc.findall('.//choice/url')
-                         if '//ad.doubleclick' not in n.text)
-        if video_url.endswith('/Unavailable.flv'):
-            raise ExtractorError(
-                'Access to this video is blocked from outside of Canada',
-                expected=True)
-
-        duration_str = self._html_search_meta(
-            'video:duration', webpage, 'duration')
-        duration = int(duration_str) if duration_str else None
-        upload_date_str = self._html_search_meta(
-            'video:release_date', webpage, 'upload date')
-        upload_date = unified_strdate(upload_date_str) if upload_date_str else None
+        path = self._match_id(url)
+        metadata = self._download_json('http://ici.tou.tv/presentation/%s' % path, path)
+        video_id = metadata['IdMedia']
+        details = metadata['Details']
+        title = details['OriginalTitle']
 
         return {
+            '_type': 'url_transparent',
+            'url': 'radiocanada:%s:%s' % (metadata.get('AppCode', 'toutv'), video_id),
             'id': video_id,
-            'title': self._og_search_title(webpage),
-            'url': video_url,
-            'description': self._og_search_description(webpage),
-            'uploader': self._dc_search_uploader(webpage),
-            'thumbnail': self._og_search_thumbnail(webpage),
-            'age_limit': self._media_rating_search(webpage),
-            'duration': duration,
-            'upload_date': upload_date,
-            'ext': 'mp4',
+            'title': title,
+            'thumbnail': details.get('ImageUrl'),
+            'duration': int_or_none(details.get('LengthInSeconds')),
         }
index f57d609d43eecb13f3bb43ecc042107b5cad50bd..5070082da7ba3b34078a01bd214d02a9e8dcac33 100644 (file)
@@ -1,25 +1,24 @@
-# -*- coding: utf-8 -*-
+# coding: utf-8
 from __future__ import unicode_literals
 
 import re
 
 from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    clean_html,
+    get_element_by_attribute,
+    ExtractorError,
+)
 
 
-class TvpIE(InfoExtractor):
-    IE_NAME = 'tvp.pl'
-    _VALID_URL = r'https?://(?:vod|www)\.tvp\.pl/.*/(?P<id>\d+)$'
+class TVPIE(InfoExtractor):
+    IE_NAME = 'tvp'
+    IE_DESC = 'Telewizja Polska'
+    _VALID_URL = r'https?://[^/]+\.tvp\.(?:pl|info)/(?:(?!\d+/)[^/]+/)*(?P<id>\d+)'
 
     _TESTS = [{
-        'url': 'http://vod.tvp.pl/filmy-fabularne/filmy-za-darmo/ogniem-i-mieczem/wideo/odc-2/4278035',
-        'md5': 'cdd98303338b8a7f7abab5cd14092bf2',
-        'info_dict': {
-            'id': '4278035',
-            'ext': 'wmv',
-            'title': 'Ogniem i mieczem, odc. 2',
-        },
-    }, {
-        'url': 'http://vod.tvp.pl/seriale/obyczajowe/czas-honoru/sezon-1-1-13/i-seria-odc-13/194536',
+        'url': 'http://vod.tvp.pl/194536/i-seria-odc-13',
         'md5': '8aa518c15e5cc32dfe8db400dc921fbb',
         'info_dict': {
             'id': '194536',
@@ -28,7 +27,7 @@ class TvpIE(InfoExtractor):
         },
     }, {
         'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176',
-        'md5': 'c3b15ed1af288131115ff17a17c19dda',
+        'md5': 'b0005b542e5b4de643a9690326ab1257',
         'info_dict': {
             'id': '17916176',
             'ext': 'mp4',
@@ -36,12 +35,22 @@ class TvpIE(InfoExtractor):
         },
     }, {
         'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272',
-        'md5': 'c3b15ed1af288131115ff17a17c19dda',
-        'info_dict': {
-            'id': '17834272',
-            'ext': 'mp4',
-            'title': 'Na sygnale, odc. 39',
-        },
+        'only_matching': True,
+    }, {
+        'url': 'http://wiadomosci.tvp.pl/25169746/24052016-1200',
+        'only_matching': True,
+    }, {
+        'url': 'http://krakow.tvp.pl/25511623/25lecie-mck-wyjatkowe-miejsce-na-mapie-krakowa',
+        'only_matching': True,
+    }, {
+        'url': 'http://teleexpress.tvp.pl/25522307/wierni-wzieli-udzial-w-procesjach',
+        'only_matching': True,
+    }, {
+        'url': 'http://sport.tvp.pl/25522165/krychowiak-uspokaja-w-sprawie-kontuzji-dwa-tygodnie-to-maksimum',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.tvp.info/25511919/trwa-rewolucja-wladza-zdecydowala-sie-na-pogwalcenie-konstytucji',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
@@ -50,6 +59,11 @@ class TvpIE(InfoExtractor):
         webpage = self._download_webpage(
             'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id)
 
+        error_massage = get_element_by_attribute('class', 'msg error', webpage)
+        if error_massage:
+            raise ExtractorError('%s said: %s' % (
+                self.IE_NAME, clean_html(error_massage)), expected=True)
+
         title = self._search_regex(
             r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1',
             webpage, 'title', group='title')
@@ -63,24 +77,50 @@ class TvpIE(InfoExtractor):
             r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None)
 
         video_url = self._search_regex(
-            r'0:{src:([\'"])(?P<url>.*?)\1', webpage, 'formats', group='url', default=None)
-        if not video_url:
+            r'0:{src:([\'"])(?P<url>.*?)\1', webpage,
+            'formats', group='url', default=None)
+        if not video_url or 'material_niedostepny.mp4' in video_url:
             video_url = self._download_json(
                 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id,
                 video_id)['video_url']
 
-        ext = video_url.rsplit('.', 1)[-1]
-        if ext != 'ism/manifest':
-            if '/' in ext:
-                ext = 'mp4'
+        formats = []
+        video_url_base = self._search_regex(
+            r'(https?://.+?/video)(?:\.(?:ism|f4m|m3u8)|-\d+\.mp4)',
+            video_url, 'video base url', default=None)
+        if video_url_base:
+            # TODO: Current DASH formats are broken - $Time$ pattern in
+            # <SegmentTemplate> not implemented yet
+            # formats.extend(self._extract_mpd_formats(
+            #     video_url_base + '.ism/video.mpd',
+            #     video_id, mpd_id='dash', fatal=False))
+            formats.extend(self._extract_f4m_formats(
+                video_url_base + '.ism/video.f4m',
+                video_id, f4m_id='hds', fatal=False))
+            m3u8_formats = self._extract_m3u8_formats(
+                video_url_base + '.ism/video.m3u8', video_id,
+                'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
+            self._sort_formats(m3u8_formats)
+            m3u8_formats = list(filter(
+                lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+                m3u8_formats))
+            formats.extend(m3u8_formats)
+            for i, m3u8_format in enumerate(m3u8_formats, 2):
+                http_url = '%s-%d.mp4' % (video_url_base, i)
+                if self._is_valid_url(http_url, video_id):
+                    f = m3u8_format.copy()
+                    f.update({
+                        'url': http_url,
+                        'format_id': f['format_id'].replace('hls', 'http'),
+                        'protocol': 'http',
+                    })
+                    formats.append(f)
+        else:
             formats = [{
                 'format_id': 'direct',
                 'url': video_url,
-                'ext': ext,
+                'ext': determine_ext(video_url, 'mp4'),
             }]
-        else:
-            m3u8_url = re.sub('([^/]*)\.ism/manifest', r'\1.ism/\1.m3u8', video_url)
-            formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4')
 
         self._sort_formats(formats)
 
@@ -92,8 +132,8 @@ class TvpIE(InfoExtractor):
         }
 
 
-class TvpSeriesIE(InfoExtractor):
-    IE_NAME = 'tvp.pl:Series'
+class TVPSeriesIE(InfoExtractor):
+    IE_NAME = 'tvp:series'
     _VALID_URL = r'https?://vod\.tvp\.pl/(?:[^/]+/){2}(?P<id>[^/]+)/?$'
 
     _TESTS = [{
@@ -127,7 +167,7 @@ class TvpSeriesIE(InfoExtractor):
         videos_paths = re.findall(
             '(?s)class="shortTitle">.*?href="(/[^"]+)', playlist)
         entries = [
-            self.url_result('http://vod.tvp.pl%s' % v_path, ie=TvpIE.ie_key())
+            self.url_result('http://vod.tvp.pl%s' % v_path, ie=TVPIE.ie_key())
             for v_path in videos_paths]
 
         return {
index f3198fb85adb29b8081b9735899dd574cb504c67..7a9386cde3d9e0e5d78bfd368d47819430c53e85 100644 (file)
@@ -1,25 +1,62 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    determine_ext,
+    mimetype2ext,
+)
 
 
 class TweakersIE(InfoExtractor):
     _VALID_URL = r'https?://tweakers\.net/video/(?P<id>\d+)'
     _TEST = {
         'url': 'https://tweakers.net/video/9926/new-nintendo-3ds-xl-op-alle-fronten-beter.html',
-        'md5': '3147e4ddad366f97476a93863e4557c8',
+        'md5': 'fe73e417c093a788e0160c4025f88b15',
         'info_dict': {
             'id': '9926',
             'ext': 'mp4',
             'title': 'New Nintendo 3DS XL - Op alle fronten beter',
-            'description': 'md5:f97324cc71e86e11c853f0763820e3ba',
+            'description': 'md5:3789b21fed9c0219e9bcaacd43fab280',
             'thumbnail': 're:^https?://.*\.jpe?g$',
             'duration': 386,
+            'uploader_id': 's7JeEm',
         }
     }
 
     def _real_extract(self, url):
-        playlist_id = self._match_id(url)
-        entries = self._extract_xspf_playlist(
-            'https://tweakers.net/video/s1playlist/%s/playlist.xspf' % playlist_id, playlist_id)
-        return self.playlist_result(entries, playlist_id)
+        video_id = self._match_id(url)
+        video_data = self._download_json(
+            'https://tweakers.net/video/s1playlist/%s/1920/1080/playlist.json' % video_id,
+            video_id)['items'][0]
+
+        title = video_data['title']
+
+        formats = []
+        for location in video_data.get('locations', {}).get('progressive', []):
+            format_id = location.get('label')
+            width = int_or_none(location.get('width'))
+            height = int_or_none(location.get('height'))
+            for source in location.get('sources', []):
+                source_url = source.get('src')
+                if not source_url:
+                    continue
+                ext = mimetype2ext(source.get('type')) or determine_ext(source_url)
+                formats.append({
+                    'format_id': format_id,
+                    'url': source_url,
+                    'width': width,
+                    'height': height,
+                    'ext': ext,
+                })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video_data.get('description'),
+            'thumbnail': video_data.get('poster'),
+            'duration': int_or_none(video_data.get('duration')),
+            'uploader_id': video_data.get('account'),
+            'formats': formats,
+        }
index e03e2dbaa42f23a5107a50c67e7c12d9f378600b..4025edf02b4ca5b8c42e3324fb094fe43141ce68 100644 (file)
@@ -47,7 +47,8 @@ class TwentyFourVideoIE(InfoExtractor):
 
         title = self._og_search_title(webpage)
         description = self._html_search_regex(
-            r'<span itemprop="description">([^<]+)</span>', webpage, 'description', fatal=False)
+            r'<(p|span)[^>]+itemprop="description"[^>]*>(?P<description>[^<]+)</\1>',
+            webpage, 'description', fatal=False, group='description')
         thumbnail = self._og_search_thumbnail(webpage)
         duration = int_or_none(self._og_search_property(
             'duration', webpage, 'duration', fatal=False))
index ca7d953b8e2733d404ebe9f7cd90e7e28d083abe..b721ecb0a106a710b6d140d7d21309307196a684 100644 (file)
@@ -32,7 +32,22 @@ class TwentyMinutenIE(InfoExtractor):
             'title': '«Wir müssen mutig nach vorne schauen»',
             'description': 'Kein Land sei innovativer als die Schweiz, sagte Johann Schneider-Ammann in seiner Neujahrsansprache. Das Land müsse aber seine Hausaufgaben machen.',
             'thumbnail': 'http://www.20min.ch/images/content/2/2/0/22050469/10/teaserbreit.jpg'
-        }
+        },
+        'skip': '"This video is no longer available" is shown both on the web page and in the downloaded file.',
+    }, {
+        # YouTube embed
+        'url': 'http://www.20min.ch/ro/sports/football/story/Il-marque-une-bicyclette-de-plus-de-30-metres--21115184',
+        'md5': 'cec64d59aa01c0ed9dbba9cf639dd82f',
+        'info_dict': {
+            'id': 'ivM7A7SpDOs',
+            'ext': 'mp4',
+            'title': 'GOLAZO DE CHILENA DE JAVI GÓMEZ, FINALISTA AL BALÓN DE CLM 2016',
+            'description': 'md5:903c92fbf2b2f66c09de514bc25e9f5a',
+            'upload_date': '20160424',
+            'uploader': 'RTVCM Castilla-La Mancha',
+            'uploader_id': 'RTVCM',
+        },
+        'add_ie': ['Youtube'],
     }, {
         'url': 'http://www.20min.ch/videotv/?cid=44&vid=468738',
         'only_matching': True,
@@ -48,6 +63,12 @@ class TwentyMinutenIE(InfoExtractor):
 
         webpage = self._download_webpage(url, display_id)
 
+        youtube_url = self._html_search_regex(
+            r'<iframe[^>]+src="((?:https?:)?//www\.youtube\.com/embed/[^"]+)"',
+            webpage, 'YouTube embed URL', default=None)
+        if youtube_url is not None:
+            return self.url_result(youtube_url, 'Youtube')
+
         title = self._html_search_regex(
             r'<h1>.*?<span>(.+?)</span></h1>',
             webpage, 'title', default=None)
index 36ee1adff2288570fc39936640bacd3abafe9ed2..67b1277ccadebc7afba028ece1d02859068d004d 100644 (file)
@@ -16,6 +16,7 @@ from ..compat import (
 from ..utils import (
     ExtractorError,
     int_or_none,
+    js_to_json,
     orderedSet,
     parse_duration,
     parse_iso8601,
@@ -28,7 +29,7 @@ class TwitchBaseIE(InfoExtractor):
     _VALID_URL_BASE = r'https?://(?:www\.)?twitch\.tv'
 
     _API_BASE = 'https://api.twitch.tv'
-    _USHER_BASE = 'http://usher.twitch.tv'
+    _USHER_BASE = 'https://usher.ttvnw.net'
     _LOGIN_URL = 'http://www.twitch.tv/login'
     _NETRC_MACHINE = 'twitch'
 
@@ -171,6 +172,7 @@ class TwitchVideoIE(TwitchItemBaseIE):
             'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG',
         },
         'playlist_mincount': 12,
+        'skip': 'HTTP Error 404: Not Found',
     }
 
 
@@ -187,6 +189,7 @@ class TwitchChapterIE(TwitchItemBaseIE):
             'title': 'ACRL Off Season - Sports Cars @ Nordschleife',
         },
         'playlist_mincount': 3,
+        'skip': 'HTTP Error 404: Not Found',
     }, {
         'url': 'http://www.twitch.tv/tsm_theoddone/c/2349361',
         'only_matching': True,
@@ -258,7 +261,7 @@ class TwitchVodIE(TwitchItemBaseIE):
                     'nauth': access_token['token'],
                     'nauthsig': access_token['sig'],
                 })),
-            item_id, 'mp4')
+            item_id, 'mp4', entry_protocol='m3u8_native')
 
         self._prefer_source(formats)
         info['formats'] = formats
@@ -355,31 +358,6 @@ class TwitchPastBroadcastsIE(TwitchPlaylistBaseIE):
     }
 
 
-class TwitchBookmarksIE(TwitchPlaylistBaseIE):
-    IE_NAME = 'twitch:bookmarks'
-    _VALID_URL = r'%s/(?P<id>[^/]+)/profile/bookmarks/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
-    _PLAYLIST_URL = '%s/api/bookmark/?user=%%s&offset=%%d&limit=%%d' % TwitchBaseIE._API_BASE
-    _PLAYLIST_TYPE = 'bookmarks'
-
-    _TEST = {
-        'url': 'http://www.twitch.tv/ognos/profile/bookmarks',
-        'info_dict': {
-            'id': 'ognos',
-            'title': 'Ognos',
-        },
-        'playlist_mincount': 3,
-    }
-
-    def _extract_playlist_page(self, response):
-        entries = []
-        for bookmark in response.get('bookmarks', []):
-            video = bookmark.get('video')
-            if not video:
-                continue
-            entries.append(video['url'])
-        return entries
-
-
 class TwitchStreamIE(TwitchBaseIE):
     IE_NAME = 'twitch:stream'
     _VALID_URL = r'%s/(?P<id>[^/#?]+)/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
@@ -477,3 +455,45 @@ class TwitchStreamIE(TwitchBaseIE):
             'formats': formats,
             'is_live': True,
         }
+
+
+class TwitchClipsIE(InfoExtractor):
+    IE_NAME = 'twitch:clips'
+    _VALID_URL = r'https?://clips\.twitch\.tv/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+
+    _TEST = {
+        'url': 'https://clips.twitch.tv/ea/AggressiveCobraPoooound',
+        'md5': '761769e1eafce0ffebfb4089cb3847cd',
+        'info_dict': {
+            'id': 'AggressiveCobraPoooound',
+            'ext': 'mp4',
+            'title': 'EA Play 2016 Live from the Novo Theatre',
+            'thumbnail': 're:^https?://.*\.jpg',
+            'creator': 'EA',
+            'uploader': 'stereotype_',
+            'uploader_id': 'stereotype_',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        clip = self._parse_json(
+            self._search_regex(
+                r'(?s)clipInfo\s*=\s*({.+?});', webpage, 'clip info'),
+            video_id, transform_source=js_to_json)
+
+        video_url = clip['clip_video_url']
+        title = clip['channel_title']
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'creator': clip.get('broadcaster_display_name') or clip.get('broadcaster_login'),
+            'uploader': clip.get('curator_login'),
+            'uploader_id': clip.get('curator_display_name'),
+        }
index ea673054fdc7135a203cca8db00dc128344b0829..b7384298619608ab879337326b1e6719962932e3 100644 (file)
@@ -5,6 +5,7 @@ import re
 
 from .common import InfoExtractor
 from ..utils import (
+    determine_ext,
     float_or_none,
     xpath_text,
     remove_end,
@@ -52,7 +53,7 @@ class TwitterCardIE(TwitterBaseIE):
                 'id': 'dq4Oj5quskI',
                 'ext': 'mp4',
                 'title': 'Ubuntu 11.10 Overview',
-                'description': 'Take a quick peek at what\'s new and improved in Ubuntu 11.10.\n\nOnce installed take a look at 10 Things to Do After Installing: http://www.omgubuntu.co.uk/2011/10/10-things-to-do-after-installing-ubuntu-11-10/',
+                'description': 'Take a quick peek at what\'s new and improved in Ubuntu 11.10.\n\nOnce installed take a look at 10 Things to Do After Installing: http://www.omgubuntu.co.uk/2011/10/10...',
                 'upload_date': '20111013',
                 'uploader': 'OMG! Ubuntu!',
                 'uploader_id': 'omgubuntu',
@@ -116,13 +117,16 @@ class TwitterCardIE(TwitterBaseIE):
         video_url = config.get('video_url') or config.get('playlist', [{}])[0].get('source')
 
         if video_url:
-            f = {
-                'url': video_url,
-            }
+            if determine_ext(video_url) == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(video_url, video_id, ext='mp4', m3u8_id='hls'))
+            else:
+                f = {
+                    'url': video_url,
+                }
 
-            _search_dimensions_in_video_url(f, video_url)
+                _search_dimensions_in_video_url(f, video_url)
 
-            formats.append(f)
+                formats.append(f)
 
         vmap_url = config.get('vmapUrl') or config.get('vmap_url')
         if vmap_url:
@@ -207,6 +211,7 @@ class TwitterIE(InfoExtractor):
             'uploader_id': 'giphz',
         },
         'expected_warnings': ['height', 'width'],
+        'skip': 'Account suspended',
     }, {
         'url': 'https://twitter.com/starwars/status/665052190608723968',
         'md5': '39b7199856dee6cd4432e72c74bc69d4',
@@ -239,10 +244,10 @@ class TwitterIE(InfoExtractor):
         'info_dict': {
             'id': '700207533655363584',
             'ext': 'mp4',
-            'title': 'jay - BEAT PROD: @suhmeduh #Damndaniel',
-            'description': 'jay on Twitter: "BEAT PROD: @suhmeduh  https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"',
+            'title': 'Donte The Dumbass - BEAT PROD: @suhmeduh #Damndaniel',
+            'description': 'Donte The Dumbass on Twitter: "BEAT PROD: @suhmeduh  https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"',
             'thumbnail': 're:^https?://.*\.jpg',
-            'uploader': 'jay',
+            'uploader': 'Donte The Dumbass',
             'uploader_id': 'jaydingeer',
         },
         'params': {
@@ -262,7 +267,6 @@ class TwitterIE(InfoExtractor):
         'add_ie': ['Vine'],
     }, {
         'url': 'https://twitter.com/captainamerica/status/719944021058060289',
-        # md5 constantly changes
         'info_dict': {
             'id': '719944021058060289',
             'ext': 'mp4',
@@ -271,6 +275,9 @@ class TwitterIE(InfoExtractor):
             'uploader_id': 'captainamerica',
             'uploader': 'Captain America',
         },
+        'params': {
+            'skip_download': True,  # requires ffmpeg
+        },
     }]
 
     def _real_extract(self, url):
@@ -278,7 +285,11 @@ class TwitterIE(InfoExtractor):
         user_id = mobj.group('user_id')
         twid = mobj.group('id')
 
-        webpage = self._download_webpage(self._TEMPLATE_URL % (user_id, twid), twid)
+        webpage, urlh = self._download_webpage_handle(
+            self._TEMPLATE_URL % (user_id, twid), twid)
+
+        if 'twitter.com/account/suspended' in urlh.geturl():
+            raise ExtractorError('Account suspended by Twitter.', expected=True)
 
         username = remove_end(self._og_search_title(webpage), ' on Twitter')
 
index d1e6f2703e022dac0edc3ef0f16794a6285d2b8f..89b86955913587c3c09474fdffaab8ad338bb26a 100644 (file)
@@ -5,7 +5,6 @@ import re
 from .common import InfoExtractor
 from ..compat import (
     compat_HTTPError,
-    compat_urllib_parse_urlencode,
     compat_urllib_request,
     compat_urlparse,
 )
@@ -84,18 +83,19 @@ class UdemyIE(InfoExtractor):
         if enroll_url:
             webpage = self._download_webpage(
                 combine_url(base_url, enroll_url),
-                course_id, 'Enrolling in the course')
+                course_id, 'Enrolling in the course',
+                headers={'Referer': base_url})
             if '>You have enrolled in' in webpage:
                 self.to_screen('%s: Successfully enrolled in the course' % course_id)
 
     def _download_lecture(self, course_id, lecture_id):
         return self._download_json(
-            'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?%s' % (
-                course_id, lecture_id, compat_urllib_parse_urlencode({
-                    'fields[lecture]': 'title,description,view_html,asset',
-                    'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data',
-                })),
-            lecture_id, 'Downloading lecture JSON')
+            'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?'
+            % (course_id, lecture_id),
+            lecture_id, 'Downloading lecture JSON', query={
+                'fields[lecture]': 'title,description,view_html,asset',
+                'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data',
+            })
 
     def _handle_error(self, response):
         if not isinstance(response, dict):
@@ -142,7 +142,9 @@ class UdemyIE(InfoExtractor):
             self._LOGIN_URL, None, 'Downloading login popup')
 
         def is_logged(webpage):
-            return any(p in webpage for p in ['href="https://www.udemy.com/user/logout/', '>Logout<'])
+            return any(re.search(p, webpage) for p in (
+                r'href=["\'](?:https://www\.udemy\.com)?/user/logout/',
+                r'>Logout<'))
 
         # already logged in
         if is_logged(login_popup):
@@ -155,13 +157,13 @@ class UdemyIE(InfoExtractor):
             'password': password,
         })
 
-        request = sanitized_Request(
-            self._LOGIN_URL, urlencode_postdata(login_form))
-        request.add_header('Referer', self._ORIGIN_URL)
-        request.add_header('Origin', self._ORIGIN_URL)
-
         response = self._download_webpage(
-            request, None, 'Logging in as %s' % username)
+            self._LOGIN_URL, None, 'Logging in as %s' % username,
+            data=urlencode_postdata(login_form),
+            headers={
+                'Referer': self._ORIGIN_URL,
+                'Origin': self._ORIGIN_URL,
+            })
 
         if not is_logged(response):
             error = self._html_search_regex(
index ee35b7227372c0ddc128dfc694577578f9fc6009..57dd73aef6f6254f22cdcd814e2d76b20c75b847 100644 (file)
@@ -2,10 +2,13 @@
 from __future__ import unicode_literals
 
 import json
+import re
+
 from .common import InfoExtractor
 from ..utils import (
+    determine_ext,
+    int_or_none,
     js_to_json,
-    ExtractorError,
 )
 from ..compat import compat_urlparse
 
@@ -16,13 +19,16 @@ class UDNEmbedIE(InfoExtractor):
     _VALID_URL = r'https?:' + _PROTOCOL_RELATIVE_VALID_URL
     _TESTS = [{
         'url': 'http://video.udn.com/embed/news/300040',
-        'md5': 'de06b4c90b042c128395a88f0384817e',
         'info_dict': {
             'id': '300040',
             'ext': 'mp4',
             'title': '生物老師男變女 全校挺"做自己"',
             'thumbnail': 're:^https?://.*\.jpg$',
-        }
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
     }, {
         'url': 'https://video.udn.com/embed/news/300040',
         'only_matching': True,
@@ -38,39 +44,53 @@ class UDNEmbedIE(InfoExtractor):
         page = self._download_webpage(url, video_id)
 
         options = json.loads(js_to_json(self._html_search_regex(
-            r'var options\s*=\s*([^;]+);', page, 'video urls dictionary')))
+            r'var\s+options\s*=\s*([^;]+);', page, 'video urls dictionary')))
 
         video_urls = options['video']
 
         if video_urls.get('youtube'):
             return self.url_result(video_urls.get('youtube'), 'Youtube')
 
-        try:
-            del video_urls['youtube']
-        except KeyError:
-            pass
+        formats = []
+        for video_type, api_url in video_urls.items():
+            if not api_url:
+                continue
 
-        formats = [{
-            'url': self._download_webpage(
+            video_url = self._download_webpage(
                 compat_urlparse.urljoin(url, api_url), video_id,
-                'retrieve url for %s video' % video_type),
-            'format_id': video_type,
-            'preference': 0 if video_type == 'mp4' else -1,
-        } for video_type, api_url in video_urls.items() if api_url]
+                note='retrieve url for %s video' % video_type)
 
-        if not formats:
-            raise ExtractorError('No videos found', expected=True)
+            ext = determine_ext(video_url)
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    video_url, video_id, ext='mp4', m3u8_id='hls'))
+            elif ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    video_url, video_id, f4m_id='hds'))
+            else:
+                mobj = re.search(r'_(?P<height>\d+)p_(?P<tbr>\d+).mp4', video_url)
+                a_format = {
+                    'url': video_url,
+                    # video_type may be 'mp4', which confuses YoutubeDL
+                    'format_id': 'http-' + video_type,
+                }
+                if mobj:
+                    a_format.update({
+                        'height': int_or_none(mobj.group('height')),
+                        'tbr': int_or_none(mobj.group('tbr')),
+                    })
+                formats.append(a_format)
 
         self._sort_formats(formats)
 
-        thumbnail = None
-
-        if options.get('gallery') and len(options['gallery']):
-            thumbnail = options['gallery'][0].get('original')
+        thumbnails = [{
+            'url': img_url,
+            'id': img_type,
+        } for img_type, img_url in options.get('gallery', [{}])[0].items() if img_url]
 
         return {
             'id': video_id,
             'formats': formats,
             'title': options['title'],
-            'thumbnail': thumbnail
+            'thumbnails': thumbnails,
         }
index 66d9f1bf3fc9ff8481fb55aa8045078244b11635..a724cdbef8620821e65cb22324fcfcb33c1a3b4c 100644 (file)
@@ -49,6 +49,7 @@ class UnistraIE(InfoExtractor):
                 'format_id': format_id,
                 'quality': quality(format_id)
             })
+        self._sort_formats(formats)
 
         title = self._html_search_regex(
             r'<title>UTV - (.*?)</', webpage, 'title')
diff --git a/youtube_dl/extractor/urplay.py b/youtube_dl/extractor/urplay.py
new file mode 100644 (file)
index 0000000..ce3bf6b
--- /dev/null
@@ -0,0 +1,67 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class URPlayIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?urplay\.se/program/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://urplay.se/program/190031-tripp-trapp-trad-sovkudde',
+        'md5': '15ca67b63fd8fb320ac2bcd854bad7b6',
+        'info_dict': {
+            'id': '190031',
+            'ext': 'mp4',
+            'title': 'Tripp, Trapp, Träd : Sovkudde',
+            'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+        urplayer_data = self._parse_json(self._search_regex(
+            r'urPlayer\.init\(({.+?})\);', webpage, 'urplayer data'), video_id)
+        host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect']
+
+        formats = []
+        for quality_attr, quality, preference in (('', 'sd', 0), ('_hd', 'hd', 1)):
+            file_rtmp = urplayer_data.get('file_rtmp' + quality_attr)
+            if file_rtmp:
+                formats.append({
+                    'url': 'rtmp://%s/urplay/mp4:%s' % (host, file_rtmp),
+                    'format_id': quality + '-rtmp',
+                    'ext': 'flv',
+                    'preference': preference,
+                })
+            file_http = urplayer_data.get('file_http' + quality_attr) or urplayer_data.get('file_http_sub' + quality_attr)
+            if file_http:
+                file_http_base_url = 'http://%s/%s' % (host, file_http)
+                formats.extend(self._extract_f4m_formats(
+                    file_http_base_url + 'manifest.f4m', video_id,
+                    preference, '%s-hds' % quality, fatal=False))
+                formats.extend(self._extract_m3u8_formats(
+                    file_http_base_url + 'playlist.m3u8', video_id, 'mp4',
+                    'm3u8_native', preference, '%s-hls' % quality, fatal=False))
+        self._sort_formats(formats)
+
+        subtitles = {}
+        for subtitle in urplayer_data.get('subtitles', []):
+            subtitle_url = subtitle.get('file')
+            kind = subtitle.get('kind')
+            if subtitle_url or kind and kind != 'captions':
+                continue
+            subtitles.setdefault(subtitle.get('label', 'Svenska'), []).append({
+                'url': subtitle_url,
+            })
+
+        return {
+            'id': video_id,
+            'title': urplayer_data['title'],
+            'description': self._og_search_description(webpage),
+            'thumbnail': urplayer_data.get('image'),
+            'series': urplayer_data.get('series_title'),
+            'subtitles': subtitles,
+            'formats': formats,
+        }
index cafc082b6bb8a589edaf02ce9a87f266e44c941d..3484a204658e1f09d472c0b31026ec6621121f1f 100644 (file)
@@ -6,10 +6,12 @@ from .common import InfoExtractor
 from ..utils import (
     int_or_none,
     unified_strdate,
+    unescapeHTML,
 )
 
 
 class UstudioIE(InfoExtractor):
+    IE_NAME = 'ustudio'
     _VALID_URL = r'https?://(?:(?:www|v1)\.)?ustudio\.com/video/(?P<id>[^/]+)/(?P<display_id>[^/?#&]+)'
     _TEST = {
         'url': 'http://ustudio.com/video/Uxu2my9bgSph/san_francisco_golden_gate_bridge',
@@ -27,9 +29,7 @@ class UstudioIE(InfoExtractor):
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        display_id = mobj.group('display_id')
+        video_id, display_id = re.match(self._VALID_URL, url).groups()
 
         config = self._download_xml(
             'http://v1.ustudio.com/embed/%s/ustudio/config.xml' % video_id,
@@ -37,7 +37,7 @@ class UstudioIE(InfoExtractor):
 
         def extract(kind):
             return [{
-                'url': item.attrib['url'],
+                'url': unescapeHTML(item.attrib['url']),
                 'width': int_or_none(item.get('width')),
                 'height': int_or_none(item.get('height')),
             } for item in config.findall('./qualities/quality/%s' % kind) if item.get('url')]
@@ -65,3 +65,61 @@ class UstudioIE(InfoExtractor):
             'uploader': uploader,
             'formats': formats,
         }
+
+
+class UstudioEmbedIE(InfoExtractor):
+    IE_NAME = 'ustudio:embed'
+    _VALID_URL = r'https?://(?:(?:app|embed)\.)?ustudio\.com/embed/(?P<uid>[^/]+)/(?P<id>[^/]+)'
+    _TEST = {
+        'url': 'http://app.ustudio.com/embed/DeN7VdYRDKhP/Uw7G1kMCe65T',
+        'md5': '47c0be52a09b23a7f40de9469cec58f4',
+        'info_dict': {
+            'id': 'Uw7G1kMCe65T',
+            'ext': 'mp4',
+            'title': '5 Things IT Should Know About Video',
+            'description': 'md5:93d32650884b500115e158c5677d25ad',
+            'uploader_id': 'DeN7VdYRDKhP',
+        }
+    }
+
+    def _real_extract(self, url):
+        uploader_id, video_id = re.match(self._VALID_URL, url).groups()
+        video_data = self._download_json(
+            'http://app.ustudio.com/embed/%s/%s/config.json' % (uploader_id, video_id),
+            video_id)['videos'][0]
+        title = video_data['name']
+
+        formats = []
+        for ext, qualities in video_data.get('transcodes', {}).items():
+            for quality in qualities:
+                quality_url = quality.get('url')
+                if not quality_url:
+                    continue
+                height = int_or_none(quality.get('height'))
+                formats.append({
+                    'format_id': '%s-%dp' % (ext, height) if height else ext,
+                    'url': quality_url,
+                    'width': int_or_none(quality.get('width')),
+                    'height': height,
+                })
+        self._sort_formats(formats)
+
+        thumbnails = []
+        for image in video_data.get('images', []):
+            image_url = image.get('url')
+            if not image_url:
+                continue
+            thumbnails.append({
+                'url': image_url,
+            })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video_data.get('description'),
+            'duration': int_or_none(video_data.get('duration')),
+            'uploader_id': uploader_id,
+            'tags': video_data.get('keywords'),
+            'thumbnails': thumbnails,
+            'formats': formats,
+        }
index 23ce0a0d1929febac87f789374d8411d7b7ddd00..0f5d6873808ed2dce5cde2e6239b6973cf809367 100644 (file)
@@ -37,6 +37,7 @@ class VeohIE(InfoExtractor):
                 'uploader': 'afp-news',
                 'duration': 123,
             },
+            'skip': 'This video has been deleted.',
         },
         {
             'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX',
index 1a0ff3395598027ebd8de05a609faca987c14e9e..2cd617b91ce4a4a7eba0a639c0956dca3e168576 100644 (file)
@@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 
 import json
+import re
 
 from .common import InfoExtractor
 from ..utils import (
@@ -12,11 +13,11 @@ from ..utils import (
 
 
 class VesselIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?vessel\.com/videos/(?P<id>[0-9a-zA-Z]+)'
+    _VALID_URL = r'https?://(?:www\.)?vessel\.com/(?:videos|embed)/(?P<id>[0-9a-zA-Z]+)'
     _API_URL_TEMPLATE = 'https://www.vessel.com/api/view/items/%s'
     _LOGIN_URL = 'https://www.vessel.com/api/account/login'
     _NETRC_MACHINE = 'vessel'
-    _TEST = {
+    _TESTS = [{
         'url': 'https://www.vessel.com/videos/HDN7G5UMs',
         'md5': '455cdf8beb71c6dd797fd2f3818d05c4',
         'info_dict': {
@@ -28,7 +29,16 @@ class VesselIE(InfoExtractor):
             'description': 'Did Nvidia pull out all the stops on the Titan X, or does its performance leave something to be desired?',
             'timestamp': int,
         },
-    }
+    }, {
+        'url': 'https://www.vessel.com/embed/G4U7gUJ6a?w=615&h=346',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return [url for _, url in re.findall(
+            r'<iframe[^>]+src=(["\'])((?:https?:)?//(?:www\.)?vessel\.com/embed/[0-9a-zA-Z]+.*?)\1',
+            webpage)]
 
     @staticmethod
     def make_json_request(url, data):
@@ -98,16 +108,24 @@ class VesselIE(InfoExtractor):
 
         formats = []
         for f in video_asset.get('sources', []):
-            if f['name'] == 'hls-index':
+            location = f.get('location')
+            if not location:
+                continue
+            name = f.get('name')
+            if name == 'hls-index':
                 formats.extend(self._extract_m3u8_formats(
-                    f['location'], video_id, ext='mp4', m3u8_id='m3u8'))
+                    location, video_id, ext='mp4',
+                    entry_protocol='m3u8_native', m3u8_id='m3u8', fatal=False))
+            elif name == 'dash-index':
+                formats.extend(self._extract_mpd_formats(
+                    location, video_id, mpd_id='dash', fatal=False))
             else:
                 formats.append({
-                    'format_id': f['name'],
+                    'format_id': name,
                     'tbr': f.get('bitrate'),
                     'height': f.get('height'),
                     'width': f.get('width'),
-                    'url': f['location'],
+                    'url': location,
                 })
         self._sort_formats(formats)
 
index 147480f6465513066db58ce3cf32e194c4ff8490..388b4debee27d7331ae7dc351338e3829e539071 100644 (file)
@@ -3,7 +3,11 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..compat import compat_etree_fromstring
+from ..compat import (
+    compat_etree_fromstring,
+    compat_str,
+    compat_urlparse,
+)
 from ..utils import (
     ExtractorError,
     int_or_none,
@@ -12,13 +16,22 @@ from ..utils import (
 )
 
 
-class VevoIE(InfoExtractor):
+class VevoBaseIE(InfoExtractor):
+    def _extract_json(self, webpage, video_id, item):
+        return self._parse_json(
+            self._search_regex(
+                r'window\.__INITIAL_STORE__\s*=\s*({.+?});\s*</script>',
+                webpage, 'initial store'),
+            video_id)['default'][item]
+
+
+class VevoIE(VevoBaseIE):
     '''
     Accepts urls from vevo.com or in the format 'vevo:{id}'
     (currently used by MTVIE and MySpaceIE)
     '''
     _VALID_URL = r'''(?x)
-        (?:https?://www\.vevo\.com/watch/(?:[^/]+/(?:[^/]+/)?)?|
+        (?:https?://www\.vevo\.com/watch/(?!playlist|genre)(?:[^/]+/(?:[^/]+/)?)?|
            https?://cache\.vevo\.com/m/html/embed\.html\?video=|
            https?://videoplayer\.vevo\.com/embed/embedded\?videoId=|
            vevo:)
@@ -30,11 +43,15 @@ class VevoIE(InfoExtractor):
         'info_dict': {
             'id': 'GB1101300280',
             'ext': 'mp4',
-            'title': 'Somebody to Die For',
+            'title': 'Hurts - Somebody to Die For',
+            'timestamp': 1372057200,
             'upload_date': '20130624',
             'uploader': 'Hurts',
-            'timestamp': 1372057200,
+            'track': 'Somebody to Die For',
+            'artist': 'Hurts',
+            'genre': 'Pop',
         },
+        'expected_warnings': ['Unable to download SMIL file'],
     }, {
         'note': 'v3 SMIL format',
         'url': 'http://www.vevo.com/watch/cassadee-pope/i-wish-i-could-break-your-heart/USUV71302923',
@@ -42,23 +59,31 @@ class VevoIE(InfoExtractor):
         'info_dict': {
             'id': 'USUV71302923',
             'ext': 'mp4',
-            'title': 'I Wish I Could Break Your Heart',
+            'title': 'Cassadee Pope - I Wish I Could Break Your Heart',
+            'timestamp': 1392796919,
             'upload_date': '20140219',
             'uploader': 'Cassadee Pope',
-            'timestamp': 1392796919,
+            'track': 'I Wish I Could Break Your Heart',
+            'artist': 'Cassadee Pope',
+            'genre': 'Country',
         },
+        'expected_warnings': ['Unable to download SMIL file'],
     }, {
         'note': 'Age-limited video',
         'url': 'https://www.vevo.com/watch/justin-timberlake/tunnel-vision-explicit/USRV81300282',
         'info_dict': {
             'id': 'USRV81300282',
             'ext': 'mp4',
-            'title': 'Tunnel Vision (Explicit)',
-            'upload_date': '20130703',
+            'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
             'age_limit': 18,
-            'uploader': 'Justin Timberlake',
             'timestamp': 1372888800,
+            'upload_date': '20130703',
+            'uploader': 'Justin Timberlake',
+            'track': 'Tunnel Vision (Explicit)',
+            'artist': 'Justin Timberlake',
+            'genre': 'Pop',
         },
+        'expected_warnings': ['Unable to download SMIL file'],
     }, {
         'note': 'No video_info',
         'url': 'http://www.vevo.com/watch/k-camp-1/Till-I-Die/USUV71503000',
@@ -66,12 +91,36 @@ class VevoIE(InfoExtractor):
         'info_dict': {
             'id': 'USUV71503000',
             'ext': 'mp4',
-            'title': 'Till I Die',
-            'upload_date': '20151207',
+            'title': 'K Camp - Till I Die',
             'age_limit': 18,
-            'uploader': 'K Camp',
             'timestamp': 1449468000,
+            'upload_date': '20151207',
+            'uploader': 'K Camp',
+            'track': 'Till I Die',
+            'artist': 'K Camp',
+            'genre': 'Rap/Hip-Hop',
+        },
+    }, {
+        'note': 'Only available via webpage',
+        'url': 'http://www.vevo.com/watch/GBUV71600656',
+        'md5': '67e79210613865b66a47c33baa5e37fe',
+        'info_dict': {
+            'id': 'GBUV71600656',
+            'ext': 'mp4',
+            'title': 'ABC - Viva Love',
+            'age_limit': 0,
+            'timestamp': 1461830400,
+            'upload_date': '20160428',
+            'uploader': 'ABC',
+            'track': 'Viva Love',
+            'artist': 'ABC',
+            'genre': 'Pop',
         },
+        'expected_warnings': ['Failed to download video versions info'],
+    }, {
+        # no genres available
+        'url': 'http://www.vevo.com/watch/INS171400764',
+        'only_matching': True,
     }]
     _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com'
     _SOURCE_TYPES = {
@@ -140,42 +189,41 @@ class VevoIE(InfoExtractor):
             errnote='Unable to retrieve oauth token')
 
         if 'THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION' in webpage:
-            raise ExtractorError(
-                '%s said: This page is currently unavailable in your region.' % self.IE_NAME, expected=True)
+            self.raise_geo_restricted(
+                '%s said: This page is currently unavailable in your region' % self.IE_NAME)
 
         auth_info = self._parse_json(webpage, video_id)
         self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['access_token']
 
-    def _call_api(self, path, video_id, note, errnote, fatal=True):
-        return self._download_json(self._api_url_template % path, video_id, note, errnote)
+    def _call_api(self, path, *args, **kwargs):
+        return self._download_json(self._api_url_template % path, *args, **kwargs)
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
         json_url = 'http://api.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id
         response = self._download_json(
-            json_url, video_id, 'Downloading video info', 'Unable to download info')
+            json_url, video_id, 'Downloading video info',
+            'Unable to download info', fatal=False) or {}
         video_info = response.get('video') or {}
-        video_versions = video_info.get('videoVersions')
+        artist = None
+        featured_artist = None
         uploader = None
-        timestamp = None
         view_count = None
         formats = []
 
         if not video_info:
-            if response.get('statusCode') != 909:
+            try:
+                self._initialize_api(video_id)
+            except ExtractorError:
                 ytid = response.get('errorInfo', {}).get('ytid')
                 if ytid:
                     self.report_warning(
                         'Video is geoblocked, trying with the YouTube video %s' % ytid)
                     return self.url_result(ytid, 'Youtube', ytid)
 
-                if 'statusMessage' in response:
-                    raise ExtractorError('%s said: %s' % (
-                        self.IE_NAME, response['statusMessage']), expected=True)
-                raise ExtractorError('Unable to extract videos')
+                raise
 
-            self._initialize_api(video_id)
             video_info = self._call_api(
                 'video/%s' % video_id, video_id, 'Downloading api video info',
                 'Failed to download video info')
@@ -183,12 +231,19 @@ class VevoIE(InfoExtractor):
             video_versions = self._call_api(
                 'video/%s/streams' % video_id, video_id,
                 'Downloading video versions info',
-                'Failed to download video versions info')
+                'Failed to download video versions info',
+                fatal=False)
+
+            # Some videos are only available via webpage (e.g.
+            # https://github.com/rg3/youtube-dl/issues/9366)
+            if not video_versions:
+                webpage = self._download_webpage(url, video_id)
+                video_versions = self._extract_json(webpage, video_id, 'streams')[video_id][0]
 
             timestamp = parse_iso8601(video_info.get('releaseDate'))
             artists = video_info.get('artists')
             if artists:
-                uploader = artists[0]['name']
+                artist = uploader = artists[0]['name']
             view_count = int_or_none(video_info.get('views', {}).get('total'))
 
             for video_version in video_versions:
@@ -241,7 +296,11 @@ class VevoIE(InfoExtractor):
                 scale=1000)
             artists = video_info.get('mainArtists')
             if artists:
-                uploader = artists[0]['artistName']
+                artist = uploader = artists[0]['artistName']
+
+            featured_artists = video_info.get('featuredArtists')
+            if featured_artists:
+                featured_artist = featured_artists[0]['artistName']
 
             smil_parsed = False
             for video_version in video_info['videoVersions']:
@@ -278,7 +337,15 @@ class VevoIE(InfoExtractor):
                         smil_parsed = True
         self._sort_formats(formats)
 
-        title = video_info['title']
+        track = video_info['title']
+        if featured_artist:
+            artist = '%s ft. %s' % (artist, featured_artist)
+        title = '%s - %s' % (artist, track) if artist else track
+
+        genres = video_info.get('genres')
+        genre = (
+            genres[0] if genres and isinstance(genres, list) and
+            isinstance(genres[0], compat_str) else None)
 
         is_explicit = video_info.get('isExplicit')
         if is_explicit is True:
@@ -300,4 +367,75 @@ class VevoIE(InfoExtractor):
             'duration': duration,
             'view_count': view_count,
             'age_limit': age_limit,
+            'track': track,
+            'artist': uploader,
+            'genre': genre,
         }
+
+
+class VevoPlaylistIE(VevoBaseIE):
+    _VALID_URL = r'https?://www\.vevo\.com/watch/(?P<kind>playlist|genre)/(?P<id>[^/?#&]+)'
+
+    _TESTS = [{
+        'url': 'http://www.vevo.com/watch/playlist/dadbf4e7-b99f-4184-9670-6f0e547b6a29',
+        'info_dict': {
+            'id': 'dadbf4e7-b99f-4184-9670-6f0e547b6a29',
+            'title': 'Best-Of: Birdman',
+        },
+        'playlist_count': 10,
+    }, {
+        'url': 'http://www.vevo.com/watch/genre/rock',
+        'info_dict': {
+            'id': 'rock',
+            'title': 'Rock',
+        },
+        'playlist_count': 20,
+    }, {
+        'url': 'http://www.vevo.com/watch/playlist/dadbf4e7-b99f-4184-9670-6f0e547b6a29?index=0',
+        'md5': '32dcdfddddf9ec6917fc88ca26d36282',
+        'info_dict': {
+            'id': 'USCMV1100073',
+            'ext': 'mp4',
+            'title': 'Birdman - Y.U. MAD',
+            'timestamp': 1323417600,
+            'upload_date': '20111209',
+            'uploader': 'Birdman',
+            'track': 'Y.U. MAD',
+            'artist': 'Birdman',
+            'genre': 'Rap/Hip-Hop',
+        },
+        'expected_warnings': ['Unable to download SMIL file'],
+    }, {
+        'url': 'http://www.vevo.com/watch/genre/rock?index=0',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        playlist_id = mobj.group('id')
+        playlist_kind = mobj.group('kind')
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+        index = qs.get('index', [None])[0]
+
+        if index:
+            video_id = self._search_regex(
+                r'<meta[^>]+content=(["\'])vevo://video/(?P<id>.+?)\1[^>]*>',
+                webpage, 'video id', default=None, group='id')
+            if video_id:
+                return self.url_result('vevo:%s' % video_id, VevoIE.ie_key())
+
+        playlists = self._extract_json(webpage, playlist_id, '%ss' % playlist_kind)
+
+        playlist = (list(playlists.values())[0]
+                    if playlist_kind == 'playlist' else playlists[playlist_id])
+
+        entries = [
+            self.url_result('vevo:%s' % src, VevoIE.ie_key())
+            for src in playlist['isrcs']]
+
+        return self.playlist_result(
+            entries, playlist.get('playlistId') or playlist_id,
+            playlist.get('name'), playlist.get('description'))
index 95daf4dfdf2155dbbab26f2896cf3c42e0f33e2f..e2b2ce0981cc8767ade2f5ef4c8bc52759b86af3 100644 (file)
@@ -11,12 +11,14 @@ class ViceIE(InfoExtractor):
 
     _TESTS = [{
         'url': 'http://www.vice.com/video/cowboy-capitalists-part-1',
+        'md5': 'e9d77741f9e42ba583e683cd170660f7',
         'info_dict': {
             'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
             'ext': 'flv',
             'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
             'duration': 725.983,
         },
+        'add_ie': ['Ooyala'],
     }, {
         'url': 'http://www.vice.com/video/how-to-hack-a-car',
         'md5': '6fb2989a3fed069fb8eab3401fc2d3c9',
@@ -29,6 +31,7 @@ class ViceIE(InfoExtractor):
             'uploader': 'Motherboard',
             'upload_date': '20140529',
         },
+        'add_ie': ['Youtube'],
     }, {
         'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab',
         'only_matching': True,
diff --git a/youtube_dl/extractor/vidbit.py b/youtube_dl/extractor/vidbit.py
new file mode 100644 (file)
index 0000000..e7ac5a8
--- /dev/null
@@ -0,0 +1,84 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+    int_or_none,
+    js_to_json,
+    remove_end,
+    unified_strdate,
+)
+
+
+class VidbitIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?vidbit\.co/(?:watch|embed)\?.*?\bv=(?P<id>[\da-zA-Z]+)'
+    _TESTS = [{
+        'url': 'http://www.vidbit.co/watch?v=jkL2yDOEq2',
+        'md5': '1a34b7f14defe3b8fafca9796892924d',
+        'info_dict': {
+            'id': 'jkL2yDOEq2',
+            'ext': 'mp4',
+            'title': 'Intro to VidBit',
+            'description': 'md5:5e0d6142eec00b766cbf114bfd3d16b7',
+            'thumbnail': 're:https?://.*\.jpg$',
+            'upload_date': '20160618',
+            'view_count': int,
+            'comment_count': int,
+        }
+    }, {
+        'url': 'http://www.vidbit.co/embed?v=jkL2yDOEq2&auto=0&water=0',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            compat_urlparse.urljoin(url, '/watch?v=%s' % video_id), video_id)
+
+        video_url, title = [None] * 2
+
+        config = self._parse_json(self._search_regex(
+            r'(?s)\.setup\(({.+?})\);', webpage, 'setup', default='{}'),
+            video_id, transform_source=js_to_json)
+        if config:
+            if config.get('file'):
+                video_url = compat_urlparse.urljoin(url, config['file'])
+            title = config.get('title')
+
+        if not video_url:
+            video_url = compat_urlparse.urljoin(url, self._search_regex(
+                r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1',
+                webpage, 'video URL', group='url'))
+
+        if not title:
+            title = remove_end(
+                self._html_search_regex(
+                    (r'<h1>(.+?)</h1>', r'<title>(.+?)</title>'),
+                    webpage, 'title', default=None) or self._og_search_title(webpage),
+                ' - VidBit')
+
+        description = self._html_search_meta(
+            ('description', 'og:description', 'twitter:description'),
+            webpage, 'description')
+
+        upload_date = unified_strdate(self._html_search_meta(
+            'datePublished', webpage, 'upload date'))
+
+        view_count = int_or_none(self._search_regex(
+            r'<strong>(\d+)</strong> views',
+            webpage, 'view count', fatal=False))
+        comment_count = int_or_none(self._search_regex(
+            r'id=["\']cmt_num["\'][^>]*>\((\d+)\)',
+            webpage, 'comment count', fatal=False))
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'description': description,
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'upload_date': upload_date,
+            'view_count': view_count,
+            'comment_count': comment_count,
+        }
diff --git a/youtube_dl/extractor/vidio.py b/youtube_dl/extractor/vidio.py
new file mode 100644 (file)
index 0000000..6898042
--- /dev/null
@@ -0,0 +1,73 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class VidioIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?vidio\.com/watch/(?P<id>\d+)-(?P<display_id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015',
+        'md5': 'cd2801394afc164e9775db6a140b91fe',
+        'info_dict': {
+            'id': '165683',
+            'display_id': 'dj_ambred-booyah-live-2015',
+            'ext': 'mp4',
+            'title': 'DJ_AMBRED - Booyah (Live 2015)',
+            'description': 'md5:27dc15f819b6a78a626490881adbadf8',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 149,
+            'like_count': int,
+        },
+    }, {
+        'url': 'https://www.vidio.com/watch/77949-south-korea-test-fires-missile-that-can-strike-all-of-the-north',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id, display_id = mobj.group('id', 'display_id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        title = self._og_search_title(webpage)
+
+        m3u8_url, duration, thumbnail = [None] * 3
+
+        clips = self._parse_json(
+            self._html_search_regex(
+                r'data-json-clips\s*=\s*(["\'])(?P<data>\[.+?\])\1',
+                webpage, 'video data', default='[]', group='data'),
+            display_id, fatal=False)
+        if clips:
+            clip = clips[0]
+            m3u8_url = clip.get('sources', [{}])[0].get('file')
+            duration = clip.get('clip_duration')
+            thumbnail = clip.get('image')
+
+        m3u8_url = m3u8_url or self._search_regex(
+            r'data(?:-vjs)?-clip-hls-url=(["\'])(?P<url>.+?)\1', webpage, 'hls url')
+        formats = self._extract_m3u8_formats(m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native')
+
+        duration = int_or_none(duration or self._search_regex(
+            r'data-video-duration=(["\'])(?P<duartion>\d+)\1', webpage, 'duration'))
+        thumbnail = thumbnail or self._og_search_thumbnail(webpage)
+
+        like_count = int_or_none(self._search_regex(
+            (r'<span[^>]+data-comment-vote-count=["\'](\d+)',
+             r'<span[^>]+class=["\'].*?\blike(?:__|-)count\b.*?["\'][^>]*>\s*(\d+)'),
+            webpage, 'like count', fatal=False))
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': self._og_search_description(webpage),
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'like_count': like_count,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/viewlift.py b/youtube_dl/extractor/viewlift.py
new file mode 100644 (file)
index 0000000..19500eb
--- /dev/null
@@ -0,0 +1,200 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    clean_html,
+    determine_ext,
+    int_or_none,
+    js_to_json,
+    parse_duration,
+)
+
+
+class ViewLiftBaseIE(InfoExtractor):
+    _DOMAINS_REGEX = '(?:snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|monumentalsportsnetwork|vayafilm)\.com|kesari\.tv'
+
+
+class ViewLiftEmbedIE(ViewLiftBaseIE):
+    _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?:%s)/embed/player\?.*\bfilmId=(?P<id>[\da-f-]{36})' % ViewLiftBaseIE._DOMAINS_REGEX
+    _TESTS = [{
+        'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500',
+        'md5': '2924e9215c6eff7a55ed35b72276bd93',
+        'info_dict': {
+            'id': '74849a00-85a9-11e1-9660-123139220831',
+            'ext': 'mp4',
+            'title': '#whilewewatch',
+        }
+    }, {
+        # invalid labels, 360p is better that 480p
+        'url': 'http://www.snagfilms.com/embed/player?filmId=17ca0950-a74a-11e0-a92a-0026bb61d036',
+        'md5': '882fca19b9eb27ef865efeeaed376a48',
+        'info_dict': {
+            'id': '17ca0950-a74a-11e0-a92a-0026bb61d036',
+            'ext': 'mp4',
+            'title': 'Life in Limbo',
+        }
+    }, {
+        'url': 'http://www.snagfilms.com/embed/player?filmId=0000014c-de2f-d5d6-abcf-ffef58af0017',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?(?:%s)/embed/player.+?)\1' % ViewLiftBaseIE._DOMAINS_REGEX,
+            webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        if '>This film is not playable in your area.<' in webpage:
+            raise ExtractorError(
+                'Film %s is not playable in your area.' % video_id, expected=True)
+
+        formats = []
+        has_bitrate = False
+        for source in self._parse_json(js_to_json(self._search_regex(
+                r'(?s)sources:\s*(\[.+?\]),', webpage, 'json')), video_id):
+            file_ = source.get('file')
+            if not file_:
+                continue
+            type_ = source.get('type')
+            ext = determine_ext(file_)
+            format_id = source.get('label') or ext
+            if all(v == 'm3u8' or v == 'hls' for v in (type_, ext)):
+                formats.extend(self._extract_m3u8_formats(
+                    file_, video_id, 'mp4', m3u8_id='hls'))
+            else:
+                bitrate = int_or_none(self._search_regex(
+                    [r'(\d+)kbps', r'_\d{1,2}x\d{1,2}_(\d{3,})\.%s' % ext],
+                    file_, 'bitrate', default=None))
+                if not has_bitrate and bitrate:
+                    has_bitrate = True
+                height = int_or_none(self._search_regex(
+                    r'^(\d+)[pP]$', format_id, 'height', default=None))
+                formats.append({
+                    'url': file_,
+                    'format_id': 'http-%s%s' % (format_id, ('-%dk' % bitrate if bitrate else '')),
+                    'tbr': bitrate,
+                    'height': height,
+                })
+        field_preference = None if has_bitrate else ('height', 'tbr', 'format_id')
+        self._sort_formats(formats, field_preference)
+
+        title = self._search_regex(
+            [r"title\s*:\s*'([^']+)'", r'<title>([^<]+)</title>'],
+            webpage, 'title')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+        }
+
+
+class ViewLiftIE(ViewLiftBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?(?P<domain>%s)/(?:films/title|show|(?:news/)?videos?)/(?P<id>[^?#]+)' % ViewLiftBaseIE._DOMAINS_REGEX
+    _TESTS = [{
+        'url': 'http://www.snagfilms.com/films/title/lost_for_life',
+        'md5': '19844f897b35af219773fd63bdec2942',
+        'info_dict': {
+            'id': '0000014c-de2f-d5d6-abcf-ffef58af0017',
+            'display_id': 'lost_for_life',
+            'ext': 'mp4',
+            'title': 'Lost for Life',
+            'description': 'md5:fbdacc8bb6b455e464aaf98bc02e1c82',
+            'thumbnail': 're:^https?://.*\.jpg',
+            'duration': 4489,
+            'categories': ['Documentary', 'Crime', 'Award Winning', 'Festivals']
+        }
+    }, {
+        'url': 'http://www.snagfilms.com/show/the_world_cut_project/india',
+        'md5': 'e6292e5b837642bbda82d7f8bf3fbdfd',
+        'info_dict': {
+            'id': '00000145-d75c-d96e-a9c7-ff5c67b20000',
+            'display_id': 'the_world_cut_project/india',
+            'ext': 'mp4',
+            'title': 'India',
+            'description': 'md5:5c168c5a8f4719c146aad2e0dfac6f5f',
+            'thumbnail': 're:^https?://.*\.jpg',
+            'duration': 979,
+            'categories': ['Documentary', 'Sports', 'Politics']
+        }
+    }, {
+        # Film is not playable in your area.
+        'url': 'http://www.snagfilms.com/films/title/inside_mecca',
+        'only_matching': True,
+    }, {
+        # Film is not available.
+        'url': 'http://www.snagfilms.com/show/augie_alone/flirting',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.winnersview.com/videos/the-good-son',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.kesari.tv/news/video/1461919076414',
+        'only_matching': True,
+    }, {
+        # Was once Kaltura embed
+        'url': 'https://www.monumentalsportsnetwork.com/videos/john-carlson-postgame-2-25-15',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        domain, display_id = re.match(self._VALID_URL, url).groups()
+
+        webpage = self._download_webpage(url, display_id)
+
+        if ">Sorry, the Film you're looking for is not available.<" in webpage:
+            raise ExtractorError(
+                'Film %s is not available.' % display_id, expected=True)
+
+        film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id')
+
+        snag = self._parse_json(
+            self._search_regex(
+                'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag'),
+            display_id)
+
+        for item in snag:
+            if item.get('data', {}).get('film', {}).get('id') == film_id:
+                data = item['data']['film']
+                title = data['title']
+                description = clean_html(data.get('synopsis'))
+                thumbnail = data.get('image')
+                duration = int_or_none(data.get('duration') or data.get('runtime'))
+                categories = [
+                    category['title'] for category in data.get('categories', [])
+                    if category.get('title')]
+                break
+        else:
+            title = self._search_regex(
+                r'itemprop="title">([^<]+)<', webpage, 'title')
+            description = self._html_search_regex(
+                r'(?s)<div itemprop="description" class="film-synopsis-inner ">(.+?)</div>',
+                webpage, 'description', default=None) or self._og_search_description(webpage)
+            thumbnail = self._og_search_thumbnail(webpage)
+            duration = parse_duration(self._search_regex(
+                r'<span itemprop="duration" class="film-duration strong">([^<]+)<',
+                webpage, 'duration', fatal=False))
+            categories = re.findall(r'<a href="/movies/[^"]+">([^<]+)</a>', webpage)
+
+        return {
+            '_type': 'url_transparent',
+            'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id),
+            'id': film_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'categories': categories,
+            'ie_key': 'ViewLiftEmbed',
+        }
index fe94a479339035dfd9a70386e903e5c3770fdfea..a93196a0772fd5588dd2f55c327427d00e814eb4 100644 (file)
@@ -1,10 +1,11 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import re
+
 from .common import InfoExtractor
 from ..compat import (
     compat_HTTPError,
-    compat_urllib_parse,
     compat_urllib_parse_unquote,
 )
 from ..utils import (
@@ -14,6 +15,7 @@ from ..utils import (
     parse_iso8601,
     sanitized_Request,
     HEADRequest,
+    url_basename,
 )
 
 
@@ -75,11 +77,11 @@ class ViewsterIE(InfoExtractor):
 
     _ACCEPT_HEADER = 'application/json, text/javascript, */*; q=0.01'
 
-    def _download_json(self, url, video_id, note='Downloading JSON metadata', fatal=True):
+    def _download_json(self, url, video_id, note='Downloading JSON metadata', fatal=True, query={}):
         request = sanitized_Request(url)
         request.add_header('Accept', self._ACCEPT_HEADER)
         request.add_header('Auth-token', self._AUTH_TOKEN)
-        return super(ViewsterIE, self)._download_json(request, video_id, note, fatal=fatal)
+        return super(ViewsterIE, self)._download_json(request, video_id, note, fatal=fatal, query=query)
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -114,43 +116,85 @@ class ViewsterIE(InfoExtractor):
             return self.playlist_result(entries, video_id, title, description)
 
         formats = []
-        for media_type in ('application/f4m+xml', 'application/x-mpegURL', 'video/mp4'):
-            media = self._download_json(
-                'https://public-api.viewster.com/movies/%s/video?mediaType=%s'
-                % (entry_id, compat_urllib_parse.quote(media_type)),
-                video_id, 'Downloading %s JSON' % media_type, fatal=False)
-            if not media:
-                continue
-            video_url = media.get('Uri')
-            if not video_url:
-                continue
-            ext = determine_ext(video_url)
-            if ext == 'f4m':
-                video_url += '&' if '?' in video_url else '?'
-                video_url += 'hdcore=3.2.0&plugin=flowplayer-3.2.0.1'
-                formats.extend(self._extract_f4m_formats(
-                    video_url, video_id, f4m_id='hds'))
-            elif ext == 'm3u8':
-                m3u8_formats = self._extract_m3u8_formats(
-                    video_url, video_id, 'mp4', m3u8_id='hls',
-                    fatal=False)  # m3u8 sometimes fail
-                if m3u8_formats:
-                    formats.extend(m3u8_formats)
-            else:
-                format_id = media.get('Bitrate')
-                f = {
-                    'url': video_url,
-                    'format_id': 'mp4-%s' % format_id,
-                    'height': int_or_none(media.get('Height')),
-                    'width': int_or_none(media.get('Width')),
-                    'preference': 1,
-                }
-                if format_id and not f['height']:
-                    f['height'] = int_or_none(self._search_regex(
-                        r'^(\d+)[pP]$', format_id, 'height', default=None))
-                formats.append(f)
-
-        if not formats and not info.get('LanguageSets') and not info.get('VODSettings'):
+        for language_set in info.get('LanguageSets', []):
+            manifest_url = None
+            m3u8_formats = []
+            audio = language_set.get('Audio') or ''
+            subtitle = language_set.get('Subtitle') or ''
+            base_format_id = audio
+            if subtitle:
+                base_format_id += '-%s' % subtitle
+
+            def concat(suffix, sep='-'):
+                return (base_format_id + '%s%s' % (sep, suffix)) if base_format_id else suffix
+
+            for media_type in ('application/f4m+xml', 'application/x-mpegURL', 'video/mp4'):
+                media = self._download_json(
+                    'https://public-api.viewster.com/movies/%s/video' % entry_id,
+                    video_id, 'Downloading %s JSON' % concat(media_type, ' '), fatal=False, query={
+                        'mediaType': media_type,
+                        'language': audio,
+                        'subtitle': subtitle,
+                    })
+                if not media:
+                    continue
+                video_url = media.get('Uri')
+                if not video_url:
+                    continue
+                ext = determine_ext(video_url)
+                if ext == 'f4m':
+                    manifest_url = video_url
+                    video_url += '&' if '?' in video_url else '?'
+                    video_url += 'hdcore=3.2.0&plugin=flowplayer-3.2.0.1'
+                    formats.extend(self._extract_f4m_formats(
+                        video_url, video_id, f4m_id=concat('hds')))
+                elif ext == 'm3u8':
+                    manifest_url = video_url
+                    m3u8_formats = self._extract_m3u8_formats(
+                        video_url, video_id, 'mp4', m3u8_id=concat('hls'),
+                        fatal=False)  # m3u8 sometimes fail
+                    if m3u8_formats:
+                        formats.extend(m3u8_formats)
+                else:
+                    qualities_basename = self._search_regex(
+                        '/([^/]+)\.csmil/',
+                        manifest_url, 'qualities basename', default=None)
+                    if not qualities_basename:
+                        continue
+                    QUALITIES_RE = r'((,\d+k)+,?)'
+                    qualities = self._search_regex(
+                        QUALITIES_RE, qualities_basename,
+                        'qualities', default=None)
+                    if not qualities:
+                        continue
+                    qualities = list(map(lambda q: int(q[:-1]), qualities.strip(',').split(',')))
+                    qualities.sort()
+                    http_template = re.sub(QUALITIES_RE, r'%dk', qualities_basename)
+                    http_url_basename = url_basename(video_url)
+                    if m3u8_formats:
+                        self._sort_formats(m3u8_formats)
+                        m3u8_formats = list(filter(
+                            lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+                            m3u8_formats))
+                    if len(qualities) == len(m3u8_formats):
+                        for q, m3u8_format in zip(qualities, m3u8_formats):
+                            f = m3u8_format.copy()
+                            f.update({
+                                'url': video_url.replace(http_url_basename, http_template % q),
+                                'format_id': f['format_id'].replace('hls', 'http'),
+                                'protocol': 'http',
+                            })
+                            formats.append(f)
+                    else:
+                        for q in qualities:
+                            formats.append({
+                                'url': video_url.replace(http_url_basename, http_template % q),
+                                'ext': 'mp4',
+                                'format_id': 'http-%d' % q,
+                                'tbr': q,
+                            })
+
+        if not formats and not info.get('VODSettings'):
             self.raise_geo_restricted()
 
         self._sort_formats(formats)
index e04b814c8cf27755bfe0a86af3d5bf43262bd0da..efa15e0b633f56dc24c15eeb4e63889cbb084ab3 100644 (file)
@@ -101,10 +101,13 @@ class VikiBaseIE(InfoExtractor):
             self.report_warning('Unable to get session token, login has probably failed')
 
     @staticmethod
-    def dict_selection(dict_obj, preferred_key):
+    def dict_selection(dict_obj, preferred_key, allow_fallback=True):
         if preferred_key in dict_obj:
             return dict_obj.get(preferred_key)
 
+        if not allow_fallback:
+            return
+
         filtered_dict = list(filter(None, [dict_obj.get(k) for k in dict_obj.keys()]))
         return filtered_dict[0] if filtered_dict else None
 
@@ -127,7 +130,7 @@ class VikiIE(VikiBaseIE):
     }, {
         # clip
         'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference',
-        'md5': '86c0b5dbd4d83a6611a79987cc7a1989',
+        'md5': 'feea2b1d7b3957f70886e6dfd8b8be84',
         'info_dict': {
             'id': '1067139v',
             'ext': 'mp4',
@@ -156,17 +159,18 @@ class VikiIE(VikiBaseIE):
         'params': {
             # m3u8 download
             'skip_download': True,
-        }
+        },
+        'skip': 'Blocked in the US',
     }, {
         # episode
         'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1',
-        'md5': '190f3ef426005ba3a080a63325955bc3',
+        'md5': '1f54697dabc8f13f31bf06bb2e4de6db',
         'info_dict': {
             'id': '44699v',
             'ext': 'mp4',
             'title': 'Boys Over Flowers - Episode 1',
-            'description': 'md5:52617e4f729c7d03bfd4bcbbb6e946f2',
-            'duration': 4155,
+            'description': 'md5:b89cf50038b480b88b5b3c93589a9076',
+            'duration': 4204,
             'timestamp': 1270496524,
             'upload_date': '20100405',
             'uploader': 'group8',
@@ -196,7 +200,7 @@ class VikiIE(VikiBaseIE):
     }, {
         # non-English description
         'url': 'http://www.viki.com/videos/158036v-love-in-magic',
-        'md5': '1713ae35df5a521b31f6dc40730e7c9c',
+        'md5': '013dc282714e22acf9447cad14ff1208',
         'info_dict': {
             'id': '158036v',
             'ext': 'mp4',
@@ -217,7 +221,7 @@ class VikiIE(VikiBaseIE):
 
         self._check_errors(video)
 
-        title = self.dict_selection(video.get('titles', {}), 'en')
+        title = self.dict_selection(video.get('titles', {}), 'en', allow_fallback=False)
         if not title:
             title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id
             container_titles = video.get('container', {}).get('titles', {})
@@ -302,7 +306,7 @@ class VikiChannelIE(VikiBaseIE):
             'title': 'Boys Over Flowers',
             'description': 'md5:ecd3cff47967fe193cff37c0bec52790',
         },
-        'playlist_count': 70,
+        'playlist_mincount': 71,
     }, {
         'url': 'http://www.viki.com/tv/1354c-poor-nastya-complete',
         'info_dict': {
index 59f9cb1ae49ab0adb7fcc62ec81b12c30e652b28..d9c9852d463d2ee533e3447f536a1a665e06ddd8 100644 (file)
@@ -8,6 +8,7 @@ import itertools
 from .common import InfoExtractor
 from ..compat import (
     compat_HTTPError,
+    compat_str,
     compat_urlparse,
 )
 from ..utils import (
@@ -15,6 +16,7 @@ from ..utils import (
     ExtractorError,
     InAdvancePagedList,
     int_or_none,
+    NO_DEFAULT,
     RegexNotFoundError,
     sanitized_Request,
     smuggle_url,
@@ -24,6 +26,7 @@ from ..utils import (
     urlencode_postdata,
     unescapeHTML,
     parse_filesize,
+    try_get,
 )
 
 
@@ -54,6 +57,26 @@ class VimeoBaseInfoExtractor(InfoExtractor):
         self._set_vimeo_cookie('vuid', vuid)
         self._download_webpage(login_request, None, False, 'Wrong login info')
 
+    def _verify_video_password(self, url, video_id, webpage):
+        password = self._downloader.params.get('videopassword')
+        if password is None:
+            raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True)
+        token, vuid = self._extract_xsrft_and_vuid(webpage)
+        data = urlencode_postdata({
+            'password': password,
+            'token': token,
+        })
+        if url.startswith('http://'):
+            # vimeo only supports https now, but the user can give an http url
+            url = url.replace('http://', 'https://')
+        password_request = sanitized_Request(url + '/password', data)
+        password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        password_request.add_header('Referer', url)
+        self._set_vimeo_cookie('vuid', vuid)
+        return self._download_webpage(
+            password_request, video_id,
+            'Verifying the password', 'Wrong password')
+
     def _extract_xsrft_and_vuid(self, webpage):
         xsrft = self._search_regex(
             r'(?:(?P<q1>["\'])xsrft(?P=q1)\s*:|xsrft\s*[=:])\s*(?P<q>["\'])(?P<xsrft>.+?)(?P=q)',
@@ -66,6 +89,69 @@ class VimeoBaseInfoExtractor(InfoExtractor):
     def _set_vimeo_cookie(self, name, value):
         self._set_cookie('vimeo.com', name, value)
 
+    def _vimeo_sort_formats(self, formats):
+        # Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps
+        # at the same time without actual units specified. This lead to wrong sorting.
+        self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'format_id'))
+
+    def _parse_config(self, config, video_id):
+        # Extract title
+        video_title = config['video']['title']
+
+        # Extract uploader, uploader_url and uploader_id
+        video_uploader = config['video'].get('owner', {}).get('name')
+        video_uploader_url = config['video'].get('owner', {}).get('url')
+        video_uploader_id = video_uploader_url.split('/')[-1] if video_uploader_url else None
+
+        # Extract video thumbnail
+        video_thumbnail = config['video'].get('thumbnail')
+        if video_thumbnail is None:
+            video_thumbs = config['video'].get('thumbs')
+            if video_thumbs and isinstance(video_thumbs, dict):
+                _, video_thumbnail = sorted((int(width if width.isdigit() else 0), t_url) for (width, t_url) in video_thumbs.items())[-1]
+
+        # Extract video duration
+        video_duration = int_or_none(config['video'].get('duration'))
+
+        formats = []
+        config_files = config['video'].get('files') or config['request'].get('files', {})
+        for f in config_files.get('progressive', []):
+            video_url = f.get('url')
+            if not video_url:
+                continue
+            formats.append({
+                'url': video_url,
+                'format_id': 'http-%s' % f.get('quality'),
+                'width': int_or_none(f.get('width')),
+                'height': int_or_none(f.get('height')),
+                'fps': int_or_none(f.get('fps')),
+                'tbr': int_or_none(f.get('bitrate')),
+            })
+        m3u8_url = config_files.get('hls', {}).get('url')
+        if m3u8_url:
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+
+        subtitles = {}
+        text_tracks = config['request'].get('text_tracks')
+        if text_tracks:
+            for tt in text_tracks:
+                subtitles[tt['lang']] = [{
+                    'ext': 'vtt',
+                    'url': 'https://vimeo.com' + tt['url'],
+                }]
+
+        return {
+            'title': video_title,
+            'uploader': video_uploader,
+            'uploader_id': video_uploader_id,
+            'uploader_url': video_uploader_url,
+            'thumbnail': video_thumbnail,
+            'duration': video_duration,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
 
 class VimeoIE(VimeoBaseInfoExtractor):
     """Information extractor for vimeo.com."""
@@ -81,7 +167,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
                             \.
                         )?
                         vimeo(?P<pro>pro)?\.com/
-                        (?!channels/[^/?#]+/?(?:$|[?#])|[^/]+/review/|(?:album|ondemand)/)
+                        (?!(?:channels|album)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/)
                         (?:.*?/)?
                         (?:
                             (?:
@@ -153,7 +239,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
                 'uploader_id': 'user18948128',
                 'uploader': 'Jaime Marquínez Ferrándiz',
                 'duration': 10,
-                'description': 'This is "youtube-dl password protected test video" by Jaime Marquínez Ferrándiz on Vimeo, the home for high quality videos and the people\u2026',
+                'description': 'This is "youtube-dl password protected test video" by  on Vimeo, the home for high quality videos and the people who love them.',
             },
             'params': {
                 'videopassword': 'youtube-dl',
@@ -162,8 +248,6 @@ class VimeoIE(VimeoBaseInfoExtractor):
         {
             'url': 'http://vimeo.com/channels/keypeele/75629013',
             'md5': '2f86a05afe9d7abc0b9126d229bbe15d',
-            'note': 'Video is freely available via original URL '
-                    'and protected with password when accessed via http://vimeo.com/75629013',
             'info_dict': {
                 'id': '75629013',
                 'ext': 'mp4',
@@ -207,7 +291,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
         {
             # contains original format
             'url': 'https://vimeo.com/33951933',
-            'md5': '53c688fa95a55bf4b7293d37a89c5c53',
+            'md5': '2d9f5475e0537f013d0073e812ab89e6',
             'info_dict': {
                 'id': '33951933',
                 'ext': 'mp4',
@@ -219,6 +303,29 @@ class VimeoIE(VimeoBaseInfoExtractor):
                 'description': 'md5:ae23671e82d05415868f7ad1aec21147',
             },
         },
+        {
+            # only available via https://vimeo.com/channels/tributes/6213729 and
+            # not via https://vimeo.com/6213729
+            'url': 'https://vimeo.com/channels/tributes/6213729',
+            'info_dict': {
+                'id': '6213729',
+                'ext': 'mp4',
+                'title': 'Vimeo Tribute: The Shining',
+                'uploader': 'Casey Donahue',
+                'uploader_url': 're:https?://(?:www\.)?vimeo\.com/caseydonahue',
+                'uploader_id': 'caseydonahue',
+                'upload_date': '20090821',
+                'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6',
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'expected_warnings': ['Unable to download JSON metadata'],
+        },
+        {
+            'url': 'http://vimeo.com/moogaloop.swf?clip_id=2539741',
+            'only_matching': True,
+        },
         {
             'url': 'https://vimeo.com/109815029',
             'note': 'Video not completely processed, "failed" seed status',
@@ -228,6 +335,10 @@ class VimeoIE(VimeoBaseInfoExtractor):
             'url': 'https://vimeo.com/groups/travelhd/videos/22439234',
             'only_matching': True,
         },
+        {
+            'url': 'https://vimeo.com/album/2632481/video/79010983',
+            'only_matching': True,
+        },
         {
             # source file returns 403: Forbidden
             'url': 'https://vimeo.com/7809605',
@@ -254,26 +365,6 @@ class VimeoIE(VimeoBaseInfoExtractor):
         if mobj:
             return mobj.group(1)
 
-    def _verify_video_password(self, url, video_id, webpage):
-        password = self._downloader.params.get('videopassword')
-        if password is None:
-            raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True)
-        token, vuid = self._extract_xsrft_and_vuid(webpage)
-        data = urlencode_postdata({
-            'password': password,
-            'token': token,
-        })
-        if url.startswith('http://'):
-            # vimeo only supports https now, but the user can give an http url
-            url = url.replace('http://', 'https://')
-        password_request = sanitized_Request(url + '/password', data)
-        password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
-        password_request.add_header('Referer', url)
-        self._set_vimeo_cookie('vuid', vuid)
-        return self._download_webpage(
-            password_request, video_id,
-            'Verifying the password', 'Wrong password')
-
     def _verify_player_video_password(self, url, video_id):
         password = self._downloader.params.get('videopassword')
         if password is None:
@@ -304,7 +395,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
         orig_url = url
         if mobj.group('pro') or mobj.group('player'):
             url = 'https://player.vimeo.com/video/' + video_id
-        else:
+        elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')):
             url = 'https://vimeo.com/' + video_id
 
         # Retrieve video webpage to extract further information
@@ -382,28 +473,24 @@ class VimeoIE(VimeoBaseInfoExtractor):
             if config.get('view') == 4:
                 config = self._verify_player_video_password(url, video_id)
 
-        if '>You rented this title.<' in webpage:
+        def is_rented():
+            if '>You rented this title.<' in webpage:
+                return True
+            if config.get('user', {}).get('purchased'):
+                return True
+            label = try_get(
+                config, lambda x: x['video']['vod']['purchase_options'][0]['label_string'], compat_str)
+            if label and label.startswith('You rented this'):
+                return True
+            return False
+
+        if is_rented():
             feature_id = config.get('video', {}).get('vod', {}).get('feature_id')
             if feature_id and not data.get('force_feature_id', False):
                 return self.url_result(smuggle_url(
                     'https://player.vimeo.com/player/%s' % feature_id,
                     {'force_feature_id': True}), 'Vimeo')
 
-        # Extract title
-        video_title = config['video']['title']
-
-        # Extract uploader, uploader_url and uploader_id
-        video_uploader = config['video'].get('owner', {}).get('name')
-        video_uploader_url = config['video'].get('owner', {}).get('url')
-        video_uploader_id = video_uploader_url.split('/')[-1] if video_uploader_url else None
-
-        # Extract video thumbnail
-        video_thumbnail = config['video'].get('thumbnail')
-        if video_thumbnail is None:
-            video_thumbs = config['video'].get('thumbs')
-            if video_thumbs and isinstance(video_thumbs, dict):
-                _, video_thumbnail = sorted((int(width if width.isdigit() else 0), t_url) for (width, t_url) in video_thumbs.items())[-1]
-
         # Extract video description
 
         video_description = self._html_search_regex(
@@ -423,9 +510,6 @@ class VimeoIE(VimeoBaseInfoExtractor):
         if not video_description and not mobj.group('player'):
             self._downloader.report_warning('Cannot find video description')
 
-        # Extract video duration
-        video_duration = int_or_none(config['video'].get('duration'))
-
         # Extract upload date
         video_upload_date = None
         mobj = re.search(r'<time[^>]+datetime="([^"]+)"', webpage)
@@ -463,53 +547,22 @@ class VimeoIE(VimeoBaseInfoExtractor):
                             'format_id': source_name,
                             'preference': 1,
                         })
-        config_files = config['video'].get('files') or config['request'].get('files', {})
-        for f in config_files.get('progressive', []):
-            video_url = f.get('url')
-            if not video_url:
-                continue
-            formats.append({
-                'url': video_url,
-                'format_id': 'http-%s' % f.get('quality'),
-                'width': int_or_none(f.get('width')),
-                'height': int_or_none(f.get('height')),
-                'fps': int_or_none(f.get('fps')),
-                'tbr': int_or_none(f.get('bitrate')),
-            })
-        m3u8_url = config_files.get('hls', {}).get('url')
-        if m3u8_url:
-            formats.extend(self._extract_m3u8_formats(
-                m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
-        # Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps
-        # at the same time without actual units specified. This lead to wrong sorting.
-        self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'format_id'))
-
-        subtitles = {}
-        text_tracks = config['request'].get('text_tracks')
-        if text_tracks:
-            for tt in text_tracks:
-                subtitles[tt['lang']] = [{
-                    'ext': 'vtt',
-                    'url': 'https://vimeo.com' + tt['url'],
-                }]
 
-        return {
+        info_dict = self._parse_config(config, video_id)
+        formats.extend(info_dict['formats'])
+        self._vimeo_sort_formats(formats)
+        info_dict.update({
             'id': video_id,
-            'uploader': video_uploader,
-            'uploader_url': video_uploader_url,
-            'uploader_id': video_uploader_id,
+            'formats': formats,
             'upload_date': video_upload_date,
-            'title': video_title,
-            'thumbnail': video_thumbnail,
             'description': video_description,
-            'duration': video_duration,
-            'formats': formats,
             'webpage_url': url,
             'view_count': view_count,
             'like_count': like_count,
             'comment_count': comment_count,
-            'subtitles': subtitles,
-        }
+        })
+
+        return info_dict
 
 
 class VimeoOndemandIE(VimeoBaseInfoExtractor):
@@ -603,8 +656,21 @@ class VimeoChannelIE(VimeoBaseInfoExtractor):
                 webpage = self._login_list_password(page_url, list_id, webpage)
                 yield self._extract_list_title(webpage)
 
-            for video_id in re.findall(r'id="clip_(\d+?)"', webpage):
-                yield self.url_result('https://vimeo.com/%s' % video_id, 'Vimeo')
+            # Try extracting href first since not all videos are available via
+            # short https://vimeo.com/id URL (e.g. https://vimeo.com/channels/tributes/6213729)
+            clips = re.findall(
+                r'id="clip_(\d+)"[^>]*>\s*<a[^>]+href="(/(?:[^/]+/)*\1)', webpage)
+            if clips:
+                for video_id, video_url in clips:
+                    yield self.url_result(
+                        compat_urlparse.urljoin(base_url, video_url),
+                        VimeoIE.ie_key(), video_id=video_id)
+            # More relaxed fallback
+            else:
+                for video_id in re.findall(r'id=["\']clip_(\d+)', webpage):
+                    yield self.url_result(
+                        'https://vimeo.com/%s' % video_id,
+                        VimeoIE.ie_key(), video_id=video_id)
 
             if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
                 break
@@ -641,7 +707,7 @@ class VimeoUserIE(VimeoChannelIE):
 
 class VimeoAlbumIE(VimeoChannelIE):
     IE_NAME = 'vimeo:album'
-    _VALID_URL = r'https://vimeo\.com/album/(?P<id>\d+)'
+    _VALID_URL = r'https://vimeo\.com/album/(?P<id>\d+)(?:$|[?#]|/(?!video))'
     _TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>'
     _TESTS = [{
         'url': 'https://vimeo.com/album/2632481',
@@ -661,6 +727,13 @@ class VimeoAlbumIE(VimeoChannelIE):
         'params': {
             'videopassword': 'youtube-dl',
         }
+    }, {
+        'url': 'https://vimeo.com/album/2632481/sort:plays/format:thumbnail',
+        'only_matching': True,
+    }, {
+        # TODO: respect page number
+        'url': 'https://vimeo.com/album/2632481/page:2/sort:plays/format:thumbnail',
+        'only_matching': True,
     }]
 
     def _page_url(self, base_url, pagenum):
@@ -692,7 +765,7 @@ class VimeoGroupsIE(VimeoAlbumIE):
         return self._extract_videos(name, 'https://vimeo.com/groups/%s' % name)
 
 
-class VimeoReviewIE(InfoExtractor):
+class VimeoReviewIE(VimeoBaseInfoExtractor):
     IE_NAME = 'vimeo:review'
     IE_DESC = 'Review pages on vimeo'
     _VALID_URL = r'https://vimeo\.com/[^/]+/review/(?P<id>[^/]+)'
@@ -704,6 +777,7 @@ class VimeoReviewIE(InfoExtractor):
             'ext': 'mp4',
             'title': "DICK HARDWICK 'Comedian'",
             'uploader': 'Richard Hardwick',
+            'uploader_id': 'user21297594',
         }
     }, {
         'note': 'video player needs Referer',
@@ -716,14 +790,45 @@ class VimeoReviewIE(InfoExtractor):
             'uploader': 'DevWeek Events',
             'duration': 2773,
             'thumbnail': 're:^https?://.*\.jpg$',
+            'uploader_id': 'user22258446',
         }
+    }, {
+        'note': 'Password protected',
+        'url': 'https://vimeo.com/user37284429/review/138823582/c4d865efde',
+        'info_dict': {
+            'id': '138823582',
+            'ext': 'mp4',
+            'title': 'EFFICIENT PICKUP MASTERCLASS MODULE 1',
+            'uploader': 'TMB',
+            'uploader_id': 'user37284429',
+        },
+        'params': {
+            'videopassword': 'holygrail',
+        },
     }]
 
+    def _real_initialize(self):
+        self._login()
+
+    def _get_config_url(self, webpage_url, video_id, video_password_verified=False):
+        webpage = self._download_webpage(webpage_url, video_id)
+        config_url = self._html_search_regex(
+            r'data-config-url="([^"]+)"', webpage, 'config URL',
+            default=NO_DEFAULT if video_password_verified else None)
+        if config_url is None:
+            self._verify_video_password(webpage_url, video_id, webpage)
+            config_url = self._get_config_url(
+                webpage_url, video_id, video_password_verified=True)
+        return config_url
+
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        player_url = 'https://player.vimeo.com/player/' + video_id
-        return self.url_result(player_url, 'Vimeo', video_id)
+        video_id = self._match_id(url)
+        config_url = self._get_config_url(url, video_id)
+        config = self._download_json(config_url, video_id)
+        info_dict = self._parse_config(config, video_id)
+        self._vimeo_sort_formats(info_dict['formats'])
+        info_dict['id'] = video_id
+        return info_dict
 
 
 class VimeoWatchLaterIE(VimeoChannelIE):
index a6a6cc47955f6aae482d8022bf52d4d233fb955f..0183f052a599f411a48053360ee41670e758f7af 100644 (file)
@@ -24,6 +24,7 @@ class VineIE(InfoExtractor):
             'upload_date': '20130519',
             'uploader': 'Jack Dorsey',
             'uploader_id': '76',
+            'view_count': int,
             'like_count': int,
             'comment_count': int,
             'repost_count': int,
@@ -39,6 +40,7 @@ class VineIE(InfoExtractor):
             'upload_date': '20140815',
             'uploader': 'Mars Ruiz',
             'uploader_id': '1102363502380728320',
+            'view_count': int,
             'like_count': int,
             'comment_count': int,
             'repost_count': int,
@@ -54,6 +56,7 @@ class VineIE(InfoExtractor):
             'upload_date': '20130430',
             'uploader': 'Z3k3',
             'uploader_id': '936470460173008896',
+            'view_count': int,
             'like_count': int,
             'comment_count': int,
             'repost_count': int,
@@ -71,6 +74,7 @@ class VineIE(InfoExtractor):
             'upload_date': '20150705',
             'uploader': 'Pimry_zaa',
             'uploader_id': '1135760698325307392',
+            'view_count': int,
             'like_count': int,
             'comment_count': int,
             'repost_count': int,
@@ -86,10 +90,12 @@ class VineIE(InfoExtractor):
 
         data = self._parse_json(
             self._search_regex(
-                r'window\.POST_DATA\s*=\s*{\s*%s\s*:\s*({.+?})\s*};\s*</script>' % video_id,
+                r'window\.POST_DATA\s*=\s*({.+?});\s*</script>',
                 webpage, 'vine data'),
             video_id)
 
+        data = data[list(data.keys())[0]]
+
         formats = [{
             'format_id': '%(format)s-%(rate)s' % f,
             'vcodec': f.get('format'),
@@ -109,6 +115,7 @@ class VineIE(InfoExtractor):
             'upload_date': unified_strdate(data.get('created')),
             'uploader': username,
             'uploader_id': data.get('userIdStr'),
+            'view_count': int_or_none(data.get('loops', {}).get('count')),
             'like_count': int_or_none(data.get('likes', {}).get('count')),
             'comment_count': int_or_none(data.get('comments', {}).get('count')),
             'repost_count': int_or_none(data.get('reposts', {}).get('count')),
index 67220f1b7a991e48494adf24c791317e29eda8cd..758d9c86b9bfb4c4db51cc8cb8e201fe017c3f7a 100644 (file)
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
 
 import re
 import json
+import sys
 
 from .common import InfoExtractor
 from ..compat import compat_str
@@ -10,7 +11,6 @@ from ..utils import (
     ExtractorError,
     int_or_none,
     orderedSet,
-    sanitized_Request,
     str_to_int,
     unescapeHTML,
     unified_strdate,
@@ -26,12 +26,16 @@ class VKIE(InfoExtractor):
     _VALID_URL = r'''(?x)
                     https?://
                         (?:
-                            (?:m\.)?vk\.com/video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)|
                             (?:
-                                (?:m\.)?vk\.com/(?:.+?\?.*?z=)?video|
-                                (?:www\.)?biqle\.ru/watch/
+                                (?:(?:m|new)\.)?vk\.com/video_|
+                                (?:www\.)?daxab.com/
                             )
-                            (?P<videoid>[^s].*?)(?:\?(?:.*\blist=(?P<list_id>[\da-f]+))?|%2F|$)
+                            ext\.php\?(?P<embed_query>.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+).*)|
+                            (?:
+                                (?:(?:m|new)\.)?vk\.com/(?:.+?\?.*?z=)?video|
+                                (?:www\.)?daxab.com/embed/
+                            )
+                            (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>[\da-f]+))?
                         )
                     '''
     _NETRC_MACHINE = 'vk'
@@ -75,7 +79,8 @@ class VKIE(InfoExtractor):
                 'duration': 101,
                 'upload_date': '20120730',
                 'view_count': int,
-            }
+            },
+            'skip': 'This video has been removed from public access.',
         },
         {
             # VIDEO NOW REMOVED
@@ -142,7 +147,7 @@ class VKIE(InfoExtractor):
                 'id': 'V3K4mi0SYkc',
                 'ext': 'webm',
                 'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate",
-                'description': 'md5:bf9c26cfa4acdfb146362682edd3827a',
+                'description': 'md5:d9903938abdc74c738af77f527ca0596',
                 'duration': 178,
                 'upload_date': '20130116',
                 'uploader': "Children's Joy Foundation",
@@ -174,13 +179,12 @@ class VKIE(InfoExtractor):
             'only_matching': True,
         },
         {
-            # vk wrapper
-            'url': 'http://www.biqle.ru/watch/847655_160197695',
+            # pladform embed
+            'url': 'https://vk.com/video-76116461_171554880',
             'only_matching': True,
         },
         {
-            # pladform embed
-            'url': 'https://vk.com/video-76116461_171554880',
+            'url': 'http://new.vk.com/video205387401_165548505',
             'only_matching': True,
         }
     ]
@@ -190,7 +194,7 @@ class VKIE(InfoExtractor):
         if username is None:
             return
 
-        login_page = self._download_webpage(
+        login_page, url_handle = self._download_webpage_handle(
             'https://vk.com', None, 'Downloading login page')
 
         login_form = self._hidden_inputs(login_page)
@@ -200,11 +204,26 @@ class VKIE(InfoExtractor):
             'pass': password.encode('cp1251'),
         })
 
-        request = sanitized_Request(
-            'https://login.vk.com/?act=login',
-            urlencode_postdata(login_form))
+        # https://new.vk.com/ serves two same remixlhk cookies in Set-Cookie header
+        # and expects the first one to be set rather than second (see
+        # https://github.com/rg3/youtube-dl/issues/9841#issuecomment-227871201).
+        # As of RFC6265 the newer one cookie should be set into cookie store
+        # what actually happens.
+        # We will workaround this VK issue by resetting the remixlhk cookie to
+        # the first one manually.
+        cookies = url_handle.headers.get('Set-Cookie')
+        if sys.version_info[0] >= 3:
+            cookies = cookies.encode('iso-8859-1')
+        cookies = cookies.decode('utf-8')
+        remixlhk = re.search(r'remixlhk=(.+?);.*?\bdomain=(.+?)(?:[,;]|$)', cookies)
+        if remixlhk:
+            value, domain = remixlhk.groups()
+            self._set_cookie(domain, 'remixlhk', value)
+
         login_page = self._download_webpage(
-            request, None, note='Logging in as %s' % username)
+            'https://login.vk.com/?act=login', None,
+            note='Logging in as %s' % username,
+            data=urlencode_postdata(login_form))
 
         if re.search(r'onLoginFailed', login_page):
             raise ExtractorError(
@@ -217,20 +236,21 @@ class VKIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('videoid')
 
-        if not video_id:
+        if video_id:
+            info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id
+            # Some videos (removed?) can only be downloaded with list id specified
+            list_id = mobj.group('list_id')
+            if list_id:
+                info_url += '&list=%s' % list_id
+        else:
+            info_url = 'http://vk.com/video_ext.php?' + mobj.group('embed_query')
             video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id'))
 
-        info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id
-
-        # Some videos (removed?) can only be downloaded with list id specified
-        list_id = mobj.group('list_id')
-        if list_id:
-            info_url += '&list=%s' % list_id
-
         info_page = self._download_webpage(info_url, video_id)
 
         error_message = self._html_search_regex(
-            r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>',
+            [r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>',
+                r'(?s)<div[^>]+id="video_ext_msg"[^>]*>(.+?)</div>'],
             info_page, 'error message', default=None)
         if error_message:
             raise ExtractorError(error_message, expected=True)
@@ -305,17 +325,17 @@ class VKIE(InfoExtractor):
         view_count = None
         views = self._html_search_regex(
             r'"mv_views_count_number"[^>]*>(.+?\bviews?)<',
-            info_page, 'view count', fatal=False)
+            info_page, 'view count', default=None)
         if views:
             view_count = str_to_int(self._search_regex(
                 r'([\d,.]+)', views, 'view count', fatal=False))
 
         formats = []
         for k, v in data.items():
-            if not k.startswith('url') and k != 'extra_data' or not v:
+            if not k.startswith('url') and not k.startswith('cache') and k != 'extra_data' or not v:
                 continue
             height = int_or_none(self._search_regex(
-                r'^url(\d+)', k, 'height', default=None))
+                r'^(?:url|cache)(\d+)', k, 'height', default=None))
             formats.append({
                 'format_id': k,
                 'url': v,
@@ -338,7 +358,7 @@ class VKIE(InfoExtractor):
 class VKUserVideosIE(InfoExtractor):
     IE_NAME = 'vk:uservideos'
     IE_DESC = "VK - User's Videos"
-    _VALID_URL = r'https?://vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]|$)'
+    _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]|$)'
     _TEMPLATE_URL = 'https://vk.com/videos'
     _TESTS = [{
         'url': 'http://vk.com/videos205387401',
@@ -353,6 +373,12 @@ class VKUserVideosIE(InfoExtractor):
     }, {
         'url': 'http://vk.com/videos-97664626?section=all',
         'only_matching': True,
+    }, {
+        'url': 'http://m.vk.com/videos205387401',
+        'only_matching': True,
+    }, {
+        'url': 'http://new.vk.com/videos205387401',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
index baf39bb2cea714fb1578d80b4b8a83c9cd67e568..8d671cca767d4592a5428f7d3ad855e952df5353 100644 (file)
@@ -1,11 +1,15 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import re
+
 from .common import InfoExtractor
 from ..utils import (
     dict_get,
+    ExtractorError,
     float_or_none,
     int_or_none,
+    remove_start,
 )
 from ..compat import compat_urllib_parse_urlencode
 
@@ -19,7 +23,7 @@ class VLiveIE(InfoExtractor):
         'info_dict': {
             'id': '1326',
             'ext': 'mp4',
-            'title': "[V] Girl's Day's Broadcast",
+            'title': "[V LIVE] Girl's Day's Broadcast",
             'creator': "Girl's Day",
             'view_count': int,
         },
@@ -31,16 +35,62 @@ class VLiveIE(InfoExtractor):
         webpage = self._download_webpage(
             'http://www.vlive.tv/video/%s' % video_id, video_id)
 
-        long_video_id = self._search_regex(
-            r'vlive\.tv\.video\.ajax\.request\.handler\.init\(\s*"[0-9]+"\s*,\s*"[^"]*"\s*,\s*"([^"]+)"',
-            webpage, 'long video id')
+        video_params = self._search_regex(
+            r'\bvlive\.video\.init\(([^)]+)\)',
+            webpage, 'video params')
+        status, _, _, live_params, long_video_id, key = re.split(
+            r'"\s*,\s*"', video_params)[2:8]
+        status = remove_start(status, 'PRODUCT_')
+
+        if status == 'LIVE_ON_AIR' or status == 'BIG_EVENT_ON_AIR':
+            live_params = self._parse_json('"%s"' % live_params, video_id)
+            live_params = self._parse_json(live_params, video_id)
+            return self._live(video_id, webpage, live_params)
+        elif status == 'VOD_ON_AIR' or status == 'BIG_EVENT_INTRO':
+            if long_video_id and key:
+                return self._replay(video_id, webpage, long_video_id, key)
+            else:
+                status = 'COMING_SOON'
 
-        key = self._search_regex(
-            r'vlive\.tv\.video\.ajax\.request\.handler\.init\(\s*"[0-9]+"\s*,\s*"[^"]*"\s*,\s*"[^"]+"\s*,\s*"([^"]+)"',
-            webpage, 'key')
+        if status == 'LIVE_END':
+            raise ExtractorError('Uploading for replay. Please wait...',
+                                 expected=True)
+        elif status == 'COMING_SOON':
+            raise ExtractorError('Coming soon!', expected=True)
+        elif status == 'CANCELED':
+            raise ExtractorError('We are sorry, '
+                                 'but the live broadcast has been canceled.',
+                                 expected=True)
+        else:
+            raise ExtractorError('Unknown status %s' % status)
 
+    def _get_common_fields(self, webpage):
         title = self._og_search_title(webpage)
+        creator = self._html_search_regex(
+            r'<div[^>]+class="info_area"[^>]*>\s*<a\s+[^>]*>([^<]+)',
+            webpage, 'creator', fatal=False)
+        thumbnail = self._og_search_thumbnail(webpage)
+        return {
+            'title': title,
+            'creator': creator,
+            'thumbnail': thumbnail,
+        }
+
+    def _live(self, video_id, webpage, live_params):
+        formats = []
+        for vid in live_params.get('resolutions', []):
+            formats.extend(self._extract_m3u8_formats(
+                vid['cdnUrl'], video_id, 'mp4',
+                m3u8_id=vid.get('name'),
+                fatal=False, live=True))
+        self._sort_formats(formats)
+
+        return dict(self._get_common_fields(webpage),
+                    id=video_id,
+                    formats=formats,
+                    is_live=True)
 
+    def _replay(self, video_id, webpage, long_video_id, key):
         playinfo = self._download_json(
             'http://global.apis.naver.com/rmcnmv/rmcnmv/vod_play_videoInfo.json?%s'
             % compat_urllib_parse_urlencode({
@@ -62,11 +112,6 @@ class VLiveIE(InfoExtractor):
         } for vid in playinfo.get('videos', {}).get('list', []) if vid.get('source')]
         self._sort_formats(formats)
 
-        thumbnail = self._og_search_thumbnail(webpage)
-        creator = self._html_search_regex(
-            r'<div[^>]+class="info_area"[^>]*>\s*<a\s+[^>]*>([^<]+)',
-            webpage, 'creator', fatal=False)
-
         view_count = int_or_none(playinfo.get('meta', {}).get('count'))
 
         subtitles = {}
@@ -77,12 +122,8 @@ class VLiveIE(InfoExtractor):
                     'ext': 'vtt',
                     'url': caption['source']}]
 
-        return {
-            'id': video_id,
-            'title': title,
-            'creator': creator,
-            'thumbnail': thumbnail,
-            'view_count': view_count,
-            'formats': formats,
-            'subtitles': subtitles,
-        }
+        return dict(self._get_common_fields(webpage),
+                    id=video_id,
+                    formats=formats,
+                    view_count=view_count,
+                    subtitles=subtitles)
index 93d15a556dedb6e0589dc5f393e0a01ad5b4a8a0..4f1a99a8989d736c1de572e6372b022544102f87 100644 (file)
@@ -3,7 +3,10 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..compat import compat_urlparse
+from ..compat import (
+    compat_str,
+    compat_urlparse,
+)
 from ..utils import (
     ExtractorError,
     determine_ext,
@@ -16,13 +19,13 @@ class VoiceRepublicIE(InfoExtractor):
     _VALID_URL = r'https?://voicerepublic\.com/(?:talks|embed)/(?P<id>[0-9a-z-]+)'
     _TESTS = [{
         'url': 'http://voicerepublic.com/talks/watching-the-watchers-building-a-sousveillance-state',
-        'md5': '0554a24d1657915aa8e8f84e15dc9353',
+        'md5': 'b9174d651323f17783000876347116e3',
         'info_dict': {
             'id': '2296',
             'display_id': 'watching-the-watchers-building-a-sousveillance-state',
             'ext': 'm4a',
             'title': 'Watching the Watchers: Building a Sousveillance State',
-            'description': 'md5:715ba964958afa2398df615809cfecb1',
+            'description': 'Secret surveillance programs have metadata too. The people and companies that operate secret surveillance programs can be surveilled.',
             'thumbnail': 're:^https?://.*\.(?:png|jpg)$',
             'duration': 1800,
             'view_count': int,
@@ -52,7 +55,7 @@ class VoiceRepublicIE(InfoExtractor):
         if data:
             title = data['title']
             description = data.get('teaser')
-            talk_id = data.get('talk_id') or display_id
+            talk_id = compat_str(data.get('talk_id') or display_id)
             talk = data['talk']
             duration = int_or_none(talk.get('duration'))
             formats = [{
index 0c6b1f0305801d71bd942b3e2ee9ee307cb4e191..b1b32ad44ecfd796e46219a87ea71caa3587face 100644 (file)
@@ -15,7 +15,8 @@ class VoxMediaIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'Google\'s new material design direction',
             'description': 'md5:2f44f74c4d14a1f800ea73e1c6832ad2',
-        }
+        },
+        'add_ie': ['Ooyala'],
     }, {
         # data-ooyala-id
         'url': 'http://www.theverge.com/2014/10/21/7025853/google-nexus-6-hands-on-photos-video-android-phablet',
@@ -25,7 +26,8 @@ class VoxMediaIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'The Nexus 6: hands-on with Google\'s phablet',
             'description': 'md5:87a51fe95ff8cea8b5bdb9ac7ae6a6af',
-        }
+        },
+        'add_ie': ['Ooyala'],
     }, {
         # volume embed
         'url': 'http://www.vox.com/2016/3/31/11336640/mississippi-lgbt-religious-freedom-bill',
@@ -35,7 +37,8 @@ class VoxMediaIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'The new frontier of LGBTQ civil rights, explained',
             'description': 'md5:0dc58e94a465cbe91d02950f770eb93f',
-        }
+        },
+        'add_ie': ['Ooyala'],
     }, {
         # youtube embed
         'url': 'http://www.vox.com/2016/3/24/11291692/robot-dance',
@@ -48,7 +51,8 @@ class VoxMediaIE(InfoExtractor):
             'upload_date': '20160324',
             'uploader_id': 'voxdotcom',
             'uploader': 'Vox',
-        }
+        },
+        'add_ie': ['Youtube'],
     }, {
         # SBN.VideoLinkset.entryGroup multiple ooyala embeds
         'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
@@ -117,7 +121,7 @@ class VoxMediaIE(InfoExtractor):
             volume_webpage = self._download_webpage(
                 'http://volume.vox-cdn.com/embed/%s' % volume_uuid, volume_uuid)
             video_data = self._parse_json(self._search_regex(
-                r'Volume\.createVideo\(({.+})\s*,\s*{.*}\);', volume_webpage, 'video data'), volume_uuid)
+                r'Volume\.createVideo\(({.+})\s*,\s*{.*}\s*,\s*\[.*\]\s*,\s*{.*}\);', volume_webpage, 'video data'), volume_uuid)
             for provider_video_type in ('ooyala', 'youtube'):
                 provider_video_id = video_data.get('%s_id' % provider_video_type)
                 if provider_video_id:
index 92c90e5172e89b98c3309bb01dc9787f15c24859..1557a0e0406ebfb75c2b5b4583c74f05c5dd2cc7 100644 (file)
@@ -4,6 +4,7 @@ import re
 
 from .common import InfoExtractor
 from ..utils import (
+    ExtractorError,
     parse_duration,
     str_to_int,
 )
@@ -27,7 +28,8 @@ class VpornIE(InfoExtractor):
                 'duration': 393,
                 'age_limit': 18,
                 'view_count': int,
-            }
+            },
+            'skip': 'video removed',
         },
         {
             'url': 'http://www.vporn.com/female/hana-shower/523564/',
@@ -40,7 +42,7 @@ class VpornIE(InfoExtractor):
                 'description': 'Hana showers at the bathroom.',
                 'thumbnail': 're:^https?://.*\.jpg$',
                 'uploader': 'Hmmmmm',
-                'categories': ['Big Boobs', 'Erotic', 'Teen', 'Female'],
+                'categories': ['Big Boobs', 'Erotic', 'Teen', 'Female', '720p'],
                 'duration': 588,
                 'age_limit': 18,
                 'view_count': int,
@@ -55,6 +57,10 @@ class VpornIE(InfoExtractor):
 
         webpage = self._download_webpage(url, display_id)
 
+        errmsg = 'This video has been deleted due to Copyright Infringement or by the account owner!'
+        if errmsg in webpage:
+            raise ExtractorError('%s said: %s' % (self.IE_NAME, errmsg), expected=True)
+
         title = self._html_search_regex(
             r'videoname\s*=\s*\'([^\']+)\'', webpage, 'title').strip()
         description = self._html_search_regex(
index 8e35f24e81e7e61240898ea3259914737365fd2f..bec7ab327008803f8609ea0e78e7d70577556940 100644 (file)
@@ -25,7 +25,8 @@ class VRTIE(InfoExtractor):
                 'timestamp': 1414271750.949,
                 'upload_date': '20141025',
                 'duration': 929,
-            }
+            },
+            'skip': 'HTTP Error 404: Not Found',
         },
         # sporza.be
         {
@@ -39,7 +40,8 @@ class VRTIE(InfoExtractor):
                 'timestamp': 1413835980.560,
                 'upload_date': '20141020',
                 'duration': 3238,
-            }
+            },
+            'skip': 'HTTP Error 404: Not Found',
         },
         # cobra.be
         {
@@ -53,16 +55,39 @@ class VRTIE(InfoExtractor):
                 'timestamp': 1413967500.494,
                 'upload_date': '20141022',
                 'duration': 661,
-            }
+            },
+            'skip': 'HTTP Error 404: Not Found',
         },
         {
             # YouTube video
             'url': 'http://deredactie.be/cm/vrtnieuws/videozone/nieuws/cultuurenmedia/1.2622957',
-            'only_matching': True,
+            'md5': 'b8b93da1df1cea6c8556255a796b7d61',
+            'info_dict': {
+                'id': 'Wji-BZ0oCwg',
+                'ext': 'mp4',
+                'title': 'ROGUE ONE: A STAR WARS STORY Official Teaser Trailer',
+                'description': 'md5:8e468944dce15567a786a67f74262583',
+                'uploader': 'Star Wars',
+                'uploader_id': 'starwars',
+                'upload_date': '20160407',
+            },
+            'add_ie': ['Youtube'],
         },
         {
             'url': 'http://cobra.canvas.be/cm/cobra/videozone/rubriek/film-videozone/1.2377055',
-            'only_matching': True,
+            'md5': '',
+            'info_dict': {
+                'id': '2377055',
+                'ext': 'mp4',
+                'title': 'Cafe Derby',
+                'description': 'Lenny Van Wesemael debuteert met de langspeelfilm Café Derby. Een waar gebeurd maar ook verzonnen verhaal.',
+                'upload_date': '20150626',
+                'timestamp': 1435305240.769,
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            }
         }
     ]
 
@@ -98,6 +123,32 @@ class VRTIE(InfoExtractor):
                 formats.extend(self._extract_m3u8_formats(
                     src, video_id, 'mp4', entry_protocol='m3u8_native',
                     m3u8_id='hls', fatal=False))
+                formats.extend(self._extract_f4m_formats(
+                    src.replace('playlist.m3u8', 'manifest.f4m'),
+                    video_id, f4m_id='hds', fatal=False))
+                if 'data-video-geoblocking="true"' not in webpage:
+                    rtmp_formats = self._extract_smil_formats(
+                        src.replace('playlist.m3u8', 'jwplayer.smil'),
+                        video_id, fatal=False)
+                    formats.extend(rtmp_formats)
+                    for rtmp_format in rtmp_formats:
+                        rtmp_format_c = rtmp_format.copy()
+                        rtmp_format_c['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
+                        del rtmp_format_c['play_path']
+                        del rtmp_format_c['ext']
+                        http_format = rtmp_format_c.copy()
+                        http_format.update({
+                            'url': rtmp_format_c['url'].replace('rtmp://', 'http://').replace('vod.', 'download.').replace('/_definst_/', '/').replace('mp4:', ''),
+                            'format_id': rtmp_format['format_id'].replace('rtmp', 'http'),
+                            'protocol': 'http',
+                        })
+                        rtsp_format = rtmp_format_c.copy()
+                        rtsp_format.update({
+                            'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
+                            'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
+                            'protocol': 'rtsp',
+                        })
+                        formats.extend([http_format, rtsp_format])
             else:
                 formats.extend(self._extract_f4m_formats(
                     '%s/manifest.f4m' % src, video_id, f4m_id='hds', fatal=False))
diff --git a/youtube_dl/extractor/vulture.py b/youtube_dl/extractor/vulture.py
deleted file mode 100644 (file)
index faa167e..0000000
+++ /dev/null
@@ -1,69 +0,0 @@
-from __future__ import unicode_literals
-
-import json
-import os.path
-import re
-
-from .common import InfoExtractor
-from ..utils import (
-    int_or_none,
-    parse_iso8601,
-)
-
-
-class VultureIE(InfoExtractor):
-    IE_NAME = 'vulture.com'
-    _VALID_URL = r'https?://video\.vulture\.com/video/(?P<display_id>[^/]+)/'
-    _TEST = {
-        'url': 'http://video.vulture.com/video/Mindy-Kaling-s-Harvard-Speech/player?layout=compact&read_more=1',
-        'md5': '8d997845642a2b5152820f7257871bc8',
-        'info_dict': {
-            'id': '6GHRQL3RV7MSD1H4',
-            'ext': 'mp4',
-            'title': 'kaling-speech-2-MAGNIFY STANDARD CONTAINER REVISED',
-            'uploader_id': 'Sarah',
-            'thumbnail': 're:^http://.*\.jpg$',
-            'timestamp': 1401288564,
-            'upload_date': '20140528',
-            'description': 'Uplifting and witty, as predicted.',
-            'duration': 1015,
-        }
-    }
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        display_id = mobj.group('display_id')
-
-        webpage = self._download_webpage(url, display_id)
-        query_string = self._search_regex(
-            r"queryString\s*=\s*'([^']+)'", webpage, 'query string')
-        video_id = self._search_regex(
-            r'content=([^&]+)', query_string, 'video ID')
-        query_url = 'http://video.vulture.com/embed/player/container/1000/1000/?%s' % query_string
-
-        query_webpage = self._download_webpage(
-            query_url, display_id, note='Downloading query page')
-        params_json = self._search_regex(
-            r'(?sm)new MagnifyEmbeddablePlayer\({.*?contentItem:\s*(\{.*?\})\n?,\n',
-            query_webpage,
-            'player params')
-        params = json.loads(params_json)
-
-        upload_timestamp = parse_iso8601(params['posted'].replace(' ', 'T'))
-        uploader_id = params.get('user', {}).get('handle')
-
-        media_item = params['media_item']
-        title = os.path.splitext(media_item['title'])[0]
-        duration = int_or_none(media_item.get('duration_seconds'))
-
-        return {
-            'id': video_id,
-            'display_id': display_id,
-            'url': media_item['pipeline_xid'],
-            'title': title,
-            'timestamp': upload_timestamp,
-            'thumbnail': params.get('thumbnail_url'),
-            'uploader_id': uploader_id,
-            'description': params.get('description'),
-            'duration': duration,
-        }
index ec8b999983f6ae89a3bf53909e9d70a463f87f52..839cad986cbbf4edc8f73ca5639e780f210163c2 100644 (file)
@@ -11,7 +11,96 @@ from ..utils import (
 
 
 class WashingtonPostIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?washingtonpost\.com/.*?/(?P<id>[^/]+)/(?:$|[?#])'
+    IE_NAME = 'washingtonpost'
+    _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+    _TEST = {
+        'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d',
+        'md5': '6f537e1334b714eb15f9563bd4b9cdfa',
+        'info_dict': {
+            'id': '480ba4ee-1ec7-11e6-82c2-a7dcb313287d',
+            'ext': 'mp4',
+            'title': 'Egypt finds belongings, debris from plane crash',
+            'description': 'md5:a17ceee432f215a5371388c1f680bd86',
+            'upload_date': '20160520',
+            'uploader': 'Reuters',
+            'timestamp': 1463778452,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        video_data = self._download_json(
+            'http://www.washingtonpost.com/posttv/c/videojson/%s?resType=jsonp' % video_id,
+            video_id, transform_source=strip_jsonp)[0]['contentConfig']
+        title = video_data['title']
+
+        urls = []
+        formats = []
+        for s in video_data.get('streams', []):
+            s_url = s.get('url')
+            if not s_url or s_url in urls:
+                continue
+            urls.append(s_url)
+            video_type = s.get('type')
+            if video_type == 'smil':
+                continue
+            elif video_type in ('ts', 'hls') and ('_master.m3u8' in s_url or '_mobile.m3u8' in s_url):
+                m3u8_formats = self._extract_m3u8_formats(
+                    s_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
+                for m3u8_format in m3u8_formats:
+                    width = m3u8_format.get('width')
+                    if not width:
+                        continue
+                    vbr = self._search_regex(
+                        r'%d_%d_(\d+)' % (width, m3u8_format['height']), m3u8_format['url'], 'vbr', default=None)
+                    if vbr:
+                        m3u8_format.update({
+                            'vbr': int_or_none(vbr),
+                        })
+                formats.extend(m3u8_formats)
+            else:
+                width = int_or_none(s.get('width'))
+                vbr = int_or_none(s.get('bitrate'))
+                has_width = width != 0
+                formats.append({
+                    'format_id': (
+                        '%s-%d-%d' % (video_type, width, vbr)
+                        if width
+                        else video_type),
+                    'vbr': vbr if has_width else None,
+                    'width': width,
+                    'height': int_or_none(s.get('height')),
+                    'acodec': s.get('audioCodec'),
+                    'vcodec': s.get('videoCodec') if has_width else 'none',
+                    'filesize': int_or_none(s.get('fileSize')),
+                    'url': s_url,
+                    'ext': 'mp4',
+                    'protocol': 'm3u8_native' if video_type in ('ts', 'hls') else None,
+                })
+        source_media_url = video_data.get('sourceMediaURL')
+        if source_media_url:
+            formats.append({
+                'format_id': 'source_media',
+                'url': source_media_url,
+            })
+        self._sort_formats(
+            formats, ('width', 'height', 'vbr', 'filesize', 'tbr', 'format_id'))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video_data.get('blurb'),
+            'uploader': video_data.get('credits', {}).get('source'),
+            'formats': formats,
+            'duration': int_or_none(video_data.get('videoDuration'), 100),
+            'timestamp': int_or_none(
+                video_data.get('dateConfig', {}).get('dateFirstPublished'), 1000),
+        }
+
+
+class WashingtonPostArticleIE(InfoExtractor):
+    IE_NAME = 'washingtonpost:article'
+    _VALID_URL = r'https?://(?:www\.)?washingtonpost\.com/(?:[^/]+/)*(?P<id>[^/?#]+)'
     _TESTS = [{
         'url': 'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/',
         'info_dict': {
@@ -63,6 +152,10 @@ class WashingtonPostIE(InfoExtractor):
         }]
     }]
 
+    @classmethod
+    def suitable(cls, url):
+        return False if WashingtonPostIE.suitable(url) else super(WashingtonPostArticleIE, cls).suitable(url)
+
     def _real_extract(self, url):
         page_id = self._match_id(url)
         webpage = self._download_webpage(url, page_id)
@@ -74,54 +167,7 @@ class WashingtonPostIE(InfoExtractor):
                 <div\s+class="posttv-video-embed[^>]*?data-uuid=|
                 data-video-uuid=
             )"([^"]+)"''', webpage)
-        entries = []
-        for i, uuid in enumerate(uuids, start=1):
-            vinfo_all = self._download_json(
-                'http://www.washingtonpost.com/posttv/c/videojson/%s?resType=jsonp' % uuid,
-                page_id,
-                transform_source=strip_jsonp,
-                note='Downloading information of video %d/%d' % (i, len(uuids))
-            )
-            vinfo = vinfo_all[0]['contentConfig']
-            uploader = vinfo.get('credits', {}).get('source')
-            timestamp = int_or_none(
-                vinfo.get('dateConfig', {}).get('dateFirstPublished'), 1000)
-
-            formats = [{
-                'format_id': (
-                    '%s-%s-%s' % (s.get('type'), s.get('width'), s.get('bitrate'))
-                    if s.get('width')
-                    else s.get('type')),
-                'vbr': s.get('bitrate') if s.get('width') != 0 else None,
-                'width': s.get('width'),
-                'height': s.get('height'),
-                'acodec': s.get('audioCodec'),
-                'vcodec': s.get('videoCodec') if s.get('width') != 0 else 'none',
-                'filesize': s.get('fileSize'),
-                'url': s.get('url'),
-                'ext': 'mp4',
-                'preference': -100 if s.get('type') == 'smil' else None,
-                'protocol': {
-                    'MP4': 'http',
-                    'F4F': 'f4m',
-                }.get(s.get('type')),
-            } for s in vinfo.get('streams', [])]
-            source_media_url = vinfo.get('sourceMediaURL')
-            if source_media_url:
-                formats.append({
-                    'format_id': 'source_media',
-                    'url': source_media_url,
-                })
-            self._sort_formats(formats)
-            entries.append({
-                'id': uuid,
-                'title': vinfo['title'],
-                'description': vinfo.get('blurb'),
-                'uploader': uploader,
-                'formats': formats,
-                'duration': int_or_none(vinfo.get('videoDuration'), 100),
-                'timestamp': timestamp,
-            })
+        entries = [self.url_result('washingtonpost:%s' % uuid, 'WashingtonPost', uuid) for uuid in uuids]
 
         return {
             '_type': 'playlist',
index 5227bb5ad9a2cd4f71c156cd8ca9bb3f5fbd5d17..de7d6b55935cd5fd8edb4c83c581505c2f0f4214 100644 (file)
@@ -2,25 +2,26 @@
 from __future__ import unicode_literals
 
 import re
-import hashlib
 
 from .common import InfoExtractor
+from ..compat import compat_str
 from ..utils import (
     ExtractorError,
     unified_strdate,
+    HEADRequest,
+    float_or_none,
 )
 
 
 class WatIE(InfoExtractor):
-    _VALID_URL = r'(?:wat:(?P<real_id>\d{8})|https?://www\.wat\.tv/video/(?P<display_id>.*)-(?P<short_id>.*?)_.*?\.html)'
+    _VALID_URL = r'(?:wat:|https?://(?:www\.)?wat\.tv/video/.*-)(?P<id>[0-9a-z]+)'
     IE_NAME = 'wat.tv'
     _TESTS = [
         {
             'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html',
-            'md5': 'ce70e9223945ed26a8056d413ca55dc9',
+            'md5': '83d882d9de5c9d97f0bb2c6273cde56a',
             'info_dict': {
                 'id': '11713067',
-                'display_id': 'soupe-figues-l-orange-aux-epices',
                 'ext': 'mp4',
                 'title': 'Soupe de figues à l\'orange et aux épices',
                 'description': 'Retrouvez l\'émission "Petits plats en équilibre", diffusée le 18 août 2014.',
@@ -33,7 +34,6 @@ class WatIE(InfoExtractor):
             'md5': 'fbc84e4378165278e743956d9c1bf16b',
             'info_dict': {
                 'id': '11713075',
-                'display_id': 'gregory-lemarchal-voix-ange',
                 'ext': 'mp4',
                 'title': 'Grégory Lemarchal, une voix d\'ange depuis 10 ans (1/3)',
                 'description': 'md5:b7a849cf16a2b733d9cd10c52906dee3',
@@ -44,96 +44,85 @@ class WatIE(InfoExtractor):
         },
     ]
 
-    def download_video_info(self, real_id):
-        # 'contentv4' is used in the website, but it also returns the related
-        # videos, we don't need them
-        info = self._download_json('http://www.wat.tv/interface/contentv3/' + real_id, real_id)
-        return info['media']
-
     def _real_extract(self, url):
-        def real_id_for_chapter(chapter):
-            return chapter['tc_start'].split('-')[0]
-        mobj = re.match(self._VALID_URL, url)
-        display_id = mobj.group('display_id')
-        real_id = mobj.group('real_id')
-        if not real_id:
-            short_id = mobj.group('short_id')
-            webpage = self._download_webpage(url, display_id or short_id)
-            real_id = self._search_regex(r'xtpage = ".*-(.*?)";', webpage, 'real id')
+        video_id = self._match_id(url)
+        video_id = video_id if video_id.isdigit() and len(video_id) > 6 else compat_str(int(video_id, 36))
 
-        video_info = self.download_video_info(real_id)
+        # 'contentv4' is used in the website, but it also returns the related
+        # videos, we don't need them
+        video_info = self._download_json(
+            'http://www.wat.tv/interface/contentv3/' + video_id, video_id)['media']
 
         error_desc = video_info.get('error_desc')
         if error_desc:
             raise ExtractorError(
                 '%s returned error: %s' % (self.IE_NAME, error_desc), expected=True)
 
-        geo_list = video_info.get('geoList')
-        country = geo_list[0] if geo_list else ''
-
         chapters = video_info['chapters']
         first_chapter = chapters[0]
-        files = video_info['files']
-        first_file = files[0]
 
-        if real_id_for_chapter(first_chapter) != real_id:
-            self.to_screen('Multipart video detected')
-            chapter_urls = []
-            for chapter in chapters:
-                chapter_id = real_id_for_chapter(chapter)
-                # Yes, when we this chapter is processed by WatIE,
-                # it will download the info again
-                chapter_info = self.download_video_info(chapter_id)
-                chapter_urls.append(chapter_info['url'])
-            entries = [self.url_result(chapter_url) for chapter_url in chapter_urls]
-            return self.playlist_result(entries, real_id, video_info['title'])
+        def video_id_for_chapter(chapter):
+            return chapter['tc_start'].split('-')[0]
 
-        upload_date = None
-        if 'date_diffusion' in first_chapter:
-            upload_date = unified_strdate(first_chapter['date_diffusion'])
+        if video_id_for_chapter(first_chapter) != video_id:
+            self.to_screen('Multipart video detected')
+            entries = [self.url_result('wat:%s' % video_id_for_chapter(chapter)) for chapter in chapters]
+            return self.playlist_result(entries, video_id, video_info['title'])
         # Otherwise we can continue and extract just one part, we have to use
-        # the short id for getting the video url
-
-        formats = [{
-            'url': 'http://wat.tv/get/android5/%s.mp4' % real_id,
-            'format_id': 'Mobile',
-        }]
-
-        fmts = [('SD', 'web')]
-        if first_file.get('hasHD'):
-            fmts.append(('HD', 'webhd'))
-
-        def compute_token(param):
-            timestamp = '%08x' % int(self._download_webpage(
-                'http://www.wat.tv/servertime', real_id,
-                'Downloading server time').split('|')[0])
-            magic = '9b673b13fa4682ed14c3cfa5af5310274b514c4133e9b3a81e6e3aba009l2564'
-            return '%s/%s' % (hashlib.md5((magic + param + timestamp).encode('ascii')).hexdigest(), timestamp)
-
-        for fmt in fmts:
-            webid = '/%s/%s' % (fmt[1], real_id)
-            video_url = self._download_webpage(
-                'http://www.wat.tv/get%s?token=%s&getURL=1&country=%s' % (webid, compute_token(webid), country),
-                real_id,
-                'Downloading %s video URL' % fmt[0],
-                'Failed to download %s video URL' % fmt[0],
-                False)
-            if not video_url:
+        # the video id for getting the video url
+
+        date_diffusion = first_chapter.get('date_diffusion')
+        upload_date = unified_strdate(date_diffusion) if date_diffusion else None
+
+        def extract_url(path_template, url_type):
+            req_url = 'http://www.wat.tv/get/%s' % (path_template % video_id)
+            head = self._request_webpage(HEADRequest(req_url), video_id, 'Extracting %s url' % url_type)
+            red_url = head.geturl()
+            if req_url == red_url:
+                raise ExtractorError(
+                    '%s said: Sorry, this video is not available from your country.' % self.IE_NAME,
+                    expected=True)
+            return red_url
+
+        m3u8_url = extract_url('ipad/%s.m3u8', 'm3u8')
+        http_url = extract_url('android5/%s.mp4', 'http')
+
+        formats = []
+        m3u8_formats = self._extract_m3u8_formats(
+            m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls')
+        formats.extend(m3u8_formats)
+        formats.extend(self._extract_f4m_formats(
+            m3u8_url.replace('ios.', 'web.').replace('.m3u8', '.f4m'),
+            video_id, f4m_id='hds', fatal=False))
+        for m3u8_format in m3u8_formats:
+            mobj = re.search(
+                r'audio.*?%3D(\d+)(?:-video.*?%3D(\d+))?', m3u8_format['url'])
+            if not mobj:
                 continue
-            formats.append({
-                'url': video_url,
-                'ext': 'mp4',
-                'format_id': fmt[0],
+            abr, vbr = mobj.groups()
+            abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
+            m3u8_format.update({
+                'vbr': vbr,
+                'abr': abr,
+            })
+            if not vbr or not abr:
+                continue
+            f = m3u8_format.copy()
+            f.update({
+                'url': re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url),
+                'format_id': f['format_id'].replace('hls', 'http'),
+                'protocol': 'http',
             })
+            formats.append(f)
+        self._sort_formats(formats)
 
         return {
-            'id': real_id,
-            'display_id': display_id,
+            'id': video_id,
             'title': first_chapter['title'],
             'thumbnail': first_chapter['preview'],
             'description': first_chapter['description'],
             'view_count': video_info['views'],
             'upload_date': upload_date,
-            'duration': first_file['duration'],
+            'duration': video_info['files'][0]['duration'],
             'formats': formats,
         }
diff --git a/youtube_dl/extractor/watchindianporn.py b/youtube_dl/extractor/watchindianporn.py
new file mode 100644 (file)
index 0000000..5d3b5bd
--- /dev/null
@@ -0,0 +1,90 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    unified_strdate,
+    parse_duration,
+    int_or_none,
+)
+
+
+class WatchIndianPornIE(InfoExtractor):
+    IE_DESC = 'Watch Indian Porn'
+    _VALID_URL = r'https?://(?:www\.)?watchindianporn\.net/(?:[^/]+/)*video/(?P<display_id>[^/]+)-(?P<id>[a-zA-Z0-9]+)\.html'
+    _TEST = {
+        'url': 'http://www.watchindianporn.net/video/hot-milf-from-kerala-shows-off-her-gorgeous-large-breasts-on-camera-RZa2avywNPa.html',
+        'md5': '249589a164dde236ec65832bfce17440',
+        'info_dict': {
+            'id': 'RZa2avywNPa',
+            'display_id': 'hot-milf-from-kerala-shows-off-her-gorgeous-large-breasts-on-camera',
+            'ext': 'mp4',
+            'title': 'Hot milf from kerala shows off her gorgeous large breasts on camera',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'uploader': 'LoveJay',
+            'upload_date': '20160428',
+            'duration': 226,
+            'view_count': int,
+            'comment_count': int,
+            'categories': list,
+            'age_limit': 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_url = self._html_search_regex(
+            r"url: escape\('([^']+)'\)", webpage, 'url')
+
+        title = self._html_search_regex(
+            r'<h2 class="he2"><span>(.*?)</span>',
+            webpage, 'title')
+        thumbnail = self._html_search_regex(
+            r'<span id="container"><img\s+src="([^"]+)"',
+            webpage, 'thumbnail', fatal=False)
+
+        uploader = self._html_search_regex(
+            r'class="aupa">\s*(.*?)</a>',
+            webpage, 'uploader')
+        upload_date = unified_strdate(self._html_search_regex(
+            r'Added: <strong>(.+?)</strong>', webpage, 'upload date', fatal=False))
+
+        duration = parse_duration(self._search_regex(
+            r'<td>Time:\s*</td>\s*<td align="right"><span>\s*(.+?)\s*</span>',
+            webpage, 'duration', fatal=False))
+
+        view_count = int_or_none(self._search_regex(
+            r'<td>Views:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>',
+            webpage, 'view count', fatal=False))
+        comment_count = int_or_none(self._search_regex(
+            r'<td>Comments:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>',
+            webpage, 'comment count', fatal=False))
+
+        categories = re.findall(
+            r'<a href="[^"]+/search/video/desi"><span>([^<]+)</span></a>',
+            webpage)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'url': video_url,
+            'http_headers': {
+                'Referer': url,
+            },
+            'title': title,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'upload_date': upload_date,
+            'duration': duration,
+            'view_count': view_count,
+            'comment_count': comment_count,
+            'categories': categories,
+            'age_limit': 18,
+        }
index 31c90430327da895ffc974c1d489cb4c92689d2f..390f9e8302f392a25c83af2520cb30b854500632 100644 (file)
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 
-import itertools
 import re
 
 from .common import InfoExtractor
-from ..compat import (
-    compat_parse_qs,
-    compat_urlparse,
-)
 from ..utils import (
+    determine_ext,
+    ExtractorError,
+    js_to_json,
+    strip_jsonp,
     unified_strdate,
-    qualities,
+    update_url_query,
+    urlhandle_detect_ext,
 )
 
 
-class WDRIE(InfoExtractor):
-    _PLAYER_REGEX = '-(?:video|audio)player(?:_size-[LMS])?'
-    _VALID_URL = r'(?P<url>https?://www\d?\.(?:wdr\d?|funkhauseuropa)\.de/)(?P<id>.+?)(?P<player>%s)?\.html' % _PLAYER_REGEX
+class WDRBaseIE(InfoExtractor):
+    def _extract_wdr_video(self, webpage, display_id):
+        # for wdr.de the data-extension is in a tag with the class "mediaLink"
+        # for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn"
+        # for wdrmaus its in a link to the page in a multiline "videoLink"-tag
+        json_metadata = self._html_search_regex(
+            r'class=(?:"(?:mediaLink|wdrrPlayerPlayBtn)\b[^"]*"[^>]+|"videoLink\b[^"]*"[\s]*>\n[^\n]*)data-extension="([^"]+)"',
+            webpage, 'media link', default=None, flags=re.MULTILINE)
+
+        if not json_metadata:
+            return
+
+        media_link_obj = self._parse_json(json_metadata, display_id,
+                                          transform_source=js_to_json)
+        jsonp_url = media_link_obj['mediaObj']['url']
+
+        metadata = self._download_json(
+            jsonp_url, 'metadata', transform_source=strip_jsonp)
+
+        metadata_tracker_data = metadata['trackerData']
+        metadata_media_resource = metadata['mediaResource']
+
+        formats = []
+
+        # check if the metadata contains a direct URL to a file
+        for kind, media_resource in metadata_media_resource.items():
+            if kind not in ('dflt', 'alt'):
+                continue
+
+            for tag_name, medium_url in media_resource.items():
+                if tag_name not in ('videoURL', 'audioURL'):
+                    continue
+
+                ext = determine_ext(medium_url)
+                if ext == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        medium_url, display_id, 'mp4', 'm3u8_native',
+                        m3u8_id='hls'))
+                elif ext == 'f4m':
+                    manifest_url = update_url_query(
+                        medium_url, {'hdcore': '3.2.0', 'plugin': 'aasp-3.2.0.77.18'})
+                    formats.extend(self._extract_f4m_formats(
+                        manifest_url, display_id, f4m_id='hds', fatal=False))
+                elif ext == 'smil':
+                    formats.extend(self._extract_smil_formats(
+                        medium_url, 'stream', fatal=False))
+                else:
+                    a_format = {
+                        'url': medium_url
+                    }
+                    if ext == 'unknown_video':
+                        urlh = self._request_webpage(
+                            medium_url, display_id, note='Determining extension')
+                        ext = urlhandle_detect_ext(urlh)
+                        a_format['ext'] = ext
+                    formats.append(a_format)
+
+        self._sort_formats(formats)
+
+        subtitles = {}
+        caption_url = metadata_media_resource.get('captionURL')
+        if caption_url:
+            subtitles['de'] = [{
+                'url': caption_url,
+                'ext': 'ttml',
+            }]
+
+        title = metadata_tracker_data['trackerClipTitle']
+
+        return {
+            'id': metadata_tracker_data.get('trackerClipId', display_id),
+            'display_id': display_id,
+            'title': title,
+            'alt_title': metadata_tracker_data.get('trackerClipSubcategory'),
+            'formats': formats,
+            'subtitles': subtitles,
+            'upload_date': unified_strdate(metadata_tracker_data.get('trackerClipAirTime')),
+        }
+
+
+class WDRIE(WDRBaseIE):
+    _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5'
+    _PAGE_REGEX = r'/(?:mediathek/)?[^/]+/(?P<type>[^/]+)/(?P<display_id>.+)\.html'
+    _VALID_URL = r'(?P<page_url>https?://(?:www\d\.)?wdr\d?\.de)' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL
 
     _TESTS = [
         {
-            'url': 'http://www1.wdr.de/mediathek/video/sendungen/servicezeit/videoservicezeit560-videoplayer_size-L.html',
+            'url': 'http://www1.wdr.de/mediathek/video/sendungen/doku-am-freitag/video-geheimnis-aachener-dom-100.html',
+            # HDS download, MD5 is unstable
             'info_dict': {
-                'id': 'mdb-362427',
+                'id': 'mdb-1058683',
                 'ext': 'flv',
-                'title': 'Servicezeit',
-                'description': 'md5:c8f43e5e815eeb54d0b96df2fba906cb',
-                'upload_date': '20140310',
-                'is_live': False
+                'display_id': 'doku-am-freitag/video-geheimnis-aachener-dom-100',
+                'title': 'Geheimnis Aachener Dom',
+                'alt_title': 'Doku am Freitag',
+                'upload_date': '20160304',
+                'description': 'md5:87be8ff14d8dfd7a7ee46f0299b52318',
+                'is_live': False,
+                'subtitles': {'de': [{
+                    'url': 'http://ondemand-ww.wdr.de/medp/fsk0/105/1058683/1058683_12220974.xml',
+                    'ext': 'ttml',
+                }]},
             },
-            'params': {
-                'skip_download': True,
+        },
+        {
+            'url': 'http://www1.wdr.de/mediathek/audio/wdr3/wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100.html',
+            'md5': 'f4c1f96d01cf285240f53ea4309663d8',
+            'info_dict': {
+                'id': 'mdb-1072000',
+                'ext': 'mp3',
+                'display_id': 'wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100',
+                'title': 'Schriftstellerin Juli Zeh',
+                'alt_title': 'WDR 3 Gespräch am Samstag',
+                'upload_date': '20160312',
+                'description': 'md5:e127d320bc2b1f149be697ce044a3dd7',
+                'is_live': False,
+                'subtitles': {}
             },
-            'skip': 'Page Not Found',
         },
         {
-            'url': 'http://www1.wdr.de/themen/av/videomargaspiegelisttot101-videoplayer.html',
+            'url': 'http://www1.wdr.de/mediathek/video/live/index.html',
             'info_dict': {
-                'id': 'mdb-363194',
-                'ext': 'flv',
-                'title': 'Marga Spiegel ist tot',
-                'description': 'md5:2309992a6716c347891c045be50992e4',
-                'upload_date': '20140311',
-                'is_live': False
+                'id': 'mdb-103364',
+                'ext': 'mp4',
+                'display_id': 'index',
+                'title': r're:^WDR Fernsehen im Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+                'alt_title': 'WDR Fernsehen Live',
+                'upload_date': None,
+                'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9',
+                'is_live': True,
+                'subtitles': {}
             },
             'params': {
-                'skip_download': True,
+                'skip_download': True,  # m3u8 download
             },
-            'skip': 'Page Not Found',
         },
         {
-            'url': 'http://www1.wdr.de/themen/kultur/audioerlebtegeschichtenmargaspiegel100-audioplayer.html',
-            'md5': '83e9e8fefad36f357278759870805898',
+            'url': 'http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html',
+            'playlist_mincount': 8,
             'info_dict': {
-                'id': 'mdb-194332',
-                'ext': 'mp3',
-                'title': 'Erlebte Geschichten: Marga Spiegel (29.11.2009)',
-                'description': 'md5:2309992a6716c347891c045be50992e4',
-                'upload_date': '20091129',
-                'is_live': False
+                'id': 'aktuelle-stunde/aktuelle-stunde-120',
             },
         },
         {
-            'url': 'http://www.funkhauseuropa.de/av/audioflaviacoelhoamaramar100-audioplayer.html',
-            'md5': '99a1443ff29af19f6c52cf6f4dc1f4aa',
+            'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5',
             'info_dict': {
-                'id': 'mdb-478135',
-                'ext': 'mp3',
-                'title': 'Flavia Coelho: Amar é Amar',
-                'description': 'md5:7b29e97e10dfb6e265238b32fa35b23a',
-                'upload_date': '20140717',
-                'is_live': False
+                'id': 'mdb-1096487',
+                'ext': 'flv',
+                'upload_date': 're:^[0-9]{8}$',
+                'title': 're:^Die Sendung mit der Maus vom [0-9.]{10}$',
+                'description': '- Die Sendung mit der Maus -',
             },
-            'skip': 'Page Not Found',
+            'skip': 'The id changes from week to week because of the new episode'
         },
         {
-            'url': 'http://www1.wdr.de/mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100.html',
-            'playlist_mincount': 146,
+            'url': 'http://www.wdrmaus.de/sachgeschichten/sachgeschichten/achterbahn.php5',
+            'md5': '803138901f6368ee497b4d195bb164f2',
             'info_dict': {
-                'id': 'mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100',
-            }
+                'id': 'mdb-186083',
+                'ext': 'mp4',
+                'upload_date': '20130919',
+                'title': 'Sachgeschichte - Achterbahn ',
+                'description': '- Die Sendung mit der Maus -',
+            },
         },
         {
-            'url': 'http://www1.wdr.de/mediathek/video/livestream/index.html',
+            'url': 'http://www1.wdr.de/radio/player/radioplayer116~_layout-popupVersion.html',
+            # Live stream, MD5 unstable
             'info_dict': {
-                'id': 'mdb-103364',
-                'title': 're:^WDR Fernsehen Live [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
-                'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9',
+                'id': 'mdb-869971',
                 'ext': 'flv',
-                'upload_date': '20150101',
-                'is_live': True
-            },
-            'params': {
-                'skip_download': True,
+                'title': 'Funkhaus Europa Livestream',
+                'description': 'md5:2309992a6716c347891c045be50992e4',
+                'upload_date': '20160101',
             },
         }
     ]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        page_url = mobj.group('url')
-        page_id = mobj.group('id')
+        url_type = mobj.group('type')
+        page_url = mobj.group('page_url')
+        display_id = mobj.group('display_id')
+        webpage = self._download_webpage(url, display_id)
 
-        webpage = self._download_webpage(url, page_id)
+        info_dict = self._extract_wdr_video(webpage, display_id)
 
-        if mobj.group('player') is None:
+        if not info_dict:
             entries = [
-                self.url_result(page_url + href, 'WDR')
+                self.url_result(page_url + href[0], 'WDR')
                 for href in re.findall(
-                    r'<a href="/?(.+?%s\.html)" rel="nofollow"' % self._PLAYER_REGEX,
+                    r'<a href="(%s)"[^>]+data-extension=' % self._PAGE_REGEX,
                     webpage)
             ]
 
             if entries:  # Playlist page
-                return self.playlist_result(entries, page_id)
+                return self.playlist_result(entries, playlist_id=display_id)
 
-            # Overview page
-            entries = []
-            for page_num in itertools.count(2):
-                hrefs = re.findall(
-                    r'<li class="mediathekvideo"\s*>\s*<img[^>]*>\s*<a href="(/mediathek/video/[^"]+)"',
-                    webpage)
-                entries.extend(
-                    self.url_result(page_url + href, 'WDR')
-                    for href in hrefs)
-                next_url_m = re.search(
-                    r'<li class="nextToLast">\s*<a href="([^"]+)"', webpage)
-                if not next_url_m:
-                    break
-                next_url = page_url + next_url_m.group(1)
-                webpage = self._download_webpage(
-                    next_url, page_id,
-                    note='Downloading playlist page %d' % page_num)
-            return self.playlist_result(entries, page_id)
-
-        flashvars = compat_parse_qs(self._html_search_regex(
-            r'<param name="flashvars" value="([^"]+)"', webpage, 'flashvars'))
+            raise ExtractorError('No downloadable streams found', expected=True)
 
-        page_id = flashvars['trackerClipId'][0]
-        video_url = flashvars['dslSrc'][0]
-        title = flashvars['trackerClipTitle'][0]
-        thumbnail = flashvars['startPicture'][0] if 'startPicture' in flashvars else None
-        is_live = flashvars.get('isLive', ['0'])[0] == '1'
+        is_live = url_type == 'live'
 
         if is_live:
-            title = self._live_title(title)
-
-        if 'trackerClipAirTime' in flashvars:
-            upload_date = flashvars['trackerClipAirTime'][0]
-        else:
-            upload_date = self._html_search_meta(
-                'DC.Date', webpage, 'upload date')
-
-        if upload_date:
-            upload_date = unified_strdate(upload_date)
-
-        formats = []
-        preference = qualities(['S', 'M', 'L', 'XL'])
-
-        if video_url.endswith('.f4m'):
-            formats.extend(self._extract_f4m_formats(
-                video_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', page_id,
-                f4m_id='hds', fatal=False))
-        elif video_url.endswith('.smil'):
-            formats.extend(self._extract_smil_formats(
-                video_url, page_id, False, {
-                    'hdcore': '3.3.0',
-                    'plugin': 'aasp-3.3.0.99.43',
-                }))
-        else:
-            formats.append({
-                'url': video_url,
-                'http_headers': {
-                    'User-Agent': 'mobile',
-                },
+            info_dict.update({
+                'title': self._live_title(info_dict['title']),
+                'upload_date': None,
             })
+        elif 'upload_date' not in info_dict:
+            info_dict['upload_date'] = unified_strdate(self._html_search_meta('DC.Date', webpage, 'upload date'))
 
-        m3u8_url = self._search_regex(
-            r'rel="adaptiv"[^>]+href="([^"]+)"',
-            webpage, 'm3u8 url', default=None)
-        if m3u8_url:
-            formats.extend(self._extract_m3u8_formats(
-                m3u8_url, page_id, 'mp4', 'm3u8_native',
-                m3u8_id='hls', fatal=False))
+        info_dict.update({
+            'description': self._html_search_meta('Description', webpage),
+            'is_live': is_live,
+        })
 
-        direct_urls = re.findall(
-            r'rel="web(S|M|L|XL)"[^>]+href="([^"]+)"', webpage)
-        if direct_urls:
-            for quality, video_url in direct_urls:
-                formats.append({
-                    'url': video_url,
-                    'preference': preference(quality),
-                    'http_headers': {
-                        'User-Agent': 'mobile',
-                    },
-                })
-
-        self._sort_formats(formats)
-
-        description = self._html_search_meta('Description', webpage, 'description')
-
-        return {
-            'id': page_id,
-            'formats': formats,
-            'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
-            'upload_date': upload_date,
-            'is_live': is_live
-        }
+        return info_dict
 
 
 class WDRMobileIE(InfoExtractor):
@@ -241,81 +262,3 @@ class WDRMobileIE(InfoExtractor):
                 'User-Agent': 'mobile',
             },
         }
-
-
-class WDRMausIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?wdrmaus\.de/(?:[^/]+/){,2}(?P<id>[^/?#]+)(?:/index\.php5|(?<!index)\.php5|/(?:$|[?#]))'
-    IE_DESC = 'Sendung mit der Maus'
-    _TESTS = [{
-        'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5',
-        'info_dict': {
-            'id': 'aktuelle-sendung',
-            'ext': 'mp4',
-            'thumbnail': 're:^http://.+\.jpg',
-            'upload_date': 're:^[0-9]{8}$',
-            'title': 're:^[0-9.]{10} - Aktuelle Sendung$',
-        }
-    }, {
-        'url': 'http://www.wdrmaus.de/sachgeschichten/sachgeschichten/40_jahre_maus.php5',
-        'md5': '3b1227ca3ed28d73ec5737c65743b2a3',
-        'info_dict': {
-            'id': '40_jahre_maus',
-            'ext': 'mp4',
-            'thumbnail': 're:^http://.+\.jpg',
-            'upload_date': '20131007',
-            'title': '12.03.2011 - 40 Jahre Maus',
-        }
-    }]
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-
-        webpage = self._download_webpage(url, video_id)
-        param_code = self._html_search_regex(
-            r'<a href="\?startVideo=1&amp;([^"]+)"', webpage, 'parameters')
-
-        title_date = self._search_regex(
-            r'<div class="sendedatum"><p>Sendedatum:\s*([0-9\.]+)</p>',
-            webpage, 'air date')
-        title_str = self._html_search_regex(
-            r'<h1>(.*?)</h1>', webpage, 'title')
-        title = '%s - %s' % (title_date, title_str)
-        upload_date = unified_strdate(
-            self._html_search_meta('dc.date', webpage))
-
-        fields = compat_parse_qs(param_code)
-        video_url = fields['firstVideo'][0]
-        thumbnail = compat_urlparse.urljoin(url, fields['startPicture'][0])
-
-        formats = [{
-            'format_id': 'rtmp',
-            'url': video_url,
-        }]
-
-        jscode = self._download_webpage(
-            'http://www.wdrmaus.de/codebase/js/extended-medien.min.js',
-            video_id, fatal=False,
-            note='Downloading URL translation table',
-            errnote='Could not download URL translation table')
-        if jscode:
-            for m in re.finditer(
-                    r"stream:\s*'dslSrc=(?P<stream>[^']+)',\s*download:\s*'(?P<dl>[^']+)'\s*\}",
-                    jscode):
-                if video_url.startswith(m.group('stream')):
-                    http_url = video_url.replace(
-                        m.group('stream'), m.group('dl'))
-                    formats.append({
-                        'format_id': 'http',
-                        'url': http_url,
-                    })
-                    break
-
-        self._sort_formats(formats)
-
-        return {
-            'id': video_id,
-            'title': title,
-            'formats': formats,
-            'thumbnail': thumbnail,
-            'upload_date': upload_date,
-        }
diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py
deleted file mode 100644 (file)
index 20bb039..0000000
+++ /dev/null
@@ -1,49 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-
-
-class WeiboIE(InfoExtractor):
-    """
-    The videos in Weibo come from different sites, this IE just finds the link
-    to the external video and returns it.
-    """
-    _VALID_URL = r'https?://video\.weibo\.com/v/weishipin/t_(?P<id>.+?)\.htm'
-
-    _TEST = {
-        'url': 'http://video.weibo.com/v/weishipin/t_zjUw2kZ.htm',
-        'info_dict': {
-            'id': '98322879',
-            'ext': 'flv',
-            'title': '魔声耳机最新广告“All Eyes On Us”',
-        },
-        'params': {
-            'skip_download': True,
-        },
-        'add_ie': ['Sina'],
-    }
-
-    # Additional example videos from different sites
-    # Youku: http://video.weibo.com/v/weishipin/t_zQGDWQ8.htm
-    # 56.com: http://video.weibo.com/v/weishipin/t_zQ44HxN.htm
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
-        video_id = mobj.group('id')
-        info_url = 'http://video.weibo.com/?s=v&a=play_list&format=json&mix_video_id=t_%s' % video_id
-        info = self._download_json(info_url, video_id)
-
-        videos_urls = map(lambda v: v['play_page_url'], info['result']['data'])
-        # Prefer sina video since they have thumbnails
-        videos_urls = sorted(videos_urls, key=lambda u: 'video.sina.com' in u)
-        player_url = videos_urls[-1]
-        m_sina = re.match(r'https?://video\.sina\.com\.cn/v/b/(\d+)-\d+\.html',
-                          player_url)
-        if m_sina is not None:
-            self.to_screen('Sina video detected')
-            sina_id = m_sina.group(1)
-            player_url = 'http://you.video.sina.com.cn/swf/quotePlayer.swf?vid=%s' % sina_id
-        return self.url_result(player_url)
index 828c03dc38c4d4d4668f6dfb66e4cc29c51fd7e5..54eb5142793827f8b733592d22b979d326593bee 100644 (file)
@@ -1,29 +1,33 @@
 from __future__ import unicode_literals
 
-from .common import InfoExtractor
 from .youtube import YoutubeIE
+from .jwplatform import JWPlatformBaseIE
 
 
-class WimpIE(InfoExtractor):
+class WimpIE(JWPlatformBaseIE):
     _VALID_URL = r'https?://(?:www\.)?wimp\.com/(?P<id>[^/]+)'
     _TESTS = [{
-        'url': 'http://www.wimp.com/maruexhausted/',
+        'url': 'http://www.wimp.com/maru-is-exhausted/',
         'md5': 'ee21217ffd66d058e8b16be340b74883',
         'info_dict': {
-            'id': 'maruexhausted',
+            'id': 'maru-is-exhausted',
             'ext': 'mp4',
             'title': 'Maru is exhausted.',
             'description': 'md5:57e099e857c0a4ea312542b684a869b8',
         }
     }, {
         'url': 'http://www.wimp.com/clowncar/',
-        'md5': '4e2986c793694b55b37cf92521d12bb4',
+        'md5': '5c31ad862a90dc5b1f023956faec13fe',
         'info_dict': {
-            'id': 'clowncar',
+            'id': 'cG4CEr2aiSg',
             'ext': 'webm',
-            'title': 'It\'s like a clown car.',
-            'description': 'md5:0e56db1370a6e49c5c1d19124c0d2fb2',
+            'title': 'Basset hound clown car...incredible!',
+            'description': '5 of my Bassets crawled in this dog loo! www.bellinghambassets.com\n\nFor licensing/usage please contact: licensing(at)jukinmediadotcom',
+            'upload_date': '20140303',
+            'uploader': 'Gretchen Hoey',
+            'uploader_id': 'gretchenandjeff1',
         },
+        'add_ie': ['Youtube'],
     }]
 
     def _real_extract(self, url):
@@ -41,14 +45,13 @@ class WimpIE(InfoExtractor):
                 'ie_key': YoutubeIE.ie_key(),
             }
 
-        video_url = self._search_regex(
-            r'<video[^>]+>\s*<source[^>]+src=(["\'])(?P<url>.+?)\1',
-            webpage, 'video URL', group='url')
+        info_dict = self._extract_jwplayer_data(
+            webpage, video_id, require_title=False)
 
-        return {
+        info_dict.update({
             'id': video_id,
-            'url': video_url,
             'title': self._og_search_title(webpage),
-            'thumbnail': self._og_search_thumbnail(webpage),
             'description': self._og_search_description(webpage),
-        }
+        })
+
+        return info_dict
index 8b14840a2dba606951f1f7d80694f1e7f0cca8d6..c634b8decddf8fdb15649b05e8f49ad9efc36254 100644 (file)
@@ -3,16 +3,17 @@ from __future__ import unicode_literals
 from .common import InfoExtractor
 from ..utils import (
     ExtractorError,
-    sanitized_Request,
     int_or_none,
+    float_or_none,
 )
 
 
 class WistiaIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:fast\.)?wistia\.net/embed/iframe/(?P<id>[a-z0-9]+)'
-    _API_URL = 'http://fast.wistia.com/embed/medias/{0:}.json'
+    _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.net/embed/iframe/)(?P<id>[a-z0-9]+)'
+    _API_URL = 'http://fast.wistia.com/embed/medias/%s.json'
+    _IFRAME_URL = 'http://fast.wistia.net/embed/iframe/%s'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://fast.wistia.net/embed/iframe/sh7fpupwlt',
         'md5': 'cafeb56ec0c53c18c97405eecb3133df',
         'info_dict': {
@@ -24,36 +25,54 @@ class WistiaIE(InfoExtractor):
             'timestamp': 1386185018,
             'duration': 117,
         },
-    }
+    }, {
+        'url': 'wistia:sh7fpupwlt',
+        'only_matching': True,
+    }, {
+        # with hls video
+        'url': 'wistia:807fafadvk',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        request = sanitized_Request(self._API_URL.format(video_id))
-        request.add_header('Referer', url)  # Some videos require this.
-        data_json = self._download_json(request, video_id)
+        data_json = self._download_json(
+            self._API_URL % video_id, video_id,
+            # Some videos require this.
+            headers={
+                'Referer': url if url.startswith('http') else self._IFRAME_URL % video_id,
+            })
+
         if data_json.get('error'):
-            raise ExtractorError('Error while getting the playlist',
-                                 expected=True)
+            raise ExtractorError(
+                'Error while getting the playlist', expected=True)
+
         data = data_json['media']
         title = data['name']
 
         formats = []
         thumbnails = []
         for a in data['assets']:
+            aurl = a.get('url')
+            if not aurl:
+                continue
             astatus = a.get('status')
             atype = a.get('type')
-            if (astatus is not None and astatus != 2) or atype == 'preview':
+            if (astatus is not None and astatus != 2) or atype in ('preview', 'storyboard'):
                 continue
             elif atype in ('still', 'still_image'):
                 thumbnails.append({
-                    'url': a['url'],
-                    'resolution': '%dx%d' % (a['width'], a['height']),
+                    'url': aurl,
+                    'width': int_or_none(a.get('width')),
+                    'height': int_or_none(a.get('height')),
                 })
             else:
+                aext = a.get('ext')
+                is_m3u8 = a.get('container') == 'm3u8' or aext == 'm3u8'
                 formats.append({
                     'format_id': atype,
-                    'url': a['url'],
+                    'url': aurl,
                     'tbr': int_or_none(a.get('bitrate')),
                     'vbr': int_or_none(a.get('opt_vbitrate')),
                     'width': int_or_none(a.get('width')),
@@ -61,7 +80,8 @@ class WistiaIE(InfoExtractor):
                     'filesize': int_or_none(a.get('size')),
                     'vcodec': a.get('codec'),
                     'container': a.get('container'),
-                    'ext': a.get('ext'),
+                    'ext': 'mp4' if is_m3u8 else aext,
+                    'protocol': 'm3u8' if is_m3u8 else None,
                     'preference': 1 if atype == 'original' else None,
                 })
 
@@ -73,6 +93,6 @@ class WistiaIE(InfoExtractor):
             'description': data.get('seoDescription'),
             'formats': formats,
             'thumbnails': thumbnails,
-            'duration': int_or_none(data.get('duration')),
+            'duration': float_or_none(data.get('duration')),
             'timestamp': int_or_none(data.get('createdAt')),
         }
index c427649211079715a5510eef3eaf35981bdb1034..bdd7097baec16afb2a3c83dbdde0e89ebc713a69 100644 (file)
@@ -5,8 +5,10 @@ import re
 
 from .common import InfoExtractor
 from ..utils import (
+    ExtractorError,
     int_or_none,
     qualities,
+    remove_start,
 )
 
 
@@ -26,16 +28,17 @@ class WrzutaIE(InfoExtractor):
             'uploader_id': 'laboratoriumdextera',
             'description': 'md5:7fb5ef3c21c5893375fda51d9b15d9cd',
         },
+        'skip': 'Redirected to wrzuta.pl',
     }, {
-        'url': 'http://jolka85.wrzuta.pl/audio/063jOPX5ue2/liber_natalia_szroeder_-_teraz_ty',
-        'md5': 'bc78077859bea7bcfe4295d7d7fc9025',
+        'url': 'http://vexling.wrzuta.pl/audio/01xBFabGXu6/james_horner_-_into_the_na_39_vi_world_bonus',
+        'md5': 'f80564fb5a2ec6ec59705ae2bf2ba56d',
         'info_dict': {
-            'id': '063jOPX5ue2',
-            'ext': 'ogg',
-            'title': 'Liber & Natalia Szroeder - Teraz Ty',
-            'duration': 203,
-            'uploader_id': 'jolka85',
-            'description': 'md5:2d2b6340f9188c8c4cd891580e481096',
+            'id': '01xBFabGXu6',
+            'ext': 'mp3',
+            'title': 'James Horner - Into The Na\'vi World [Bonus]',
+            'description': 'md5:30a70718b2cd9df3120fce4445b0263b',
+            'duration': 95,
+            'uploader_id': 'vexling',
         },
     }]
 
@@ -45,7 +48,10 @@ class WrzutaIE(InfoExtractor):
         typ = mobj.group('typ')
         uploader = mobj.group('uploader')
 
-        webpage = self._download_webpage(url, video_id)
+        webpage, urlh = self._download_webpage_handle(url, video_id)
+
+        if urlh.geturl() == 'http://www.wrzuta.pl/':
+            raise ExtractorError('Video removed', expected=True)
 
         quality = qualities(['SD', 'MQ', 'HQ', 'HD'])
 
@@ -80,3 +86,73 @@ class WrzutaIE(InfoExtractor):
             'description': self._og_search_description(webpage),
             'age_limit': embedpage.get('minimalAge', 0),
         }
+
+
+class WrzutaPlaylistIE(InfoExtractor):
+    """
+        this class covers extraction of wrzuta playlist entries
+        the extraction process bases on following steps:
+        * collect information of playlist size
+        * download all entries provided on
+          the playlist webpage (the playlist is split
+          on two pages: first directly reached from webpage
+          second: downloaded on demand by ajax call and rendered
+          using the ajax call response)
+        * in case size of extracted entries not reached total number of entries
+          use the ajax call to collect the remaining entries
+    """
+
+    IE_NAME = 'wrzuta.pl:playlist'
+    _VALID_URL = r'https?://(?P<uploader>[0-9a-zA-Z]+)\.wrzuta\.pl/playlista/(?P<id>[0-9a-zA-Z]+)'
+    _TESTS = [{
+        'url': 'http://miromak71.wrzuta.pl/playlista/7XfO4vE84iR/moja_muza',
+        'playlist_mincount': 14,
+        'info_dict': {
+            'id': '7XfO4vE84iR',
+            'title': 'Moja muza',
+        },
+    }, {
+        'url': 'http://heroesf70.wrzuta.pl/playlista/6Nj3wQHx756/lipiec_-_lato_2015_muzyka_swiata',
+        'playlist_mincount': 144,
+        'info_dict': {
+            'id': '6Nj3wQHx756',
+            'title': 'Lipiec - Lato 2015 Muzyka Świata',
+        },
+    }, {
+        'url': 'http://miromak71.wrzuta.pl/playlista/7XfO4vE84iR',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        playlist_id = mobj.group('id')
+        uploader = mobj.group('uploader')
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        playlist_size = int_or_none(self._html_search_regex(
+            (r'<div[^>]+class=["\']playlist-counter["\'][^>]*>\d+/(\d+)',
+             r'<div[^>]+class=["\']all-counter["\'][^>]*>(.+?)</div>'),
+            webpage, 'playlist size', default=None))
+
+        playlist_title = remove_start(
+            self._og_search_title(webpage), 'Playlista: ')
+
+        entries = []
+        if playlist_size:
+            entries = [
+                self.url_result(entry_url)
+                for _, entry_url in re.findall(
+                    r'<a[^>]+href=(["\'])(http.+?)\1[^>]+class=["\']playlist-file-page',
+                    webpage)]
+            if playlist_size > len(entries):
+                playlist_content = self._download_json(
+                    'http://%s.wrzuta.pl/xhr/get_playlist_offset/%s' % (uploader, playlist_id),
+                    playlist_id,
+                    'Downloading playlist JSON',
+                    'Unable to download playlist JSON')
+                entries.extend([
+                    self.url_result(entry['filelink'])
+                    for entry in playlist_content.get('files', []) if entry.get('filelink')])
+
+        return self.playlist_result(entries, playlist_id, playlist_title)
index 5a897371d1d69a95e08f7b4da4d457b3236e09cc..a83e68b17f53ca06fa4b76af6e3aa560fb23e107 100644 (file)
@@ -4,16 +4,22 @@ from __future__ import unicode_literals
 from .common import InfoExtractor
 from ..utils import (
     int_or_none,
+    float_or_none,
     unified_strdate,
 )
 
 
 class WSJIE(InfoExtractor):
-    _VALID_URL = r'https?://video-api\.wsj\.com/api-video/player/iframe\.html\?guid=(?P<id>[a-zA-Z0-9-]+)'
+    _VALID_URL = r'''(?x)https?://
+        (?:
+            video-api\.wsj\.com/api-video/player/iframe\.html\?guid=|
+            (?:www\.)?wsj\.com/video/[^/]+/
+        )
+        (?P<id>[a-zA-Z0-9-]+)'''
     IE_DESC = 'Wall Street Journal'
-    _TEST = {
+    _TESTS = [{
         'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A',
-        'md5': '9747d7a6ebc2f4df64b981e1dde9efa9',
+        'md5': 'e230a5bb249075e40793b655a54a02e4',
         'info_dict': {
             'id': '1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A',
             'ext': 'mp4',
@@ -24,65 +30,60 @@ class WSJIE(InfoExtractor):
             'duration': 90,
             'title': 'Bills Coach Rex Ryan Updates His Old Jets Tattoo',
         },
-    }
+    }, {
+        'url': 'http://www.wsj.com/video/can-alphabet-build-a-smarter-city/359DDAA8-9AC1-489C-82E6-0429C1E430E0.html',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        bitrates = [128, 174, 264, 320, 464, 664, 1264]
         api_url = (
             'http://video-api.wsj.com/api-video/find_all_videos.asp?'
-            'type=guid&count=1&query=%s&'
-            'fields=hls,adZone,thumbnailList,guid,state,secondsUntilStartTime,'
-            'author,description,name,linkURL,videoStillURL,duration,videoURL,'
-            'adCategory,catastrophic,linkShortURL,doctypeID,youtubeID,'
-            'titletag,rssURL,wsj-section,wsj-subsection,allthingsd-section,'
-            'allthingsd-subsection,sm-section,sm-subsection,provider,'
-            'formattedCreationDate,keywords,keywordsOmniture,column,editor,'
-            'emailURL,emailPartnerID,showName,omnitureProgramName,'
-            'omnitureVideoFormat,linkRelativeURL,touchCastID,'
-            'omniturePublishDate,%s') % (
-                video_id, ','.join('video%dkMP4Url' % br for br in bitrates))
+            'type=guid&count=1&query=%s&fields=type,hls,videoMP4List,'
+            'thumbnailList,author,description,name,duration,videoURL,'
+            'titletag,formattedCreationDate,keywords,editor' % video_id)
         info = self._download_json(api_url, video_id)['items'][0]
-
-        # Thumbnails are conveniently in the correct format already
-        thumbnails = info.get('thumbnailList')
-        creator = info.get('author')
-        uploader_id = info.get('editor')
-        categories = info.get('keywords')
-        duration = int_or_none(info.get('duration'))
-        upload_date = unified_strdate(
-            info.get('formattedCreationDate'), day_first=False)
         title = info.get('name', info.get('titletag'))
 
-        formats = [{
-            'format_id': 'f4m',
-            'format_note': 'f4m (meta URL)',
-            'url': info['videoURL'],
-        }]
-        if info.get('hls'):
+        formats = []
+
+        f4m_url = info.get('videoURL')
+        if f4m_url:
+            formats.extend(self._extract_f4m_formats(
+                f4m_url, video_id, f4m_id='hds', fatal=False))
+
+        m3u8_url = info.get('hls')
+        if m3u8_url:
             formats.extend(self._extract_m3u8_formats(
                 info['hls'], video_id, ext='mp4',
-                preference=0, entry_protocol='m3u8_native'))
-        for br in bitrates:
-            field = 'video%dkMP4Url' % br
-            if info.get(field):
-                formats.append({
-                    'format_id': 'mp4-%d' % br,
-                    'container': 'mp4',
-                    'tbr': br,
-                    'url': info[field],
-                })
+                entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
+
+        for v in info.get('videoMP4List', []):
+            mp4_url = v.get('url')
+            if not mp4_url:
+                continue
+            tbr = int_or_none(v.get('bitrate'))
+            formats.append({
+                'url': mp4_url,
+                'format_id': 'http' + ('-%d' % tbr if tbr else ''),
+                'tbr': tbr,
+                'width': int_or_none(v.get('width')),
+                'height': int_or_none(v.get('height')),
+                'fps': float_or_none(v.get('fps')),
+            })
         self._sort_formats(formats)
 
         return {
             'id': video_id,
             'formats': formats,
-            'thumbnails': thumbnails,
-            'creator': creator,
-            'uploader_id': uploader_id,
-            'duration': duration,
-            'upload_date': upload_date,
+            # Thumbnails are conveniently in the correct format already
+            'thumbnails': info.get('thumbnailList'),
+            'creator': info.get('author'),
+            'uploader_id': info.get('editor'),
+            'duration': int_or_none(info.get('duration')),
+            'upload_date': unified_strdate(info.get(
+                'formattedCreationDate'), day_first=False),
             'title': title,
-            'categories': categories,
+            'categories': info.get('keywords'),
         }
index 2d1504eaacd36eb564da06eb541d2b6b8eabaa44..995aada0d1565ccc76f2fa2c7654e3f53f834d91 100644 (file)
@@ -5,29 +5,44 @@ import re
 
 from .common import InfoExtractor
 from ..utils import (
+    decode_packed_codes,
     ExtractorError,
     int_or_none,
+    NO_DEFAULT,
     sanitized_Request,
     urlencode_postdata,
 )
 
 
 class XFileShareIE(InfoExtractor):
-    IE_DESC = 'XFileShare based sites: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net, filehoot.com and vidto.me'
-    _VALID_URL = r'''(?x)
-        https?://(?P<host>(?:www\.)?
-            (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in|realvid\.net|filehoot\.com|vidto\.me|powerwatch\.pw))/
-        (?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:-[0-9]+x[0-9]+\.html)?
-    '''
-
-    _FILE_NOT_FOUND_REGEX = r'>(?:404 - )?File Not Found<'
+    _SITES = (
+        ('daclips.in', 'DaClips'),
+        ('filehoot.com', 'FileHoot'),
+        ('gorillavid.in', 'GorillaVid'),
+        ('movpod.in', 'MovPod'),
+        ('powerwatch.pw', 'PowerWatch'),
+        ('rapidvideo.ws', 'Rapidvideo.ws'),
+        ('thevideobee.to', 'TheVideoBee'),
+        ('vidto.me', 'Vidto'),
+        ('streamin.to', 'Streamin.To'),
+        ('xvidstage.com', 'XVIDSTAGE'),
+    )
+
+    IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1])
+    _VALID_URL = (r'https?://(?P<host>(?:www\.)?(?:%s))/(?:embed-)?(?P<id>[0-9a-zA-Z]+)'
+                  % '|'.join(re.escape(site) for site in list(zip(*_SITES))[0]))
+
+    _FILE_NOT_FOUND_REGEXES = (
+        r'>(?:404 - )?File Not Found<',
+        r'>The file was removed by administrator<',
+    )
 
     _TESTS = [{
         'url': 'http://gorillavid.in/06y9juieqpmi',
         'md5': '5ae4a3580620380619678ee4875893ba',
         'info_dict': {
             'id': '06y9juieqpmi',
-            'ext': 'flv',
+            'ext': 'mp4',
             'title': 'Rebecca Black My Moment Official Music Video Reaction-6GK87Rc8bzQ',
             'thumbnail': 're:http://.*\.jpg',
         },
@@ -43,25 +58,6 @@ class XFileShareIE(InfoExtractor):
             'title': 'Micro Pig piglets ready on 16th July 2009-bG0PdrCdxUc',
             'thumbnail': 're:http://.*\.jpg',
         }
-    }, {
-        # video with countdown timeout
-        'url': 'http://fastvideo.in/1qmdn1lmsmbw',
-        'md5': '8b87ec3f6564a3108a0e8e66594842ba',
-        'info_dict': {
-            'id': '1qmdn1lmsmbw',
-            'ext': 'mp4',
-            'title': 'Man of Steel - Trailer',
-            'thumbnail': 're:http://.*\.jpg',
-        },
-    }, {
-        'url': 'http://realvid.net/ctn2y6p2eviw',
-        'md5': 'b2166d2cf192efd6b6d764c18fd3710e',
-        'info_dict': {
-            'id': 'ctn2y6p2eviw',
-            'ext': 'flv',
-            'title': 'rdx 1955',
-            'thumbnail': 're:http://.*\.jpg',
-        },
     }, {
         'url': 'http://movpod.in/0wguyyxi1yca',
         'only_matching': True,
@@ -72,7 +68,8 @@ class XFileShareIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'youtube-dl test video \'äBaW_jenozKc.mp4.mp4',
             'thumbnail': 're:http://.*\.jpg',
-        }
+        },
+        'skip': 'Video removed',
     }, {
         'url': 'http://vidto.me/ku5glz52nqe1.html',
         'info_dict': {
@@ -87,6 +84,17 @@ class XFileShareIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'Big Buck Bunny trailer',
         },
+    }, {
+        'url': 'http://xvidstage.com/e0qcnl03co6z',
+        'info_dict': {
+            'id': 'e0qcnl03co6z',
+            'ext': 'mp4',
+            'title': 'Chucky Prank 2015.mp4',
+        },
+    }, {
+        # removed by administrator
+        'url': 'http://xvidstage.com/amfy7atlkx25',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
@@ -96,7 +104,7 @@ class XFileShareIE(InfoExtractor):
         url = 'http://%s/%s' % (mobj.group('host'), video_id)
         webpage = self._download_webpage(url, video_id)
 
-        if re.search(self._FILE_NOT_FOUND_REGEX, webpage) is not None:
+        if any(re.search(p, webpage) for p in self._FILE_NOT_FOUND_REGEXES):
             raise ExtractorError('Video %s does not exist' % video_id, expected=True)
 
         fields = self._hidden_inputs(webpage)
@@ -122,10 +130,23 @@ class XFileShareIE(InfoExtractor):
              r'>Watch (.+) ',
              r'<h2 class="video-page-head">([^<]+)</h2>'],
             webpage, 'title', default=None) or self._og_search_title(webpage)).strip()
-        video_url = self._search_regex(
-            [r'file\s*:\s*["\'](http[^"\']+)["\'],',
-             r'file_link\s*=\s*\'(https?:\/\/[0-9a-zA-z.\/\-_]+)'],
-            webpage, 'file url')
+
+        def extract_video_url(default=NO_DEFAULT):
+            return self._search_regex(
+                (r'file\s*:\s*(["\'])(?P<url>http.+?)\1,',
+                 r'file_link\s*=\s*(["\'])(?P<url>http.+?)\1',
+                 r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?P<url>http.+?)\2\)',
+                 r'<embed[^>]+src=(["\'])(?P<url>http.+?)\1'),
+                webpage, 'file url', default=default, group='url')
+
+        video_url = extract_video_url(default=None)
+
+        if not video_url:
+            webpage = decode_packed_codes(self._search_regex(
+                r"(}\('(.+)',(\d+),(\d+),'[^']*\b(?:file|embed)\b[^']*'\.split\('\|'\))",
+                webpage, 'packed code'))
+            video_url = extract_video_url()
+
         thumbnail = self._search_regex(
             r'image\s*:\s*["\'](http[^"\']+)["\'],', webpage, 'thumbnail', default=None)
 
index b3547174dd92beffafaf8f220b50b94a25f2fa2b..bd8e1af2e0f6c25fc44aea36c23b813b092b4438 100644 (file)
@@ -12,37 +12,52 @@ from ..utils import (
 
 
 class XHamsterIE(InfoExtractor):
-    _VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'
-    _TESTS = [
-        {
-            'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',
-            'info_dict': {
-                'id': '1509445',
-                'ext': 'mp4',
-                'title': 'FemaleAgent Shy beauty takes the bait',
-                'upload_date': '20121014',
-                'uploader': 'Ruseful2011',
-                'duration': 893.52,
-                'age_limit': 18,
-            }
+    _VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.*?)\.html(?:\?.*)?'
+    _TESTS = [{
+        'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',
+        'md5': '8281348b8d3c53d39fffb377d24eac4e',
+        'info_dict': {
+            'id': '1509445',
+            'ext': 'mp4',
+            'title': 'FemaleAgent Shy beauty takes the bait',
+            'upload_date': '20121014',
+            'uploader': 'Ruseful2011',
+            'duration': 893.52,
+            'age_limit': 18,
         },
-        {
-            'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',
-            'info_dict': {
-                'id': '2221348',
-                'ext': 'mp4',
-                'title': 'Britney Spears  Sexy Booty',
-                'upload_date': '20130914',
-                'uploader': 'jojo747400',
-                'duration': 200.48,
-                'age_limit': 18,
-            }
+    }, {
+        'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',
+        'info_dict': {
+            'id': '2221348',
+            'ext': 'mp4',
+            'title': 'Britney Spears  Sexy Booty',
+            'upload_date': '20130914',
+            'uploader': 'jojo747400',
+            'duration': 200.48,
+            'age_limit': 18,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # empty seo
+        'url': 'http://xhamster.com/movies/5667973/.html',
+        'info_dict': {
+            'id': '5667973',
+            'ext': 'mp4',
+            'title': '....',
+            'upload_date': '20160208',
+            'uploader': 'parejafree',
+            'duration': 72.0,
+            'age_limit': 18,
         },
-        {
-            'url': 'https://xhamster.com/movies/2272726/amber_slayed_by_the_knight.html',
-            'only_matching': True,
+        'params': {
+            'skip_download': True,
         },
-    ]
+    }, {
+        'url': 'https://xhamster.com/movies/2272726/amber_slayed_by_the_knight.html',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         def extract_video_url(webpage, name):
@@ -170,7 +185,7 @@ class XHamsterEmbedIE(InfoExtractor):
         webpage = self._download_webpage(url, video_id)
 
         video_url = self._search_regex(
-            r'href="(https?://xhamster\.com/movies/%s/[^"]+\.html[^"]*)"' % video_id,
+            r'href="(https?://xhamster\.com/movies/%s/[^"]*\.html[^"]*)"' % video_id,
             webpage, 'xhamster url', default=None)
 
         if not video_url:
diff --git a/youtube_dl/extractor/xiami.py b/youtube_dl/extractor/xiami.py
new file mode 100644 (file)
index 0000000..a6dfc4a
--- /dev/null
@@ -0,0 +1,168 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
+from ..utils import int_or_none
+
+
+class XiamiBaseIE(InfoExtractor):
+    _API_BASE_URL = 'http://www.xiami.com/song/playlist/cat/json/id'
+
+    def _download_webpage(self, *args, **kwargs):
+        webpage = super(XiamiBaseIE, self)._download_webpage(*args, **kwargs)
+        if '>Xiami is currently not available in your country.<' in webpage:
+            self.raise_geo_restricted('Xiami is currently not available in your country')
+
+    def _extract_track(self, track, track_id=None):
+        title = track['title']
+        track_url = self._decrypt(track['location'])
+
+        subtitles = {}
+        lyrics_url = track.get('lyric_url') or track.get('lyric')
+        if lyrics_url and lyrics_url.startswith('http'):
+            subtitles['origin'] = [{'url': lyrics_url}]
+
+        return {
+            'id': track.get('song_id') or track_id,
+            'url': track_url,
+            'title': title,
+            'thumbnail': track.get('pic') or track.get('album_pic'),
+            'duration': int_or_none(track.get('length')),
+            'creator': track.get('artist', '').split(';')[0],
+            'track': title,
+            'album': track.get('album_name'),
+            'artist': track.get('artist'),
+            'subtitles': subtitles,
+        }
+
+    def _extract_tracks(self, item_id, typ=None):
+        playlist = self._download_json(
+            '%s/%s%s' % (self._API_BASE_URL, item_id, '/type/%s' % typ if typ else ''), item_id)
+        return [
+            self._extract_track(track, item_id)
+            for track in playlist['data']['trackList']]
+
+    @staticmethod
+    def _decrypt(origin):
+        n = int(origin[0])
+        origin = origin[1:]
+        short_lenth = len(origin) // n
+        long_num = len(origin) - short_lenth * n
+        l = tuple()
+        for i in range(0, n):
+            length = short_lenth
+            if i < long_num:
+                length += 1
+            l += (origin[0:length], )
+            origin = origin[length:]
+        ans = ''
+        for i in range(0, short_lenth + 1):
+            for j in range(0, n):
+                if len(l[j]) > i:
+                    ans += l[j][i]
+        return compat_urllib_parse_unquote(ans).replace('^', '0')
+
+
+class XiamiSongIE(XiamiBaseIE):
+    IE_NAME = 'xiami:song'
+    IE_DESC = '虾米音乐'
+    _VALID_URL = r'https?://(?:www\.)?xiami\.com/song/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://www.xiami.com/song/1775610518',
+        'md5': '521dd6bea40fd5c9c69f913c232cb57e',
+        'info_dict': {
+            'id': '1775610518',
+            'ext': 'mp3',
+            'title': 'Woman',
+            'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg',
+            'duration': 265,
+            'creator': 'HONNE',
+            'track': 'Woman',
+            'album': 'Woman',
+            'artist': 'HONNE',
+            'subtitles': {
+                'origin': [{
+                    'ext': 'lrc',
+                }],
+            },
+        },
+        'skip': 'Georestricted',
+    }, {
+        'url': 'http://www.xiami.com/song/1775256504',
+        'md5': '932a3abd45c6aa2b1fdbe028fcb4c4fc',
+        'info_dict': {
+            'id': '1775256504',
+            'ext': 'mp3',
+            'title': '悟空',
+            'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg',
+            'duration': 200,
+            'creator': '戴荃',
+            'track': '悟空',
+            'album': '悟空',
+            'artist': '戴荃',
+            'subtitles': {
+                'origin': [{
+                    'ext': 'lrc',
+                }],
+            },
+        },
+        'skip': 'Georestricted',
+    }]
+
+    def _real_extract(self, url):
+        return self._extract_tracks(self._match_id(url))[0]
+
+
+class XiamiPlaylistBaseIE(XiamiBaseIE):
+    def _real_extract(self, url):
+        item_id = self._match_id(url)
+        return self.playlist_result(self._extract_tracks(item_id, self._TYPE), item_id)
+
+
+class XiamiAlbumIE(XiamiPlaylistBaseIE):
+    IE_NAME = 'xiami:album'
+    IE_DESC = '虾米音乐 - 专辑'
+    _VALID_URL = r'https?://(?:www\.)?xiami\.com/album/(?P<id>[0-9]+)'
+    _TYPE = '1'
+    _TESTS = [{
+        'url': 'http://www.xiami.com/album/2100300444',
+        'info_dict': {
+            'id': '2100300444',
+        },
+        'playlist_count': 10,
+        'skip': 'Georestricted',
+    }, {
+        'url': 'http://www.xiami.com/album/512288?spm=a1z1s.6843761.1110925389.6.hhE9p9',
+        'only_matching': True,
+    }]
+
+
+class XiamiArtistIE(XiamiPlaylistBaseIE):
+    IE_NAME = 'xiami:artist'
+    IE_DESC = '虾米音乐 - 歌手'
+    _VALID_URL = r'https?://(?:www\.)?xiami\.com/artist/(?P<id>[0-9]+)'
+    _TYPE = '2'
+    _TEST = {
+        'url': 'http://www.xiami.com/artist/2132?spm=0.0.0.0.dKaScp',
+        'info_dict': {
+            'id': '2132',
+        },
+        'playlist_count': 20,
+        'skip': 'Georestricted',
+    }
+
+
+class XiamiCollectionIE(XiamiPlaylistBaseIE):
+    IE_NAME = 'xiami:collection'
+    IE_DESC = '虾米音乐 - 精选集'
+    _VALID_URL = r'https?://(?:www\.)?xiami\.com/collect/(?P<id>[0-9]+)'
+    _TYPE = '3'
+    _TEST = {
+        'url': 'http://www.xiami.com/collect/156527391?spm=a1z1s.2943601.6856193.12.4jpBnr',
+        'info_dict': {
+            'id': '156527391',
+        },
+        'playlist_mincount': 29,
+        'skip': 'Georestricted',
+    }
index 7c9d8af6f2585207347d58d08fc607ebf4d28900..36e5ead1e690db9bb0c1c1a64650a69c784bbe76 100644 (file)
@@ -2,15 +2,15 @@
 from __future__ import unicode_literals
 
 import re
+import time
 
 from .common import InfoExtractor
 from ..compat import (
-    compat_chr,
     compat_ord,
 )
 from ..utils import (
     int_or_none,
-    parse_filesize,
+    parse_duration,
 )
 
 
@@ -22,7 +22,7 @@ class XMinusIE(InfoExtractor):
         'info_dict': {
             'id': '4542',
             'ext': 'mp3',
-            'title': 'Леонид Агутин-Песенка шофера',
+            'title': 'Леонид Агутин-Песенка шофёра',
             'duration': 156,
             'tbr': 320,
             'filesize_approx': 5900000,
@@ -36,38 +36,41 @@ class XMinusIE(InfoExtractor):
         webpage = self._download_webpage(url, video_id)
 
         artist = self._html_search_regex(
-            r'minus_track\.artist="(.+?)"', webpage, 'artist')
+            r'<a[^>]+href="/artist/\d+">([^<]+)</a>', webpage, 'artist')
         title = artist + '-' + self._html_search_regex(
-            r'minus_track\.title="(.+?)"', webpage, 'title')
-        duration = int_or_none(self._html_search_regex(
-            r'minus_track\.dur_sec=\'([0-9]*?)\'',
+            r'<span[^>]+class="minustrack-full-title(?:\s+[^"]+)?"[^>]*>([^<]+)', webpage, 'title')
+        duration = parse_duration(self._html_search_regex(
+            r'<span[^>]+class="player-duration(?:\s+[^"]+)?"[^>]*>([^<]+)',
             webpage, 'duration', fatal=False))
-        filesize_approx = parse_filesize(self._html_search_regex(
-            r'<div id="finfo"[^>]*>\s*↓\s*([0-9.]+\s*[a-zA-Z][bB])',
-            webpage, 'approximate filesize', fatal=False))
-        tbr = int_or_none(self._html_search_regex(
-            r'<div class="quality[^"]*"></div>\s*([0-9]+)\s*kbps',
-            webpage, 'bitrate', fatal=False))
+        mobj = re.search(
+            r'<div[^>]+class="dw-info(?:\s+[^"]+)?"[^>]*>(?P<tbr>\d+)\s*кбит/c\s+(?P<filesize>[0-9.]+)\s*мб</div>',
+            webpage)
+        tbr = filesize_approx = None
+        if mobj:
+            filesize_approx = float(mobj.group('filesize')) * 1000000
+            tbr = float(mobj.group('tbr'))
         view_count = int_or_none(self._html_search_regex(
-            r'<div class="quality.*?► ([0-9]+)',
+            r'<span><[^>]+class="icon-chart-bar".*?>(\d+)</span>',
             webpage, 'view count', fatal=False))
         description = self._html_search_regex(
-            r'(?s)<div id="song_texts">(.*?)</div><br',
+            r'(?s)<pre[^>]+id="lyrics-original"[^>]*>(.*?)</pre>',
             webpage, 'song lyrics', fatal=False)
         if description:
             description = re.sub(' *\r *', '\n', description)
 
-        enc_token = self._html_search_regex(
-            r'minus_track\.s?tkn="(.+?)"', webpage, 'enc_token')
-        token = ''.join(
-            c if pos == 3 else compat_chr(compat_ord(c) - 1)
-            for pos, c in enumerate(reversed(enc_token)))
-        video_url = 'http://x-minus.org/dwlf/%s/%s.mp3' % (video_id, token)
+        k = self._search_regex(
+            r'<div[^>]+id="player-bottom"[^>]+data-k="([^"]+)">', webpage,
+            'encoded data')
+        h = time.time() / 3600
+        a = sum(map(int, [compat_ord(c) for c in k])) + int(video_id) + h
+        video_url = 'http://x-minus.me/dl/minus?id=%s&tkn2=%df%d' % (video_id, a, h)
 
         return {
             'id': video_id,
             'title': title,
             'url': video_url,
+            # The extension is unknown until actual downloading
+            'ext': 'mp3',
             'duration': duration,
             'filesize_approx': filesize_approx,
             'tbr': tbr,
index 5a41f8ffa0c5a46a3d0431a6aac8e93ba8ca1cb9..bcb140305559a164f56392dd33eab4d5c7b0bab5 100644 (file)
@@ -6,17 +6,23 @@ from ..compat import compat_urllib_parse_unquote
 
 
 class XNXXIE(InfoExtractor):
-    _VALID_URL = r'^https?://(?:video|www)\.xnxx\.com/video(?P<id>[0-9]+)/(.*)'
-    _TEST = {
-        'url': 'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_',
-        'md5': '0831677e2b4761795f68d417e0b7b445',
+    _VALID_URL = r'https?://(?:video|www)\.xnxx\.com/video-?(?P<id>[0-9a-z]+)/'
+    _TESTS = [{
+        'url': 'http://www.xnxx.com/video-55awb78/skyrim_test_video',
+        'md5': 'ef7ecee5af78f8b03dca2cf31341d3a0',
         'info_dict': {
-            'id': '1135332',
+            'id': '55awb78',
             'ext': 'flv',
-            'title': 'lida » Naked Funny Actress  (5)',
+            'title': 'Skyrim Test Video',
             'age_limit': 18,
-        }
-    }
+        },
+    }, {
+        'url': 'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.xnxx.com/video-55awb78/',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
index 4075b8a4f8a705cf29aa1430656146350a8d07aa..83bc1fef2095b322a67199c60e27fcc6f8f1bcbc 100644 (file)
@@ -4,17 +4,23 @@ import itertools
 import re
 
 from .common import InfoExtractor
-from ..compat import compat_urllib_parse_unquote
 from ..utils import (
     int_or_none,
     orderedSet,
+    parse_duration,
     sanitized_Request,
     str_to_int,
 )
 
 
 class XTubeIE(InfoExtractor):
-    _VALID_URL = r'(?:xtube:|https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?P<display_id>[^/]+)-))(?P<id>[^/?&#]+)'
+    _VALID_URL = r'''(?x)
+                        (?:
+                            xtube:|
+                            https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?P<display_id>[^/]+)-)
+                        )
+                        (?P<id>[^/?&#]+)
+                    '''
 
     _TESTS = [{
         # old URL schema
@@ -27,6 +33,8 @@ class XTubeIE(InfoExtractor):
             'description': 'contains:an ET kind of thing',
             'uploader': 'greenshowers',
             'duration': 450,
+            'view_count': int,
+            'comment_count': int,
             'age_limit': 18,
         }
     }, {
@@ -51,21 +59,30 @@ class XTubeIE(InfoExtractor):
         req.add_header('Cookie', 'age_verified=1; cookiesAccepted=1')
         webpage = self._download_webpage(req, display_id)
 
-        flashvars = self._parse_json(
-            self._search_regex(
-                r'xt\.playerOps\s*=\s*({.+?});', webpage, 'player ops'),
-            video_id)['flashvars']
-
-        title = flashvars.get('title') or self._search_regex(
-            r'<h1>([^<]+)</h1>', webpage, 'title')
-        video_url = compat_urllib_parse_unquote(flashvars['video_url'])
-        duration = int_or_none(flashvars.get('video_duration'))
-
-        uploader = self._search_regex(
-            r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"',
-            webpage, 'uploader', fatal=False)
+        sources = self._parse_json(self._search_regex(
+            r'sources\s*:\s*({.+?}),', webpage, 'sources'), video_id)
+
+        formats = []
+        for format_id, format_url in sources.items():
+            formats.append({
+                'url': format_url,
+                'format_id': format_id,
+                'height': int_or_none(format_id),
+            })
+        self._sort_formats(formats)
+
+        title = self._search_regex(
+            (r'<h1>(?P<title>[^<]+)</h1>', r'videoTitle\s*:\s*(["\'])(?P<title>.+?)\1'),
+            webpage, 'title', group='title')
         description = self._search_regex(
             r'</h1>\s*<p>([^<]+)', webpage, 'description', fatal=False)
+        uploader = self._search_regex(
+            (r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"',
+             r'<span[^>]+class="nickname"[^>]*>([^<]+)'),
+            webpage, 'uploader', fatal=False)
+        duration = parse_duration(self._search_regex(
+            r'<dt>Runtime:</dt>\s*<dd>([^<]+)</dd>',
+            webpage, 'duration', fatal=False))
         view_count = str_to_int(self._search_regex(
             r'<dt>Views:</dt>\s*<dd>([\d,\.]+)</dd>',
             webpage, 'view count', fatal=False))
@@ -76,7 +93,6 @@ class XTubeIE(InfoExtractor):
         return {
             'id': video_id,
             'display_id': display_id,
-            'url': video_url,
             'title': title,
             'description': description,
             'uploader': uploader,
@@ -84,6 +100,7 @@ class XTubeIE(InfoExtractor):
             'view_count': view_count,
             'comment_count': comment_count,
             'age_limit': 18,
+            'formats': formats,
         }
 
 
index 2466410faaba4e0047fe26099ee936b68dcb9e34..a66daee46ebc0324152f69eefa2b66d79bfb513d 100644 (file)
@@ -66,6 +66,21 @@ class XuiteIE(InfoExtractor):
             'uploader_id': '242127761',
             'categories': ['電玩動漫'],
         },
+        'skip': 'Video removed',
+    }, {
+        # Video with encoded media id
+        # from http://forgetfulbc.blogspot.com/2016/06/date.html
+        'url': 'http://vlog.xuite.net/embed/cE1xbENoLTI3NDQ3MzM2LmZsdg==?ar=0&as=0',
+        'info_dict': {
+            'id': 'cE1xbENoLTI3NDQ3MzM2LmZsdg==',
+            'ext': 'mp4',
+            'title': '男女平權只是口號?專家解釋約會時男生是否該幫女生付錢 (中字)',
+            'description': 'md5:f0abdcb69df300f522a5442ef3146f2a',
+            'timestamp': 1466160960,
+            'upload_date': '20160617',
+            'uploader': 'B.C. & Lowy',
+            'uploader_id': '232279340',
+        },
     }, {
         'url': 'http://vlog.xuite.net/play/S1dDUjdyLTMyOTc3NjcuZmx2/%E5%AD%AB%E7%87%95%E5%A7%BF-%E7%9C%BC%E6%B7%9A%E6%88%90%E8%A9%A9',
         'only_matching': True,
@@ -79,10 +94,9 @@ class XuiteIE(InfoExtractor):
     def base64_encode_utf8(data):
         return base64.b64encode(data.encode('utf-8')).decode('utf-8')
 
-    def _extract_flv_config(self, media_id):
-        base64_media_id = self.base64_encode_utf8(media_id)
+    def _extract_flv_config(self, encoded_media_id):
         flv_config = self._download_xml(
-            'http://vlog.xuite.net/flash/player?media=%s' % base64_media_id,
+            'http://vlog.xuite.net/flash/player?media=%s' % encoded_media_id,
             'flv config')
         prop_dict = {}
         for prop in flv_config.findall('./property'):
@@ -107,9 +121,14 @@ class XuiteIE(InfoExtractor):
                 '%s returned error: %s' % (self.IE_NAME, error_msg),
                 expected=True)
 
-        video_id = self._html_search_regex(
-            r'data-mediaid="(\d+)"', webpage, 'media id')
-        flv_config = self._extract_flv_config(video_id)
+        encoded_media_id = self._search_regex(
+            r'attributes\.name\s*=\s*"([^"]+)"', webpage,
+            'encoded media id', default=None)
+        if encoded_media_id is None:
+            video_id = self._html_search_regex(
+                r'data-mediaid="(\d+)"', webpage, 'media id')
+            encoded_media_id = self.base64_encode_utf8(video_id)
+        flv_config = self._extract_flv_config(encoded_media_id)
 
         FORMATS = {
             'audio': 'mp3',
index 710ad5041988b0e1c932b135af91a27036dfd664..1dfe031cab8e9ccf4fa6fb23fbd90d40759ac930 100644 (file)
@@ -8,7 +8,6 @@ from ..utils import (
     clean_html,
     ExtractorError,
     determine_ext,
-    sanitized_Request,
 )
 
 
@@ -25,8 +24,6 @@ class XVideosIE(InfoExtractor):
         }
     }
 
-    _ANDROID_USER_AGENT = 'Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19'
-
     def _real_extract(self, url):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
@@ -35,31 +32,34 @@ class XVideosIE(InfoExtractor):
         if mobj:
             raise ExtractorError('%s said: %s' % (self.IE_NAME, clean_html(mobj.group(1))), expected=True)
 
-        video_url = compat_urllib_parse_unquote(
-            self._search_regex(r'flv_url=(.+?)&', webpage, 'video URL'))
         video_title = self._html_search_regex(
             r'<title>(.*?)\s+-\s+XVID', webpage, 'title')
         video_thumbnail = self._search_regex(
             r'url_bigthumb=(.+?)&amp', webpage, 'thumbnail', fatal=False)
 
-        formats = [{
-            'url': video_url,
-        }]
+        formats = []
 
-        android_req = sanitized_Request(url)
-        android_req.add_header('User-Agent', self._ANDROID_USER_AGENT)
-        android_webpage = self._download_webpage(android_req, video_id, fatal=False)
+        video_url = compat_urllib_parse_unquote(self._search_regex(
+            r'flv_url=(.+?)&', webpage, 'video URL', default=''))
+        if video_url:
+            formats.append({'url': video_url})
 
-        if android_webpage is not None:
-            player_params_str = self._search_regex(
-                'mobileReplacePlayerDivTwoQual\(([^)]+)\)',
-                android_webpage, 'player parameters', default='')
-            player_params = list(map(lambda s: s.strip(' \''), player_params_str.split(',')))
-            if player_params:
-                formats.extend([{
-                    'url': param,
-                    'preference': -10,
-                } for param in player_params if determine_ext(param) == 'mp4'])
+        player_args = self._search_regex(
+            r'(?s)new\s+HTML5Player\((.+?)\)', webpage, ' html5 player', default=None)
+        if player_args:
+            for arg in player_args.split(','):
+                format_url = self._search_regex(
+                    r'(["\'])(?P<url>https?://.+?)\1', arg, 'url',
+                    default=None, group='url')
+                if not format_url:
+                    continue
+                ext = determine_ext(format_url)
+                if ext == 'mp4':
+                    formats.append({'url': format_url})
+                elif ext == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        format_url, video_id, 'mp4',
+                        entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
 
         self._sort_formats(formats)
 
@@ -67,7 +67,6 @@ class XVideosIE(InfoExtractor):
             'id': video_id,
             'formats': formats,
             'title': video_title,
-            'ext': 'flv',
             'thumbnail': video_thumbnail,
             'age_limit': 18,
         }
index b2d8f4b48daddcf734d3a1fb461d1b92736bcfd1..b0679dfb70868f39b1190b8b04696787c02d75b1 100644 (file)
@@ -19,12 +19,13 @@ from ..utils import (
     mimetype2ext,
 )
 
+from .brightcove import BrightcoveNewIE
 from .nbc import NBCSportsVPlayerIE
 
 
 class YahooIE(InfoExtractor):
     IE_DESC = 'Yahoo screen and movies'
-    _VALID_URL = r'(?P<url>(?P<host>https?://(?:[a-zA-Z]{2}\.)?[\da-zA-Z_-]+\.yahoo\.com)/(?:[^/]+/)*(?P<display_id>.+)?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html)'
+    _VALID_URL = r'(?P<url>(?P<host>https?://(?:[a-zA-Z]{2}\.)?[\da-zA-Z_-]+\.yahoo\.com)/(?:[^/]+/)*(?P<display_id>.+)?-(?P<id>[0-9]+)(?:-[a-z]+)?(?:\.html)?)'
     _TESTS = [
         {
             'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
@@ -38,7 +39,7 @@ class YahooIE(InfoExtractor):
         },
         {
             'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html',
-            'md5': 'd6e6fc6e1313c608f316ddad7b82b306',
+            'md5': 'c3466d2b6d5dd6b9f41ba9ed04c24b23',
             'info_dict': {
                 'id': 'd1dedf8c-d58c-38c3-8963-e899929ae0a9',
                 'ext': 'mp4',
@@ -49,7 +50,7 @@ class YahooIE(InfoExtractor):
         },
         {
             'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed',
-            'md5': '60e8ac193d8fb71997caa8fce54c6460',
+            'md5': '75ffabdb87c16d4ffe8c036dc4d1c136',
             'info_dict': {
                 'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb',
                 'ext': 'mp4',
@@ -59,15 +60,15 @@ class YahooIE(InfoExtractor):
             }
         },
         {
-            'url': 'https://tw.screen.yahoo.com/election-2014-askmayor/敢問市長-黃秀霜批賴清德-非常高傲-033009720.html',
-            'md5': '3a09cf59349cfaddae1797acc3c087fc',
+            'url': 'https://tw.news.yahoo.com/%E6%95%A2%E5%95%8F%E5%B8%82%E9%95%B7%20%E9%BB%83%E7%A7%80%E9%9C%9C%E6%89%B9%E8%B3%B4%E6%B8%85%E5%BE%B7%20%E9%9D%9E%E5%B8%B8%E9%AB%98%E5%82%B2-034024051.html',
+            'md5': '9035d38f88b1782682a3e89f985be5bb',
             'info_dict': {
                 'id': 'cac903b3-fcf4-3c14-b632-643ab541712f',
                 'ext': 'mp4',
                 'title': '敢問市長/黃秀霜批賴清德「非常高傲」',
                 'description': '直言台南沒捷運 交通居五都之末',
                 'duration': 396,
-            }
+            },
         },
         {
             'url': 'https://uk.screen.yahoo.com/editor-picks/cute-raccoon-freed-drain-using-091756545.html',
@@ -89,17 +90,32 @@ class YahooIE(InfoExtractor):
                 'title': 'Program that makes hockey more affordable not offered in Manitoba',
                 'description': 'md5:c54a609f4c078d92b74ffb9bf1f496f4',
                 'duration': 121,
-            }
+            },
+            'skip': 'Video gone',
         }, {
             'url': 'https://ca.finance.yahoo.com/news/hackers-sony-more-trouble-well-154609075.html',
-            'md5': '226a895aae7e21b0129e2a2006fe9690',
             'info_dict': {
-                'id': 'e624c4bc-3389-34de-9dfc-025f74943409',
-                'ext': 'mp4',
-                'title': '\'The Interview\' TV Spot: War',
-                'description': 'The Interview',
-                'duration': 30,
-            }
+                'id': '154609075',
+            },
+            'playlist': [{
+                'md5': 'f8e336c6b66f503282e5f719641d6565',
+                'info_dict': {
+                    'id': 'e624c4bc-3389-34de-9dfc-025f74943409',
+                    'ext': 'mp4',
+                    'title': '\'The Interview\' TV Spot: War',
+                    'description': 'The Interview',
+                    'duration': 30,
+                },
+            }, {
+                'md5': '958bcb90b4d6df71c56312137ee1cd5a',
+                'info_dict': {
+                    'id': '1fc8ada0-718e-3abe-a450-bf31f246d1a9',
+                    'ext': 'mp4',
+                    'title': '\'The Interview\' TV Spot: Guys',
+                    'description': 'The Interview',
+                    'duration': 30,
+                },
+            }],
         }, {
             'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
             'md5': '88e209b417f173d86186bef6e4d1f160',
@@ -119,10 +135,11 @@ class YahooIE(InfoExtractor):
                 'title': 'Connect the Dots: Dark Side of Virgo',
                 'description': 'md5:1428185051cfd1949807ad4ff6d3686a',
                 'duration': 201,
-            }
+            },
+            'skip': 'Domain name in.lifestyle.yahoo.com gone',
         }, {
             'url': 'https://www.yahoo.com/movies/v/true-story-trailer-173000497.html',
-            'md5': '989396ae73d20c6f057746fb226aa215',
+            'md5': 'b17ac378b1134fa44370fb27db09a744',
             'info_dict': {
                 'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1',
                 'ext': 'mp4',
@@ -141,6 +158,9 @@ class YahooIE(InfoExtractor):
                 'ext': 'flv',
                 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d',
                 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson',
+                'upload_date': '20150313',
+                'uploader': 'NBCU-SPORTS',
+                'timestamp': 1426270238,
             }
         }, {
             'url': 'https://tw.news.yahoo.com/-100120367.html',
@@ -148,7 +168,7 @@ class YahooIE(InfoExtractor):
         }, {
             # Query result is embedded in webpage, but explicit request to video API fails with geo restriction
             'url': 'https://screen.yahoo.com/community/communitary-community-episode-1-ladders-154501237.html',
-            'md5': '4fbafb9c9b6f07aa8f870629f6671b35',
+            'md5': '1ddbf7c850777548438e5c4f147c7b8c',
             'info_dict': {
                 'id': '1f32853c-a271-3eef-8cb6-f6d6872cb504',
                 'ext': 'mp4',
@@ -166,6 +186,17 @@ class YahooIE(InfoExtractor):
                 'description': 'While they play feuding fathers in \'Daddy\'s Home,\' star Will Ferrell & Mark Wahlberg share their true feelings on parenthood.',
             },
         },
+        {
+            # config['models']['applet_model']['data']['sapi'] has no query
+            'url': 'https://www.yahoo.com/music/livenation/event/galactic-2016',
+            'md5': 'dac0c72d502bc5facda80c9e6d5c98db',
+            'info_dict': {
+                'id': 'a6015640-e9e5-3efb-bb60-05589a183919',
+                'ext': 'mp4',
+                'description': 'Galactic',
+                'title': 'Dolla Diva (feat. Maggie Koerner)',
+            },
+        },
     ]
 
     def _real_extract(self, url):
@@ -174,23 +205,35 @@ class YahooIE(InfoExtractor):
         page_id = mobj.group('id')
         url = mobj.group('url')
         host = mobj.group('host')
-        webpage = self._download_webpage(url, display_id)
+        webpage, urlh = self._download_webpage_handle(url, display_id)
+        if 'err=404' in urlh.geturl():
+            raise ExtractorError('Video gone', expected=True)
 
         # Look for iframed media first
-        iframe_m = re.search(r'<iframe[^>]+src="(/video/.+?-\d+\.html\?format=embed.*?)"', webpage)
-        if iframe_m:
+        entries = []
+        iframe_urls = re.findall(r'<iframe[^>]+src="(/video/.+?-\d+\.html\?format=embed.*?)"', webpage)
+        for idx, iframe_url in enumerate(iframe_urls):
             iframepage = self._download_webpage(
-                host + iframe_m.group(1), display_id, 'Downloading iframe webpage')
+                host + iframe_url, display_id,
+                note='Downloading iframe webpage for video #%d' % idx)
             items_json = self._search_regex(
                 r'mediaItems: (\[.+?\])$', iframepage, 'items', flags=re.MULTILINE, default=None)
             if items_json:
                 items = json.loads(items_json)
                 video_id = items[0]['id']
-                return self._get_info(video_id, display_id, webpage)
+                entries.append(self._get_info(video_id, display_id, webpage))
+        if entries:
+            return self.playlist_result(entries, page_id)
+
         # Look for NBCSports iframes
         nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
         if nbc_sports_url:
-            return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
+            return self.url_result(nbc_sports_url, NBCSportsVPlayerIE.ie_key())
+
+        # Look for Brightcove New Studio embeds
+        bc_url = BrightcoveNewIE._extract_url(webpage)
+        if bc_url:
+            return self.url_result(bc_url, BrightcoveNewIE.ie_key())
 
         # Query result is often embedded in webpage as JSON. Sometimes explicit requests
         # to video API results in a failure with geo restriction reason therefore using
@@ -202,7 +245,7 @@ class YahooIE(InfoExtractor):
             config = self._parse_json(config_json, display_id, fatal=False)
             if config:
                 sapi = config.get('models', {}).get('applet_model', {}).get('data', {}).get('sapi')
-                if sapi:
+                if sapi and 'query' in sapi:
                     return self._extract_info(display_id, sapi, webpage)
 
         items_json = self._search_regex(
@@ -306,7 +349,7 @@ class YahooIE(InfoExtractor):
             webpage, 'region', fatal=False, default='US')
         data = compat_urllib_parse_urlencode({
             'protocol': 'http',
-            'region': region,
+            'region': region.upper(),
         })
         query_url = (
             'https://video.media.yql.yahoo.com/v1/video/sapi/streams/'
index 7a90cc60cfa61a880ba8a0d05cd841017a24bfa8..b37d0eab66b53ab45ff38dcac91598079f08a275 100644 (file)
@@ -10,17 +10,35 @@ from ..utils import (
     ExtractorError,
     int_or_none,
     float_or_none,
-    sanitized_Request,
-    urlencode_postdata,
 )
 
 
 class YandexMusicBaseIE(InfoExtractor):
     @staticmethod
     def _handle_error(response):
-        error = response.get('error')
-        if error:
-            raise ExtractorError(error, expected=True)
+        if isinstance(response, dict):
+            error = response.get('error')
+            if error:
+                raise ExtractorError(error, expected=True)
+            if response.get('type') == 'captcha' or 'captcha' in response:
+                YandexMusicBaseIE._raise_captcha()
+
+    @staticmethod
+    def _raise_captcha():
+        raise ExtractorError(
+            'YandexMusic has considered youtube-dl requests automated and '
+            'asks you to solve a CAPTCHA. You can either wait for some '
+            'time until unblocked and optionally use --sleep-interval '
+            'in future or alternatively you can go to https://music.yandex.ru/ '
+            'solve CAPTCHA, then export cookies and pass cookie file to '
+            'youtube-dl with --cookies',
+            expected=True)
+
+    def _download_webpage(self, *args, **kwargs):
+        webpage = super(YandexMusicBaseIE, self)._download_webpage(*args, **kwargs)
+        if 'Нам очень жаль, но&nbsp;запросы, поступившие с&nbsp;вашего IP-адреса, похожи на&nbsp;автоматические.' in webpage:
+            self._raise_captcha()
+        return webpage
 
     def _download_json(self, *args, **kwargs):
         response = super(YandexMusicBaseIE, self)._download_json(*args, **kwargs)
@@ -47,7 +65,8 @@ class YandexMusicTrackIE(YandexMusicBaseIE):
             'album_artist': 'Carlo Ambrosio',
             'artist': 'Carlo Ambrosio & Fabio Di Bari, Carlo Ambrosio',
             'release_year': '2009',
-        }
+        },
+        'skip': 'Travis CI servers blocked by YandexMusic',
     }
 
     def _get_track_url(self, storage_dir, track_id):
@@ -139,6 +158,7 @@ class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE):
             'title': 'Carlo Ambrosio - Gypsy Soul (2009)',
         },
         'playlist_count': 50,
+        'skip': 'Travis CI servers blocked by YandexMusic',
     }
 
     def _real_extract(self, url):
@@ -161,7 +181,7 @@ class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE):
 class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):
     IE_NAME = 'yandexmusic:playlist'
     IE_DESC = 'Яндекс.Музыка - Плейлист'
-    _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/users/[^/]+/playlists/(?P<id>\d+)'
+    _VALID_URL = r'https?://music\.yandex\.(?P<tld>ru|kz|ua|by)/users/(?P<user>[^/]+)/playlists/(?P<id>\d+)'
 
     _TESTS = [{
         'url': 'http://music.yandex.ru/users/music.partners/playlists/1245',
@@ -171,6 +191,7 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):
             'description': 'md5:3b9f27b0efbe53f2ee1e844d07155cc9',
         },
         'playlist_count': 6,
+        'skip': 'Travis CI servers blocked by YandexMusic',
     }, {
         # playlist exceeding the limit of 150 tracks shipped with webpage (see
         # https://github.com/rg3/youtube-dl/issues/6666)
@@ -179,46 +200,64 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):
             'id': '1036',
             'title': 'Музыка 90-х',
         },
-        'playlist_count': 310,
+        'playlist_mincount': 300,
+        'skip': 'Travis CI servers blocked by YandexMusic',
     }]
 
     def _real_extract(self, url):
-        playlist_id = self._match_id(url)
-
-        webpage = self._download_webpage(url, playlist_id)
-
-        mu = self._parse_json(
-            self._search_regex(
-                r'var\s+Mu\s*=\s*({.+?});\s*</script>', webpage, 'player'),
-            playlist_id)
-
-        playlist = mu['pageData']['playlist']
-        tracks, track_ids = playlist['tracks'], playlist['trackIds']
-
-        # tracks dictionary shipped with webpage is limited to 150 tracks,
+        mobj = re.match(self._VALID_URL, url)
+        tld = mobj.group('tld')
+        user = mobj.group('user')
+        playlist_id = mobj.group('id')
+
+        playlist = self._download_json(
+            'https://music.yandex.%s/handlers/playlist.jsx' % tld,
+            playlist_id, 'Downloading missing tracks JSON',
+            fatal=False,
+            headers={
+                'Referer': url,
+                'X-Requested-With': 'XMLHttpRequest',
+                'X-Retpath-Y': url,
+            },
+            query={
+                'owner': user,
+                'kinds': playlist_id,
+                'light': 'true',
+                'lang': tld,
+                'external-domain': 'music.yandex.%s' % tld,
+                'overembed': 'false',
+            })['playlist']
+
+        tracks, track_ids = playlist['tracks'], map(compat_str, playlist['trackIds'])
+
+        # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks,
         # missing tracks should be retrieved manually.
         if len(tracks) < len(track_ids):
-            present_track_ids = set([compat_str(track['id']) for track in tracks if track.get('id')])
-            missing_track_ids = set(map(compat_str, track_ids)) - set(present_track_ids)
-            request = sanitized_Request(
-                'https://music.yandex.ru/handlers/track-entries.jsx',
-                urlencode_postdata({
+            present_track_ids = set([
+                compat_str(track['id'])
+                for track in tracks if track.get('id')])
+            missing_track_ids = [
+                track_id for track_id in track_ids
+                if track_id not in present_track_ids]
+            missing_tracks = self._download_json(
+                'https://music.yandex.%s/handlers/track-entries.jsx' % tld,
+                playlist_id, 'Downloading missing tracks JSON',
+                fatal=False,
+                headers={
+                    'Referer': url,
+                    'X-Requested-With': 'XMLHttpRequest',
+                },
+                query={
                     'entries': ','.join(missing_track_ids),
-                    'lang': mu.get('settings', {}).get('lang', 'en'),
-                    'external-domain': 'music.yandex.ru',
+                    'lang': tld,
+                    'external-domain': 'music.yandex.%s' % tld,
                     'overembed': 'false',
-                    'sign': mu.get('authData', {}).get('user', {}).get('sign'),
                     'strict': 'true',
-                }))
-            request.add_header('Referer', url)
-            request.add_header('X-Requested-With', 'XMLHttpRequest')
-
-            missing_tracks = self._download_json(
-                request, playlist_id, 'Downloading missing tracks JSON', fatal=False)
+                })
             if missing_tracks:
                 tracks.extend(missing_tracks)
 
         return self.playlist_result(
             self._build_playlist(tracks),
             compat_str(playlist_id),
-            playlist['title'], playlist.get('description'))
+            playlist.get('title'), playlist.get('description'))
index 349ce09414b765060ac5f06121b87b529e287a12..e37f237c76c6880eb1d442e8302dcef558d0e9d8 100644 (file)
@@ -2,7 +2,9 @@
 from __future__ import unicode_literals
 
 import base64
+import itertools
 import random
+import re
 import string
 import time
 
@@ -13,7 +15,7 @@ from ..compat import (
 )
 from ..utils import (
     ExtractorError,
-    sanitized_Request,
+    get_element_by_attribute,
 )
 
 
@@ -215,14 +217,10 @@ class YoukuIE(InfoExtractor):
             headers = {
                 'Referer': req_url,
             }
+            headers.update(self.geo_verification_headers())
             self._set_cookie('youku.com', 'xreferrer', 'http://www.youku.com')
-            req = sanitized_Request(req_url, headers=headers)
 
-            cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
-            if cn_verification_proxy:
-                req.add_header('Ytdl-request-proxy', cn_verification_proxy)
-
-            raw_data = self._download_json(req, video_id, note=note)
+            raw_data = self._download_json(req_url, video_id, note=note, headers=headers)
 
             return raw_data['data']
 
@@ -275,6 +273,8 @@ class YoukuIE(InfoExtractor):
                     'format_id': self.get_format_name(fm),
                     'ext': self.parse_ext_l(fm),
                     'filesize': int(seg['size']),
+                    'width': stream.get('width'),
+                    'height': stream.get('height'),
                 })
 
         return {
@@ -283,3 +283,52 @@ class YoukuIE(InfoExtractor):
             'title': title,
             'entries': entries,
         }
+
+
+class YoukuShowIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?youku\.com/show_page/id_(?P<id>[0-9a-z]+)\.html'
+    IE_NAME = 'youku:show'
+
+    _TEST = {
+        'url': 'http://www.youku.com/show_page/id_zc7c670be07ff11e48b3f.html',
+        'info_dict': {
+            'id': 'zc7c670be07ff11e48b3f',
+            'title': '花千骨 未删减版',
+            'description': 'md5:578d4f2145ae3f9128d9d4d863312910',
+        },
+        'playlist_count': 50,
+    }
+
+    _PAGE_SIZE = 40
+
+    def _find_videos_in_page(self, webpage):
+        videos = re.findall(
+            r'<li><a[^>]+href="(?P<url>https?://v\.youku\.com/[^"]+)"[^>]+title="(?P<title>[^"]+)"', webpage)
+        return [
+            self.url_result(video_url, YoukuIE.ie_key(), title)
+            for video_url, title in videos]
+
+    def _real_extract(self, url):
+        show_id = self._match_id(url)
+        webpage = self._download_webpage(url, show_id)
+
+        entries = self._find_videos_in_page(webpage)
+
+        playlist_title = self._html_search_regex(
+            r'<span[^>]+class="name">([^<]+)</span>', webpage, 'playlist title', fatal=False)
+        detail_div = get_element_by_attribute('class', 'detail', webpage) or ''
+        playlist_description = self._html_search_regex(
+            r'<span[^>]+style="display:none"[^>]*>([^<]+)</span>',
+            detail_div, 'playlist description', fatal=False)
+
+        for idx in itertools.count(1):
+            episodes_page = self._download_webpage(
+                'http://www.youku.com/show_episode/id_%s.html' % show_id,
+                show_id, query={'divid': 'reload_%d' % (idx * self._PAGE_SIZE + 1)},
+                note='Downloading episodes page %d' % idx)
+            new_entries = self._find_videos_in_page(episodes_page)
+            entries.extend(new_entries)
+            if len(new_entries) < self._PAGE_SIZE:
+                break
+
+        return self.playlist_result(entries, show_id, playlist_title, playlist_description)
index 1124fe6c280cb0e23bee3a41ea323165ec714dce..0df2d76ee198d5d6ae1914f078cc96accec2d17e 100644 (file)
@@ -17,7 +17,7 @@ class YouPornIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?youporn\.com/watch/(?P<id>\d+)/(?P<display_id>[^/?#&]+)'
     _TESTS = [{
         'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',
-        'md5': '71ec5fcfddacf80f495efa8b6a8d9a89',
+        'md5': '3744d24c50438cf5b6f6d59feb5055c2',
         'info_dict': {
             'id': '505835',
             'display_id': 'sex-ed-is-it-safe-to-masturbate-daily',
@@ -121,21 +121,21 @@ class YouPornIE(InfoExtractor):
             webpage, 'thumbnail', fatal=False, group='thumbnail')
 
         uploader = self._html_search_regex(
-            r'(?s)<div[^>]+class=["\']videoInfoBy(?:\s+[^"\']+)?["\'][^>]*>\s*By:\s*</div>(.+?)</(?:a|div)>',
+            r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>',
             webpage, 'uploader', fatal=False)
         upload_date = unified_strdate(self._html_search_regex(
-            r'(?s)<div[^>]+class=["\']videoInfoTime["\'][^>]*>(.+?)</div>',
+            r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>',
             webpage, 'upload date', fatal=False))
 
         age_limit = self._rta_search(webpage)
 
         average_rating = int_or_none(self._search_regex(
-            r'<div[^>]+class=["\']videoInfoRating["\'][^>]*>\s*<div[^>]+class=["\']videoRatingPercentage["\'][^>]*>(\d+)%</div>',
+            r'<div[^>]+class=["\']videoRatingPercentage["\'][^>]*>(\d+)%</div>',
             webpage, 'average rating', fatal=False))
 
         view_count = str_to_int(self._search_regex(
-            r'(?s)<div[^>]+class=["\']videoInfoViews["\'][^>]*>.*?([\d,.]+)\s*</div>',
-            webpage, 'view count', fatal=False))
+            r'(?s)<div[^>]+class=(["\']).*?\bvideoInfoViews\b.*?\1[^>]*>.*?(?P<count>[\d,.]+)<',
+            webpage, 'view count', fatal=False, group='count'))
         comment_count = str_to_int(self._search_regex(
             r'>All [Cc]omments? \(([\d,.]+)\)',
             webpage, 'comment count', fatal=False))
index 44f98d294909a75f44f9c01e3a2ce0e7c66d86b5..8aa7dfc413a7141cd56328ec3c0eaa56941b4171 100644 (file)
@@ -344,6 +344,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
         '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
         '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
+        '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'},
+        '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'},
 
         # Dash webm
         '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
@@ -499,6 +501,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'youtube_include_dash_manifest': True,
                 'format': '141',
             },
+            'skip': 'format 141 not served anymore',
         },
         # DASH manifest with encrypted signature
         {
@@ -515,7 +518,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             },
             'params': {
                 'youtube_include_dash_manifest': True,
-                'format': '141',
+                'format': '141/bestaudio[ext=m4a]',
             },
         },
         # JS player signature function name containing $
@@ -535,7 +538,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             },
             'params': {
                 'youtube_include_dash_manifest': True,
-                'format': '141',
+                'format': '141/bestaudio[ext=m4a]',
             },
         },
         # Controversy video
@@ -616,7 +619,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/olympic',
                 'license': 'Standard YouTube License',
                 'description': 'HO09  - Women -  GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
-                'uploader': 'Olympics',
+                'uploader': 'Olympic',
                 'title': 'Hockey - Women -  GER-AUS - London 2012 Olympic Games',
             },
             'params': {
@@ -669,7 +672,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
                 'uploader': 'dorappi2000',
                 'license': 'Standard YouTube License',
-                'formats': 'mincount:33',
+                'formats': 'mincount:32',
             },
         },
         # DASH manifest with segment_list
@@ -689,7 +692,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             'params': {
                 'youtube_include_dash_manifest': True,
                 'format': '135',  # bestvideo
-            }
+            },
+            'skip': 'This live event has ended.',
         },
         {
             # Multifeed videos (multiple cameras), URL is for Main Camera
@@ -760,6 +764,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
             },
             'playlist_count': 2,
+            'skip': 'Not multifeed anymore',
         },
         {
             'url': 'http://vid.plus/FlRa-iH7PGw',
@@ -812,6 +817,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             'params': {
                 'skip_download': True,
             },
+            'skip': 'This video does not exist.',
         },
         {
             # Video licensed under Creative Commons
@@ -1326,10 +1332,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         if video_description:
             video_description = re.sub(r'''(?x)
                 <a\s+
-                    (?:[a-zA-Z-]+="[^"]+"\s+)*?
+                    (?:[a-zA-Z-]+="[^"]*"\s+)*?
                     (?:title|href)="([^"]+)"\s+
-                    (?:[a-zA-Z-]+="[^"]+"\s+)*?
-                    class="(?:yt-uix-redirect-link|yt-uix-sessionlink[^"]*)"[^>]*>
+                    (?:[a-zA-Z-]+="[^"]*"\s+)*?
+                    class="[^"]*"[^>]*>
                 [^<]+\.{3}\s*
                 </a>
             ''', r'\1', video_description)
@@ -1724,6 +1730,39 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         }
 
 
+class YoutubeSharedVideoIE(InfoExtractor):
+    _VALID_URL = r'(?:https?:)?//(?:www\.)?youtube\.com/shared\?ci=(?P<id>[0-9A-Za-z_-]{11})'
+    IE_NAME = 'youtube:shared'
+
+    _TEST = {
+        'url': 'https://www.youtube.com/shared?ci=1nEzmT-M4fU',
+        'info_dict': {
+            'id': 'uPDB5I9wfp8',
+            'ext': 'webm',
+            'title': 'Pocoyo: 90 minutos de episódios completos Português para crianças - PARTE 3',
+            'description': 'md5:d9e4d9346a2dfff4c7dc4c8cec0f546d',
+            'upload_date': '20160219',
+            'uploader': 'Pocoyo - Português (BR)',
+            'uploader_id': 'PocoyoBrazil',
+        },
+        'add_ie': ['Youtube'],
+        'params': {
+            # There are already too many Youtube downloads
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        real_video_id = self._html_search_meta(
+            'videoId', webpage, 'YouTube video id', fatal=True)
+
+        return self.url_result(real_video_id, YoutubeIE.ie_key())
+
+
 class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
     IE_DESC = 'YouTube.com playlists'
     _VALID_URL = r"""(?x)(?:
@@ -1939,10 +1978,13 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
         return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
                 else super(YoutubeChannelIE, cls).suitable(url))
 
+    def _build_template_url(self, url, channel_id):
+        return self._TEMPLATE_URL % channel_id
+
     def _real_extract(self, url):
         channel_id = self._match_id(url)
 
-        url = self._TEMPLATE_URL % channel_id
+        url = self._build_template_url(url, channel_id)
 
         # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
         # Workaround by extracting as a playlist if managed to obtain channel playlist URL
@@ -1956,9 +1998,13 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
             channel_playlist_id = self._html_search_meta(
                 'channelId', channel_page, 'channel id', default=None)
             if not channel_playlist_id:
-                channel_playlist_id = self._search_regex(
-                    r'data-(?:channel-external-|yt)id="([^"]+)"',
-                    channel_page, 'channel id', default=None)
+                channel_url = self._html_search_meta(
+                    ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
+                    channel_page, 'channel url', default=None)
+                if channel_url:
+                    channel_playlist_id = self._search_regex(
+                        r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
+                        channel_url, 'channel id', default=None)
         if channel_playlist_id and channel_playlist_id.startswith('UC'):
             playlist_id = 'UU' + channel_playlist_id[2:]
             return self.url_result(
@@ -1981,24 +2027,53 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
                 for video_id, video_title in self.extract_videos_from_page(channel_page)]
             return self.playlist_result(entries, channel_id)
 
+        try:
+            next(self._entries(channel_page, channel_id))
+        except StopIteration:
+            alert_message = self._html_search_regex(
+                r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
+                channel_page, 'alert', default=None, group='alert')
+            if alert_message:
+                raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
+
         return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
 
 
 class YoutubeUserIE(YoutubeChannelIE):
     IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
-    _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
-    _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'
+    _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
+    _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
     IE_NAME = 'youtube:user'
 
     _TESTS = [{
         'url': 'https://www.youtube.com/user/TheLinuxFoundation',
         'playlist_mincount': 320,
         'info_dict': {
-            'title': 'TheLinuxFoundation',
+            'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
+            'title': 'Uploads from The Linux Foundation',
+        }
+    }, {
+        # Only available via https://www.youtube.com/c/12minuteathlete/videos
+        # but not https://www.youtube.com/user/12minuteathlete/videos
+        'url': 'https://www.youtube.com/c/12minuteathlete/videos',
+        'playlist_mincount': 249,
+        'info_dict': {
+            'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
+            'title': 'Uploads from 12 Minute Athlete',
         }
     }, {
         'url': 'ytuser:phihag',
         'only_matching': True,
+    }, {
+        'url': 'https://www.youtube.com/c/gametrailers',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.youtube.com/gametrailers',
+        'only_matching': True,
+    }, {
+        # This channel is not available.
+        'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
+        'only_matching': True,
     }]
 
     @classmethod
@@ -2011,6 +2086,10 @@ class YoutubeUserIE(YoutubeChannelIE):
         else:
             return super(YoutubeUserIE, cls).suitable(url)
 
+    def _build_template_url(self, url, channel_id):
+        mobj = re.match(self._VALID_URL, url)
+        return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
+
 
 class YoutubeLiveIE(YoutubeBaseInfoExtractor):
     IE_DESC = 'YouTube.com live streams'
@@ -2139,10 +2218,11 @@ class YoutubeSearchDateIE(YoutubeSearchIE):
     _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
 
 
-class YoutubeSearchURLIE(InfoExtractor):
+class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor):
     IE_DESC = 'YouTube.com search URLs'
     IE_NAME = 'youtube:search_url'
     _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
+    _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
     _TESTS = [{
         'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
         'playlist_mincount': 5,
@@ -2157,32 +2237,8 @@ class YoutubeSearchURLIE(InfoExtractor):
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         query = compat_urllib_parse_unquote_plus(mobj.group('query'))
-
         webpage = self._download_webpage(url, query)
-        result_code = self._search_regex(
-            r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')
-
-        part_codes = re.findall(
-            r'(?s)<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*>(.*?)</h3>', result_code)
-        entries = []
-        for part_code in part_codes:
-            part_title = self._html_search_regex(
-                [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
-            part_url_snippet = self._html_search_regex(
-                r'(?s)href="([^"]+)"', part_code, 'item URL')
-            part_url = compat_urlparse.urljoin(
-                'https://www.youtube.com/', part_url_snippet)
-            entries.append({
-                '_type': 'url',
-                'url': part_url,
-                'title': part_title,
-            })
-
-        return {
-            '_type': 'playlist',
-            'entries': entries,
-            'title': query,
-        }
+        return self.playlist_result(self._process_page(webpage), playlist_title=query)
 
 
 class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
index a7440c58242079ea1c6874e1bed0abe756fdc814..9737f70021d3285a4e8df616467b764de1a91fa2 100644 (file)
@@ -232,7 +232,7 @@ class JSInterpreter(object):
     def extract_function(self, funcname):
         func_m = re.search(
             r'''(?x)
-                (?:function\s+%s|[{;,]%s\s*=\s*function|var\s+%s\s*=\s*function)\s*
+                (?:function\s+%s|[{;,]\s*%s\s*=\s*function|var\s+%s\s*=\s*function)\s*
                 \((?P<args>[^)]*)\)\s*
                 \{(?P<code>[^}]+)\}''' % (
                 re.escape(funcname), re.escape(funcname), re.escape(funcname)),
index d1f8d1331cf153a58a42b4220ebe37b441f3df4b..c4a85b2c09c17eb02123836a2341efc93fb6d283 100644 (file)
@@ -26,9 +26,11 @@ def parseOpts(overrideArguments=None):
         except IOError:
             return default  # silently skip if file is not present
         try:
-            res = []
-            for l in optionf:
-                res += compat_shlex_split(l, comments=True)
+            # FIXME: https://github.com/rg3/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
+            contents = optionf.read()
+            if sys.version_info < (3,):
+                contents = contents.decode(preferredencoding())
+            res = compat_shlex_split(contents, comments=True)
         finally:
             optionf.close()
         return res
@@ -188,7 +190,10 @@ def parseOpts(overrideArguments=None):
     network.add_option(
         '--proxy', dest='proxy',
         default=None, metavar='URL',
-        help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection')
+        help='Use the specified HTTP/HTTPS/SOCKS proxy. To enable experimental '
+             'SOCKS proxy, specify a proper scheme. For example '
+             'socks5://127.0.0.1:1080/. Pass in an empty string (--proxy "") '
+             'for direct connection')
     network.add_option(
         '--socket-timeout',
         dest='socket_timeout', type=float, default=None, metavar='SECONDS',
@@ -208,11 +213,16 @@ def parseOpts(overrideArguments=None):
         action='store_const', const='::', dest='source_address',
         help='Make all connections via IPv6 (experimental)',
     )
+    network.add_option(
+        '--geo-verification-proxy',
+        dest='geo_verification_proxy', default=None, metavar='URL',
+        help='Use this proxy to verify the IP address for some geo-restricted sites. '
+        'The default proxy specified by --proxy (or none, if the options is not present) is used for the actual downloading. (experimental)'
+    )
     network.add_option(
         '--cn-verification-proxy',
         dest='cn_verification_proxy', default=None, metavar='URL',
-        help='Use this proxy to verify the IP address for some Chinese sites. '
-        'The default proxy specified by --proxy (or none, if the options is not present) is used for the actual downloading. (experimental)'
+        help=optparse.SUPPRESS_HELP,
     )
 
     selection = optparse.OptionGroup(parser, 'Video Selection')
@@ -392,8 +402,8 @@ def parseOpts(overrideArguments=None):
 
     downloader = optparse.OptionGroup(parser, 'Download Options')
     downloader.add_option(
-        '-r', '--rate-limit',
-        dest='ratelimit', metavar='LIMIT',
+        '-r', '--limit-rate', '--rate-limit',
+        dest='ratelimit', metavar='RATE',
         help='Maximum download rate in bytes per second (e.g. 50K or 4.2M)')
     downloader.add_option(
         '-R', '--retries',
@@ -665,7 +675,7 @@ def parseOpts(overrideArguments=None):
         action='store_true', dest='writeannotations', default=False,
         help='Write video annotations to a .annotations.xml file')
     filesystem.add_option(
-        '--load-info',
+        '--load-info-json', '--load-info',
         dest='load_info_filename', metavar='FILE',
         help='JSON file containing the video information (created with the "--write-info-json" option)')
     filesystem.add_option(
@@ -806,11 +816,11 @@ def parseOpts(overrideArguments=None):
             system_conf = []
             user_conf = []
         else:
-            system_conf = compat_conf(_readOptions('/etc/youtube-dl.conf'))
+            system_conf = _readOptions('/etc/youtube-dl.conf')
             if '--ignore-config' in system_conf:
                 user_conf = []
             else:
-                user_conf = compat_conf(_readUserConf())
+                user_conf = _readUserConf()
         argv = system_conf + user_conf + command_line_conf
 
         opts, args = parser.parse_args(argv)
index 74f66d669c0679a9eece06b1924ecc9f5dae00d2..90630c2d7391de9fd288662c8f207433702f8c99 100644 (file)
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
 import subprocess
 
 from .common import PostProcessor
-from ..compat import shlex_quote
+from ..compat import compat_shlex_quote
 from ..utils import PostProcessingError
 
 
@@ -17,7 +17,7 @@ class ExecAfterDownloadPP(PostProcessor):
         if '{}' not in cmd:
             cmd += ' {}'
 
-        cmd = cmd.replace('{}', shlex_quote(information['filepath']))
+        cmd = cmd.replace('{}', compat_shlex_quote(information['filepath']))
 
         self._downloader.to_screen('[exec] Executing command: %s' % cmd)
         retCode = subprocess.call(cmd, shell=True)
index ca2d401f87e2594bce5b18c1be18491ce0815fd5..c1e9eb159ed9c2f969ad2ae3b9f90847b83cd98d 100644 (file)
@@ -391,23 +391,30 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
 class FFmpegMetadataPP(FFmpegPostProcessor):
     def run(self, info):
         metadata = {}
-        if info.get('title') is not None:
-            metadata['title'] = info['title']
-        if info.get('upload_date') is not None:
-            metadata['date'] = info['upload_date']
-        if info.get('artist') is not None:
-            metadata['artist'] = info['artist']
-        elif info.get('uploader') is not None:
-            metadata['artist'] = info['uploader']
-        elif info.get('uploader_id') is not None:
-            metadata['artist'] = info['uploader_id']
-        if info.get('description') is not None:
-            metadata['description'] = info['description']
-            metadata['comment'] = info['description']
-        if info.get('webpage_url') is not None:
-            metadata['purl'] = info['webpage_url']
-        if info.get('album') is not None:
-            metadata['album'] = info['album']
+
+        def add(meta_list, info_list=None):
+            if not info_list:
+                info_list = meta_list
+            if not isinstance(meta_list, (list, tuple)):
+                meta_list = (meta_list,)
+            if not isinstance(info_list, (list, tuple)):
+                info_list = (info_list,)
+            for info_f in info_list:
+                if info.get(info_f) is not None:
+                    for meta_f in meta_list:
+                        metadata[meta_f] = info[info_f]
+                    break
+
+        add('title', ('track', 'title'))
+        add('date', 'upload_date')
+        add(('description', 'comment'), 'description')
+        add('purl', 'webpage_url')
+        add('track', 'track_number')
+        add('artist', ('artist', 'creator', 'uploader', 'uploader_id'))
+        add('genre')
+        add('album')
+        add('album_artist')
+        add('disc', 'disc_number')
 
         if not metadata:
             self._downloader.to_screen('[ffmpeg] There isn\'t any metadata to add')
diff --git a/youtube_dl/socks.py b/youtube_dl/socks.py
new file mode 100644 (file)
index 0000000..1048072
--- /dev/null
@@ -0,0 +1,271 @@
+# Public Domain SOCKS proxy protocol implementation
+# Adapted from https://gist.github.com/bluec0re/cafd3764412967417fd3
+
+from __future__ import unicode_literals
+
+# References:
+# SOCKS4 protocol http://www.openssh.com/txt/socks4.protocol
+# SOCKS4A protocol http://www.openssh.com/txt/socks4a.protocol
+# SOCKS5 protocol https://tools.ietf.org/html/rfc1928
+# SOCKS5 username/password authentication https://tools.ietf.org/html/rfc1929
+
+import collections
+import socket
+
+from .compat import (
+    compat_ord,
+    compat_struct_pack,
+    compat_struct_unpack,
+)
+
+__author__ = 'Timo Schmid <coding@timoschmid.de>'
+
+SOCKS4_VERSION = 4
+SOCKS4_REPLY_VERSION = 0x00
+# Excerpt from SOCKS4A protocol:
+# if the client cannot resolve the destination host's domain name to find its
+# IP address, it should set the first three bytes of DSTIP to NULL and the last
+# byte to a non-zero value.
+SOCKS4_DEFAULT_DSTIP = compat_struct_pack('!BBBB', 0, 0, 0, 0xFF)
+
+SOCKS5_VERSION = 5
+SOCKS5_USER_AUTH_VERSION = 0x01
+SOCKS5_USER_AUTH_SUCCESS = 0x00
+
+
+class Socks4Command(object):
+    CMD_CONNECT = 0x01
+    CMD_BIND = 0x02
+
+
+class Socks5Command(Socks4Command):
+    CMD_UDP_ASSOCIATE = 0x03
+
+
+class Socks5Auth(object):
+    AUTH_NONE = 0x00
+    AUTH_GSSAPI = 0x01
+    AUTH_USER_PASS = 0x02
+    AUTH_NO_ACCEPTABLE = 0xFF  # For server response
+
+
+class Socks5AddressType(object):
+    ATYP_IPV4 = 0x01
+    ATYP_DOMAINNAME = 0x03
+    ATYP_IPV6 = 0x04
+
+
+class ProxyError(IOError):
+    ERR_SUCCESS = 0x00
+
+    def __init__(self, code=None, msg=None):
+        if code is not None and msg is None:
+            msg = self.CODES.get(code) and 'unknown error'
+        super(ProxyError, self).__init__(code, msg)
+
+
+class InvalidVersionError(ProxyError):
+    def __init__(self, expected_version, got_version):
+        msg = ('Invalid response version from server. Expected {0:02x} got '
+               '{1:02x}'.format(expected_version, got_version))
+        super(InvalidVersionError, self).__init__(0, msg)
+
+
+class Socks4Error(ProxyError):
+    ERR_SUCCESS = 90
+
+    CODES = {
+        91: 'request rejected or failed',
+        92: 'request rejected because SOCKS server cannot connect to identd on the client',
+        93: 'request rejected because the client program and identd report different user-ids'
+    }
+
+
+class Socks5Error(ProxyError):
+    ERR_GENERAL_FAILURE = 0x01
+
+    CODES = {
+        0x01: 'general SOCKS server failure',
+        0x02: 'connection not allowed by ruleset',
+        0x03: 'Network unreachable',
+        0x04: 'Host unreachable',
+        0x05: 'Connection refused',
+        0x06: 'TTL expired',
+        0x07: 'Command not supported',
+        0x08: 'Address type not supported',
+        0xFE: 'unknown username or invalid password',
+        0xFF: 'all offered authentication methods were rejected'
+    }
+
+
+class ProxyType(object):
+    SOCKS4 = 0
+    SOCKS4A = 1
+    SOCKS5 = 2
+
+Proxy = collections.namedtuple('Proxy', (
+    'type', 'host', 'port', 'username', 'password', 'remote_dns'))
+
+
+class sockssocket(socket.socket):
+    def __init__(self, *args, **kwargs):
+        self._proxy = None
+        super(sockssocket, self).__init__(*args, **kwargs)
+
+    def setproxy(self, proxytype, addr, port, rdns=True, username=None, password=None):
+        assert proxytype in (ProxyType.SOCKS4, ProxyType.SOCKS4A, ProxyType.SOCKS5)
+
+        self._proxy = Proxy(proxytype, addr, port, username, password, rdns)
+
+    def recvall(self, cnt):
+        data = b''
+        while len(data) < cnt:
+            cur = self.recv(cnt - len(data))
+            if not cur:
+                raise IOError('{0} bytes missing'.format(cnt - len(data)))
+            data += cur
+        return data
+
+    def _recv_bytes(self, cnt):
+        data = self.recvall(cnt)
+        return compat_struct_unpack('!{0}B'.format(cnt), data)
+
+    @staticmethod
+    def _len_and_data(data):
+        return compat_struct_pack('!B', len(data)) + data
+
+    def _check_response_version(self, expected_version, got_version):
+        if got_version != expected_version:
+            self.close()
+            raise InvalidVersionError(expected_version, got_version)
+
+    def _resolve_address(self, destaddr, default, use_remote_dns):
+        try:
+            return socket.inet_aton(destaddr)
+        except socket.error:
+            if use_remote_dns and self._proxy.remote_dns:
+                return default
+            else:
+                return socket.inet_aton(socket.gethostbyname(destaddr))
+
+    def _setup_socks4(self, address, is_4a=False):
+        destaddr, port = address
+
+        ipaddr = self._resolve_address(destaddr, SOCKS4_DEFAULT_DSTIP, use_remote_dns=is_4a)
+
+        packet = compat_struct_pack('!BBH', SOCKS4_VERSION, Socks4Command.CMD_CONNECT, port) + ipaddr
+
+        username = (self._proxy.username or '').encode('utf-8')
+        packet += username + b'\x00'
+
+        if is_4a and self._proxy.remote_dns:
+            packet += destaddr.encode('utf-8') + b'\x00'
+
+        self.sendall(packet)
+
+        version, resp_code, dstport, dsthost = compat_struct_unpack('!BBHI', self.recvall(8))
+
+        self._check_response_version(SOCKS4_REPLY_VERSION, version)
+
+        if resp_code != Socks4Error.ERR_SUCCESS:
+            self.close()
+            raise Socks4Error(resp_code)
+
+        return (dsthost, dstport)
+
+    def _setup_socks4a(self, address):
+        self._setup_socks4(address, is_4a=True)
+
+    def _socks5_auth(self):
+        packet = compat_struct_pack('!B', SOCKS5_VERSION)
+
+        auth_methods = [Socks5Auth.AUTH_NONE]
+        if self._proxy.username and self._proxy.password:
+            auth_methods.append(Socks5Auth.AUTH_USER_PASS)
+
+        packet += compat_struct_pack('!B', len(auth_methods))
+        packet += compat_struct_pack('!{0}B'.format(len(auth_methods)), *auth_methods)
+
+        self.sendall(packet)
+
+        version, method = self._recv_bytes(2)
+
+        self._check_response_version(SOCKS5_VERSION, version)
+
+        if method == Socks5Auth.AUTH_NO_ACCEPTABLE:
+            self.close()
+            raise Socks5Error(method)
+
+        if method == Socks5Auth.AUTH_USER_PASS:
+            username = self._proxy.username.encode('utf-8')
+            password = self._proxy.password.encode('utf-8')
+            packet = compat_struct_pack('!B', SOCKS5_USER_AUTH_VERSION)
+            packet += self._len_and_data(username) + self._len_and_data(password)
+            self.sendall(packet)
+
+            version, status = self._recv_bytes(2)
+
+            self._check_response_version(SOCKS5_USER_AUTH_VERSION, version)
+
+            if status != SOCKS5_USER_AUTH_SUCCESS:
+                self.close()
+                raise Socks5Error(Socks5Error.ERR_GENERAL_FAILURE)
+
+    def _setup_socks5(self, address):
+        destaddr, port = address
+
+        ipaddr = self._resolve_address(destaddr, None, use_remote_dns=True)
+
+        self._socks5_auth()
+
+        reserved = 0
+        packet = compat_struct_pack('!BBB', SOCKS5_VERSION, Socks5Command.CMD_CONNECT, reserved)
+        if ipaddr is None:
+            destaddr = destaddr.encode('utf-8')
+            packet += compat_struct_pack('!B', Socks5AddressType.ATYP_DOMAINNAME)
+            packet += self._len_and_data(destaddr)
+        else:
+            packet += compat_struct_pack('!B', Socks5AddressType.ATYP_IPV4) + ipaddr
+        packet += compat_struct_pack('!H', port)
+
+        self.sendall(packet)
+
+        version, status, reserved, atype = self._recv_bytes(4)
+
+        self._check_response_version(SOCKS5_VERSION, version)
+
+        if status != Socks5Error.ERR_SUCCESS:
+            self.close()
+            raise Socks5Error(status)
+
+        if atype == Socks5AddressType.ATYP_IPV4:
+            destaddr = self.recvall(4)
+        elif atype == Socks5AddressType.ATYP_DOMAINNAME:
+            alen = compat_ord(self.recv(1))
+            destaddr = self.recvall(alen)
+        elif atype == Socks5AddressType.ATYP_IPV6:
+            destaddr = self.recvall(16)
+        destport = compat_struct_unpack('!H', self.recvall(2))[0]
+
+        return (destaddr, destport)
+
+    def _make_proxy(self, connect_func, address):
+        if not self._proxy:
+            return connect_func(self, address)
+
+        result = connect_func(self, (self._proxy.host, self._proxy.port))
+        if result != 0 and result is not None:
+            return result
+        setup_funcs = {
+            ProxyType.SOCKS4: self._setup_socks4,
+            ProxyType.SOCKS4A: self._setup_socks4a,
+            ProxyType.SOCKS5: self._setup_socks5,
+        }
+        setup_funcs[self._proxy.type](address)
+        return result
+
+    def connect(self, address):
+        self._make_proxy(socket.socket.connect, address)
+
+    def connect_ex(self, address):
+        return self._make_proxy(socket.socket.connect_ex, address)
index 06c1d6cc1755ef022aa78967d4b651e21fd66618..7cf490aa43a878b3c377bea0b173c7a2b170c2c7 100644 (file)
@@ -4,10 +4,12 @@ import collections
 import io
 import zlib
 
-from .compat import compat_str
+from .compat import (
+    compat_str,
+    compat_struct_unpack,
+)
 from .utils import (
     ExtractorError,
-    struct_unpack,
 )
 
 
@@ -23,17 +25,17 @@ def _extract_tags(file_contents):
             file_contents[:1])
 
     # Determine number of bits in framesize rectangle
-    framesize_nbits = struct_unpack('!B', content[:1])[0] >> 3
+    framesize_nbits = compat_struct_unpack('!B', content[:1])[0] >> 3
     framesize_len = (5 + 4 * framesize_nbits + 7) // 8
 
     pos = framesize_len + 2 + 2
     while pos < len(content):
-        header16 = struct_unpack('<H', content[pos:pos + 2])[0]
+        header16 = compat_struct_unpack('<H', content[pos:pos + 2])[0]
         pos += 2
         tag_code = header16 >> 6
         tag_len = header16 & 0x3f
         if tag_len == 0x3f:
-            tag_len = struct_unpack('<I', content[pos:pos + 4])[0]
+            tag_len = compat_struct_unpack('<I', content[pos:pos + 4])[0]
             pos += 4
         assert pos + tag_len <= len(content), \
             ('Tag %d ends at %d+%d - that\'s longer than the file (%d)'
@@ -101,7 +103,7 @@ def _read_int(reader):
     for _ in range(5):
         buf = reader.read(1)
         assert len(buf) == 1
-        b = struct_unpack('<B', buf)[0]
+        b = compat_struct_unpack('<B', buf)[0]
         res = res | ((b & 0x7f) << shift)
         if b & 0x80 == 0:
             break
@@ -127,7 +129,7 @@ def _s24(reader):
     bs = reader.read(3)
     assert len(bs) == 3
     last_byte = b'\xff' if (ord(bs[2:3]) >= 0x80) else b'\x00'
-    return struct_unpack('<i', bs + last_byte)[0]
+    return compat_struct_unpack('<i', bs + last_byte)[0]
 
 
 def _read_string(reader):
@@ -146,7 +148,7 @@ def _read_bytes(count, reader):
 
 def _read_byte(reader):
     resb = _read_bytes(1, reader=reader)
-    res = struct_unpack('<B', resb)[0]
+    res = compat_struct_unpack('<B', resb)[0]
     return res
 
 
index 676ebe1c42d1d6b54eb50bfc3f087e6fee8e20f0..ebce9666a21465b53b93ccd0bd263b29349720a0 100644 (file)
@@ -83,11 +83,8 @@ def update_self(to_screen, verbose, opener):
 
     print_notes(to_screen, versions_info['versions'])
 
-    filename = sys.argv[0]
-    # Py2EXE: Filename could be different
-    if hasattr(sys, 'frozen') and not os.path.isfile(filename):
-        if os.path.isfile(filename + '.exe'):
-            filename += '.exe'
+    # sys.executable is set to the full pathname of the exe-file for py2exe
+    filename = sys.executable if hasattr(sys, 'frozen') else sys.argv[0]
 
     if not os.access(filename, os.W_OK):
         to_screen('ERROR: no write permissions on %s' % filename)
@@ -95,7 +92,7 @@ def update_self(to_screen, verbose, opener):
 
     # Py2EXE
     if hasattr(sys, 'frozen'):
-        exe = os.path.abspath(filename)
+        exe = filename
         directory = os.path.dirname(exe)
         if not os.access(directory, os.W_OK):
             to_screen('ERROR: no write permissions on %s' % directory)
index f333e471275a69cbd158828c90f0ed1b5522582f..3498697b60d70c45d4041f80f59945aefeb9e035 100644 (file)
@@ -14,8 +14,8 @@ import email.utils
 import errno
 import functools
 import gzip
-import itertools
 import io
+import itertools
 import json
 import locale
 import math
@@ -24,9 +24,8 @@ import os
 import pipes
 import platform
 import re
-import ssl
 import socket
-import struct
+import ssl
 import subprocess
 import sys
 import tempfile
@@ -40,27 +39,44 @@ from .compat import (
     compat_chr,
     compat_etree_fromstring,
     compat_html_entities,
+    compat_html_entities_html5,
     compat_http_client,
     compat_kwargs,
     compat_parse_qs,
+    compat_shlex_quote,
     compat_socket_create_connection,
     compat_str,
+    compat_struct_pack,
     compat_urllib_error,
     compat_urllib_parse,
     compat_urllib_parse_urlencode,
     compat_urllib_parse_urlparse,
+    compat_urllib_parse_unquote_plus,
     compat_urllib_request,
     compat_urlparse,
     compat_xpath,
-    shlex_quote,
+)
+
+from .socks import (
+    ProxyType,
+    sockssocket,
 )
 
 
+def register_socks_protocols():
+    # "Register" SOCKS protocols
+    # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
+    # URLs with protocols not in urlparse.uses_netloc are not handled correctly
+    for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
+        if scheme not in compat_urlparse.uses_netloc:
+            compat_urlparse.uses_netloc.append(scheme)
+
+
 # This is not clearly defined otherwise
 compiled_regex_type = type(re.compile(''))
 
 std_headers = {
-    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
+    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
     'Accept-Encoding': 'gzip, deflate',
@@ -89,6 +105,54 @@ KNOWN_EXTENSIONS = (
     'wav',
     'f4f', 'f4m', 'm3u8', 'smil')
 
+# needed for sanitizing filenames in restricted mode
+ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
+                        itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
+                                        'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
+
+DATE_FORMATS = (
+    '%d %B %Y',
+    '%d %b %Y',
+    '%B %d %Y',
+    '%b %d %Y',
+    '%b %dst %Y %I:%M',
+    '%b %dnd %Y %I:%M',
+    '%b %dth %Y %I:%M',
+    '%Y %m %d',
+    '%Y-%m-%d',
+    '%Y/%m/%d',
+    '%Y/%m/%d %H:%M:%S',
+    '%Y-%m-%d %H:%M:%S',
+    '%Y-%m-%d %H:%M:%S.%f',
+    '%d.%m.%Y %H:%M',
+    '%d.%m.%Y %H.%M',
+    '%Y-%m-%dT%H:%M:%SZ',
+    '%Y-%m-%dT%H:%M:%S.%fZ',
+    '%Y-%m-%dT%H:%M:%S.%f0Z',
+    '%Y-%m-%dT%H:%M:%S',
+    '%Y-%m-%dT%H:%M:%S.%f',
+    '%Y-%m-%dT%H:%M',
+)
+
+DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
+DATE_FORMATS_DAY_FIRST.extend([
+    '%d-%m-%Y',
+    '%d.%m.%Y',
+    '%d.%m.%y',
+    '%d/%m/%Y',
+    '%d/%m/%y',
+    '%d/%m/%Y %H:%M:%S',
+])
+
+DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
+DATE_FORMATS_MONTH_FIRST.extend([
+    '%m-%d-%Y',
+    '%m.%d.%Y',
+    '%m/%d/%Y',
+    '%m/%d/%y',
+    '%m/%d/%Y %H:%M:%S',
+])
+
 
 def preferredencoding():
     """Get preferred encoding.
@@ -246,18 +310,26 @@ def get_element_by_id(id, html):
     return get_element_by_attribute('id', id, html)
 
 
-def get_element_by_attribute(attribute, value, html):
+def get_element_by_class(class_name, html):
+    return get_element_by_attribute(
+        'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
+        html, escape_value=False)
+
+
+def get_element_by_attribute(attribute, value, html, escape_value=True):
     """Return the content of the tag with the specified attribute in the passed HTML document"""
 
+    value = re.escape(value) if escape_value else value
+
     m = re.search(r'''(?xs)
         <([a-zA-Z0-9:._-]+)
-         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
+         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
          \s+%s=['"]?%s['"]?
-         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
+         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
         \s*>
         (?P<content>.*?)
         </\1>
-    ''' % (re.escape(attribute), re.escape(value)), html)
+    ''' % (re.escape(attribute), value), html)
 
     if not m:
         return None
@@ -365,6 +437,8 @@ def sanitize_filename(s, restricted=False, is_id=False):
     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
     """
     def replace_insane(char):
+        if restricted and char in ACCENT_CHARS:
+            return ACCENT_CHARS[char]
         if char == '?' or ord(char) < 32 or ord(char) == 127:
             return ''
         elif char == '"':
@@ -434,12 +508,19 @@ def orderedSet(iterable):
     return res
 
 
-def _htmlentity_transform(entity):
+def _htmlentity_transform(entity_with_semicolon):
     """Transforms an HTML entity to a character."""
+    entity = entity_with_semicolon[:-1]
+
     # Known non-numeric HTML entity
     if entity in compat_html_entities.name2codepoint:
         return compat_chr(compat_html_entities.name2codepoint[entity])
 
+    # TODO: HTML5 allows entities without a semicolon. For example,
+    # '&Eacuteric' should be decoded as 'Éric'.
+    if entity_with_semicolon in compat_html_entities_html5:
+        return compat_html_entities_html5[entity_with_semicolon]
+
     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
     if mobj is not None:
         numstr = mobj.group(1)
@@ -464,7 +545,7 @@ def unescapeHTML(s):
     assert type(s) == compat_str
 
     return re.sub(
-        r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
+        r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 
 
 def get_subprocess_encoding():
@@ -745,8 +826,15 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
         self._params = params
 
     def http_open(self, req):
+        conn_class = compat_http_client.HTTPConnection
+
+        socks_proxy = req.headers.get('Ytdl-socks-proxy')
+        if socks_proxy:
+            conn_class = make_socks_conn_class(conn_class, socks_proxy)
+            del req.headers['Ytdl-socks-proxy']
+
         return self.do_open(functools.partial(
-            _create_http_connection, self, compat_http_client.HTTPConnection, False),
+            _create_http_connection, self, conn_class, False),
             req)
 
     @staticmethod
@@ -832,9 +920,13 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
                 if sys.version_info >= (3, 0):
                     location = location.encode('iso-8859-1').decode('utf-8')
+                else:
+                    location = location.decode('utf-8')
                 location_escaped = escape_url(location)
                 if location != location_escaped:
                     del resp.headers['Location']
+                    if sys.version_info < (3, 0):
+                        location_escaped = location_escaped.encode('utf-8')
                     resp.headers['Location'] = location_escaped
         return resp
 
@@ -842,6 +934,49 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
     https_response = http_response
 
 
+def make_socks_conn_class(base_class, socks_proxy):
+    assert issubclass(base_class, (
+        compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
+
+    url_components = compat_urlparse.urlparse(socks_proxy)
+    if url_components.scheme.lower() == 'socks5':
+        socks_type = ProxyType.SOCKS5
+    elif url_components.scheme.lower() in ('socks', 'socks4'):
+        socks_type = ProxyType.SOCKS4
+    elif url_components.scheme.lower() == 'socks4a':
+        socks_type = ProxyType.SOCKS4A
+
+    def unquote_if_non_empty(s):
+        if not s:
+            return s
+        return compat_urllib_parse_unquote_plus(s)
+
+    proxy_args = (
+        socks_type,
+        url_components.hostname, url_components.port or 1080,
+        True,  # Remote DNS
+        unquote_if_non_empty(url_components.username),
+        unquote_if_non_empty(url_components.password),
+    )
+
+    class SocksConnection(base_class):
+        def connect(self):
+            self.sock = sockssocket()
+            self.sock.setproxy(*proxy_args)
+            if type(self.timeout) in (int, float):
+                self.sock.settimeout(self.timeout)
+            self.sock.connect((self.host, self.port))
+
+            if isinstance(self, compat_http_client.HTTPSConnection):
+                if hasattr(self, '_context'):  # Python > 2.6
+                    self.sock = self._context.wrap_socket(
+                        self.sock, server_hostname=self.host)
+                else:
+                    self.sock = ssl.wrap_socket(self.sock)
+
+    return SocksConnection
+
+
 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
     def __init__(self, params, https_conn_class=None, *args, **kwargs):
         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
@@ -850,12 +985,20 @@ class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 
     def https_open(self, req):
         kwargs = {}
+        conn_class = self._https_conn_class
+
         if hasattr(self, '_context'):  # python > 2.6
             kwargs['context'] = self._context
         if hasattr(self, '_check_hostname'):  # python 3.x
             kwargs['check_hostname'] = self._check_hostname
+
+        socks_proxy = req.headers.get('Ytdl-socks-proxy')
+        if socks_proxy:
+            conn_class = make_socks_conn_class(conn_class, socks_proxy)
+            del req.headers['Ytdl-socks-proxy']
+
         return self.do_open(functools.partial(
-            _create_http_connection, self, self._https_conn_class, True),
+            _create_http_connection, self, conn_class, True),
             req, **kwargs)
 
 
@@ -883,6 +1026,24 @@ class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
     https_response = http_response
 
 
+def extract_timezone(date_str):
+    m = re.search(
+        r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
+        date_str)
+    if not m:
+        timezone = datetime.timedelta()
+    else:
+        date_str = date_str[:-len(m.group('tz'))]
+        if not m.group('sign'):
+            timezone = datetime.timedelta()
+        else:
+            sign = 1 if m.group('sign') == '+' else -1
+            timezone = datetime.timedelta(
+                hours=sign * int(m.group('hours')),
+                minutes=sign * int(m.group('minutes')))
+    return timezone, date_str
+
+
 def parse_iso8601(date_str, delimiter='T', timezone=None):
     """ Return a UNIX timestamp from the given date """
 
@@ -892,20 +1053,8 @@ def parse_iso8601(date_str, delimiter='T', timezone=None):
     date_str = re.sub(r'\.[0-9]+', '', date_str)
 
     if timezone is None:
-        m = re.search(
-            r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
-            date_str)
-        if not m:
-            timezone = datetime.timedelta()
-        else:
-            date_str = date_str[:-len(m.group(0))]
-            if not m.group('sign'):
-                timezone = datetime.timedelta()
-            else:
-                sign = 1 if m.group('sign') == '+' else -1
-                timezone = datetime.timedelta(
-                    hours=sign * int(m.group('hours')),
-                    minutes=sign * int(m.group('minutes')))
+        timezone, date_str = extract_timezone(date_str)
+
     try:
         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
         dt = datetime.datetime.strptime(date_str, date_format) - timezone
@@ -914,6 +1063,10 @@ def parse_iso8601(date_str, delimiter='T', timezone=None):
         pass
 
 
+def date_formats(day_first=True):
+    return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
+
+
 def unified_strdate(date_str, day_first=True):
     """Return a string with the date in the format YYYYMMDD"""
 
@@ -922,52 +1075,11 @@ def unified_strdate(date_str, day_first=True):
     upload_date = None
     # Replace commas
     date_str = date_str.replace(',', ' ')
-    # %z (UTC offset) is only supported in python>=3.2
-    if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
-        date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
     # Remove AM/PM + timezone
     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
+    _, date_str = extract_timezone(date_str)
 
-    format_expressions = [
-        '%d %B %Y',
-        '%d %b %Y',
-        '%B %d %Y',
-        '%b %d %Y',
-        '%b %dst %Y %I:%M',
-        '%b %dnd %Y %I:%M',
-        '%b %dth %Y %I:%M',
-        '%Y %m %d',
-        '%Y-%m-%d',
-        '%Y/%m/%d',
-        '%Y/%m/%d %H:%M:%S',
-        '%Y-%m-%d %H:%M:%S',
-        '%Y-%m-%d %H:%M:%S.%f',
-        '%d.%m.%Y %H:%M',
-        '%d.%m.%Y %H.%M',
-        '%Y-%m-%dT%H:%M:%SZ',
-        '%Y-%m-%dT%H:%M:%S.%fZ',
-        '%Y-%m-%dT%H:%M:%S.%f0Z',
-        '%Y-%m-%dT%H:%M:%S',
-        '%Y-%m-%dT%H:%M:%S.%f',
-        '%Y-%m-%dT%H:%M',
-    ]
-    if day_first:
-        format_expressions.extend([
-            '%d-%m-%Y',
-            '%d.%m.%Y',
-            '%d/%m/%Y',
-            '%d/%m/%y',
-            '%d/%m/%Y %H:%M:%S',
-        ])
-    else:
-        format_expressions.extend([
-            '%m-%d-%Y',
-            '%m.%d.%Y',
-            '%m/%d/%Y',
-            '%m/%d/%y',
-            '%m/%d/%Y %H:%M:%S',
-        ])
-    for expression in format_expressions:
+    for expression in date_formats(day_first):
         try:
             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
         except ValueError:
@@ -975,11 +1087,37 @@ def unified_strdate(date_str, day_first=True):
     if upload_date is None:
         timetuple = email.utils.parsedate_tz(date_str)
         if timetuple:
-            upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
+            try:
+                upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
+            except ValueError:
+                pass
     if upload_date is not None:
         return compat_str(upload_date)
 
 
+def unified_timestamp(date_str, day_first=True):
+    if date_str is None:
+        return None
+
+    date_str = date_str.replace(',', ' ')
+
+    pm_delta = datetime.timedelta(hours=12 if re.search(r'(?i)PM', date_str) else 0)
+    timezone, date_str = extract_timezone(date_str)
+
+    # Remove AM/PM + timezone
+    date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
+
+    for expression in date_formats(day_first):
+        try:
+            dt = datetime.datetime.strptime(date_str, expression) - timezone + pm_delta
+            return calendar.timegm(dt.timetuple())
+        except ValueError:
+            pass
+    timetuple = email.utils.parsedate_tz(date_str)
+    if timetuple:
+        return calendar.timegm(timetuple.timetuple())
+
+
 def determine_ext(url, default_ext='unknown_video'):
     if url is None:
         return default_ext
@@ -1186,7 +1324,7 @@ def bytes_to_intlist(bs):
 def intlist_to_bytes(xs):
     if not xs:
         return b''
-    return struct_pack('%dB' % len(xs), *xs)
+    return compat_struct_pack('%dB' % len(xs), *xs)
 
 
 # Cross-platform file locking
@@ -1314,6 +1452,8 @@ def shell_quote(args):
 def smuggle_url(url, data):
     """ Pass additional data in a URL for internal use. """
 
+    url, idata = unsmuggle_url(url, {})
+    data.update(idata)
     sdata = compat_urllib_parse_urlencode(
         {'__youtubedl_smuggle': json.dumps(data)})
     return url + '#' + sdata
@@ -1469,15 +1609,11 @@ def setproctitle(title):
 
 
 def remove_start(s, start):
-    if s.startswith(start):
-        return s[len(start):]
-    return s
+    return s[len(start):] if s is not None and s.startswith(start) else s
 
 
 def remove_end(s, end):
-    if s.endswith(end):
-        return s[:-len(end)]
-    return s
+    return s[:-len(end)] if s is not None and s.endswith(end) else s
 
 
 def remove_quotes(s):
@@ -1499,6 +1635,11 @@ class HEADRequest(compat_urllib_request.Request):
         return 'HEAD'
 
 
+class PUTRequest(compat_urllib_request.Request):
+    def get_method(self):
+        return 'PUT'
+
+
 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
     if get_attr:
         if v is not None:
@@ -1534,6 +1675,10 @@ def float_or_none(v, scale=1, invscale=1, default=None):
         return default
 
 
+def strip_or_none(v):
+    return None if v is None else v.strip()
+
+
 def parse_duration(s):
     if not isinstance(s, compat_basestring):
         return None
@@ -1754,24 +1899,6 @@ def escape_url(url):
         fragment=escape_rfc3986(url_parsed.fragment)
     ).geturl()
 
-try:
-    struct.pack('!I', 0)
-except TypeError:
-    # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument
-    # See https://bugs.python.org/issue19099
-    def struct_pack(spec, *args):
-        if isinstance(spec, compat_str):
-            spec = spec.encode('ascii')
-        return struct.pack(spec, *args)
-
-    def struct_unpack(spec, *args):
-        if isinstance(spec, compat_str):
-            spec = spec.encode('ascii')
-        return struct.unpack(spec, *args)
-else:
-    struct_pack = struct.pack
-    struct_unpack = struct.unpack
-
 
 def read_batch_urls(batch_fd):
     def fixup(url):
@@ -1808,7 +1935,13 @@ def update_Request(req, url=None, data=None, headers={}, query={}):
     req_headers.update(headers)
     req_data = data or req.data
     req_url = update_url_query(url or req.get_full_url(), query)
-    req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
+    req_get_method = req.get_method()
+    if req_get_method == 'HEAD':
+        req_type = HEADRequest
+    elif req_get_method == 'PUT':
+        req_type = PUTRequest
+    else:
+        req_type = compat_urllib_request.Request
     new_req = req_type(
         req_url, data=req_data, headers=req_headers,
         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
@@ -1827,6 +1960,16 @@ def dict_get(d, key_or_keys, default=None, skip_false_values=True):
     return d.get(key_or_keys, default)
 
 
+def try_get(src, getter, expected_type=None):
+    try:
+        v = getter(src)
+    except (AttributeError, KeyError, TypeError, IndexError):
+        pass
+    else:
+        if expected_type is None or isinstance(v, expected_type):
+            return v
+
+
 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
 
@@ -1849,7 +1992,7 @@ def parse_age_limit(s):
 
 def strip_jsonp(code):
     return re.sub(
-        r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
+        r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
 
 
 def js_to_json(code):
@@ -1857,24 +2000,38 @@ def js_to_json(code):
         v = m.group(0)
         if v in ('true', 'false', 'null'):
             return v
-        if v.startswith('"'):
-            v = re.sub(r"\\'", "'", v[1:-1])
-        elif v.startswith("'"):
-            v = v[1:-1]
-            v = re.sub(r"\\\\|\\'|\"", lambda m: {
-                '\\\\': '\\\\',
-                "\\'": "'",
+        elif v.startswith('/*') or v == ',':
+            return ""
+
+        if v[0] in ("'", '"'):
+            v = re.sub(r'(?s)\\.|"', lambda m: {
                 '"': '\\"',
-            }[m.group(0)], v)
+                "\\'": "'",
+                '\\\n': '',
+                '\\x': '\\u00',
+            }.get(m.group(0), m.group(0)), v[1:-1])
+
+        INTEGER_TABLE = (
+            (r'^0[xX][0-9a-fA-F]+', 16),
+            (r'^0+[0-7]+', 8),
+        )
+
+        for regex, base in INTEGER_TABLE:
+            im = re.match(regex, v)
+            if im:
+                i = int(im.group(0), base)
+                return '"%d":' % i if v.endswith(':') else '%d' % i
+
         return '"%s"' % v
 
-    res = re.sub(r'''(?x)
-        "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
-        '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
-        [a-zA-Z_][.a-zA-Z_0-9]*
+    return re.sub(r'''(?sx)
+        "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
+        '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
+        /\*.*?\*/|,(?=\s*[\]}])|
+        [a-zA-Z_][.a-zA-Z_0-9]*|
+        \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
+        [0-9]+(?=\s*:)
         ''', fix_kv, code)
-    res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
-    return res
 
 
 def qualities(quality_ids):
@@ -1922,7 +2079,7 @@ def ytdl_is_updateable():
 
 def args_to_str(args):
     # Get a short string representation for a subprocess command
-    return ' '.join(shlex_quote(a) for a in args)
+    return ' '.join(compat_shlex_quote(a) for a in args)
 
 
 def error_to_compat_str(err):
@@ -1935,13 +2092,20 @@ def error_to_compat_str(err):
 
 
 def mimetype2ext(mt):
+    if mt is None:
+        return None
+
     ext = {
         'audio/mp4': 'm4a',
+        # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
+        # it's the most popular one
+        'audio/mpeg': 'mp3',
     }.get(mt)
     if ext is not None:
         return ext
 
     _, _, res = mt.rpartition('/')
+    res = res.lower()
 
     return {
         '3gpp': '3gp',
@@ -1953,15 +2117,17 @@ def mimetype2ext(mt):
         'x-flv': 'flv',
         'x-mp4-fragmented': 'mp4',
         'x-ms-wmv': 'wmv',
+        'mpegurl': 'm3u8',
+        'x-mpegurl': 'm3u8',
+        'vnd.apple.mpegurl': 'm3u8',
+        'dash+xml': 'mpd',
+        'f4m': 'f4m',
+        'f4m+xml': 'f4m',
     }.get(res, res)
 
 
 def urlhandle_detect_ext(url_handle):
-    try:
-        url_handle.headers
-        getheader = lambda h: url_handle.headers[h]
-    except AttributeError:  # Python < 3
-        getheader = url_handle.info().getheader
+    getheader = url_handle.headers.get
 
     cd = getheader('Content-Disposition')
     if cd:
@@ -2691,6 +2857,10 @@ class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
 
         if proxy == '__noproxy__':
             return None  # No Proxy
+        if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
+            req.add_header('Ytdl-socks-proxy', proxy)
+            # youtube-dl's http/https handlers do wrapping the socket with socks
+            return None
         return compat_urllib_request.ProxyHandler.proxy_open(
             self, req, proxy, type)
 
@@ -2748,3 +2918,16 @@ def decode_packed_codes(code):
     return re.sub(
         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
         obfucasted_code)
+
+
+def parse_m3u8_attributes(attrib):
+    info = {}
+    for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
+        if val.startswith('"'):
+            val = val[1:-1]
+        info[key] = val
+    return info
+
+
+def urshift(val, n):
+    return val >> n if val >= 0 else (val + 0x100000000) >> n
index 140a67847df1bce886625ea7db1e0694c072df4c..3e45977c926534aa4a8ff8b24fcf0a87df09dcdd 100644 (file)
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '2016.04.19'
+__version__ = '2016.07.09.1'