Merge pull request #8611 from remitamine/ffmpegfd
authorremitamine <remitamine@gmail.com>
Sun, 13 Mar 2016 20:30:27 +0000 (21:30 +0100)
committerremitamine <remitamine@gmail.com>
Sun, 13 Mar 2016 20:30:27 +0000 (21:30 +0100)
[downloader/external] Add FFmpegFD

102 files changed:
.gitignore
AUTHORS
CONTRIBUTING.md
Makefile
README.md
docs/supportedsites.md
test/helper.py
test/test_YoutubeDL.py
test/test_http.py
test/test_iqiyi_sdk_interpreter.py [new file with mode: 0644]
test/test_utils.py
youtube_dl/YoutubeDL.py
youtube_dl/__init__.py
youtube_dl/compat.py
youtube_dl/downloader/common.py
youtube_dl/downloader/fragment.py
youtube_dl/extractor/__init__.py
youtube_dl/extractor/aol.py
youtube_dl/extractor/appletrailers.py
youtube_dl/extractor/arte.py
youtube_dl/extractor/audimedia.py
youtube_dl/extractor/audioboom.py [new file with mode: 0644]
youtube_dl/extractor/bbc.py
youtube_dl/extractor/bleacherreport.py
youtube_dl/extractor/bokecc.py [new file with mode: 0644]
youtube_dl/extractor/c56.py
youtube_dl/extractor/cinemassacre.py
youtube_dl/extractor/cnet.py
youtube_dl/extractor/common.py
youtube_dl/extractor/douyutv.py
youtube_dl/extractor/dplay.py
youtube_dl/extractor/dw.py [new file with mode: 0644]
youtube_dl/extractor/elpais.py
youtube_dl/extractor/engadget.py
youtube_dl/extractor/facebook.py
youtube_dl/extractor/faz.py
youtube_dl/extractor/fivemin.py
youtube_dl/extractor/foxnews.py
youtube_dl/extractor/freespeech.py
youtube_dl/extractor/generic.py
youtube_dl/extractor/googledrive.py
youtube_dl/extractor/imdb.py
youtube_dl/extractor/indavideo.py
youtube_dl/extractor/infoq.py
youtube_dl/extractor/iqiyi.py
youtube_dl/extractor/jeuxvideo.py
youtube_dl/extractor/jwplatform.py
youtube_dl/extractor/kaltura.py
youtube_dl/extractor/khanacademy.py
youtube_dl/extractor/kusi.py [new file with mode: 0644]
youtube_dl/extractor/kuwo.py
youtube_dl/extractor/leeco.py [new file with mode: 0644]
youtube_dl/extractor/letv.py [deleted file]
youtube_dl/extractor/lifenews.py
youtube_dl/extractor/livestream.py
youtube_dl/extractor/makerschannel.py [new file with mode: 0644]
youtube_dl/extractor/mdr.py
youtube_dl/extractor/minoto.py [new file with mode: 0644]
youtube_dl/extractor/mit.py
youtube_dl/extractor/mixcloud.py
youtube_dl/extractor/motherless.py
youtube_dl/extractor/mtv.py
youtube_dl/extractor/nba.py
youtube_dl/extractor/nrk.py
youtube_dl/extractor/pbs.py
youtube_dl/extractor/pyvideo.py
youtube_dl/extractor/revision3.py
youtube_dl/extractor/rtve.py
youtube_dl/extractor/safari.py
youtube_dl/extractor/screenwavemedia.py
youtube_dl/extractor/sexu.py
youtube_dl/extractor/space.py [deleted file]
youtube_dl/extractor/ted.py
youtube_dl/extractor/tf1.py
youtube_dl/extractor/theplatform.py
youtube_dl/extractor/tnaflix.py
youtube_dl/extractor/twitch.py
youtube_dl/extractor/twitter.py
youtube_dl/extractor/ustudio.py [new file with mode: 0644]
youtube_dl/extractor/vgtv.py
youtube_dl/extractor/vice.py
youtube_dl/extractor/videomega.py
youtube_dl/extractor/vidzi.py
youtube_dl/extractor/viki.py
youtube_dl/extractor/vimeo.py
youtube_dl/extractor/vk.py
youtube_dl/extractor/wat.py
youtube_dl/extractor/webofstories.py
youtube_dl/extractor/wimp.py
youtube_dl/extractor/wistia.py
youtube_dl/extractor/xfileshare.py
youtube_dl/extractor/yandexmusic.py
youtube_dl/extractor/youporn.py
youtube_dl/extractor/youtube.py
youtube_dl/extractor/zdf.py
youtube_dl/options.py
youtube_dl/postprocessor/__init__.py
youtube_dl/postprocessor/embedthumbnail.py
youtube_dl/postprocessor/ffmpeg.py
youtube_dl/postprocessor/xattrpp.py
youtube_dl/utils.py
youtube_dl/version.py

index 0422adf4456ec35166f5d2bce6b832c67601c2dc..26dbde73d412673ee9c53ee06a476a803a92edc7 100644 (file)
@@ -1,5 +1,6 @@
 *.pyc
 *.pyo
+*.class
 *~
 *.DS_Store
 wine-py2exe/
@@ -32,4 +33,4 @@ test/testdata
 .tox
 youtube-dl.zsh
 .idea
-.idea/*
\ No newline at end of file
+.idea/*
diff --git a/AUTHORS b/AUTHORS
index 0e90a3ecb4f5660a861273bb0fc422ae62e8111d..aa48cd5a6015aa965a23b4203349e3bc0a6f690d 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -160,3 +160,6 @@ Erwin de Haan
 Jens Wille
 Robin Houtevelts
 Patrick Griffis
+Aidan Rowe
+mutantmonkey
+Ben Congdon
index d15267d7ea1833f2340a834397148836dbf3912c..c996f03ab823c83546a92f907ff21d86a5fdb30d 100644 (file)
@@ -1,6 +1,6 @@
-**Please include the full output of youtube-dl when run with `-v`**, i.e. add `-v` flag to your command line, copy the **whole** output and post it in the issue body wrapped in \`\`\` for better formatting. It should look similar to this:
+**Please include the full output of youtube-dl when run with `-v`**, i.e. **add** `-v` flag to **your command line**, copy the **whole** output and post it in the issue body wrapped in \`\`\` for better formatting. It should look similar to this:
 ```
-$ youtube-dl -v http://www.youtube.com/watch?v=BaW_jenozKcj
+$ youtube-dl -v <your command line>
 [debug] System config: []
 [debug] User config: []
 [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
@@ -92,7 +92,9 @@ If you want to create a build of youtube-dl yourself, you'll need
 
 ### Adding support for a new site
 
-If you want to add support for a new site, you can follow this quick list (assuming your service is called `yourextractor`):
+If you want to add support for a new site, first of all **make sure** this site is **not dedicated to [copyright infringement](#can-you-add-support-for-this-anime-video-site-or-site-which-shows-current-movies-for-free)**. youtube-dl does **not support** such sites thus pull requests adding support for them **will be rejected**.
+
+After you have ensured this site is distributing it's content legally, you can follow this quick list (assuming your service is called `yourextractor`):
 
 1. [Fork this repository](https://github.com/rg3/youtube-dl/fork)
 2. Check out the source code with `git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git`
@@ -140,16 +142,17 @@ If you want to add support for a new site, you can follow this quick list (assum
     ```
 5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py).
 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc.
-7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62-L200). Add tests and code for as many as you want.
-8. If you can, check the code with [flake8](https://pypi.python.org/pypi/flake8).
-9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this:
+7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/58525c94d547be1c8167d16c298bdd75506db328/youtube_dl/extractor/common.py#L68-L226). Add tests and code for as many as you want.
+8. Keep in mind that the only mandatory fields in info dict for successful extraction process are `id`, `title` and either `url` or `formats`, i.e. these are the critical data the extraction does not make any sense without. This means that [any field](https://github.com/rg3/youtube-dl/blob/58525c94d547be1c8167d16c298bdd75506db328/youtube_dl/extractor/common.py#L138-L226) apart from aforementioned mandatory ones should be treated **as optional** and extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. For example, if you have some intermediate dict `meta` that is a source of metadata and it has a key `summary` that you want to extract and put into resulting info dict as `description`, you should be ready that this key may be missing from the `meta` dict, i.e. you should extract it as `meta.get('summary')` and not `meta['summary']`. Similarly, you should pass `fatal=False` when extracting data from a webpage with `_search_regex/_html_search_regex`.
+9. Check the code with [flake8](https://pypi.python.org/pypi/flake8).
+10. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this:
 
         $ git add youtube_dl/extractor/__init__.py
         $ git add youtube_dl/extractor/yourextractor.py
         $ git commit -m '[yourextractor] Add new extractor'
         $ git push origin yourextractor
 
-10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it.
+11. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it.
 
 In any case, thank you very much for your contributions!
 
index f826c16857846635e2c84ae8954d44f9aac3e48b..e98806791327feaa67cd780e5395ce0cc960bbd8 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -3,6 +3,7 @@ all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bas
 clean:
        rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp youtube-dl youtube-dl.exe
        find . -name "*.pyc" -delete
+       find . -name "*.class" -delete
 
 PREFIX ?= /usr/local
 BINDIR ?= $(PREFIX)/bin
@@ -44,7 +45,7 @@ test:
 ot: offlinetest
 
 offlinetest: codetest
-       nosetests --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py
+       $(PYTHON) -m nose --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py
 
 tar: youtube-dl.tar.gz
 
index db49ab6d523a8f16fc03806d1ad27f439f5e7b14..a774e73e75ecbf1bc2c8c5c5b45798a66b866eef 100644 (file)
--- a/README.md
+++ b/README.md
@@ -80,6 +80,8 @@ which means you can modify it, redistribute it or use it however you like.
                                      on Windows)
     --flat-playlist                  Do not extract the videos of a playlist,
                                      only list them.
+    --mark-watched                   Mark videos watched (YouTube only)
+    --no-mark-watched                Do not mark videos watched (YouTube only)
     --no-color                       Do not emit color codes in output
 
 ## Network Options:
@@ -409,13 +411,18 @@ which means you can modify it, redistribute it or use it however you like.
 
 # CONFIGURATION
 
-You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\<user name>\youtube-dl.conf`. For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime and use a proxy:
+You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\<user name>\youtube-dl.conf`.
+
+For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime, use a proxy and save all videos under `Movies` directory in your home directory:
 ```
---extract-audio
+-x
 --no-mtime
 --proxy 127.0.0.1:3128
+-o ~/Movies/%(title)s.%(ext)s
 ```
 
+Note that options in configuration file are just the same options aka switches used in regular command line calls thus there **must be no whitespace** after `-` or `--`, e.g. `-o` or `--proxy` but not `- o` or `-- proxy`.
+
 You can use `--ignore-config` if you want to disable the configuration file for a particular youtube-dl run.
 
 ### Authentication with `.netrc` file
@@ -440,7 +447,11 @@ On Windows you may also need to setup the `%HOME%` environment variable manually
 
 # OUTPUT TEMPLATE
 
-The `-o` option allows users to indicate a template for the output file names. The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "http://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences have the format `%(NAME)s`. To clarify, that is a percent symbol followed by a name in parentheses, followed by a lowercase S. Allowed names are:
+The `-o` option allows users to indicate a template for the output file names.
+
+**tl;dr:** [navigate me to examples](#output-template-examples).
+
+The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "http://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences have the format `%(NAME)s`. To clarify, that is a percent symbol followed by a name in parentheses, followed by a lowercase S. Allowed names are:
 
  - `id`: Video identifier
  - `title`: Video title
@@ -449,6 +460,7 @@ The `-o` option allows users to indicate a template for the output file names. T
  - `alt_title`: A secondary title of the video
  - `display_id`: An alternative identifier for the video
  - `uploader`: Full name of the video uploader
+ - `license`: License name the video is licensed under
  - `creator`: The main artist who created the video
  - `release_date`: The date (YYYYMMDD) when the video was released
  - `timestamp`: UNIX timestamp of the moment the video became available
@@ -513,7 +525,9 @@ The current default template is `%(title)s-%(id)s.%(ext)s`.
 
 In some cases, you don't want special characters such as 中, spaces, or &, such as when transferring the downloaded filename to a Windows system or the filename through an 8bit-unsafe channel. In these cases, add the `--restrict-filenames` flag to get a shorter title:
 
-Examples (note on Windows you may need to use double quotes instead of single):
+#### Output template examples
+
+Note on Windows you may need to use double quotes instead of single.
 
 ```bash
 $ youtube-dl --get-filename -o '%(title)s.%(ext)s' BaW_jenozKc
@@ -525,6 +539,9 @@ youtube-dl_test_video_.mp4          # A simple file name
 # Download YouTube playlist videos in separate directory indexed by video order in a playlist
 $ youtube-dl -o '%(playlist)s/%(playlist_index)s - %(title)s.%(ext)s' https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re
 
+# Download all playlists of YouTube channel/user keeping each playlist in separate directory:
+$ youtube-dl -o '%(uploader)s/%(playlist)s/%(playlist_index)s - %(title)s.%(ext)s' https://www.youtube.com/user/TheLinuxFoundation/playlists
+
 # Download Udemy course keeping each chapter in separate directory under MyVideos directory in your home
 $ youtube-dl -u user -p password -o '~/MyVideos/%(playlist)s/%(chapter_number)s - %(chapter)s/%(title)s.%(ext)s' https://www.udemy.com/java-tutorial/
 
@@ -543,6 +560,8 @@ But sometimes you may want to download in a different format, for example when y
 
 The general syntax for format selection is `--format FORMAT` or shorter `-f FORMAT` where `FORMAT` is a *selector expression*, i.e. an expression that describes format or formats you would like to download.
 
+**tl;dr:** [navigate me to examples](#format-selection-examples).
+
 The simplest case is requesting a specific format, for example with `-f 22` you can download the format with format code equal to 22. You can get the list of available format codes for particular video using `--list-formats` or `-F`. Note that these format codes are extractor specific. 
 
 You can also use a file extension (currently `3gp`, `aac`, `flv`, `m4a`, `mp3`, `mp4`, `ogg`, `wav`, `webm` are supported) to download best quality format of particular file extension served as a single file, e.g. `-f webm` will download best quality format with `webm` extension served as a single file.
@@ -588,11 +607,14 @@ You can merge the video and audio of two formats into a single file using `-f <v
 
 Format selectors can also be grouped using parentheses, for example if you want to download the best mp4 and webm formats with a height lower than 480 you can use `-f '(mp4,webm)[height<480]'`.
 
-Since the end of April 2015 and version 2015.04.26 youtube-dl uses `-f bestvideo+bestaudio/best` as default format selection (see #5447, #5456). If ffmpeg or avconv are installed this results in downloading `bestvideo` and `bestaudio` separately and muxing them together into a single file giving the best overall quality available. Otherwise it falls back to `best` and results in downloading the best available quality served as a single file. `best` is also needed for videos that don't come from YouTube because they don't provide the audio and video in two different files. If you want to only download some DASH formats (for example if you are not interested in getting videos with a resolution higher than 1080p), you can add `-f bestvideo[height<=?1080]+bestaudio/best` to your configuration file. Note that if you use youtube-dl to stream to `stdout` (and most likely to pipe it to your media player then), i.e. you explicitly specify output template as `-o -`, youtube-dl still uses `-f best` format selection in order to start content delivery immediately to your player and not to wait until `bestvideo` and `bestaudio` are downloaded and muxed.
+Since the end of April 2015 and version 2015.04.26 youtube-dl uses `-f bestvideo+bestaudio/best` as default format selection (see [#5447](https://github.com/rg3/youtube-dl/issues/5447), [#5456](https://github.com/rg3/youtube-dl/issues/5456)). If ffmpeg or avconv are installed this results in downloading `bestvideo` and `bestaudio` separately and muxing them together into a single file giving the best overall quality available. Otherwise it falls back to `best` and results in downloading the best available quality served as a single file. `best` is also needed for videos that don't come from YouTube because they don't provide the audio and video in two different files. If you want to only download some DASH formats (for example if you are not interested in getting videos with a resolution higher than 1080p), you can add `-f bestvideo[height<=?1080]+bestaudio/best` to your configuration file. Note that if you use youtube-dl to stream to `stdout` (and most likely to pipe it to your media player then), i.e. you explicitly specify output template as `-o -`, youtube-dl still uses `-f best` format selection in order to start content delivery immediately to your player and not to wait until `bestvideo` and `bestaudio` are downloaded and muxed.
 
 If you want to preserve the old format selection behavior (prior to youtube-dl 2015.04.26), i.e. you want to download the best available quality media served as a single file, you should explicitly specify your choice with `-f best`. You may want to add it to the [configuration file](#configuration) in order not to type it every time you run youtube-dl.
 
-Examples (note on Windows you may need to use double quotes instead of single):
+#### Format selection examples
+
+Note on Windows you may need to use double quotes instead of single.
+
 ```bash
 # Download best mp4 format available or any other best if no mp4 available
 $ youtube-dl -f 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best'
@@ -733,7 +755,7 @@ means you're using an outdated version of Python. Please update to Python 2.6 or
 
 ### What is this binary file? Where has the code gone?
 
-Since June 2012 (#342) youtube-dl is packed as an executable zipfile, simply unzip it (might need renaming to `youtube-dl.zip` first on some systems) or clone the git repository, as laid out above. If you modify the code, you can run it by executing the `__main__.py` file. To recompile the executable, run `make youtube-dl`.
+Since June 2012 ([#342](https://github.com/rg3/youtube-dl/issues/342)) youtube-dl is packed as an executable zipfile, simply unzip it (might need renaming to `youtube-dl.zip` first on some systems) or clone the git repository, as laid out above. If you modify the code, you can run it by executing the `__main__.py` file. To recompile the executable, run `make youtube-dl`.
 
 ### The exe throws a *Runtime error from Visual C++*
 
@@ -816,7 +838,9 @@ If you want to create a build of youtube-dl yourself, you'll need
 
 ### Adding support for a new site
 
-If you want to add support for a new site, you can follow this quick list (assuming your service is called `yourextractor`):
+If you want to add support for a new site, first of all **make sure** this site is **not dedicated to [copyright infringement](#can-you-add-support-for-this-anime-video-site-or-site-which-shows-current-movies-for-free)**. youtube-dl does **not support** such sites thus pull requests adding support for them **will be rejected**.
+
+After you have ensured this site is distributing it's content legally, you can follow this quick list (assuming your service is called `yourextractor`):
 
 1. [Fork this repository](https://github.com/rg3/youtube-dl/fork)
 2. Check out the source code with `git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git`
@@ -864,16 +888,17 @@ If you want to add support for a new site, you can follow this quick list (assum
     ```
 5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py).
 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc.
-7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62-L200). Add tests and code for as many as you want.
-8. If you can, check the code with [flake8](https://pypi.python.org/pypi/flake8).
-9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this:
+7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/58525c94d547be1c8167d16c298bdd75506db328/youtube_dl/extractor/common.py#L68-L226). Add tests and code for as many as you want.
+8. Keep in mind that the only mandatory fields in info dict for successful extraction process are `id`, `title` and either `url` or `formats`, i.e. these are the critical data the extraction does not make any sense without. This means that [any field](https://github.com/rg3/youtube-dl/blob/58525c94d547be1c8167d16c298bdd75506db328/youtube_dl/extractor/common.py#L138-L226) apart from aforementioned mandatory ones should be treated **as optional** and extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. For example, if you have some intermediate dict `meta` that is a source of metadata and it has a key `summary` that you want to extract and put into resulting info dict as `description`, you should be ready that this key may be missing from the `meta` dict, i.e. you should extract it as `meta.get('summary')` and not `meta['summary']`. Similarly, you should pass `fatal=False` when extracting data from a webpage with `_search_regex/_html_search_regex`.
+9. Check the code with [flake8](https://pypi.python.org/pypi/flake8).
+10. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this:
 
         $ git add youtube_dl/extractor/__init__.py
         $ git add youtube_dl/extractor/yourextractor.py
         $ git commit -m '[yourextractor] Add new extractor'
         $ git push origin yourextractor
 
-10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it.
+11. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it.
 
 In any case, thank you very much for your contributions!
 
index b384a31654d319edfb53a2131ca6c67d46acd0a4..0682e4af041c8d01ca8507679782a403a68d9564 100644 (file)
@@ -30,6 +30,7 @@
  - **AlJazeera**
  - **Allocine**
  - **AlphaPorno**
+ - **AnimeOnDemand**
  - **anitube.se**
  - **AnySex**
  - **Aparat**
@@ -49,6 +50,7 @@
  - **arte.tv:ddc**
  - **arte.tv:embed**
  - **arte.tv:future**
+ - **arte.tv:magazine**
  - **AtresPlayer**
  - **ATTTechChannel**
  - **AudiMedia**
@@ -75,6 +77,7 @@
  - **BleacherReportCMS**
  - **blinkx**
  - **Bloomberg**
+ - **BokeCC**
  - **Bpb**: Bundeszentrale für politische Bildung
  - **BR**: Bayerischer Rundfunk Mediathek
  - **Break**
  - **faz.net**
  - **fc2**
  - **Fczenit**
+ - **features.aol.com**
  - **fernsehkritik.tv**
  - **Firstpost**
  - **FiveTV**
  - **kontrtube**: KontrTube.ru - Труба зовёт
  - **KrasView**: Красвью
  - **Ku6**
+ - **KUSI**
  - **kuwo:album**: 酷我音乐 - 专辑
  - **kuwo:category**: 酷我音乐 - 分类
  - **kuwo:chart**: 酷我音乐 - 排行榜
  - **kuwo:song**: 酷我音乐
  - **la7.tv**
  - **Laola1Tv**
+ - **Le**: 乐视网
  - **Lecture2Go**
  - **Lemonde**
- - **Letv**: 乐视网
+ - **LePlaylist**
  - **LetvCloud**: 乐视云
- - **LetvPlaylist**
- - **LetvTv**
  - **Libsyn**
  - **life:embed**
  - **lifenews**: LIFE | NEWS
  - **MySpace:album**
  - **MySpass**
  - **Myvi**
- - **myvideo**
+ - **myvideo** (Currently broken)
  - **MyVidster**
  - **n-tv.de**
  - **NationalGeographic**
  - **NowTV** (Currently broken)
  - **NowTVList**
  - **nowvideo**: NowVideo
+ - **Noz**
  - **npo**: npo.nl and ntr.nl
  - **npo.nl:live**
  - **npo.nl:radio**
  - **Npr**
  - **NRK**
  - **NRKPlaylist**
+ - **NRKSkole**: NRK Skole
  - **NRKTV**: NRK TV and NRK Radio
  - **ntv.ru**
  - **Nuvid**
  - **PornHd**
  - **PornHub**
  - **PornHubPlaylist**
+ - **PornHubUserVideos**
  - **Pornotube**
  - **PornoVoisines**
  - **PornoXO**
  - **screen.yahoo:search**: Yahoo screen search
  - **Screencast**
  - **ScreencastOMatic**
+ - **ScreenJunkies**
  - **ScreenwaveMedia**
  - **SenateISVP**
  - **ServingSys**
  - **southpark.de**
  - **southpark.nl**
  - **southparkstudios.dk**
- - **Space**
  - **SpankBang**
  - **Spankwire**
  - **Spiegel**
  - **TMZ**
  - **TMZArticle**
  - **TNAFlix**
+ - **TNAFlixNetworkEmbed**
  - **toggle**
  - **tou.tv**
  - **Toypics**: Toypics user profile
  - **twitch:video**
  - **twitch:vod**
  - **twitter**
+ - **twitter:amplify**
  - **twitter:card**
  - **Ubu**
  - **udemy**
  - **Urort**: NRK P3 Urørt
  - **ustream**
  - **ustream:channel**
+ - **Ustudio**
  - **Varzesh3**
  - **Vbox7**
  - **VeeHD**
  - **video.mit.edu**
  - **VideoDetective**
  - **videofy.me**
- - **VideoMega** (Currently broken)
+ - **VideoMega**
  - **videomore**
  - **videomore:season**
  - **videomore:video**
index bdd7acca4d91c490f29c21aeac7cc9ba01c86952..f2d87821290095c1f9526f50db5d80ab31969d56 100644 (file)
@@ -11,8 +11,11 @@ import sys
 
 import youtube_dl.extractor
 from youtube_dl import YoutubeDL
-from youtube_dl.utils import (
+from youtube_dl.compat import (
+    compat_os_name,
     compat_str,
+)
+from youtube_dl.utils import (
     preferredencoding,
     write_string,
 )
@@ -42,7 +45,7 @@ def report_warning(message):
     Print the message to stderr, it will be prefixed with 'WARNING:'
     If stderr is a tty file the 'WARNING:' will be colored
     '''
-    if sys.stderr.isatty() and os.name != 'nt':
+    if sys.stderr.isatty() and compat_os_name != 'nt':
         _msg_header = '\033[0;33mWARNING:\033[0m'
     else:
         _msg_header = 'WARNING:'
index 02caf5908233b2971c66dd0db17820a847c3582b..efbee3b711b046f62fbb486375486a9e558e5035 100644 (file)
@@ -234,7 +234,7 @@ class TestFormatSelection(unittest.TestCase):
 
     def test_youtube_format_selection(self):
         order = [
-            '38', '37', '46', '22', '45', '35', '44', '18', '34', '43', '6', '5', '36', '17', '13',
+            '38', '37', '46', '22', '45', '35', '44', '18', '34', '43', '6', '5', '17', '36', '13',
             # Apple HTTP Live Streaming
             '96', '95', '94', '93', '92', '132', '151',
             # 3D
@@ -502,6 +502,9 @@ class TestYoutubeDL(unittest.TestCase):
         assertRegexpMatches(self, ydl._format_note({
             'vbr': 10,
         }), '^\s*10k$')
+        assertRegexpMatches(self, ydl._format_note({
+            'fps': 30,
+        }), '^30fps$')
 
     def test_postprocessors(self):
         filename = 'post-processor-testfile.mp4'
index f2e305b6fed3ce2f0574a7c20e89ffb977934f28..fc59b1aed6ddc2db10598a1a4b954a128e3d3133 100644 (file)
@@ -52,7 +52,12 @@ class TestHTTP(unittest.TestCase):
             ('localhost', 0), HTTPTestRequestHandler)
         self.httpd.socket = ssl.wrap_socket(
             self.httpd.socket, certfile=certfn, server_side=True)
-        self.port = self.httpd.socket.getsockname()[1]
+        if os.name == 'java':
+            # In Jython SSLSocket is not a subclass of socket.socket
+            sock = self.httpd.socket.sock
+        else:
+            sock = self.httpd.socket
+        self.port = sock.getsockname()[1]
         self.server_thread = threading.Thread(target=self.httpd.serve_forever)
         self.server_thread.daemon = True
         self.server_thread.start()
diff --git a/test/test_iqiyi_sdk_interpreter.py b/test/test_iqiyi_sdk_interpreter.py
new file mode 100644 (file)
index 0000000..9d95cb6
--- /dev/null
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+
+from __future__ import unicode_literals
+
+# Allow direct execution
+import os
+import sys
+import unittest
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from test.helper import FakeYDL
+from youtube_dl.extractor import IqiyiIE
+
+
+class IqiyiIEWithCredentials(IqiyiIE):
+    def _get_login_info(self):
+        return 'foo', 'bar'
+
+
+class WarningLogger(object):
+    def __init__(self):
+        self.messages = []
+
+    def warning(self, msg):
+        self.messages.append(msg)
+
+    def debug(self, msg):
+        pass
+
+    def error(self, msg):
+        pass
+
+
+class TestIqiyiSDKInterpreter(unittest.TestCase):
+    def test_iqiyi_sdk_interpreter(self):
+        '''
+        Test the functionality of IqiyiSDKInterpreter by trying to log in
+
+        If `sign` is incorrect, /validate call throws an HTTP 556 error
+        '''
+        logger = WarningLogger()
+        ie = IqiyiIEWithCredentials(FakeYDL({'logger': logger}))
+        ie._login()
+        self.assertTrue('unable to log in:' in logger.messages[0])
+
+if __name__ == '__main__':
+    unittest.main()
index a1e416dd5fdd1e385a5a37e87b65e3d78e334b4a..bc28ceb344f0388ed11028af3171d2007777c366 100644 (file)
@@ -18,6 +18,7 @@ import xml.etree.ElementTree
 from youtube_dl.utils import (
     age_restricted,
     args_to_str,
+    encode_base_n,
     clean_html,
     DateRange,
     detect_exe_version,
@@ -35,10 +36,12 @@ from youtube_dl.utils import (
     is_html,
     js_to_json,
     limit_length,
+    ohdave_rsa_encrypt,
     OnDemandPagedList,
     orderedSet,
     parse_duration,
     parse_filesize,
+    parse_count,
     parse_iso8601,
     read_batch_urls,
     sanitize_filename,
@@ -59,6 +62,7 @@ from youtube_dl.utils import (
     lowercase_escape,
     url_basename,
     urlencode_postdata,
+    update_url_query,
     version_tuple,
     xpath_with_ns,
     xpath_element,
@@ -74,6 +78,8 @@ from youtube_dl.utils import (
 )
 from youtube_dl.compat import (
     compat_etree_fromstring,
+    compat_urlparse,
+    compat_parse_qs,
 )
 
 
@@ -248,6 +254,7 @@ class TestUtil(unittest.TestCase):
         self.assertEqual(
             unified_strdate('2/2/2015 6:47:40 PM', day_first=False),
             '20150202')
+        self.assertEqual(unified_strdate('Feb 14th 2016 5:45PM'), '20160214')
         self.assertEqual(unified_strdate('25-09-2014'), '20140925')
         self.assertEqual(unified_strdate('UNKNOWN DATE FORMAT'), None)
 
@@ -451,6 +458,40 @@ class TestUtil(unittest.TestCase):
         data = urlencode_postdata({'username': 'foo@bar.com', 'password': '1234'})
         self.assertTrue(isinstance(data, bytes))
 
+    def test_update_url_query(self):
+        def query_dict(url):
+            return compat_parse_qs(compat_urlparse.urlparse(url).query)
+        self.assertEqual(query_dict(update_url_query(
+            'http://example.com/path', {'quality': ['HD'], 'format': ['mp4']})),
+            query_dict('http://example.com/path?quality=HD&format=mp4'))
+        self.assertEqual(query_dict(update_url_query(
+            'http://example.com/path', {'system': ['LINUX', 'WINDOWS']})),
+            query_dict('http://example.com/path?system=LINUX&system=WINDOWS'))
+        self.assertEqual(query_dict(update_url_query(
+            'http://example.com/path', {'fields': 'id,formats,subtitles'})),
+            query_dict('http://example.com/path?fields=id,formats,subtitles'))
+        self.assertEqual(query_dict(update_url_query(
+            'http://example.com/path', {'fields': ('id,formats,subtitles', 'thumbnails')})),
+            query_dict('http://example.com/path?fields=id,formats,subtitles&fields=thumbnails'))
+        self.assertEqual(query_dict(update_url_query(
+            'http://example.com/path?manifest=f4m', {'manifest': []})),
+            query_dict('http://example.com/path'))
+        self.assertEqual(query_dict(update_url_query(
+            'http://example.com/path?system=LINUX&system=WINDOWS', {'system': 'LINUX'})),
+            query_dict('http://example.com/path?system=LINUX'))
+        self.assertEqual(query_dict(update_url_query(
+            'http://example.com/path', {'fields': b'id,formats,subtitles'})),
+            query_dict('http://example.com/path?fields=id,formats,subtitles'))
+        self.assertEqual(query_dict(update_url_query(
+            'http://example.com/path', {'width': 1080, 'height': 720})),
+            query_dict('http://example.com/path?width=1080&height=720'))
+        self.assertEqual(query_dict(update_url_query(
+            'http://example.com/path', {'bitrate': 5020.43})),
+            query_dict('http://example.com/path?bitrate=5020.43'))
+        self.assertEqual(query_dict(update_url_query(
+            'http://example.com/path', {'test': '第二行тест'})),
+            query_dict('http://example.com/path?test=%E7%AC%AC%E4%BA%8C%E8%A1%8C%D1%82%D0%B5%D1%81%D1%82'))
+
     def test_dict_get(self):
         FALSE_VALUES = {
             'none': None,
@@ -613,6 +654,15 @@ class TestUtil(unittest.TestCase):
         self.assertEqual(parse_filesize('1.2Tb'), 1200000000000)
         self.assertEqual(parse_filesize('1,24 KB'), 1240)
 
+    def test_parse_count(self):
+        self.assertEqual(parse_count(None), None)
+        self.assertEqual(parse_count(''), None)
+        self.assertEqual(parse_count('0'), 0)
+        self.assertEqual(parse_count('1000'), 1000)
+        self.assertEqual(parse_count('1.000'), 1000)
+        self.assertEqual(parse_count('1.1k'), 1100)
+        self.assertEqual(parse_count('1.1kk'), 1100000)
+
     def test_version_tuple(self):
         self.assertEqual(version_tuple('1'), (1,))
         self.assertEqual(version_tuple('10.23.344'), (10, 23, 344))
@@ -792,6 +842,24 @@ The first line
                 {'nocheckcertificate': False}, '--check-certificate', 'nocheckcertificate', 'false', 'true', '='),
             ['--check-certificate=true'])
 
+    def test_ohdave_rsa_encrypt(self):
+        N = 0xab86b6371b5318aaa1d3c9e612a9f1264f372323c8c0f19875b5fc3b3fd3afcc1e5bec527aa94bfa85bffc157e4245aebda05389a5357b75115ac94f074aefcd
+        e = 65537
+
+        self.assertEqual(
+            ohdave_rsa_encrypt(b'aa111222', e, N),
+            '726664bd9a23fd0c70f9f1b84aab5e3905ce1e45a584e9cbcf9bcc7510338fc1986d6c599ff990d923aa43c51c0d9013cd572e13bc58f4ae48f2ed8c0b0ba881')
+
+    def test_encode_base_n(self):
+        self.assertEqual(encode_base_n(0, 30), '0')
+        self.assertEqual(encode_base_n(80, 30), '2k')
+
+        custom_table = '9876543210ZYXWVUTSRQPONMLKJIHGFEDCBA'
+        self.assertEqual(encode_base_n(0, 30, custom_table), '9')
+        self.assertEqual(encode_base_n(80, 30, custom_table), '7P')
+
+        self.assertRaises(ValueError, encode_base_n, 0, 70)
+        self.assertRaises(ValueError, encode_base_n, 0, 60, custom_table)
 
 if __name__ == '__main__':
     unittest.main()
index f4324039c72ec656f67bf536ee7856aae464cc01..8c651cd52375e1dcf986307b57f447fce4025543 100755 (executable)
@@ -24,9 +24,6 @@ import time
 import tokenize
 import traceback
 
-if os.name == 'nt':
-    import ctypes
-
 from .compat import (
     compat_basestring,
     compat_cookiejar,
@@ -34,6 +31,7 @@ from .compat import (
     compat_get_terminal_size,
     compat_http_client,
     compat_kwargs,
+    compat_os_name,
     compat_str,
     compat_tokenize_tokenize,
     compat_urllib_error,
@@ -87,6 +85,7 @@ from .extractor import get_info_extractor, gen_extractors
 from .downloader import get_suitable_downloader
 from .downloader.rtmp import rtmpdump_version
 from .postprocessor import (
+    FFmpegFixupM3u8PP,
     FFmpegFixupM4aPP,
     FFmpegFixupStretchedPP,
     FFmpegMergerPP,
@@ -95,6 +94,9 @@ from .postprocessor import (
 )
 from .version import __version__
 
+if compat_os_name == 'nt':
+    import ctypes
+
 
 class YoutubeDL(object):
     """YoutubeDL class.
@@ -450,7 +452,7 @@ class YoutubeDL(object):
     def to_console_title(self, message):
         if not self.params.get('consoletitle', False):
             return
-        if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
+        if compat_os_name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
             # c_wchar_p() might not be necessary if `message` is
             # already of type unicode()
             ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
@@ -521,7 +523,7 @@ class YoutubeDL(object):
         else:
             if self.params.get('no_warnings'):
                 return
-            if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt':
+            if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
                 _msg_header = '\033[0;33mWARNING:\033[0m'
             else:
                 _msg_header = 'WARNING:'
@@ -533,7 +535,7 @@ class YoutubeDL(object):
         Do the same as trouble, but prefixes the message with 'ERROR:', colored
         in red if stderr is a tty file.
         '''
-        if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt':
+        if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
             _msg_header = '\033[0;31mERROR:\033[0m'
         else:
             _msg_header = 'ERROR:'
@@ -566,7 +568,7 @@ class YoutubeDL(object):
                 elif template_dict.get('height'):
                     template_dict['resolution'] = '%sp' % template_dict['height']
                 elif template_dict.get('width'):
-                    template_dict['resolution'] = '?x%d' % template_dict['width']
+                    template_dict['resolution'] = '%dx?' % template_dict['width']
 
             sanitize = lambda k, v: sanitize_filename(
                 compat_str(v),
@@ -1232,6 +1234,10 @@ class YoutubeDL(object):
                 if t.get('id') is None:
                     t['id'] = '%d' % i
 
+        if self.params.get('list_thumbnails'):
+            self.list_thumbnails(info_dict)
+            return
+
         if thumbnails and 'thumbnail' not in info_dict:
             info_dict['thumbnail'] = thumbnails[-1]['url']
 
@@ -1333,9 +1339,6 @@ class YoutubeDL(object):
         if self.params.get('listformats'):
             self.list_formats(info_dict)
             return
-        if self.params.get('list_thumbnails'):
-            self.list_thumbnails(info_dict)
-            return
 
         req_format = self.params.get('format')
         if req_format is None:
@@ -1631,12 +1634,14 @@ class YoutubeDL(object):
                 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
                 return
 
-            if success:
+            if success and filename != '-':
                 # Fixup content
                 fixup_policy = self.params.get('fixup')
                 if fixup_policy is None:
                     fixup_policy = 'detect_or_warn'
 
+                INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.'
+
                 stretched_ratio = info_dict.get('stretched_ratio')
                 if stretched_ratio is not None and stretched_ratio != 1:
                     if fixup_policy == 'warn':
@@ -1649,15 +1654,18 @@ class YoutubeDL(object):
                             info_dict['__postprocessors'].append(stretched_pp)
                         else:
                             self.report_warning(
-                                '%s: Non-uniform pixel ratio (%s). Install ffmpeg or avconv to fix this automatically.' % (
-                                    info_dict['id'], stretched_ratio))
+                                '%s: Non-uniform pixel ratio (%s). %s'
+                                % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
                     else:
                         assert fixup_policy in ('ignore', 'never')
 
-                if info_dict.get('requested_formats') is None and info_dict.get('container') == 'm4a_dash':
+                if (info_dict.get('requested_formats') is None and
+                        info_dict.get('container') == 'm4a_dash'):
                     if fixup_policy == 'warn':
-                        self.report_warning('%s: writing DASH m4a. Only some players support this container.' % (
-                            info_dict['id']))
+                        self.report_warning(
+                            '%s: writing DASH m4a. '
+                            'Only some players support this container.'
+                            % info_dict['id'])
                     elif fixup_policy == 'detect_or_warn':
                         fixup_pp = FFmpegFixupM4aPP(self)
                         if fixup_pp.available:
@@ -1665,8 +1673,27 @@ class YoutubeDL(object):
                             info_dict['__postprocessors'].append(fixup_pp)
                         else:
                             self.report_warning(
-                                '%s: writing DASH m4a. Only some players support this container. Install ffmpeg or avconv to fix this automatically.' % (
-                                    info_dict['id']))
+                                '%s: writing DASH m4a. '
+                                'Only some players support this container. %s'
+                                % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
+                    else:
+                        assert fixup_policy in ('ignore', 'never')
+
+                if (info_dict.get('protocol') == 'm3u8_native' or
+                        info_dict.get('protocol') == 'm3u8' and
+                        self.params.get('hls_prefer_native')):
+                    if fixup_policy == 'warn':
+                        self.report_warning('%s: malformated aac bitstream.' % (
+                            info_dict['id']))
+                    elif fixup_policy == 'detect_or_warn':
+                        fixup_pp = FFmpegFixupM3u8PP(self)
+                        if fixup_pp.available:
+                            info_dict.setdefault('__postprocessors', [])
+                            info_dict['__postprocessors'].append(fixup_pp)
+                        else:
+                            self.report_warning(
+                                '%s: malformated aac bitstream. %s'
+                                % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
                     else:
                         assert fixup_policy in ('ignore', 'never')
 
@@ -1830,7 +1857,9 @@ class YoutubeDL(object):
         if fdict.get('vbr') is not None:
             res += '%4dk' % fdict['vbr']
         if fdict.get('fps') is not None:
-            res += ', %sfps' % fdict['fps']
+            if res:
+                res += ', '
+            res += '%sfps' % fdict['fps']
         if fdict.get('acodec') is not None:
             if res:
                 res += ', '
@@ -1873,13 +1902,8 @@ class YoutubeDL(object):
     def list_thumbnails(self, info_dict):
         thumbnails = info_dict.get('thumbnails')
         if not thumbnails:
-            tn_url = info_dict.get('thumbnail')
-            if tn_url:
-                thumbnails = [{'id': '0', 'url': tn_url}]
-            else:
-                self.to_screen(
-                    '[info] No thumbnails present for %s' % info_dict['id'])
-                return
+            self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
+            return
 
         self.to_screen(
             '[info] Thumbnails for %s:' % info_dict['id'])
index f5f06424146f88fa045a26db569cbe7fa42da586..79b3898409bb4cd2f703a15b26f7225408c279d6 100644 (file)
@@ -355,6 +355,7 @@ def _real_main(argv=None):
         'youtube_include_dash_manifest': opts.youtube_include_dash_manifest,
         'encoding': opts.encoding,
         'extract_flat': opts.extract_flat,
+        'mark_watched': opts.mark_watched,
         'merge_output_format': opts.merge_output_format,
         'postprocessors': postprocessors,
         'fixup': opts.fixup,
index b497da6964944b5d47e0805f9e872357b6c11c74..2771fb5faa371ccf01d1d4f9449b87c7a380c175 100644 (file)
@@ -326,6 +326,9 @@ def compat_ord(c):
         return ord(c)
 
 
+compat_os_name = os._name if os.name == 'java' else os.name
+
+
 if sys.version_info >= (3, 0):
     compat_getenv = os.getenv
     compat_expanduser = os.path.expanduser
@@ -346,7 +349,7 @@ else:
     # The following are os.path.expanduser implementations from cpython 2.7.8 stdlib
     # for different platforms with correct environment variables decoding.
 
-    if os.name == 'posix':
+    if compat_os_name == 'posix':
         def compat_expanduser(path):
             """Expand ~ and ~user constructions.  If user or $HOME is unknown,
             do nothing."""
@@ -370,7 +373,7 @@ else:
                 userhome = pwent.pw_dir
             userhome = userhome.rstrip('/')
             return (userhome + path[i:]) or '/'
-    elif os.name == 'nt' or os.name == 'ce':
+    elif compat_os_name == 'nt' or compat_os_name == 'ce':
         def compat_expanduser(path):
             """Expand ~ and ~user constructs.
 
@@ -556,6 +559,7 @@ __all__ = [
     'compat_itertools_count',
     'compat_kwargs',
     'compat_ord',
+    'compat_os_name',
     'compat_parse_qs',
     'compat_print',
     'compat_shlex_split',
index 2d51540518f7ee40c7d002d632007d5aa9542697..f39db58f6c13f623a00b37ef0565dad70f18a305 100644 (file)
@@ -5,6 +5,7 @@ import re
 import sys
 import time
 
+from ..compat import compat_os_name
 from ..utils import (
     encodeFilename,
     error_to_compat_str,
@@ -219,7 +220,7 @@ class FileDownloader(object):
         if self.params.get('progress_with_newline', False):
             self.to_screen(fullmsg)
         else:
-            if os.name == 'nt':
+            if compat_os_name == 'nt':
                 prev_len = getattr(self, '_report_progress_prev_line_length',
                                    0)
                 if prev_len > len(fullmsg):
index 5bc99492bc7b90abdc5c133b3f6c573d54fe9ed3..a5bae96699e0b0f81fd11deab4900fe5ed8b820d 100644 (file)
@@ -99,7 +99,8 @@ class FragmentFD(FileDownloader):
                     state['eta'] = self.calc_eta(
                         start, time_now, estimated_size,
                         state['downloaded_bytes'])
-                state['speed'] = s.get('speed')
+                state['speed'] = s.get('speed') or ctx.get('speed')
+                ctx['speed'] = state['speed']
                 ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes
             self._hook_progress(state)
 
index 1edbfbd2830b852f9283ab1c4bfe3085a4f76ab2..c5b80f4aadc423765b6bb3a4ef63289849642655 100644 (file)
@@ -23,7 +23,10 @@ from .alphaporno import AlphaPornoIE
 from .animeondemand import AnimeOnDemandIE
 from .anitube import AnitubeIE
 from .anysex import AnySexIE
-from .aol import AolIE
+from .aol import (
+    AolIE,
+    AolFeaturesIE,
+)
 from .allocine import AllocineIE
 from .aparat import AparatIE
 from .appleconnect import AppleConnectIE
@@ -51,6 +54,7 @@ from .arte import (
 from .atresplayer import AtresPlayerIE
 from .atttechchannel import ATTTechChannelIE
 from .audimedia import AudiMediaIE
+from .audioboom import AudioBoomIE
 from .audiomack import AudiomackIE, AudiomackAlbumIE
 from .azubu import AzubuIE, AzubuLiveIE
 from .baidu import BaiduVideoIE
@@ -74,6 +78,7 @@ from .bleacherreport import (
 )
 from .blinkx import BlinkxIE
 from .bloomberg import BloombergIE
+from .bokecc import BokeCCIE
 from .bpb import BpbIE
 from .br import BRIE
 from .breakcom import BreakIE
@@ -184,6 +189,10 @@ from .dumpert import DumpertIE
 from .defense import DefenseGouvFrIE
 from .discovery import DiscoveryIE
 from .dropbox import DropboxIE
+from .dw import (
+    DWIE,
+    DWArticleIE,
+)
 from .eagleplatform import EaglePlatformIE
 from .ebaumsworld import EbaumsWorldIE
 from .echomsk import EchoMskIE
@@ -208,10 +217,7 @@ from .everyonesmixtape import EveryonesMixtapeIE
 from .exfm import ExfmIE
 from .expotv import ExpoTVIE
 from .extremetube import ExtremeTubeIE
-from .facebook import (
-    FacebookIE,
-    FacebookPostIE,
-)
+from .facebook import FacebookIE
 from .faz import FazIE
 from .fc2 import FC2IE
 from .fczenit import FczenitIE
@@ -339,6 +345,7 @@ from .konserthusetplay import KonserthusetPlayIE
 from .kontrtube import KontrTubeIE
 from .krasview import KrasViewIE
 from .ku6 import Ku6IE
+from .kusi import KUSIIE
 from .kuwo import (
     KuwoIE,
     KuwoAlbumIE,
@@ -351,10 +358,9 @@ from .la7 import LA7IE
 from .laola1tv import Laola1TvIE
 from .lecture2go import Lecture2GoIE
 from .lemonde import LemondeIE
-from .letv import (
-    LetvIE,
-    LetvTvIE,
-    LetvPlaylistIE,
+from .leeco import (
+    LeIE,
+    LePlaylistIE,
     LetvCloudIE,
 )
 from .libsyn import LibsynIE
@@ -383,6 +389,7 @@ from .lynda import (
 from .m6 import M6IE
 from .macgamestore import MacGameStoreIE
 from .mailru import MailRuIE
+from .makerschannel import MakersChannelIE
 from .makertv import MakerTVIE
 from .malemotion import MalemotionIE
 from .matchtv import MatchTVIE
@@ -392,6 +399,7 @@ from .metacritic import MetacriticIE
 from .mgoon import MgoonIE
 from .minhateca import MinhatecaIE
 from .ministrygrid import MinistryGridIE
+from .minoto import MinotoIE
 from .miomio import MioMioIE
 from .mit import TechTVMITIE, MITIE, OCWMITIE
 from .mitele import MiTeleIE
@@ -505,6 +513,7 @@ from .npr import NprIE
 from .nrk import (
     NRKIE,
     NRKPlaylistIE,
+    NRKSkoleIE,
     NRKTVIE,
 )
 from .ntvde import NTVDeIE
@@ -669,7 +678,6 @@ from .southpark import (
     SouthParkEsIE,
     SouthParkNlIE
 )
-from .space import SpaceIE
 from .spankbang import SpankBangIE
 from .spankwire import SpankwireIE
 from .spiegel import SpiegelIE, SpiegelArticleIE
@@ -737,6 +745,7 @@ from .tmz import (
     TMZArticleIE,
 )
 from .tnaflix import (
+    TNAFlixNetworkEmbedIE,
     TNAFlixIE,
     EMPFlixIE,
     MovieFapIE,
@@ -798,7 +807,11 @@ from .twitch import (
     TwitchBookmarksIE,
     TwitchStreamIE,
 )
-from .twitter import TwitterCardIE, TwitterIE
+from .twitter import (
+    TwitterCardIE,
+    TwitterIE,
+    TwitterAmplifyIE,
+)
 from .ubu import UbuIE
 from .udemy import (
     UdemyIE,
@@ -809,6 +822,7 @@ from .digiteka import DigitekaIE
 from .unistra import UnistraIE
 from .urort import UrortIE
 from .ustream import UstreamIE, UstreamChannelIE
+from .ustudio import UstudioIE
 from .varzesh3 import Varzesh3IE
 from .vbox7 import Vbox7IE
 from .veehd import VeeHDIE
@@ -822,7 +836,10 @@ from .vgtv import (
     VGTVIE,
 )
 from .vh1 import VH1IE
-from .vice import ViceIE
+from .vice import (
+    ViceIE,
+    ViceShowIE,
+)
 from .viddler import ViddlerIE
 from .videodetective import VideoDetectiveIE
 from .videofyme import VideofyMeIE
@@ -849,6 +866,7 @@ from .vimeo import (
     VimeoChannelIE,
     VimeoGroupsIE,
     VimeoLikesIE,
+    VimeoOndemandIE,
     VimeoReviewIE,
     VimeoUserIE,
     VimeoWatchLaterIE,
index b51eafc45928f8e6ff4ce571763593f71b715583..b761b2cc4c5d3d4b70766ed56ff5c3529dd39e6b 100644 (file)
@@ -1,24 +1,11 @@
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
 
 
 class AolIE(InfoExtractor):
     IE_NAME = 'on.aol.com'
-    _VALID_URL = r'''(?x)
-        (?:
-            aol-video:|
-            http://on\.aol\.com/
-            (?:
-                video/.*-|
-                playlist/(?P<playlist_display_id>[^/?#]+?)-(?P<playlist_id>[0-9]+)[?#].*_videoid=
-            )
-        )
-        (?P<id>[0-9]+)
-        (?:$|\?)
-    '''
+    _VALID_URL = r'(?:aol-video:|http://on\.aol\.com/video/.*-)(?P<id>[0-9]+)(?:$|\?)'
 
     _TESTS = [{
         'url': 'http://on.aol.com/video/u-s--official-warns-of-largest-ever-irs-phone-scam-518167793?icid=OnHomepageC2Wide_MustSee_Img',
@@ -29,42 +16,31 @@ class AolIE(InfoExtractor):
             'title': 'U.S. Official Warns Of \'Largest Ever\' IRS Phone Scam',
         },
         'add_ie': ['FiveMin'],
-    }, {
-        'url': 'http://on.aol.com/playlist/brace-yourself---todays-weirdest-news-152147?icid=OnHomepageC4_Omg_Img#_videoid=518184316',
-        'info_dict': {
-            'id': '152147',
-            'title': 'Brace Yourself - Today\'s Weirdest News',
-        },
-        'playlist_mincount': 10,
     }]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        playlist_id = mobj.group('playlist_id')
-        if not playlist_id or self._downloader.params.get('noplaylist'):
-            return self.url_result('5min:%s' % video_id)
+        video_id = self._match_id(url)
+        return self.url_result('5min:%s' % video_id)
 
-        self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
 
-        webpage = self._download_webpage(url, playlist_id)
-        title = self._html_search_regex(
-            r'<h1 class="video-title[^"]*">(.+?)</h1>', webpage, 'title')
-        playlist_html = self._search_regex(
-            r"(?s)<ul\s+class='video-related[^']*'>(.*?)</ul>", webpage,
-            'playlist HTML')
-        entries = [{
-            '_type': 'url',
-            'url': 'aol-video:%s' % m.group('id'),
-            'ie_key': 'Aol',
-        } for m in re.finditer(
-            r"<a\s+href='.*videoid=(?P<id>[0-9]+)'\s+class='video-thumb'>",
-            playlist_html)]
+class AolFeaturesIE(InfoExtractor):
+    IE_NAME = 'features.aol.com'
+    _VALID_URL = r'http://features\.aol\.com/video/(?P<id>[^/?#]+)'
 
-        return {
-            '_type': 'playlist',
-            'id': playlist_id,
-            'display_id': mobj.group('playlist_display_id'),
-            'title': title,
-            'entries': entries,
-        }
+    _TESTS = [{
+        'url': 'http://features.aol.com/video/behind-secret-second-careers-late-night-talk-show-hosts',
+        'md5': '7db483bb0c09c85e241f84a34238cc75',
+        'info_dict': {
+            'id': '519507715',
+            'ext': 'mp4',
+            'title': 'What To Watch - February 17, 2016',
+        },
+        'add_ie': ['FiveMin'],
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        return self.url_result(self._search_regex(
+            r'<script type="text/javascript" src="(https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js[^"]+)"',
+            webpage, '5min embed url'), 'FiveMin')
index 62ed0c9186aec37b45e417f5e2a315123232bbf0..be40f85b487057b4cb319dba102cec76519880a5 100644 (file)
@@ -12,7 +12,7 @@ from ..utils import (
 
 class AppleTrailersIE(InfoExtractor):
     IE_NAME = 'appletrailers'
-    _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/(?:trailers|ca)/(?P<company>[^/]+)/(?P<movie>[^/]+)'
+    _VALID_URL = r'https?://(?:www\.|movie)?trailers\.apple\.com/(?:trailers|ca)/(?P<company>[^/]+)/(?P<movie>[^/]+)'
     _TESTS = [{
         'url': 'http://trailers.apple.com/trailers/wb/manofsteel/',
         'info_dict': {
@@ -73,6 +73,9 @@ class AppleTrailersIE(InfoExtractor):
     }, {
         'url': 'http://trailers.apple.com/ca/metropole/autrui/',
         'only_matching': True,
+    }, {
+        'url': 'http://movietrailers.apple.com/trailers/focus_features/kuboandthetwostrings/',
+        'only_matching': True,
     }]
 
     _JSON_RE = r'iTunes.playURL\((.*?)\);'
index 793da2ee1d8b015a5573d7febea439bb4f5e465a..3e119e21b39ba2ab6bc504cf1d19a90008bfbd24 100644 (file)
@@ -63,7 +63,7 @@ class ArteTvIE(InfoExtractor):
 
 class ArteTVPlus7IE(InfoExtractor):
     IE_NAME = 'arte.tv:+7'
-    _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions)/)?(?P<id>[^/]+)/(?P<name>[^/?#&+])'
+    _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&+])'
 
     @classmethod
     def _extract_url_info(cls, url):
@@ -110,17 +110,29 @@ class ArteTVPlus7IE(InfoExtractor):
             # en and es URLs produce react-based pages with different layout (e.g.
             # http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world)
             if not iframe_url:
-                embed_html = self._parse_json(
-                    self._search_regex(
-                        r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n',
-                        webpage, 'program'),
-                    video_id)['embed_html']
-                iframe_url = find_iframe_url(embed_html)
-            json_url = compat_parse_qs(
-                compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0]
-        return self._extract_from_json_url(json_url, video_id, lang)
-
-    def _extract_from_json_url(self, json_url, video_id, lang):
+                program = self._search_regex(
+                    r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n',
+                    webpage, 'program', default=None)
+                if program:
+                    embed_html = self._parse_json(program, video_id)
+                    if embed_html:
+                        iframe_url = find_iframe_url(embed_html['embed_html'])
+            if iframe_url:
+                json_url = compat_parse_qs(
+                    compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0]
+        if json_url:
+            title = self._search_regex(
+                r'<h3[^>]+title=(["\'])(?P<title>.+?)\1',
+                webpage, 'title', default=None, group='title')
+            return self._extract_from_json_url(json_url, video_id, lang, title=title)
+        # Different kind of embed URL (e.g.
+        # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium)
+        embed_url = self._search_regex(
+            r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1',
+            webpage, 'embed url', group='url')
+        return self.url_result(embed_url)
+
+    def _extract_from_json_url(self, json_url, video_id, lang, title=None):
         info = self._download_json(json_url, video_id)
         player_info = info['videoJsonPlayer']
 
@@ -128,7 +140,7 @@ class ArteTVPlus7IE(InfoExtractor):
         if not upload_date_str:
             upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0]
 
-        title = player_info['VTI'].strip()
+        title = (player_info.get('VTI') or title or player_info['VID']).strip()
         subtitle = player_info.get('VSU', '').strip()
         if subtitle:
             title += ' - %s' % subtitle
@@ -230,6 +242,7 @@ class ArteTVFutureIE(ArteTVPlus7IE):
             'id': '050940-028-A',
             'ext': 'mp4',
             'title': 'Les écrevisses aussi peuvent être anxieuses',
+            'upload_date': '20140902',
         },
     }, {
         'url': 'http://future.arte.tv/fr/la-science-est-elle-responsable',
@@ -294,12 +307,25 @@ class ArteTVMagazineIE(ArteTVPlus7IE):
     _VALID_URL = r'https?://(?:www\.)?arte\.tv/magazine/[^/]+/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)'
 
     _TESTS = [{
+        # Embedded via <iframe src="http://www.arte.tv/arte_vp/index.php?json_url=..."
         'url': 'http://www.arte.tv/magazine/trepalium/fr/entretien-avec-le-realisateur-vincent-lannoo-trepalium',
-        'md5': '66a093339c1278bb3719157ef07107b2',
+        'md5': '2a9369bcccf847d1c741e51416299f25',
         'info_dict': {
             'id': '065965-000-A',
             'ext': 'mp4',
             'title': 'Trepalium - Extrait Ep.01',
+            'upload_date': '20160121',
+        },
+    }, {
+        # Embedded via <iframe src="http://www.arte.tv/guide/fr/embed/054813-004-A/medium"
+        'url': 'http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium',
+        'md5': 'fedc64fc7a946110fe311634e79782ca',
+        'info_dict': {
+            'id': '054813-004_PLUS7-F',
+            'ext': 'mp4',
+            'title': 'Trepalium (4/6)',
+            'description': 'md5:10057003c34d54e95350be4f9b05cb40',
+            'upload_date': '20160218',
         },
     }, {
         'url': 'http://www.arte.tv/magazine/metropolis/de/frank-woeste-german-paris-metropolis',
index 3b2effa15fe15a5527644349d785b452540c7568..aa6925623140f08090515fda2f42a7debd5545ac 100644 (file)
@@ -10,9 +10,9 @@ from ..utils import (
 
 
 class AudiMediaIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?audimedia\.tv/(?:en|de)/vid/(?P<id>[^/?#]+)'
+    _VALID_URL = r'https?://(?:www\.)?audi-mediacenter\.com/(?:en|de)/audimediatv/(?P<id>[^/?#]+)'
     _TEST = {
-        'url': 'https://audimedia.tv/en/vid/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test',
+        'url': 'https://www.audi-mediacenter.com/en/audimediatv/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-1467',
         'md5': '79a8b71c46d49042609795ab59779b66',
         'info_dict': {
             'id': '1565',
@@ -32,7 +32,10 @@ class AudiMediaIE(InfoExtractor):
         display_id = self._match_id(url)
         webpage = self._download_webpage(url, display_id)
 
-        raw_payload = self._search_regex(r'<script[^>]+class="amtv-embed"[^>]+id="([^"]+)"', webpage, 'raw payload')
+        raw_payload = self._search_regex([
+            r'class="amtv-embed"[^>]+id="([^"]+)"',
+            r'class=\\"amtv-embed\\"[^>]+id=\\"([^"]+)\\"',
+        ], webpage, 'raw payload')
         _, stage_mode, video_id, lang = raw_payload.split('-')
 
         # TODO: handle s and e stage_mode (live streams and ended live streams)
@@ -59,13 +62,19 @@ class AudiMediaIE(InfoExtractor):
                 video_version_url = video_version.get('download_url') or video_version.get('stream_url')
                 if not video_version_url:
                     continue
-                formats.append({
+                f = {
                     'url': video_version_url,
                     'width': int_or_none(video_version.get('width')),
                     'height': int_or_none(video_version.get('height')),
                     'abr': int_or_none(video_version.get('audio_bitrate')),
                     'vbr': int_or_none(video_version.get('video_bitrate')),
-                })
+                }
+                bitrate = self._search_regex(r'(\d+)k', video_version_url, 'bitrate', default=None)
+                if bitrate:
+                    f.update({
+                        'format_id': 'http-%s' % bitrate,
+                    })
+                formats.append(f)
             self._sort_formats(formats)
 
             return {
diff --git a/youtube_dl/extractor/audioboom.py b/youtube_dl/extractor/audioboom.py
new file mode 100644 (file)
index 0000000..2ec2d70
--- /dev/null
@@ -0,0 +1,66 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import float_or_none
+
+
+class AudioBoomIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?audioboom\.com/boos/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'https://audioboom.com/boos/4279833-3-09-2016-czaban-hour-3?t=0',
+        'md5': '63a8d73a055c6ed0f1e51921a10a5a76',
+        'info_dict': {
+            'id': '4279833',
+            'ext': 'mp3',
+            'title': '3/09/2016 Czaban Hour 3',
+            'description': 'Guest:   Nate Davis - NFL free agency,   Guest:   Stan Gans',
+            'duration': 2245.72,
+            'uploader': 'Steve Czaban',
+            'uploader_url': 're:https?://(?:www\.)?audioboom\.com/channel/steveczabanyahoosportsradio',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        clip = None
+
+        clip_store = self._parse_json(
+            self._search_regex(
+                r'data-new-clip-store=(["\'])(?P<json>{.*?"clipId"\s*:\s*%s.*?})\1' % video_id,
+                webpage, 'clip store', default='{}', group='json'),
+            video_id, fatal=False)
+        if clip_store:
+            clips = clip_store.get('clips')
+            if clips and isinstance(clips, list) and isinstance(clips[0], dict):
+                clip = clips[0]
+
+        def from_clip(field):
+            if clip:
+                clip.get(field)
+
+        audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property(
+            'audio', webpage, 'audio url')
+        title = from_clip('title') or self._og_search_title(webpage)
+        description = from_clip('description') or self._og_search_description(webpage)
+
+        duration = float_or_none(from_clip('duration') or self._html_search_meta(
+            'weibo:audio:duration', webpage))
+
+        uploader = from_clip('author') or self._og_search_property(
+            'audio:artist', webpage, 'uploader', fatal=False)
+        uploader_url = from_clip('author_url') or self._html_search_meta(
+            'audioboo:channel', webpage, 'uploader url')
+
+        return {
+            'id': video_id,
+            'url': audio_url,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'uploader': uploader,
+            'uploader_url': uploader_url,
+        }
index 9d0dfb9611687b15075d0e6fc7d57dfa0244c60a..e62b3860e99b106d08ef79cf593e180fe8c9496c 100644 (file)
@@ -10,7 +10,6 @@ from ..utils import (
     int_or_none,
     parse_duration,
     parse_iso8601,
-    remove_end,
     unescapeHTML,
 )
 from ..compat import (
@@ -561,7 +560,7 @@ class BBCIE(BBCCoUkIE):
         'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
         'info_dict': {
             'id': '3662a707-0af9-3149-963f-47bea720b460',
-            'title': 'BBC Blogs - Adam Curtis - BUGGER',
+            'title': 'BUGGER',
         },
         'playlist_count': 18,
     }, {
@@ -670,9 +669,17 @@ class BBCIE(BBCCoUkIE):
         'url': 'http://www.bbc.com/sport/0/football/34475836',
         'info_dict': {
             'id': '34475836',
-            'title': 'What Liverpool can expect from Klopp',
+            'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
         },
         'playlist_count': 3,
+    }, {
+        # school report article with single video
+        'url': 'http://www.bbc.co.uk/schoolreport/35744779',
+        'info_dict': {
+            'id': '35744779',
+            'title': 'School which breaks down barriers in Jerusalem',
+        },
+        'playlist_count': 1,
     }, {
         # single video with playlist URL from weather section
         'url': 'http://www.bbc.com/weather/features/33601775',
@@ -735,8 +742,17 @@ class BBCIE(BBCCoUkIE):
 
         json_ld_info = self._search_json_ld(webpage, playlist_id, default=None)
         timestamp = json_ld_info.get('timestamp')
+
         playlist_title = json_ld_info.get('title')
-        playlist_description = json_ld_info.get('description')
+        if not playlist_title:
+            playlist_title = self._og_search_title(
+                webpage, default=None) or self._html_search_regex(
+                r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
+            if playlist_title:
+                playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
+
+        playlist_description = json_ld_info.get(
+            'description') or self._og_search_description(webpage, default=None)
 
         if not timestamp:
             timestamp = parse_iso8601(self._search_regex(
@@ -797,8 +813,6 @@ class BBCIE(BBCCoUkIE):
                                 playlist.get('progressiveDownloadUrl'), playlist_id, timestamp))
 
         if entries:
-            playlist_title = playlist_title or remove_end(self._og_search_title(webpage), ' - BBC News')
-            playlist_description = playlist_description or self._og_search_description(webpage, default=None)
             return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
 
         # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
@@ -829,10 +843,6 @@ class BBCIE(BBCCoUkIE):
                 'subtitles': subtitles,
             }
 
-        playlist_title = self._html_search_regex(
-            r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?</title>', webpage, 'playlist title')
-        playlist_description = self._og_search_description(webpage, default=None)
-
         def extract_all(pattern):
             return list(filter(None, map(
                 lambda s: self._parse_json(s, playlist_id, fatal=False),
index 38bda3af5a189cc7a2c8d65937a7d710edd0211f..7a8e1f60b82923b643918e43924fa64a5250cb83 100644 (file)
@@ -28,10 +28,10 @@ class BleacherReportIE(InfoExtractor):
         'add_ie': ['Ooyala'],
     }, {
         'url': 'http://bleacherreport.com/articles/2586817-aussie-golfers-get-fright-of-their-lives-after-being-chased-by-angry-kangaroo',
-        'md5': 'af5f90dc9c7ba1c19d0a3eac806bbf50',
+        'md5': '6a5cd403418c7b01719248ca97fb0692',
         'info_dict': {
             'id': '2586817',
-            'ext': 'mp4',
+            'ext': 'webm',
             'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo',
             'timestamp': 1446839961,
             'uploader': 'Sean Fay',
@@ -93,10 +93,14 @@ class BleacherReportCMSIE(AMPIE):
         'md5': '8c2c12e3af7805152675446c905d159b',
         'info_dict': {
             'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1',
-            'ext': 'flv',
+            'ext': 'mp4',
             'title': 'Cena vs. Rollins Would Expose the Heavyweight Division',
             'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e',
         },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
     }]
 
     def _real_extract(self, url):
diff --git a/youtube_dl/extractor/bokecc.py b/youtube_dl/extractor/bokecc.py
new file mode 100644 (file)
index 0000000..122a1cb
--- /dev/null
@@ -0,0 +1,60 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_parse_qs
+from ..utils import ExtractorError
+
+
+class BokeCCBaseIE(InfoExtractor):
+    def _extract_bokecc_formats(self, webpage, video_id, format_id=None):
+        player_params_str = self._html_search_regex(
+            r'<(?:script|embed)[^>]+src="http://p\.bokecc\.com/player\?([^"]+)',
+            webpage, 'player params')
+
+        player_params = compat_parse_qs(player_params_str)
+
+        info_xml = self._download_xml(
+            'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % (
+                player_params['siteid'][0], player_params['vid'][0]), video_id)
+
+        formats = [{
+            'format_id': format_id,
+            'url': quality.find('./copy').attrib['playurl'],
+            'preference': int(quality.attrib['value']),
+        } for quality in info_xml.findall('./video/quality')]
+
+        self._sort_formats(formats)
+
+        return formats
+
+
+class BokeCCIE(BokeCCBaseIE):
+    _IE_DESC = 'CC视频'
+    _VALID_URL = r'http://union\.bokecc\.com/playvideo\.bo\?(?P<query>.*)'
+
+    _TESTS = [{
+        'url': 'http://union.bokecc.com/playvideo.bo?vid=E44D40C15E65EA30&uid=CD0C5D3C8614B28B',
+        'info_dict': {
+            'id': 'CD0C5D3C8614B28B_E44D40C15E65EA30',
+            'ext': 'flv',
+            'title': 'BokeCC Video',
+        },
+    }]
+
+    def _real_extract(self, url):
+        qs = compat_parse_qs(re.match(self._VALID_URL, url).group('query'))
+        if not qs.get('vid') or not qs.get('uid'):
+            raise ExtractorError('Invalid URL', expected=True)
+
+        video_id = '%s_%s' % (qs['uid'][0], qs['vid'][0])
+
+        webpage = self._download_webpage(url, video_id)
+
+        return {
+            'id': video_id,
+            'title': 'BokeCC Video',  # no title provided in the webpage
+            'formats': self._extract_bokecc_formats(webpage, video_id),
+        }
index cb96c3876b7cbf02220d06ad86a44414d69c9fa8..cac8fdcba4a9967b9aa028c29bac1d539af458ca 100644 (file)
@@ -4,12 +4,13 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..utils import js_to_json
 
 
 class C56IE(InfoExtractor):
     _VALID_URL = r'https?://(?:(?:www|player)\.)?56\.com/(?:.+?/)?(?:v_|(?:play_album.+-))(?P<textid>.+?)\.(?:html|swf)'
     IE_NAME = '56.com'
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.56.com/u39/v_OTM0NDA3MTY.html',
         'md5': 'e59995ac63d0457783ea05f93f12a866',
         'info_dict': {
@@ -18,12 +19,29 @@ class C56IE(InfoExtractor):
             'title': '网事知多少 第32期:车怒',
             'duration': 283.813,
         },
-    }
+    }, {
+        'url': 'http://www.56.com/u47/v_MTM5NjQ5ODc2.html',
+        'md5': '',
+        'info_dict': {
+            'id': '82247482',
+            'title': '爱的诅咒之杜鹃花开',
+        },
+        'playlist_count': 7,
+        'add_ie': ['Sohu'],
+    }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
         text_id = mobj.group('textid')
 
+        webpage = self._download_webpage(url, text_id)
+        sohu_video_info_str = self._search_regex(
+            r'var\s+sohuVideoInfo\s*=\s*({[^}]+});', webpage, 'Sohu video info', default=None)
+        if sohu_video_info_str:
+            sohu_video_info = self._parse_json(
+                sohu_video_info_str, text_id, transform_source=js_to_json)
+            return self.url_result(sohu_video_info['url'], 'Sohu')
+
         page = self._download_json(
             'http://vxml.56.com/json/%s/' % text_id, text_id, 'Downloading video info')
 
index 6d9cd8abd1545ff09d27c991a7dc7c5d2cc2a872..042c4f2f13757ab3f0c942932ada8e7a01160055 100644 (file)
@@ -21,6 +21,10 @@ class CinemassacreIE(InfoExtractor):
                 'title': '“Angry Video Game Nerd: The Movie” – Trailer',
                 'description': 'md5:fb87405fcb42a331742a0dce2708560b',
             },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
         },
         {
             'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
@@ -31,14 +35,18 @@ class CinemassacreIE(InfoExtractor):
                 'upload_date': '20131002',
                 'title': 'The Mummy’s Hand (1940)',
             },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
         },
         {
             # Youtube embedded video
             'url': 'http://cinemassacre.com/2006/12/07/chronologically-confused-about-bad-movie-and-video-game-sequel-titles/',
-            'md5': 'df4cf8a1dcedaec79a73d96d83b99023',
+            'md5': 'ec9838a5520ef5409b3e4e42fcb0a3b9',
             'info_dict': {
                 'id': 'OEVzPCY2T-g',
-                'ext': 'mp4',
+                'ext': 'webm',
                 'title': 'AVGN: Chronologically Confused about Bad Movie and Video Game Sequel Titles',
                 'upload_date': '20061207',
                 'uploader': 'Cinemassacre',
@@ -49,12 +57,12 @@ class CinemassacreIE(InfoExtractor):
         {
             # Youtube embedded video
             'url': 'http://cinemassacre.com/2006/09/01/mckids/',
-            'md5': '6eb30961fa795fedc750eac4881ad2e1',
+            'md5': '7393c4e0f54602ad110c793eb7a6513a',
             'info_dict': {
                 'id': 'FnxsNhuikpo',
-                'ext': 'mp4',
+                'ext': 'webm',
                 'upload_date': '20060901',
-                'uploader': 'Cinemassacre Extras',
+                'uploader': 'Cinemassacre Extra',
                 'description': 'md5:de9b751efa9e45fbaafd9c8a1123ed53',
                 'uploader_id': 'Cinemassacre',
                 'title': 'AVGN: McKids',
@@ -69,7 +77,11 @@ class CinemassacreIE(InfoExtractor):
                 'description': 'Let’s Play Mario Kart 64 !! Mario Kart 64 is a classic go-kart racing game released for the Nintendo 64 (N64). Today James & Mike do 4 player Battle Mode with Kyle and Bootsy!',
                 'title': 'Mario Kart 64 (Nintendo 64) James & Mike Mondays',
                 'upload_date': '20150525',
-            }
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
         }
     ]
 
index 5c3908f72b2f94c5557aaeaaa2e19ec78716c0f0..3cf0bf95b4386b393df28d46367887c2e7952ae9 100644 (file)
@@ -51,9 +51,7 @@ class CNETIE(ThePlatformIE):
             uploader = None
             uploader_id = None
 
-        mpx_account = data['config']['uvpConfig']['default']['mpx_account']
-
-        metadata = self.get_metadata('%s/%s' % (mpx_account, list(vdata['files'].values())[0]), video_id)
+        metadata = self.get_metadata('kYEXFC/%s' % list(vdata['files'].values())[0], video_id)
         description = vdata.get('description') or metadata.get('description')
         duration = int_or_none(vdata.get('duration')) or metadata.get('duration')
 
@@ -62,7 +60,7 @@ class CNETIE(ThePlatformIE):
         for (fkey, vid) in vdata['files'].items():
             if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']:
                 continue
-            release_url = 'http://link.theplatform.com/s/%s/%s?format=SMIL&mbr=true' % (mpx_account, vid)
+            release_url = 'http://link.theplatform.com/s/kYEXFC/%s?format=SMIL&mbr=true' % vid
             if fkey == 'hds':
                 release_url += '&manifest=f4m'
             tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % fkey)
index f411ea7633568915e4e97df61958d13c6a8aca80..ecd7da767f72da9c07903d7013cd95ebf616078a 100644 (file)
@@ -15,13 +15,14 @@ import math
 from ..compat import (
     compat_cookiejar,
     compat_cookies,
+    compat_etree_fromstring,
     compat_getpass,
     compat_http_client,
+    compat_os_name,
+    compat_str,
     compat_urllib_error,
     compat_urllib_parse,
     compat_urlparse,
-    compat_str,
-    compat_etree_fromstring,
 )
 from ..utils import (
     NO_DEFAULT,
@@ -46,6 +47,8 @@ from ..utils import (
     xpath_with_ns,
     determine_protocol,
     parse_duration,
+    mimetype2ext,
+    update_url_query,
 )
 
 
@@ -103,7 +106,7 @@ class InfoExtractor(object):
                     * protocol   The protocol that will be used for the actual
                                  download, lower-case.
                                  "http", "https", "rtsp", "rtmp", "rtmpe",
-                                 "m3u8", or "m3u8_native".
+                                 "m3u8", "m3u8_native" or "http_dash_segments".
                     * preference Order number of this format. If this field is
                                  present and not None, the formats get sorted
                                  by this field, regardless of all other values.
@@ -156,12 +159,14 @@ class InfoExtractor(object):
     thumbnail:      Full URL to a video thumbnail image.
     description:    Full video description.
     uploader:       Full name of the video uploader.
+    license:        License name the video is licensed under.
     creator:        The main artist who created the video.
     release_date:   The date (YYYYMMDD) when the video was released.
     timestamp:      UNIX timestamp of the moment the video became available.
     upload_date:    Video upload date (YYYYMMDD).
                     If not explicitly set, calculated from timestamp.
     uploader_id:    Nickname or id of the video uploader.
+    uploader_url:   Full URL to a personal webpage of the video uploader.
     location:       Physical location where the video was filmed.
     subtitles:      The available subtitles as a dictionary in the format
                     {language: subformats}. "subformats" is a list sorted from
@@ -341,7 +346,7 @@ class InfoExtractor(object):
     def IE_NAME(self):
         return compat_str(type(self).__name__[:-2])
 
-    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
+    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None):
         """ Returns the response handle """
         if note is None:
             self.report_download_webpage(video_id)
@@ -350,6 +355,12 @@ class InfoExtractor(object):
                 self.to_screen('%s' % (note,))
             else:
                 self.to_screen('%s: %s' % (video_id, note))
+        # data, headers and query params will be ignored for `Request` objects
+        if isinstance(url_or_request, compat_str):
+            if query:
+                url_or_request = update_url_query(url_or_request, query)
+            if data or headers:
+                url_or_request = sanitized_Request(url_or_request, data, headers or {})
         try:
             return self._downloader.urlopen(url_or_request)
         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
@@ -365,13 +376,13 @@ class InfoExtractor(object):
                 self._downloader.report_warning(errmsg)
                 return False
 
-    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
+    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers=None, query=None):
         """ Returns a tuple (page content as string, URL handle) """
         # Strip hashes from the URL (#1038)
         if isinstance(url_or_request, (compat_str, str)):
             url_or_request = url_or_request.partition('#')[0]
 
-        urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
+        urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
         if urlh is False:
             assert not fatal
             return False
@@ -424,7 +435,7 @@ class InfoExtractor(object):
             self.to_screen('Saving request to ' + filename)
             # Working around MAX_PATH limitation on Windows (see
             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
-            if os.name == 'nt':
+            if compat_os_name == 'nt':
                 absfilepath = os.path.abspath(filename)
                 if len(absfilepath) > 259:
                     filename = '\\\\?\\' + absfilepath
@@ -458,13 +469,13 @@ class InfoExtractor(object):
 
         return content
 
-    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
+    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers=None, query=None):
         """ Returns the data of the page as a string """
         success = False
         try_count = 0
         while success is False:
             try:
-                res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
+                res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
                 success = True
             except compat_http_client.IncompleteRead as e:
                 try_count += 1
@@ -479,10 +490,10 @@ class InfoExtractor(object):
 
     def _download_xml(self, url_or_request, video_id,
                       note='Downloading XML', errnote='Unable to download XML',
-                      transform_source=None, fatal=True, encoding=None):
+                      transform_source=None, fatal=True, encoding=None, data=None, headers=None, query=None):
         """Return the xml as an xml.etree.ElementTree.Element"""
         xml_string = self._download_webpage(
-            url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
+            url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
         if xml_string is False:
             return xml_string
         if transform_source:
@@ -493,10 +504,10 @@ class InfoExtractor(object):
                        note='Downloading JSON metadata',
                        errnote='Unable to download JSON metadata',
                        transform_source=None,
-                       fatal=True, encoding=None):
+                       fatal=True, encoding=None, data=None, headers=None, query=None):
         json_string = self._download_webpage(
             url_or_request, video_id, note, errnote, fatal=fatal,
-            encoding=encoding)
+            encoding=encoding, data=data, headers=headers, query=query)
         if (not fatal) and json_string is False:
             return None
         return self._parse_json(
@@ -593,7 +604,7 @@ class InfoExtractor(object):
                 if mobj:
                     break
 
-        if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
+        if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
             _name = '\033[0;34m%s\033[0m' % name
         else:
             _name = name
@@ -899,6 +910,16 @@ class InfoExtractor(object):
                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
                 formats)
 
+    @staticmethod
+    def _remove_duplicate_formats(formats):
+        format_urls = set()
+        unique_formats = []
+        for f in formats:
+            if f['url'] not in format_urls:
+                format_urls.add(f['url'])
+                unique_formats.append(f)
+        formats[:] = unique_formats
+
     def _is_valid_url(self, url, video_id, item='video'):
         url = self._proto_relative_url(url, scheme='http:')
         # For now assume non HTTP(S) URLs always valid
@@ -952,6 +973,13 @@ class InfoExtractor(object):
         if manifest is False:
             return []
 
+        return self._parse_f4m_formats(
+            manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
+            transform_source=transform_source, fatal=fatal)
+
+    def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
+                           transform_source=lambda s: fix_xml_ampersands(s).strip(),
+                           fatal=True):
         formats = []
         manifest_version = '1.0'
         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
@@ -977,7 +1005,8 @@ class InfoExtractor(object):
                 # bitrate in f4m downloader
                 if determine_ext(manifest_url) == 'f4m':
                     formats.extend(self._extract_f4m_formats(
-                        manifest_url, video_id, preference, f4m_id, fatal=fatal))
+                        manifest_url, video_id, preference=preference, f4m_id=f4m_id,
+                        transform_source=transform_source, fatal=fatal))
                     continue
             tbr = int_or_none(media_el.attrib.get('bitrate'))
             formats.append({
@@ -1022,11 +1051,21 @@ class InfoExtractor(object):
             return []
         m3u8_doc, urlh = res
         m3u8_url = urlh.geturl()
-        # A Media Playlist Tag MUST NOT appear in a Master Playlist
-        # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
-        # The EXT-X-TARGETDURATION tag is REQUIRED for every M3U8 Media Playlists
-        # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
-        if '#EXT-X-TARGETDURATION' in m3u8_doc:
+
+        # We should try extracting formats only from master playlists [1], i.e.
+        # playlists that describe available qualities. On the other hand media
+        # playlists [2] should be returned as is since they contain just the media
+        # without qualities renditions.
+        # Fortunately, master playlist can be easily distinguished from media
+        # playlist based on particular tags availability. As of [1, 2] master
+        # playlist tags MUST NOT appear in a media playist and vice versa.
+        # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
+        # and MUST NOT appear in master playlist thus we can clearly detect media
+        # playlist with this criterion.
+        # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
+        # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
+        # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
+        if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
             return [{
                 'url': m3u8_url,
                 'format_id': m3u8_id,
@@ -1073,19 +1112,29 @@ class InfoExtractor(object):
                     'protocol': entry_protocol,
                     'preference': preference,
                 }
-                codecs = last_info.get('CODECS')
-                if codecs:
-                    # TODO: looks like video codec is not always necessarily goes first
-                    va_codecs = codecs.split(',')
-                    if va_codecs[0]:
-                        f['vcodec'] = va_codecs[0]
-                    if len(va_codecs) > 1 and va_codecs[1]:
-                        f['acodec'] = va_codecs[1]
                 resolution = last_info.get('RESOLUTION')
                 if resolution:
                     width_str, height_str = resolution.split('x')
                     f['width'] = int(width_str)
                     f['height'] = int(height_str)
+                codecs = last_info.get('CODECS')
+                if codecs:
+                    vcodec, acodec = [None] * 2
+                    va_codecs = codecs.split(',')
+                    if len(va_codecs) == 1:
+                        # Audio only entries usually come with single codec and
+                        # no resolution. For more robustness we also check it to
+                        # be mp4 audio.
+                        if not resolution and va_codecs[0].startswith('mp4a'):
+                            vcodec, acodec = 'none', va_codecs[0]
+                        else:
+                            vcodec = va_codecs[0]
+                    else:
+                        vcodec, acodec = va_codecs[:2]
+                    f.update({
+                        'acodec': acodec,
+                        'vcodec': vcodec,
+                    })
                 if last_media is not None:
                     f['m3u8_media'] = last_media
                     last_media = None
@@ -1106,8 +1155,8 @@ class InfoExtractor(object):
                 out.append('{%s}%s' % (namespace, c))
         return '/'.join(out)
 
-    def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
-        smil = self._download_smil(smil_url, video_id, fatal=fatal)
+    def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
+        smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
 
         if smil is False:
             assert not fatal
@@ -1124,10 +1173,10 @@ class InfoExtractor(object):
             return {}
         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
 
-    def _download_smil(self, smil_url, video_id, fatal=True):
+    def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
         return self._download_xml(
             smil_url, video_id, 'Downloading SMIL file',
-            'Unable to download SMIL file', fatal=fatal)
+            'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
 
     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
         namespace = self._parse_smil_namespace(smil)
@@ -1277,16 +1326,7 @@ class InfoExtractor(object):
             if not src or src in urls:
                 continue
             urls.append(src)
-            ext = textstream.get('ext') or determine_ext(src)
-            if not ext:
-                type_ = textstream.get('type')
-                SUBTITLES_TYPES = {
-                    'text/vtt': 'vtt',
-                    'text/srt': 'srt',
-                    'application/smptett+xml': 'tt',
-                }
-                if type_ in SUBTITLES_TYPES:
-                    ext = SUBTITLES_TYPES[type_]
+            ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type'))
             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
             subtitles.setdefault(lang, []).append({
                 'url': src,
@@ -1422,8 +1462,9 @@ class InfoExtractor(object):
                         continue
                     representation_attrib = adaptation_set.attrib.copy()
                     representation_attrib.update(representation.attrib)
-                    mime_type = representation_attrib.get('mimeType')
-                    content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType')
+                    # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory
+                    mime_type = representation_attrib['mimeType']
+                    content_type = mime_type.split('/')[0]
                     if content_type == 'text':
                         # TODO implement WebVTT downloading
                         pass
@@ -1446,6 +1487,7 @@ class InfoExtractor(object):
                         f = {
                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
                             'url': base_url,
+                            'ext': mimetype2ext(mime_type),
                             'width': int_or_none(representation_attrib.get('width')),
                             'height': int_or_none(representation_attrib.get('height')),
                             'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
@@ -1598,6 +1640,15 @@ class InfoExtractor(object):
     def _get_automatic_captions(self, *args, **kwargs):
         raise NotImplementedError('This method must be implemented by subclasses')
 
+    def mark_watched(self, *args, **kwargs):
+        if (self._downloader.params.get('mark_watched', False) and
+                (self._get_login_info()[0] is not None or
+                    self._downloader.params.get('cookiefile') is not None)):
+            self._mark_watched(*args, **kwargs)
+
+    def _mark_watched(self, *args, **kwargs):
+        raise NotImplementedError('This method must be implemented by subclasses')
+
 
 class SearchInfoExtractor(InfoExtractor):
     """
index 373b3b4b4735d8544128c48a10037eed3c570e5d..bdc768c783b9b3213badc5cf4b354f6159142f9f 100644 (file)
@@ -18,7 +18,7 @@ class DouyuTVIE(InfoExtractor):
             'display_id': 'iseven',
             'ext': 'flv',
             'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
-            'description': 'md5:c93d6692dde6fe33809a46edcbecca44',
+            'description': 'md5:f34981259a03e980a3c6404190a3ed61',
             'thumbnail': 're:^https?://.*\.jpg$',
             'uploader': '7师傅',
             'uploader_id': '431925',
@@ -26,7 +26,7 @@ class DouyuTVIE(InfoExtractor):
         },
         'params': {
             'skip_download': True,
-        }
+        },
     }, {
         'url': 'http://www.douyutv.com/85982',
         'info_dict': {
@@ -42,7 +42,24 @@ class DouyuTVIE(InfoExtractor):
         },
         'params': {
             'skip_download': True,
-        }
+        },
+        'skip': 'Romm not found',
+    }, {
+        'url': 'http://www.douyutv.com/17732',
+        'info_dict': {
+            'id': '17732',
+            'display_id': '17732',
+            'ext': 'flv',
+            'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'description': 'md5:f34981259a03e980a3c6404190a3ed61',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'uploader': '7师傅',
+            'uploader_id': '431925',
+            'is_live': True,
+        },
+        'params': {
+            'skip_download': True,
+        },
     }]
 
     def _real_extract(self, url):
index 6cda56a7fdd045ef032bb496fadd8462c3839ecd..a638c827c7e01ed8acee28a091d3cdcff510ada0 100644 (file)
@@ -1,6 +1,8 @@
-# encoding: utf-8
+# coding: utf-8
 from __future__ import unicode_literals
 
+import json
+import re
 import time
 
 from .common import InfoExtractor
@@ -8,44 +10,125 @@ from ..utils import int_or_none
 
 
 class DPlayIE(InfoExtractor):
-    _VALID_URL = r'http://www\.dplay\.se/[^/]+/(?P<id>[^/?#]+)'
+    _VALID_URL = r'http://(?P<domain>it\.dplay\.com|www\.dplay\.(?:dk|se|no))/[^/]+/(?P<id>[^/?#]+)'
 
-    _TEST = {
+    _TESTS = [{
+        'url': 'http://it.dplay.com/take-me-out/stagione-1-episodio-25/',
+        'info_dict': {
+            'id': '1255600',
+            'display_id': 'stagione-1-episodio-25',
+            'ext': 'mp4',
+            'title': 'Episodio 25',
+            'description': 'md5:cae5f40ad988811b197d2d27a53227eb',
+            'duration': 2761,
+            'timestamp': 1454701800,
+            'upload_date': '20160205',
+            'creator': 'RTIT',
+            'series': 'Take me out',
+            'season_number': 1,
+            'episode_number': 25,
+            'age_limit': 0,
+        },
+        'expected_warnings': ['Unable to download f4m manifest'],
+    }, {
         'url': 'http://www.dplay.se/nugammalt-77-handelser-som-format-sverige/season-1-svensken-lar-sig-njuta-av-livet/',
         'info_dict': {
             'id': '3172',
-            'ext': 'mp4',
             'display_id': 'season-1-svensken-lar-sig-njuta-av-livet',
+            'ext': 'flv',
             'title': 'Svensken lär sig njuta av livet',
+            'description': 'md5:d3819c9bccffd0fe458ca42451dd50d8',
             'duration': 2650,
+            'timestamp': 1365454320,
+            'upload_date': '20130408',
+            'creator': 'Kanal 5 (Home)',
+            'series': 'Nugammalt - 77 händelser som format Sverige',
+            'season_number': 1,
+            'episode_number': 1,
+            'age_limit': 0,
         },
-    }
+    }, {
+        'url': 'http://www.dplay.dk/mig-og-min-mor/season-6-episode-12/',
+        'info_dict': {
+            'id': '70816',
+            'display_id': 'season-6-episode-12',
+            'ext': 'flv',
+            'title': 'Episode 12',
+            'description': 'md5:9c86e51a93f8a4401fc9641ef9894c90',
+            'duration': 2563,
+            'timestamp': 1429696800,
+            'upload_date': '20150422',
+            'creator': 'Kanal 4',
+            'series': 'Mig og min mor',
+            'season_number': 6,
+            'episode_number': 12,
+            'age_limit': 0,
+        },
+    }, {
+        'url': 'http://www.dplay.no/pga-tour/season-1-hoydepunkter-18-21-februar/',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
-        display_id = self._match_id(url)
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('id')
+        domain = mobj.group('domain')
+
         webpage = self._download_webpage(url, display_id)
+
         video_id = self._search_regex(
-            r'data-video-id="(\d+)"', webpage, 'video id')
+            r'data-video-id=["\'](\d+)', webpage, 'video id')
 
         info = self._download_json(
-            'http://www.dplay.se/api/v2/ajax/videos?video_id=' + video_id,
+            'http://%s/api/v2/ajax/videos?video_id=%s' % (domain, video_id),
             video_id)['data'][0]
 
-        self._set_cookie(
-            'secure.dplay.se', 'dsc-geo',
-            '{"countryCode":"NL","expiry":%d}' % ((time.time() + 20 * 60) * 1000))
-        # TODO: consider adding support for 'stream_type=hds', it seems to
-        # require setting some cookies
-        manifest_url = self._download_json(
-            'https://secure.dplay.se/secure/api/v2/user/authorization/stream/%s?stream_type=hls' % video_id,
-            video_id, 'Getting manifest url for hls stream')['hls']
-        formats = self._extract_m3u8_formats(
-            manifest_url, video_id, ext='mp4', entry_protocol='m3u8_native')
+        title = info['title']
+
+        PROTOCOLS = ('hls', 'hds')
+        formats = []
+
+        def extract_formats(protocol, manifest_url):
+            if protocol == 'hls':
+                formats.extend(self._extract_m3u8_formats(
+                    manifest_url, video_id, ext='mp4',
+                    entry_protocol='m3u8_native', m3u8_id=protocol, fatal=False))
+            elif protocol == 'hds':
+                formats.extend(self._extract_f4m_formats(
+                    manifest_url + '&hdcore=3.8.0&plugin=flowplayer-3.8.0.0',
+                    video_id, f4m_id=protocol, fatal=False))
+
+        domain_tld = domain.split('.')[-1]
+        if domain_tld in ('se', 'dk'):
+            for protocol in PROTOCOLS:
+                self._set_cookie(
+                    'secure.dplay.%s' % domain_tld, 'dsc-geo',
+                    json.dumps({
+                        'countryCode': domain_tld.upper(),
+                        'expiry': (time.time() + 20 * 60) * 1000,
+                    }))
+                stream = self._download_json(
+                    'https://secure.dplay.%s/secure/api/v2/user/authorization/stream/%s?stream_type=%s'
+                    % (domain_tld, video_id, protocol), video_id,
+                    'Downloading %s stream JSON' % protocol, fatal=False)
+                if stream and stream.get(protocol):
+                    extract_formats(protocol, stream[protocol])
+        else:
+            for protocol in PROTOCOLS:
+                if info.get(protocol):
+                    extract_formats(protocol, info[protocol])
 
         return {
             'id': video_id,
             'display_id': display_id,
-            'title': info['title'],
-            'formats': formats,
+            'title': title,
+            'description': info.get('video_metadata_longDescription'),
             'duration': int_or_none(info.get('video_metadata_length'), scale=1000),
+            'timestamp': int_or_none(info.get('video_publish_date')),
+            'creator': info.get('video_metadata_homeChannel'),
+            'series': info.get('video_metadata_show'),
+            'season_number': int_or_none(info.get('season')),
+            'episode_number': int_or_none(info.get('episode')),
+            'age_limit': int_or_none(info.get('minimum_age')),
+            'formats': formats,
         }
diff --git a/youtube_dl/extractor/dw.py b/youtube_dl/extractor/dw.py
new file mode 100644 (file)
index 0000000..b6c9855
--- /dev/null
@@ -0,0 +1,85 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+from ..compat import compat_urlparse
+
+
+class DWIE(InfoExtractor):
+    IE_NAME = 'dw'
+    _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+av-(?P<id>\d+)'
+    _TESTS = [{
+        # video
+        'url': 'http://www.dw.com/en/intelligent-light/av-19112290',
+        'md5': '7372046e1815c5a534b43f3c3c36e6e9',
+        'info_dict': {
+            'id': '19112290',
+            'ext': 'mp4',
+            'title': 'Intelligent light',
+            'description': 'md5:90e00d5881719f2a6a5827cb74985af1',
+            'upload_date': '20160311',
+        }
+    }, {
+        # audio
+        'url': 'http://www.dw.com/en/worldlink-my-business/av-19111941',
+        'md5': '2814c9a1321c3a51f8a7aeb067a360dd',
+        'info_dict': {
+            'id': '19111941',
+            'ext': 'mp3',
+            'title': 'WorldLink: My business',
+            'description': 'md5:bc9ca6e4e063361e21c920c53af12405',
+            'upload_date': '20160311',
+        }
+    }]
+
+    def _real_extract(self, url):
+        media_id = self._match_id(url)
+        webpage = self._download_webpage(url, media_id)
+        hidden_inputs = self._hidden_inputs(webpage)
+        title = hidden_inputs['media_title']
+
+        formats = []
+        if hidden_inputs.get('player_type') == 'video' and hidden_inputs.get('stream_file') == '1':
+            formats = self._extract_smil_formats(
+                'http://www.dw.com/smil/v-%s' % media_id, media_id,
+                transform_source=lambda s: s.replace(
+                    'rtmp://tv-od.dw.de/flash/',
+                    'http://tv-download.dw.de/dwtv_video/flv/'))
+        else:
+            formats = [{'url': hidden_inputs['file_name']}]
+
+        return {
+            'id': media_id,
+            'title': title,
+            'description': self._og_search_description(webpage),
+            'thumbnail': hidden_inputs.get('preview_image'),
+            'duration': int_or_none(hidden_inputs.get('file_duration')),
+            'upload_date': hidden_inputs.get('display_date'),
+            'formats': formats,
+        }
+
+
+class DWArticleIE(InfoExtractor):
+    IE_NAME = 'dw:article'
+    _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+a-(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.dw.com/en/no-hope-limited-options-for-refugees-in-idomeni/a-19111009',
+        'md5': '8ca657f9d068bbef74d6fc38b97fc869',
+        'info_dict': {
+            'id': '19105868',
+            'ext': 'mp4',
+            'title': 'The harsh life of refugees in Idomeni',
+            'description': 'md5:196015cc7e48ebf474db9399420043c7',
+            'upload_date': '20160310',
+        }
+    }
+
+    def _real_extract(self, url):
+        article_id = self._match_id(url)
+        webpage = self._download_webpage(url, article_id)
+        hidden_inputs = self._hidden_inputs(webpage)
+        media_id = hidden_inputs['media_id']
+        media_path = self._search_regex(r'href="([^"]+av-%s)"\s+class="overlayLink"' % media_id, webpage, 'media url')
+        media_url = compat_urlparse.urljoin(url, media_path)
+        return self.url_result(media_url, 'DW', media_id)
index 00a69e6312aede6069e062c6abff29137939daa9..8c725a4e631860584781b116e72b02dd05813fc2 100644 (file)
@@ -9,7 +9,7 @@ class ElPaisIE(InfoExtractor):
     _VALID_URL = r'https?://(?:[^.]+\.)?elpais\.com/.*/(?P<id>[^/#?]+)\.html(?:$|[?#])'
     IE_DESC = 'El País'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://blogs.elpais.com/la-voz-de-inaki/2014/02/tiempo-nuevo-recetas-viejas.html',
         'md5': '98406f301f19562170ec071b83433d55',
         'info_dict': {
@@ -19,30 +19,41 @@ class ElPaisIE(InfoExtractor):
             'description': 'De lunes a viernes, a partir de las ocho de la mañana, Iñaki Gabilondo nos cuenta su visión de la actualidad nacional e internacional.',
             'upload_date': '20140206',
         }
-    }
+    }, {
+        'url': 'http://elcomidista.elpais.com/elcomidista/2016/02/24/articulo/1456340311_668921.html#?id_externo_nwl=newsletter_diaria20160303t',
+        'md5': '3bd5b09509f3519d7d9e763179b013de',
+        'info_dict': {
+            'id': '1456340311_668921',
+            'ext': 'mp4',
+            'title': 'Cómo hacer el mejor café con cafetera italiana',
+            'description': 'Que sí, que las cápsulas son cómodas. Pero si le pides algo más a la vida, quizá deberías aprender a usar bien la cafetera italiana. No tienes más que ver este vídeo y seguir sus siete normas básicas.',
+            'upload_date': '20160303',
+        }
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
 
         prefix = self._html_search_regex(
-            r'var url_cache = "([^"]+)";', webpage, 'URL prefix')
+            r'var\s+url_cache\s*=\s*"([^"]+)";', webpage, 'URL prefix')
         video_suffix = self._search_regex(
-            r"URLMediaFile = url_cache \+ '([^']+)'", webpage, 'video URL')
+            r"(?:URLMediaFile|urlVideo_\d+)\s*=\s*url_cache\s*\+\s*'([^']+)'", webpage, 'video URL')
         video_url = prefix + video_suffix
         thumbnail_suffix = self._search_regex(
-            r"URLMediaStill = url_cache \+ '([^']+)'", webpage, 'thumbnail URL',
-            fatal=False)
+            r"(?:URLMediaStill|urlFotogramaFijo_\d+)\s*=\s*url_cache\s*\+\s*'([^']+)'",
+            webpage, 'thumbnail URL', fatal=False)
         thumbnail = (
             None if thumbnail_suffix is None
             else prefix + thumbnail_suffix)
         title = self._html_search_regex(
-            '<h2 class="entry-header entry-title.*?>(.*?)</h2>',
+            (r"tituloVideo\s*=\s*'([^']+)'", webpage, 'title',
+             r'<h2 class="entry-header entry-title.*?>(.*?)</h2>'),
             webpage, 'title')
-        date_str = self._search_regex(
+        upload_date = unified_strdate(self._search_regex(
             r'<p class="date-header date-int updated"\s+title="([^"]+)">',
-            webpage, 'upload date', fatal=False)
-        upload_date = (None if date_str is None else unified_strdate(date_str))
+            webpage, 'upload date', default=None) or self._html_search_meta(
+            'datePublished', webpage, 'timestamp'))
 
         return {
             'id': video_id,
index e4180701d7d5fe7f538d029e8ffb27235b6135df..e5e57d48518d3dd3999dad650d0c32406079ce33 100644 (file)
@@ -1,21 +1,13 @@
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
-from ..utils import (
-    url_basename,
-)
 
 
 class EngadgetIE(InfoExtractor):
-    _VALID_URL = r'''(?x)https?://www.engadget.com/
-        (?:video(?:/5min)?/(?P<id>\d+)|
-            [\d/]+/.*?)
-        '''
+    _VALID_URL = r'https?://www.engadget.com/video/(?P<id>\d+)'
 
     _TEST = {
-        'url': 'http://www.engadget.com/video/5min/518153925/',
+        'url': 'http://www.engadget.com/video/518153925/',
         'md5': 'c6820d4828a5064447a4d9fc73f312c9',
         'info_dict': {
             'id': '518153925',
@@ -27,15 +19,4 @@ class EngadgetIE(InfoExtractor):
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-
-        if video_id is not None:
-            return self.url_result('5min:%s' % video_id)
-        else:
-            title = url_basename(url)
-            webpage = self._download_webpage(url, title)
-            ids = re.findall(r'<iframe[^>]+?playList=(\d+)', webpage)
-            return {
-                '_type': 'playlist',
-                'title': title,
-                'entries': [self.url_result('5min:%s' % vid) for vid in ids]
-            }
+        return self.url_result('5min:%s' % video_id)
index 0a9a5ca718ce171ef211fee6f04b4685452add1e..f5bbd39d2d0e90996c118e3fae325034fc2bbb6d 100644 (file)
@@ -34,9 +34,12 @@ class FacebookIE(InfoExtractor):
                                 video/video\.php|
                                 photo\.php|
                                 video\.php|
-                                video/embed
-                            )\?(?:.*?)(?:v|video_id)=|
-                            [^/]+/videos/(?:[^/]+/)?
+                                video/embed|
+                                story\.php
+                            )\?(?:.*?)(?:v|video_id|story_fbid)=|
+                            [^/]+/videos/(?:[^/]+/)?|
+                            [^/]+/posts/|
+                            groups/[^/]+/permalink/
                         )|
                     facebook:
                 )
@@ -49,6 +52,8 @@ class FacebookIE(InfoExtractor):
 
     _CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36'
 
+    _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s'
+
     _TESTS = [{
         'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf',
         'md5': '6a40d33c0eccbb1af76cf0485a052659',
@@ -80,6 +85,33 @@ class FacebookIE(InfoExtractor):
             'title': 'When you post epic content on instagram.com/433 8 million followers, this is ...',
             'uploader': 'Demy de Zeeuw',
         },
+    }, {
+        'url': 'https://www.facebook.com/maxlayn/posts/10153807558977570',
+        'md5': '037b1fa7f3c2d02b7a0d7bc16031ecc6',
+        'info_dict': {
+            'id': '544765982287235',
+            'ext': 'mp4',
+            'title': '"What are you doing running in the snow?"',
+            'uploader': 'FailArmy',
+        }
+    }, {
+        'url': 'https://m.facebook.com/story.php?story_fbid=1035862816472149&id=116132035111903',
+        'md5': '1deb90b6ac27f7efcf6d747c8a27f5e3',
+        'info_dict': {
+            'id': '1035862816472149',
+            'ext': 'mp4',
+            'title': 'What the Flock Is Going On In New Zealand  Credit: ViralHog',
+            'uploader': 'S. Saint',
+        },
+    }, {
+        'note': 'swf params escaped',
+        'url': 'https://www.facebook.com/barackobama/posts/10153664894881749',
+        'md5': '97ba073838964d12c70566e0085c2b91',
+        'info_dict': {
+            'id': '10153664894881749',
+            'ext': 'mp4',
+            'title': 'Facebook video #10153664894881749',
+        },
     }, {
         'url': 'https://www.facebook.com/video.php?v=10204634152394104',
         'only_matching': True,
@@ -92,6 +124,9 @@ class FacebookIE(InfoExtractor):
     }, {
         'url': 'facebook:544765982287235',
         'only_matching': True,
+    }, {
+        'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/',
+        'only_matching': True,
     }]
 
     def _login(self):
@@ -160,19 +195,19 @@ class FacebookIE(InfoExtractor):
     def _real_initialize(self):
         self._login()
 
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        req = sanitized_Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
+    def _extract_from_url(self, url, video_id, fatal_if_no_video=True):
+        req = sanitized_Request(url)
         req.add_header('User-Agent', self._CHROME_USER_AGENT)
         webpage = self._download_webpage(req, video_id)
 
         video_data = None
 
-        BEFORE = '{swf.addParam(param[0], param[1]);});\n'
+        BEFORE = '{swf.addParam(param[0], param[1]);});'
         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
-        m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
+        m = re.search(re.escape(BEFORE) + '(?:\n|\\\\n)(.*?)' + re.escape(AFTER), webpage)
         if m:
-            data = dict(json.loads(m.group(1)))
+            swf_params = m.group(1).replace('\\\\', '\\').replace('\\"', '"')
+            data = dict(json.loads(swf_params))
             params_raw = compat_urllib_parse_unquote(data['params'])
             video_data = json.loads(params_raw)['video_data']
 
@@ -185,13 +220,15 @@ class FacebookIE(InfoExtractor):
 
         if not video_data:
             server_js_data = self._parse_json(self._search_regex(
-                r'handleServerJS\(({.+})\);', webpage, 'server js data'), video_id)
+                r'handleServerJS\(({.+})\);', webpage, 'server js data', default='{}'), video_id)
             for item in server_js_data.get('instances', []):
                 if item[1][0] == 'VideoConfig':
                     video_data = video_data_list2dict(item[2][0]['videoData'])
                     break
 
         if not video_data:
+            if not fatal_if_no_video:
+                return webpage, False
             m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage)
             if m_msg is not None:
                 raise ExtractorError(
@@ -208,10 +245,13 @@ class FacebookIE(InfoExtractor):
                 for src_type in ('src', 'src_no_ratelimit'):
                     src = f[0].get('%s_%s' % (quality, src_type))
                     if src:
+                        preference = -10 if format_id == 'progressive' else 0
+                        if quality == 'hd':
+                            preference += 5
                         formats.append({
                             'format_id': '%s_%s_%s' % (format_id, quality, src_type),
                             'url': src,
-                            'preference': -10 if format_id == 'progressive' else 0,
+                            'preference': preference,
                         })
             dash_manifest = f[0].get('dash_manifest')
             if dash_manifest:
@@ -234,39 +274,36 @@ class FacebookIE(InfoExtractor):
             video_title = 'Facebook video #%s' % video_id
         uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage))
 
-        return {
+        info_dict = {
             'id': video_id,
             'title': video_title,
             'formats': formats,
             'uploader': uploader,
         }
 
-
-class FacebookPostIE(InfoExtractor):
-    IE_NAME = 'facebook:post'
-    _VALID_URL = r'https?://(?:\w+\.)?facebook\.com/[^/]+/posts/(?P<id>\d+)'
-    _TEST = {
-        'url': 'https://www.facebook.com/maxlayn/posts/10153807558977570',
-        'md5': '037b1fa7f3c2d02b7a0d7bc16031ecc6',
-        'info_dict': {
-            'id': '544765982287235',
-            'ext': 'mp4',
-            'title': '"What are you doing running in the snow?"',
-            'uploader': 'FailArmy',
-        }
-    }
+        return webpage, info_dict
 
     def _real_extract(self, url):
-        post_id = self._match_id(url)
+        video_id = self._match_id(url)
+
+        real_url = self._VIDEO_PAGE_TEMPLATE % video_id if url.startswith('facebook:') else url
+        webpage, info_dict = self._extract_from_url(real_url, video_id, fatal_if_no_video=False)
 
-        webpage = self._download_webpage(url, post_id)
+        if info_dict:
+            return info_dict
 
-        entries = [
-            self.url_result('facebook:%s' % video_id, FacebookIE.ie_key())
-            for video_id in self._parse_json(
-                self._search_regex(
-                    r'(["\'])video_ids\1\s*:\s*(?P<ids>\[.+?\])',
-                    webpage, 'video ids', group='ids'),
-                post_id)]
+        if '/posts/' in url:
+            entries = [
+                self.url_result('facebook:%s' % vid, FacebookIE.ie_key())
+                for vid in self._parse_json(
+                    self._search_regex(
+                        r'(["\'])video_ids\1\s*:\s*(?P<ids>\[.+?\])',
+                        webpage, 'video ids', group='ids'),
+                    video_id)]
 
-        return self.playlist_result(entries, post_id)
+            return self.playlist_result(entries, video_id)
+        else:
+            _, info_dict = self._extract_from_url(
+                self._VIDEO_PAGE_TEMPLATE % video_id,
+                video_id, fatal_if_no_video=True)
+            return info_dict
index 6f9b003c2b2f490094dde43098e8087062886a2f..fd535457dc56a589eaf9e062dc40fe5374735020 100644 (file)
@@ -52,7 +52,7 @@ class FazIE(InfoExtractor):
         formats = []
         for pref, code in enumerate(['LOW', 'HIGH', 'HQ']):
             encoding = xpath_element(encodings, code)
-            if encoding:
+            if encoding is not None:
                 encoding_url = xpath_text(encoding, 'FILENAME')
                 if encoding_url:
                     formats.append({
index 2955965d908c15f21b3f8880993530779f97ec15..67d50a386ce812018047f711205b8619d75c1bf8 100644 (file)
@@ -1,5 +1,7 @@
 from __future__ import unicode_literals
 
+import re
+
 from .common import InfoExtractor
 from ..compat import (
     compat_urllib_parse,
@@ -16,12 +18,7 @@ from ..utils import (
 
 class FiveMinIE(InfoExtractor):
     IE_NAME = '5min'
-    _VALID_URL = r'''(?x)
-        (?:https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?:.*?&)?playList=|
-            https?://(?:(?:massively|www)\.)?joystiq\.com/video/|
-            5min:)
-        (?P<id>\d+)
-        '''
+    _VALID_URL = r'(?:5min:(?P<id>\d+)(?::(?P<sid>\d+))?|https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?P<query>.*))'
 
     _TESTS = [
         {
@@ -45,6 +42,7 @@ class FiveMinIE(InfoExtractor):
                 'title': 'How to Make a Next-Level Fruit Salad',
                 'duration': 184,
             },
+            'skip': 'no longer available',
         },
     ]
     _ERRORS = {
@@ -91,20 +89,33 @@ class FiveMinIE(InfoExtractor):
     }
 
     def _real_extract(self, url):
-        video_id = self._match_id(url)
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        sid = mobj.group('sid')
+
+        if mobj.group('query'):
+            qs = compat_parse_qs(mobj.group('query'))
+            if not qs.get('playList'):
+                raise ExtractorError('Invalid URL', expected=True)
+            video_id = qs['playList'][0]
+            if qs.get('sid'):
+                sid = qs['sid'][0]
+
         embed_url = 'https://embed.5min.com/playerseed/?playList=%s' % video_id
-        embed_page = self._download_webpage(embed_url, video_id,
-                                            'Downloading embed page')
-        sid = self._search_regex(r'sid=(\d+)', embed_page, 'sid')
-        query = compat_urllib_parse.urlencode({
-            'func': 'GetResults',
-            'playlist': video_id,
-            'sid': sid,
-            'isPlayerSeed': 'true',
-            'url': embed_url,
-        })
+        if not sid:
+            embed_page = self._download_webpage(embed_url, video_id,
+                                                'Downloading embed page')
+            sid = self._search_regex(r'sid=(\d+)', embed_page, 'sid')
+
         response = self._download_json(
-            'https://syn.5min.com/handlers/SenseHandler.ashx?' + query,
+            'https://syn.5min.com/handlers/SenseHandler.ashx?' +
+            compat_urllib_parse.urlencode({
+                'func': 'GetResults',
+                'playlist': video_id,
+                'sid': sid,
+                'isPlayerSeed': 'true',
+                'url': embed_url,
+            }),
             video_id)
         if not response['success']:
             raise ExtractorError(
@@ -118,9 +129,7 @@ class FiveMinIE(InfoExtractor):
         parsed_video_url = compat_urllib_parse_urlparse(compat_parse_qs(
             compat_urllib_parse_urlparse(info['EmbededURL']).query)['videoUrl'][0])
         for rendition in info['Renditions']:
-            if rendition['RenditionType'] == 'm3u8':
-                formats.extend(self._extract_m3u8_formats(rendition['Url'], video_id, m3u8_id='hls'))
-            elif rendition['RenditionType'] == 'aac':
+            if rendition['RenditionType'] == 'aac' or rendition['RenditionType'] == 'm3u8':
                 continue
             else:
                 rendition_url = compat_urlparse.urlunparse(parsed_video_url._replace(path=replace_extension(parsed_video_url.path.replace('//', '/%s/' % rendition['ID']), rendition['RenditionType'])))
index 318ac013d44b9ca8ce9de5c77d67b2cd3c9bb1e1..1dc50318ce81feb2604cfef943ddfb90e0a7641b 100644 (file)
@@ -36,6 +36,10 @@ class FoxNewsIE(AMPIE):
                 # 'upload_date': '20141204',
                 'thumbnail': 're:^https?://.*\.jpg$',
             },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
         },
         {
             'url': 'http://video.foxnews.com/v/video-embed.html?video_id=3937480&d=video.foxnews.com',
index c210177f7297e174d38988a2e62f379a9a478305..1477708bbec14c38bf0db7801d09d68a22ff1546 100644 (file)
@@ -14,7 +14,7 @@ class FreespeechIE(InfoExtractor):
         'url': 'https://www.freespeech.org/video/obama-romney-campaign-colorado-ahead-debate-0',
         'info_dict': {
             'id': 'poKsVCZ64uU',
-            'ext': 'mp4',
+            'ext': 'webm',
             'title': 'Obama, Romney Campaign in Colorado Ahead of Debate',
             'description': 'Obama, Romney Campaign in Colorado Ahead of Debate',
             'uploader': 'freespeechtv',
index 45adbb7a3947c9f3f6a3522864ac5fe370cdc254..8121f04a5e02cf672dc6ab0f152d21df8b98034e 100644 (file)
@@ -47,6 +47,7 @@ from .senateisvp import SenateISVPIE
 from .svt import SVTIE
 from .pornhub import PornHubIE
 from .xhamster import XHamsterEmbedIE
+from .tnaflix import TNAFlixNetworkEmbedIE
 from .vimeo import VimeoIE
 from .dailymotion import DailymotionCloudIE
 from .onionstudios import OnionStudiosIE
@@ -1241,28 +1242,34 @@ class GenericIE(InfoExtractor):
             full_response = self._request_webpage(request, video_id)
             head_response = full_response
 
+        info_dict = {
+            'id': video_id,
+            'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
+        }
+
         # Check for direct link to a video
         content_type = head_response.headers.get('Content-Type', '')
         m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>.+)$', content_type)
         if m:
             upload_date = unified_strdate(
                 head_response.headers.get('Last-Modified'))
-            formats = []
-            if m.group('format_id').endswith('mpegurl'):
+            format_id = m.group('format_id')
+            if format_id.endswith('mpegurl'):
                 formats = self._extract_m3u8_formats(url, video_id, 'mp4')
+            elif format_id == 'f4m':
+                formats = self._extract_f4m_formats(url, video_id)
             else:
                 formats = [{
                     'format_id': m.group('format_id'),
                     'url': url,
                     'vcodec': 'none' if m.group('type') == 'audio' else None
                 }]
-            return {
-                'id': video_id,
-                'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
+            info_dict.update({
                 'direct': True,
                 'formats': formats,
                 'upload_date': upload_date,
-            }
+            })
+            return info_dict
 
         if not self._downloader.params.get('test', False) and not is_intentional:
             force = self._downloader.params.get('force_generic_extractor', False)
@@ -1290,13 +1297,12 @@ class GenericIE(InfoExtractor):
                 'URL could be a direct video link, returning it as such.')
             upload_date = unified_strdate(
                 head_response.headers.get('Last-Modified'))
-            return {
-                'id': video_id,
-                'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
+            info_dict.update({
                 'direct': True,
                 'url': url,
                 'upload_date': upload_date,
-            }
+            })
+            return info_dict
 
         webpage = self._webpage_read_content(
             full_response, url, video_id, prefix=first_bytes)
@@ -1313,12 +1319,12 @@ class GenericIE(InfoExtractor):
             elif doc.tag == '{http://xspf.org/ns/0/}playlist':
                 return self.playlist_result(self._parse_xspf(doc, video_id), video_id)
             elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
-                return {
-                    'id': video_id,
-                    'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
-                    'formats': self._parse_mpd_formats(
-                        doc, video_id, mpd_base_url=url.rpartition('/')[0]),
-                }
+                info_dict['formats'] = self._parse_mpd_formats(
+                    doc, video_id, mpd_base_url=url.rpartition('/')[0])
+                return info_dict
+            elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag):
+                info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id)
+                return info_dict
         except compat_xml_parse_error:
             pass
 
@@ -1573,6 +1579,11 @@ class GenericIE(InfoExtractor):
         if mobj is not None:
             return self.url_result(mobj.group('url'), 'VK')
 
+        # Look for embedded Odnoklassniki player
+        mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'Odnoklassniki')
+
         # Look for embedded ivi player
         mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
         if mobj is not None:
@@ -1628,6 +1639,11 @@ class GenericIE(InfoExtractor):
         if xhamster_urls:
             return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed')
 
+        # Look for embedded TNAFlixNetwork player
+        tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage)
+        if tnaflix_urls:
+            return _playlist_from_matches(tnaflix_urls, ie=TNAFlixNetworkEmbedIE.ie_key())
+
         # Look for embedded Tvigle player
         mobj = re.search(
             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage)
@@ -1974,6 +1990,8 @@ class GenericIE(InfoExtractor):
                 entry_info_dict['formats'] = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
             elif ext == 'mpd':
                 entry_info_dict['formats'] = self._extract_mpd_formats(video_url, video_id)
+            elif ext == 'f4m':
+                entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id)
             else:
                 entry_info_dict['url'] = video_url
 
index f354c9c7aa3f43a1aed6c04afb60fe5a9d8c434b..766fc26d0f01145bdd2456a221940fa60ece6953 100644 (file)
@@ -10,8 +10,8 @@ from ..utils import (
 
 
 class GoogleDriveIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28})'
-    _TEST = {
+    _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28,})'
+    _TESTS = [{
         'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
         'md5': '881f7700aec4f538571fa1e0eed4a7b6',
         'info_dict': {
@@ -20,7 +20,11 @@ class GoogleDriveIE(InfoExtractor):
             'title': 'Big Buck Bunny.mp4',
             'duration': 46,
         }
-    }
+    }, {
+        # video id is longer than 28 characters
+        'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
+        'only_matching': True,
+    }]
     _FORMATS_EXT = {
         '5': 'flv',
         '6': 'flv',
@@ -43,7 +47,7 @@ class GoogleDriveIE(InfoExtractor):
     @staticmethod
     def _extract_url(webpage):
         mobj = re.search(
-            r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})',
+            r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})',
             webpage)
         if mobj:
             return 'https://drive.google.com/file/d/%s' % mobj.group('id')
@@ -82,7 +86,7 @@ class GoogleDriveIE(InfoExtractor):
         return {
             'id': video_id,
             'title': title,
-            'thumbnail': self._og_search_thumbnail(webpage),
+            'thumbnail': self._og_search_thumbnail(webpage, default=None),
             'duration': duration,
             'formats': formats,
         }
index 02e1e428e9e41ba75a2ff5c37c7cf0732682c111..b61b2dc4e0e36867c55dcd62068466f1080a22ad 100644 (file)
@@ -42,7 +42,7 @@ class ImdbIE(InfoExtractor):
             for f_url, f_name in extra_formats]
         format_pages.append(player_page)
 
-        quality = qualities(['SD', '480p', '720p'])
+        quality = qualities(('SD', '480p', '720p', '1080p'))
         formats = []
         for format_page in format_pages:
             json_data = self._search_regex(
index 12fb5e8e1dcb1e5dfba8057a1496edcbe0f61f82..9622f198aa6aaf99094a9b85c5a914d4f0c07d46 100644 (file)
@@ -73,7 +73,7 @@ class IndavideoEmbedIE(InfoExtractor):
             'url': self._proto_relative_url(thumbnail)
         } for thumbnail in video.get('thumbnails', [])]
 
-        tags = [tag['title'] for tag in video.get('tags', [])]
+        tags = [tag['title'] for tag in video.get('tags') or []]
 
         return {
             'id': video.get('id') or video_id,
index 016af2084b3301bccf6f6191df66b2eec84968a4..cca0b8a9323c0d2412c65610a3acb3ef2943ba6f 100644 (file)
@@ -4,15 +4,12 @@ from __future__ import unicode_literals
 
 import base64
 
-from .common import InfoExtractor
-from ..compat import (
-    compat_urllib_parse_unquote,
-    compat_parse_qs,
-)
+from ..compat import compat_urllib_parse_unquote
 from ..utils import determine_ext
+from .bokecc import BokeCCBaseIE
 
 
-class InfoQIE(InfoExtractor):
+class InfoQIE(BokeCCBaseIE):
     _VALID_URL = r'https?://(?:www\.)?infoq\.com/(?:[^/]+/)+(?P<id>[^/]+)'
 
     _TESTS = [{
@@ -38,26 +35,6 @@ class InfoQIE(InfoExtractor):
         },
     }]
 
-    def _extract_bokecc_videos(self, webpage, video_id):
-        # TODO: bokecc.com is a Chinese video cloud platform
-        # It should have an independent extractor but I don't have other
-        # examples using bokecc
-        player_params_str = self._html_search_regex(
-            r'<script[^>]+src="http://p\.bokecc\.com/player\?([^"]+)',
-            webpage, 'player params', default=None)
-
-        player_params = compat_parse_qs(player_params_str)
-
-        info_xml = self._download_xml(
-            'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % (
-                player_params['siteid'][0], player_params['vid'][0]), video_id)
-
-        return [{
-            'format_id': 'bokecc',
-            'url': quality.find('./copy').attrib['playurl'],
-            'preference': int(quality.attrib['value']),
-        } for quality in info_xml.findall('./video/quality')]
-
     def _extract_rtmp_videos(self, webpage):
         # The server URL is hardcoded
         video_url = 'rtmpe://video.infoq.com/cfx/st/'
@@ -101,7 +78,7 @@ class InfoQIE(InfoExtractor):
 
         if '/cn/' in url:
             # for China videos, HTTP video URL exists but always fails with 403
-            formats = self._extract_bokecc_videos(webpage, video_id)
+            formats = self._extract_bokecc_formats(webpage, video_id)
         else:
             formats = self._extract_rtmp_videos(webpage) + self._extract_http_videos(webpage)
 
index 9046705a54aac3eb37bed44792ac38b41239563e..e7c0cb3f66ab542e79f86238d2db991047d6d453 100644 (file)
 from __future__ import unicode_literals
 
 import hashlib
+import itertools
 import math
 import os
 import random
+import re
 import time
 import uuid
 
 from .common import InfoExtractor
 from ..compat import (
     compat_parse_qs,
+    compat_str,
     compat_urllib_parse,
     compat_urllib_parse_urlparse,
 )
 from ..utils import (
+    decode_packed_codes,
     ExtractorError,
+    ohdave_rsa_encrypt,
+    remove_start,
     sanitized_Request,
     urlencode_postdata,
     url_basename,
 )
 
 
+def md5_text(text):
+    return hashlib.md5(text.encode('utf-8')).hexdigest()
+
+
+class IqiyiSDK(object):
+    def __init__(self, target, ip, timestamp):
+        self.target = target
+        self.ip = ip
+        self.timestamp = timestamp
+
+    @staticmethod
+    def split_sum(data):
+        return compat_str(sum(map(lambda p: int(p, 16), list(data))))
+
+    @staticmethod
+    def digit_sum(num):
+        if isinstance(num, int):
+            num = compat_str(num)
+        return compat_str(sum(map(int, num)))
+
+    def even_odd(self):
+        even = self.digit_sum(compat_str(self.timestamp)[::2])
+        odd = self.digit_sum(compat_str(self.timestamp)[1::2])
+        return even, odd
+
+    def preprocess(self, chunksize):
+        self.target = md5_text(self.target)
+        chunks = []
+        for i in range(32 // chunksize):
+            chunks.append(self.target[chunksize * i:chunksize * (i + 1)])
+        if 32 % chunksize:
+            chunks.append(self.target[32 - 32 % chunksize:])
+        return chunks, list(map(int, self.ip.split('.')))
+
+    def mod(self, modulus):
+        chunks, ip = self.preprocess(32)
+        self.target = chunks[0] + ''.join(map(lambda p: compat_str(p % modulus), ip))
+
+    def split(self, chunksize):
+        modulus_map = {
+            4: 256,
+            5: 10,
+            8: 100,
+        }
+
+        chunks, ip = self.preprocess(chunksize)
+        ret = ''
+        for i in range(len(chunks)):
+            ip_part = compat_str(ip[i] % modulus_map[chunksize]) if i < 4 else ''
+            if chunksize == 8:
+                ret += ip_part + chunks[i]
+            else:
+                ret += chunks[i] + ip_part
+        self.target = ret
+
+    def handle_input16(self):
+        self.target = md5_text(self.target)
+        self.target = self.split_sum(self.target[:16]) + self.target + self.split_sum(self.target[16:])
+
+    def handle_input8(self):
+        self.target = md5_text(self.target)
+        ret = ''
+        for i in range(4):
+            part = self.target[8 * i:8 * (i + 1)]
+            ret += self.split_sum(part) + part
+        self.target = ret
+
+    def handleSum(self):
+        self.target = md5_text(self.target)
+        self.target = self.split_sum(self.target) + self.target
+
+    def date(self, scheme):
+        self.target = md5_text(self.target)
+        d = time.localtime(self.timestamp)
+        strings = {
+            'y': compat_str(d.tm_year),
+            'm': '%02d' % d.tm_mon,
+            'd': '%02d' % d.tm_mday,
+        }
+        self.target += ''.join(map(lambda c: strings[c], list(scheme)))
+
+    def split_time_even_odd(self):
+        even, odd = self.even_odd()
+        self.target = odd + md5_text(self.target) + even
+
+    def split_time_odd_even(self):
+        even, odd = self.even_odd()
+        self.target = even + md5_text(self.target) + odd
+
+    def split_ip_time_sum(self):
+        chunks, ip = self.preprocess(32)
+        self.target = compat_str(sum(ip)) + chunks[0] + self.digit_sum(self.timestamp)
+
+    def split_time_ip_sum(self):
+        chunks, ip = self.preprocess(32)
+        self.target = self.digit_sum(self.timestamp) + chunks[0] + compat_str(sum(ip))
+
+
+class IqiyiSDKInterpreter(object):
+    def __init__(self, sdk_code):
+        self.sdk_code = sdk_code
+
+    def run(self, target, ip, timestamp):
+        self.sdk_code = decode_packed_codes(self.sdk_code)
+
+        functions = re.findall(r'input=([a-zA-Z0-9]+)\(input', self.sdk_code)
+
+        sdk = IqiyiSDK(target, ip, timestamp)
+
+        other_functions = {
+            'handleSum': sdk.handleSum,
+            'handleInput8': sdk.handle_input8,
+            'handleInput16': sdk.handle_input16,
+            'splitTimeEvenOdd': sdk.split_time_even_odd,
+            'splitTimeOddEven': sdk.split_time_odd_even,
+            'splitIpTimeSum': sdk.split_ip_time_sum,
+            'splitTimeIpSum': sdk.split_time_ip_sum,
+        }
+        for function in functions:
+            if re.match(r'mod\d+', function):
+                sdk.mod(int(function[3:]))
+            elif re.match(r'date[ymd]{3}', function):
+                sdk.date(function[4:])
+            elif re.match(r'split\d+', function):
+                sdk.split(int(function[5:]))
+            elif function in other_functions:
+                other_functions[function]()
+            else:
+                raise ExtractorError('Unknown funcion %s' % function)
+
+        return sdk.target
+
+
 class IqiyiIE(InfoExtractor):
     IE_NAME = 'iqiyi'
     IE_DESC = '爱奇艺'
 
     _VALID_URL = r'http://(?:[^.]+\.)?iqiyi\.com/.+\.html'
 
+    _NETRC_MACHINE = 'iqiyi'
+
     _TESTS = [{
         'url': 'http://www.iqiyi.com/v_19rrojlavg.html',
         'md5': '2cb594dc2781e6c941a110d8f358118b',
@@ -125,6 +266,13 @@ class IqiyiIE(InfoExtractor):
             },
         }],
         'expected_warnings': ['Needs a VIP account for full video'],
+    }, {
+        'url': 'http://www.iqiyi.com/a_19rrhb8ce1.html',
+        'info_dict': {
+            'id': '202918101',
+            'title': '灌篮高手 国语版',
+        },
+        'playlist_count': 101,
     }]
 
     _FORMATS_MAP = [
@@ -136,9 +284,63 @@ class IqiyiIE(InfoExtractor):
         ('10', 'h1'),
     ]
 
+    def _real_initialize(self):
+        self._login()
+
     @staticmethod
-    def md5_text(text):
-        return hashlib.md5(text.encode('utf-8')).hexdigest()
+    def _rsa_fun(data):
+        # public key extracted from http://static.iqiyi.com/js/qiyiV2/20160129180840/jobs/i18n/i18nIndex.js
+        N = 0xab86b6371b5318aaa1d3c9e612a9f1264f372323c8c0f19875b5fc3b3fd3afcc1e5bec527aa94bfa85bffc157e4245aebda05389a5357b75115ac94f074aefcd
+        e = 65537
+
+        return ohdave_rsa_encrypt(data, e, N)
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+
+        # No authentication to be performed
+        if not username:
+            return True
+
+        data = self._download_json(
+            'http://kylin.iqiyi.com/get_token', None,
+            note='Get token for logging', errnote='Unable to get token for logging')
+        sdk = data['sdk']
+        timestamp = int(time.time())
+        target = '/apis/reglogin/login.action?lang=zh_TW&area_code=null&email=%s&passwd=%s&agenttype=1&from=undefined&keeplogin=0&piccode=&fromurl=&_pos=1' % (
+            username, self._rsa_fun(password.encode('utf-8')))
+
+        interp = IqiyiSDKInterpreter(sdk)
+        sign = interp.run(target, data['ip'], timestamp)
+
+        validation_params = {
+            'target': target,
+            'server': 'BEA3AA1908656AABCCFF76582C4C6660',
+            'token': data['token'],
+            'bird_src': 'f8d91d57af224da7893dd397d52d811a',
+            'sign': sign,
+            'bird_t': timestamp,
+        }
+        validation_result = self._download_json(
+            'http://kylin.iqiyi.com/validate?' + compat_urllib_parse.urlencode(validation_params), None,
+            note='Validate credentials', errnote='Unable to validate credentials')
+
+        MSG_MAP = {
+            'P00107': 'please login via the web interface and enter the CAPTCHA code',
+            'P00117': 'bad username or password',
+        }
+
+        code = validation_result['code']
+        if code != 'A00000':
+            msg = MSG_MAP.get(code)
+            if not msg:
+                msg = 'error %s' % code
+                if validation_result.get('msg'):
+                    msg += ': ' + validation_result['msg']
+            self._downloader.report_warning('unable to log in: ' + msg)
+            return False
+
+        return True
 
     def _authenticate_vip_video(self, api_video_url, video_id, tvid, _uuid, do_report_warning):
         auth_params = {
@@ -199,7 +401,7 @@ class IqiyiIE(InfoExtractor):
                 note='Download path key of segment %d for format %s' % (segment_index + 1, format_id)
             )['t']
             t = str(int(math.floor(int(tm) / (600.0))))
-            return self.md5_text(t + mg + x)
+            return md5_text(t + mg + x)
 
         video_urls_dict = {}
         need_vip_warning_report = True
@@ -278,16 +480,16 @@ class IqiyiIE(InfoExtractor):
         tail = tm + tvid
         param = {
             'key': 'fvip',
-            'src': self.md5_text('youtube-dl'),
+            'src': md5_text('youtube-dl'),
             'tvId': tvid,
             'vid': video_id,
             'vinfo': 1,
             'tm': tm,
-            'enc': self.md5_text(enc_key + tail),
+            'enc': md5_text(enc_key + tail),
             'qyid': _uuid,
             'tn': random.random(),
             'um': 0,
-            'authkey': self.md5_text(self.md5_text('') + tail),
+            'authkey': md5_text(md5_text('') + tail),
             'k_tag': 1,
         }
 
@@ -296,24 +498,62 @@ class IqiyiIE(InfoExtractor):
         raw_data = self._download_json(api_url, video_id)
         return raw_data
 
-    def get_enc_key(self, swf_url, video_id):
+    def get_enc_key(self, video_id):
         # TODO: automatic key extraction
         # last update at 2016-01-22 for Zombie::bite
-        enc_key = '6ab6d0280511493ba85594779759d4ed'
+        enc_key = '8ed797d224d043e7ac23d95b70227d32'
         return enc_key
 
+    def _extract_playlist(self, webpage):
+        PAGE_SIZE = 50
+
+        links = re.findall(
+            r'<a[^>]+class="site-piclist_pic_link"[^>]+href="(http://www\.iqiyi\.com/.+\.html)"',
+            webpage)
+        if not links:
+            return
+
+        album_id = self._search_regex(
+            r'albumId\s*:\s*(\d+),', webpage, 'album ID')
+        album_title = self._search_regex(
+            r'data-share-title="([^"]+)"', webpage, 'album title', fatal=False)
+
+        entries = list(map(self.url_result, links))
+
+        # Start from 2 because links in the first page are already on webpage
+        for page_num in itertools.count(2):
+            pagelist_page = self._download_webpage(
+                'http://cache.video.qiyi.com/jp/avlist/%s/%d/%d/' % (album_id, page_num, PAGE_SIZE),
+                album_id,
+                note='Download playlist page %d' % page_num,
+                errnote='Failed to download playlist page %d' % page_num)
+            pagelist = self._parse_json(
+                remove_start(pagelist_page, 'var tvInfoJs='), album_id)
+            vlist = pagelist['data']['vlist']
+            for item in vlist:
+                entries.append(self.url_result(item['vurl']))
+            if len(vlist) < PAGE_SIZE:
+                break
+
+        return self.playlist_result(entries, album_id, album_title)
+
     def _real_extract(self, url):
         webpage = self._download_webpage(
             url, 'temp_id', note='download video page')
+
+        # There's no simple way to determine whether an URL is a playlist or not
+        # So detect it
+        playlist_result = self._extract_playlist(webpage)
+        if playlist_result:
+            return playlist_result
+
         tvid = self._search_regex(
             r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid')
         video_id = self._search_regex(
             r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id')
-        swf_url = self._search_regex(
-            r'(http://[^\'"]+MainPlayer[^.]+\.swf)', webpage, 'swf player URL')
         _uuid = uuid.uuid4().hex
 
-        enc_key = self.get_enc_key(swf_url, video_id)
+        enc_key = self.get_enc_key(video_id)
 
         raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid)
 
index eef7daa299813219c5211aefe2051a1160238319..137db873cc09f7e57b258bcf65b8331d8b36b8c0 100644 (file)
@@ -30,7 +30,7 @@ class JeuxVideoIE(InfoExtractor):
         webpage = self._download_webpage(url, title)
         title = self._html_search_meta('name', webpage) or self._og_search_title(webpage)
         config_url = self._html_search_regex(
-            r'data-src="(/contenu/medias/video.php.*?)"',
+            r'data-src(?:set-video)?="(/contenu/medias/video.php.*?)"',
             webpage, 'config URL')
         config_url = 'http://www.jeuxvideo.com' + config_url
 
index 8e90d59868c380d1f486d838d8b6f3283bdc2b58..6770685d7027c3738fba35f3e057f6be2a3a512c 100644 (file)
@@ -7,33 +7,9 @@ from .common import InfoExtractor
 from ..utils import int_or_none
 
 
-class JWPlatformIE(InfoExtractor):
-    _VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})'
-    _TEST = {
-        'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js',
-        'md5': 'fa8899fa601eb7c83a64e9d568bdf325',
-        'info_dict': {
-            'id': 'nPripu9l',
-            'ext': 'mov',
-            'title': 'Big Buck Bunny Trailer',
-            'description': 'Big Buck Bunny is a short animated film by the Blender Institute. It is made using free and open source software.',
-            'upload_date': '20081127',
-            'timestamp': 1227796140,
-        }
-    }
-
-    @staticmethod
-    def _extract_url(webpage):
-        mobj = re.search(
-            r'<script[^>]+?src=["\'](?P<url>(?:https?:)?//content.jwplatform.com/players/[a-zA-Z0-9]{8})',
-            webpage)
-        if mobj:
-            return mobj.group('url')
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        json_data = self._download_json('http://content.jwplatform.com/feeds/%s.json' % video_id, video_id)
-        video_data = json_data['playlist'][0]
+class JWPlatformBaseIE(InfoExtractor):
+    def _parse_jwplayer_data(self, jwplayer_data, video_id, require_title=True):
+        video_data = jwplayer_data['playlist'][0]
         subtitles = {}
         for track in video_data['tracks']:
             if track['kind'] == 'captions':
@@ -43,7 +19,7 @@ class JWPlatformIE(InfoExtractor):
         for source in video_data['sources']:
             source_url = self._proto_relative_url(source['file'])
             source_type = source.get('type') or ''
-            if source_type == 'application/vnd.apple.mpegurl':
+            if source_type in ('application/vnd.apple.mpegurl', 'hls'):
                 formats.extend(self._extract_m3u8_formats(
                     source_url, video_id, 'mp4', 'm3u8_native', fatal=False))
             elif source_type.startswith('audio'):
@@ -61,10 +37,39 @@ class JWPlatformIE(InfoExtractor):
 
         return {
             'id': video_id,
-            'title': video_data['title'],
+            'title': video_data['title'] if require_title else video_data.get('title'),
             'description': video_data.get('description'),
             'thumbnail': self._proto_relative_url(video_data.get('image')),
             'timestamp': int_or_none(video_data.get('pubdate')),
             'subtitles': subtitles,
             'formats': formats,
         }
+
+
+class JWPlatformIE(JWPlatformBaseIE):
+    _VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})'
+    _TEST = {
+        'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js',
+        'md5': 'fa8899fa601eb7c83a64e9d568bdf325',
+        'info_dict': {
+            'id': 'nPripu9l',
+            'ext': 'mov',
+            'title': 'Big Buck Bunny Trailer',
+            'description': 'Big Buck Bunny is a short animated film by the Blender Institute. It is made using free and open source software.',
+            'upload_date': '20081127',
+            'timestamp': 1227796140,
+        }
+    }
+
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'<script[^>]+?src=["\'](?P<url>(?:https?:)?//content.jwplatform.com/players/[a-zA-Z0-9]{8})',
+            webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        json_data = self._download_json('http://content.jwplatform.com/feeds/%s.json' % video_id, video_id)
+        return self._parse_jwplayer_data(json_data, video_id)
index ccbc39c665412980e6b6104e83ffaf2e8574517f..44d7c84a13f9bef9aa1d68dc5d38fe81b0af4a5f 100644 (file)
@@ -8,6 +8,7 @@ from .common import InfoExtractor
 from ..compat import (
     compat_urllib_parse,
     compat_urlparse,
+    compat_parse_qs,
 )
 from ..utils import (
     clean_html,
@@ -20,21 +21,17 @@ from ..utils import (
 class KalturaIE(InfoExtractor):
     _VALID_URL = r'''(?x)
                 (?:
-                    kaltura:(?P<partner_id_s>\d+):(?P<id_s>[0-9a-z_]+)|
+                    kaltura:(?P<partner_id>\d+):(?P<id>[0-9a-z_]+)|
                     https?://
                         (:?(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/
                         (?:
                             (?:
                                 # flash player
-                                index\.php/kwidget/
-                                (?:[^/]+/)*?wid/_(?P<partner_id>\d+)/
-                                (?:[^/]+/)*?entry_id/(?P<id>[0-9a-z_]+)|
+                                index\.php/kwidget|
                                 # html5 player
-                                html5/html5lib/
-                                (?:[^/]+/)*?entry_id/(?P<id_html5>[0-9a-z_]+)
-                                .*\?.*\bwid=_(?P<partner_id_html5>\d+)
+                                html5/html5lib/[^/]+/mwEmbedFrame\.php
                             )
-                        )
+                        )(?:/(?P<path>[^?]+))?(?:\?(?P<query>.*))?
                 )
                 '''
     _API_BASE = 'http://cdnapi.kaltura.com/api_v3/index.php?'
@@ -127,10 +124,41 @@ class KalturaIE(InfoExtractor):
         url, smuggled_data = unsmuggle_url(url, {})
 
         mobj = re.match(self._VALID_URL, url)
-        partner_id = mobj.group('partner_id_s') or mobj.group('partner_id') or mobj.group('partner_id_html5')
-        entry_id = mobj.group('id_s') or mobj.group('id') or mobj.group('id_html5')
-
-        info, flavor_assets = self._get_video_info(entry_id, partner_id)
+        partner_id, entry_id = mobj.group('partner_id', 'id')
+        ks = None
+        if partner_id and entry_id:
+            info, flavor_assets = self._get_video_info(entry_id, partner_id)
+        else:
+            path, query = mobj.group('path', 'query')
+            if not path and not query:
+                raise ExtractorError('Invalid URL', expected=True)
+            params = {}
+            if query:
+                params = compat_parse_qs(query)
+            if path:
+                splitted_path = path.split('/')
+                params.update(dict((zip(splitted_path[::2], [[v] for v in splitted_path[1::2]]))))
+            if 'wid' in params:
+                partner_id = params['wid'][0][1:]
+            elif 'p' in params:
+                partner_id = params['p'][0]
+            else:
+                raise ExtractorError('Invalid URL', expected=True)
+            if 'entry_id' in params:
+                entry_id = params['entry_id'][0]
+                info, flavor_assets = self._get_video_info(entry_id, partner_id)
+            elif 'uiconf_id' in params and 'flashvars[referenceId]' in params:
+                reference_id = params['flashvars[referenceId]'][0]
+                webpage = self._download_webpage(url, reference_id)
+                entry_data = self._parse_json(self._search_regex(
+                    r'window\.kalturaIframePackageData\s*=\s*({.*});',
+                    webpage, 'kalturaIframePackageData'),
+                    reference_id)['entryResult']
+                info, flavor_assets = entry_data['meta'], entry_data['contextData']['flavorAssets']
+                entry_id = info['id']
+            else:
+                raise ExtractorError('Invalid URL', expected=True)
+            ks = params.get('flashvars[ks]', [None])[0]
 
         source_url = smuggled_data.get('source_url')
         if source_url:
@@ -140,14 +168,19 @@ class KalturaIE(InfoExtractor):
         else:
             referrer = None
 
+        def sign_url(unsigned_url):
+            if ks:
+                unsigned_url += '/ks/%s' % ks
+            if referrer:
+                unsigned_url += '?referrer=%s' % referrer
+            return unsigned_url
+
         formats = []
         for f in flavor_assets:
             # Continue if asset is not ready
             if f['status'] != 2:
                 continue
-            video_url = '%s/flavorId/%s' % (info['dataUrl'], f['id'])
-            if referrer:
-                video_url += '?referrer=%s' % referrer
+            video_url = sign_url('%s/flavorId/%s' % (info['dataUrl'], f['id']))
             formats.append({
                 'format_id': '%(fileExt)s-%(bitrate)s' % f,
                 'ext': f.get('fileExt'),
@@ -160,9 +193,7 @@ class KalturaIE(InfoExtractor):
                 'width': int_or_none(f.get('width')),
                 'url': video_url,
             })
-        m3u8_url = info['dataUrl'].replace('format/url', 'format/applehttp')
-        if referrer:
-            m3u8_url += '?referrer=%s' % referrer
+        m3u8_url = sign_url(info['dataUrl'].replace('format/url', 'format/applehttp'))
         formats.extend(self._extract_m3u8_formats(
             m3u8_url, entry_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
 
index 08a671fa86a007d3327ef03c257f1b943bd425db..61739efa7a4c3b84892083eab10237c23eb69e3d 100644 (file)
@@ -14,10 +14,10 @@ class KhanAcademyIE(InfoExtractor):
 
     _TESTS = [{
         'url': 'http://www.khanacademy.org/video/one-time-pad',
-        'md5': '7021db7f2d47d4fff89b13177cb1e8f4',
+        'md5': '7b391cce85e758fb94f763ddc1bbb979',
         'info_dict': {
             'id': 'one-time-pad',
-            'ext': 'mp4',
+            'ext': 'webm',
             'title': 'The one-time pad',
             'description': 'The perfect cipher',
             'duration': 176,
diff --git a/youtube_dl/extractor/kusi.py b/youtube_dl/extractor/kusi.py
new file mode 100644 (file)
index 0000000..931f34c
--- /dev/null
@@ -0,0 +1,99 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import random
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote_plus
+from ..utils import (
+    int_or_none,
+    float_or_none,
+    timeconvert,
+    update_url_query,
+    xpath_text,
+)
+
+
+class KUSIIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www\.)?kusi\.com/(?P<path>story/.+|video\?clipId=(?P<clipId>\d+))'
+    _TESTS = [{
+        'url': 'http://www.kusi.com/story/31183873/turko-files-case-closed-put-on-hold',
+        'md5': 'f926e7684294cf8cb7bdf8858e1b3988',
+        'info_dict': {
+            'id': '12203019',
+            'ext': 'mp4',
+            'title': 'Turko Files: Case Closed! & Put On Hold!',
+            'duration': 231.0,
+            'upload_date': '20160210',
+            'timestamp': 1455087571,
+            'thumbnail': 're:^https?://.*\.jpg$'
+        },
+    }, {
+        'url': 'http://kusi.com/video?clipId=12203019',
+        'info_dict': {
+            'id': '12203019',
+            'ext': 'mp4',
+            'title': 'Turko Files: Case Closed! & Put On Hold!',
+            'duration': 231.0,
+            'upload_date': '20160210',
+            'timestamp': 1455087571,
+            'thumbnail': 're:^https?://.*\.jpg$'
+        },
+        'params': {
+            'skip_download': True,  # Same as previous one
+        },
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        clip_id = mobj.group('clipId')
+        video_id = clip_id or mobj.group('path')
+
+        webpage = self._download_webpage(url, video_id)
+
+        if clip_id is None:
+            video_id = clip_id = self._html_search_regex(
+                r'"clipId"\s*,\s*"(\d+)"', webpage, 'clip id')
+
+        affiliate_id = self._search_regex(
+            r'affiliateId\s*:\s*\'([^\']+)\'', webpage, 'affiliate id')
+
+        # See __Packages/worldnow/model/GalleryModel.as of WNGallery.swf
+        xml_url = update_url_query('http://www.kusi.com/build.asp', {
+            'buildtype': 'buildfeaturexmlrequest',
+            'featureType': 'Clip',
+            'featureid': clip_id,
+            'affiliateno': affiliate_id,
+            'clientgroupid': '1',
+            'rnd': int(round(random.random() * 1000000)),
+        })
+
+        doc = self._download_xml(xml_url, video_id)
+
+        video_title = xpath_text(doc, 'HEADLINE', fatal=True)
+        duration = float_or_none(xpath_text(doc, 'DURATION'), scale=1000)
+        description = xpath_text(doc, 'ABSTRACT')
+        thumbnail = xpath_text(doc, './THUMBNAILIMAGE/FILENAME')
+        createtion_time = timeconvert(xpath_text(doc, 'rfc822creationdate'))
+
+        quality_options = doc.find('{http://search.yahoo.com/mrss/}group').findall('{http://search.yahoo.com/mrss/}content')
+        formats = []
+        for quality in quality_options:
+            formats.append({
+                'url': compat_urllib_parse_unquote_plus(quality.attrib['url']),
+                'height': int_or_none(quality.attrib.get('height')),
+                'width': int_or_none(quality.attrib.get('width')),
+                'vbr': float_or_none(quality.attrib.get('bitratebits'), scale=1000),
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': video_title,
+            'description': description,
+            'duration': duration,
+            'formats': formats,
+            'thumbnail': thumbnail,
+            'timestamp': createtion_time,
+        }
index f641edef8ada91cf1a4b458f388f7f51ce56fb3f..700e44b639a216d63921f607d980a53c3dc4f7cd 100644 (file)
@@ -68,6 +68,7 @@ class KuwoIE(KuwoBaseIE):
             'id': '6446136',
             'ext': 'mp3',
             'title': '心',
+            'description': 'md5:b2ab6295d014005bfc607525bfc1e38a',
             'creator': 'IU',
             'upload_date': '20150518',
         },
diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py
new file mode 100644 (file)
index 0000000..df47e88
--- /dev/null
@@ -0,0 +1,361 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import base64
+import datetime
+import hashlib
+import re
+import time
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_ord,
+    compat_str,
+    compat_urllib_parse,
+)
+from ..utils import (
+    determine_ext,
+    encode_data_uri,
+    ExtractorError,
+    int_or_none,
+    orderedSet,
+    parse_iso8601,
+    sanitized_Request,
+    str_or_none,
+    url_basename,
+)
+
+
+class LeIE(InfoExtractor):
+    IE_DESC = '乐视网'
+    _VALID_URL = r'http://www\.le\.com/ptv/vplay/(?P<id>\d+)\.html'
+
+    _URL_TEMPLATE = 'http://www.le.com/ptv/vplay/%s.html'
+
+    _TESTS = [{
+        'url': 'http://www.le.com/ptv/vplay/22005890.html',
+        'md5': 'edadcfe5406976f42f9f266057ee5e40',
+        'info_dict': {
+            'id': '22005890',
+            'ext': 'mp4',
+            'title': '第87届奥斯卡颁奖礼完美落幕 《鸟人》成最大赢家',
+            'description': 'md5:a9cb175fd753e2962176b7beca21a47c',
+        },
+        'params': {
+            'hls_prefer_native': True,
+        },
+    }, {
+        'url': 'http://www.le.com/ptv/vplay/1415246.html',
+        'info_dict': {
+            'id': '1415246',
+            'ext': 'mp4',
+            'title': '美人天下01',
+            'description': 'md5:f88573d9d7225ada1359eaf0dbf8bcda',
+        },
+        'params': {
+            'hls_prefer_native': True,
+        },
+    }, {
+        'note': 'This video is available only in Mainland China, thus a proxy is needed',
+        'url': 'http://www.le.com/ptv/vplay/1118082.html',
+        'md5': '2424c74948a62e5f31988438979c5ad1',
+        'info_dict': {
+            'id': '1118082',
+            'ext': 'mp4',
+            'title': '与龙共舞 完整版',
+            'description': 'md5:7506a5eeb1722bb9d4068f85024e3986',
+        },
+        'params': {
+            'hls_prefer_native': True,
+        },
+        'skip': 'Only available in China',
+    }]
+
+    @staticmethod
+    def urshift(val, n):
+        return val >> n if val >= 0 else (val + 0x100000000) >> n
+
+    # ror() and calc_time_key() are reversed from a embedded swf file in KLetvPlayer.swf
+    def ror(self, param1, param2):
+        _loc3_ = 0
+        while _loc3_ < param2:
+            param1 = self.urshift(param1, 1) + ((param1 & 1) << 31)
+            _loc3_ += 1
+        return param1
+
+    def calc_time_key(self, param1):
+        _loc2_ = 773625421
+        _loc3_ = self.ror(param1, _loc2_ % 13)
+        _loc3_ = _loc3_ ^ _loc2_
+        _loc3_ = self.ror(_loc3_, _loc2_ % 17)
+        return _loc3_
+
+    # see M3U8Encryption class in KLetvPlayer.swf
+    @staticmethod
+    def decrypt_m3u8(encrypted_data):
+        if encrypted_data[:5].decode('utf-8').lower() != 'vc_01':
+            return encrypted_data
+        encrypted_data = encrypted_data[5:]
+
+        _loc4_ = bytearray(2 * len(encrypted_data))
+        for idx, val in enumerate(encrypted_data):
+            b = compat_ord(val)
+            _loc4_[2 * idx] = b // 16
+            _loc4_[2 * idx + 1] = b % 16
+        idx = len(_loc4_) - 11
+        _loc4_ = _loc4_[idx:] + _loc4_[:idx]
+        _loc7_ = bytearray(len(encrypted_data))
+        for i in range(len(encrypted_data)):
+            _loc7_[i] = _loc4_[2 * i] * 16 + _loc4_[2 * i + 1]
+
+        return bytes(_loc7_)
+
+    def _real_extract(self, url):
+        media_id = self._match_id(url)
+        page = self._download_webpage(url, media_id)
+        params = {
+            'id': media_id,
+            'platid': 1,
+            'splatid': 101,
+            'format': 1,
+            'tkey': self.calc_time_key(int(time.time())),
+            'domain': 'www.le.com'
+        }
+        play_json_req = sanitized_Request(
+            'http://api.le.com/mms/out/video/playJson?' + compat_urllib_parse.urlencode(params)
+        )
+        cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
+        if cn_verification_proxy:
+            play_json_req.add_header('Ytdl-request-proxy', cn_verification_proxy)
+
+        play_json = self._download_json(
+            play_json_req,
+            media_id, 'Downloading playJson data')
+
+        # Check for errors
+        playstatus = play_json['playstatus']
+        if playstatus['status'] == 0:
+            flag = playstatus['flag']
+            if flag == 1:
+                msg = 'Country %s auth error' % playstatus['country']
+            else:
+                msg = 'Generic error. flag = %d' % flag
+            raise ExtractorError(msg, expected=True)
+
+        playurl = play_json['playurl']
+
+        formats = ['350', '1000', '1300', '720p', '1080p']
+        dispatch = playurl['dispatch']
+
+        urls = []
+        for format_id in formats:
+            if format_id in dispatch:
+                media_url = playurl['domain'][0] + dispatch[format_id][0]
+                media_url += '&' + compat_urllib_parse.urlencode({
+                    'm3v': 1,
+                    'format': 1,
+                    'expect': 3,
+                    'rateid': format_id,
+                })
+
+                nodes_data = self._download_json(
+                    media_url, media_id,
+                    'Download JSON metadata for format %s' % format_id)
+
+                req = self._request_webpage(
+                    nodes_data['nodelist'][0]['location'], media_id,
+                    note='Downloading m3u8 information for format %s' % format_id)
+
+                m3u8_data = self.decrypt_m3u8(req.read())
+
+                url_info_dict = {
+                    'url': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'),
+                    'ext': determine_ext(dispatch[format_id][1]),
+                    'format_id': format_id,
+                    'protocol': 'm3u8',
+                }
+
+                if format_id[-1:] == 'p':
+                    url_info_dict['height'] = int_or_none(format_id[:-1])
+
+                urls.append(url_info_dict)
+
+        publish_time = parse_iso8601(self._html_search_regex(
+            r'发布时间&nbsp;([^<>]+) ', page, 'publish time', default=None),
+            delimiter=' ', timezone=datetime.timedelta(hours=8))
+        description = self._html_search_meta('description', page, fatal=False)
+
+        return {
+            'id': media_id,
+            'formats': urls,
+            'title': playurl['title'],
+            'thumbnail': playurl['pic'],
+            'description': description,
+            'timestamp': publish_time,
+        }
+
+
+class LePlaylistIE(InfoExtractor):
+    _VALID_URL = r'http://[a-z]+\.le\.com/[a-z]+/(?P<id>[a-z0-9_]+)'
+
+    _TESTS = [{
+        'url': 'http://www.le.com/tv/46177.html',
+        'info_dict': {
+            'id': '46177',
+            'title': '美人天下',
+            'description': 'md5:395666ff41b44080396e59570dbac01c'
+        },
+        'playlist_count': 35
+    }, {
+        'url': 'http://tv.le.com/izt/wuzetian/index.html',
+        'info_dict': {
+            'id': 'wuzetian',
+            'title': '武媚娘传奇',
+            'description': 'md5:e12499475ab3d50219e5bba00b3cb248'
+        },
+        # This playlist contains some extra videos other than the drama itself
+        'playlist_mincount': 96
+    }, {
+        'url': 'http://tv.le.com/pzt/lswjzzjc/index.shtml',
+        # This series is moved to http://www.le.com/tv/10005297.html
+        'only_matching': True,
+    }, {
+        'url': 'http://www.le.com/comic/92063.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://list.le.com/listn/c1009_sc532002_d2_p1_o1.html',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if LeIE.suitable(url) else super(LePlaylistIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+        page = self._download_webpage(url, playlist_id)
+
+        # Currently old domain names are still used in playlists
+        media_ids = orderedSet(re.findall(
+            r'<a[^>]+href="http://www\.letv\.com/ptv/vplay/(\d+)\.html', page))
+        entries = [self.url_result(LeIE._URL_TEMPLATE % media_id, ie='Le')
+                   for media_id in media_ids]
+
+        title = self._html_search_meta('keywords', page,
+                                       fatal=False).split(',')[0]
+        description = self._html_search_meta('description', page, fatal=False)
+
+        return self.playlist_result(entries, playlist_id, playlist_title=title,
+                                    playlist_description=description)
+
+
+class LetvCloudIE(InfoExtractor):
+    # Most of *.letv.com is changed to *.le.com on 2016/01/02
+    # but yuntv.letv.com is kept, so also keep the extractor name
+    IE_DESC = '乐视云'
+    _VALID_URL = r'https?://yuntv\.letv\.com/bcloud.html\?.+'
+
+    _TESTS = [{
+        'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=467623dedf',
+        'md5': '26450599afd64c513bc77030ad15db44',
+        'info_dict': {
+            'id': 'p7jnfw5hw9_467623dedf',
+            'ext': 'mp4',
+            'title': 'Video p7jnfw5hw9_467623dedf',
+        },
+    }, {
+        'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=ec93197892&pu=2c7cd40209&auto_play=1&gpcflag=1&width=640&height=360',
+        'md5': 'e03d9cc8d9c13191e1caf277e42dbd31',
+        'info_dict': {
+            'id': 'p7jnfw5hw9_ec93197892',
+            'ext': 'mp4',
+            'title': 'Video p7jnfw5hw9_ec93197892',
+        },
+    }, {
+        'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=187060b6fd',
+        'md5': 'cb988699a776b22d4a41b9d43acfb3ac',
+        'info_dict': {
+            'id': 'p7jnfw5hw9_187060b6fd',
+            'ext': 'mp4',
+            'title': 'Video p7jnfw5hw9_187060b6fd',
+        },
+    }]
+
+    @staticmethod
+    def sign_data(obj):
+        if obj['cf'] == 'flash':
+            salt = '2f9d6924b33a165a6d8b5d3d42f4f987'
+            items = ['cf', 'format', 'ran', 'uu', 'ver', 'vu']
+        elif obj['cf'] == 'html5':
+            salt = 'fbeh5player12c43eccf2bec3300344'
+            items = ['cf', 'ran', 'uu', 'bver', 'vu']
+        input_data = ''.join([item + obj[item] for item in items]) + salt
+        obj['sign'] = hashlib.md5(input_data.encode('utf-8')).hexdigest()
+
+    def _get_formats(self, cf, uu, vu, media_id):
+        def get_play_json(cf, timestamp):
+            data = {
+                'cf': cf,
+                'ver': '2.2',
+                'bver': 'firefox44.0',
+                'format': 'json',
+                'uu': uu,
+                'vu': vu,
+                'ran': compat_str(timestamp),
+            }
+            self.sign_data(data)
+            return self._download_json(
+                'http://api.letvcloud.com/gpc.php?' + compat_urllib_parse.urlencode(data),
+                media_id, 'Downloading playJson data for type %s' % cf)
+
+        play_json = get_play_json(cf, time.time())
+        # The server time may be different from local time
+        if play_json.get('code') == 10071:
+            play_json = get_play_json(cf, play_json['timestamp'])
+
+        if not play_json.get('data'):
+            if play_json.get('message'):
+                raise ExtractorError('Letv cloud said: %s' % play_json['message'], expected=True)
+            elif play_json.get('code'):
+                raise ExtractorError('Letv cloud returned error %d' % play_json['code'], expected=True)
+            else:
+                raise ExtractorError('Letv cloud returned an unknwon error')
+
+        def b64decode(s):
+            return base64.b64decode(s.encode('utf-8')).decode('utf-8')
+
+        formats = []
+        for media in play_json['data']['video_info']['media'].values():
+            play_url = media['play_url']
+            url = b64decode(play_url['main_url'])
+            decoded_url = b64decode(url_basename(url))
+            formats.append({
+                'url': url,
+                'ext': determine_ext(decoded_url),
+                'format_id': str_or_none(play_url.get('vtype')),
+                'format_note': str_or_none(play_url.get('definition')),
+                'width': int_or_none(play_url.get('vwidth')),
+                'height': int_or_none(play_url.get('vheight')),
+            })
+
+        return formats
+
+    def _real_extract(self, url):
+        uu_mobj = re.search('uu=([\w]+)', url)
+        vu_mobj = re.search('vu=([\w]+)', url)
+
+        if not uu_mobj or not vu_mobj:
+            raise ExtractorError('Invalid URL: %s' % url, expected=True)
+
+        uu = uu_mobj.group(1)
+        vu = vu_mobj.group(1)
+        media_id = uu + '_' + vu
+
+        formats = self._get_formats('flash', uu, vu, media_id) + self._get_formats('html5', uu, vu, media_id)
+        self._sort_formats(formats)
+
+        return {
+            'id': media_id,
+            'title': 'Video %s' % media_id,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py
deleted file mode 100644 (file)
index 9665ece..0000000
+++ /dev/null
@@ -1,356 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import datetime
-import re
-import time
-import base64
-import hashlib
-
-from .common import InfoExtractor
-from ..compat import (
-    compat_urllib_parse,
-    compat_ord,
-    compat_str,
-)
-from ..utils import (
-    determine_ext,
-    ExtractorError,
-    parse_iso8601,
-    sanitized_Request,
-    int_or_none,
-    str_or_none,
-    encode_data_uri,
-    url_basename,
-)
-
-
-class LetvIE(InfoExtractor):
-    IE_DESC = '乐视网'
-    _VALID_URL = r'http://www\.letv\.com/ptv/vplay/(?P<id>\d+).html'
-
-    _TESTS = [{
-        'url': 'http://www.letv.com/ptv/vplay/22005890.html',
-        'md5': 'edadcfe5406976f42f9f266057ee5e40',
-        'info_dict': {
-            'id': '22005890',
-            'ext': 'mp4',
-            'title': '第87届奥斯卡颁奖礼完美落幕 《鸟人》成最大赢家',
-            'description': 'md5:a9cb175fd753e2962176b7beca21a47c',
-        },
-        'params': {
-            'hls_prefer_native': True,
-        },
-    }, {
-        'url': 'http://www.letv.com/ptv/vplay/1415246.html',
-        'info_dict': {
-            'id': '1415246',
-            'ext': 'mp4',
-            'title': '美人天下01',
-            'description': 'md5:f88573d9d7225ada1359eaf0dbf8bcda',
-        },
-        'params': {
-            'hls_prefer_native': True,
-        },
-    }, {
-        'note': 'This video is available only in Mainland China, thus a proxy is needed',
-        'url': 'http://www.letv.com/ptv/vplay/1118082.html',
-        'md5': '2424c74948a62e5f31988438979c5ad1',
-        'info_dict': {
-            'id': '1118082',
-            'ext': 'mp4',
-            'title': '与龙共舞 完整版',
-            'description': 'md5:7506a5eeb1722bb9d4068f85024e3986',
-        },
-        'params': {
-            'hls_prefer_native': True,
-        },
-        'skip': 'Only available in China',
-    }]
-
-    @staticmethod
-    def urshift(val, n):
-        return val >> n if val >= 0 else (val + 0x100000000) >> n
-
-    # ror() and calc_time_key() are reversed from a embedded swf file in KLetvPlayer.swf
-    def ror(self, param1, param2):
-        _loc3_ = 0
-        while _loc3_ < param2:
-            param1 = self.urshift(param1, 1) + ((param1 & 1) << 31)
-            _loc3_ += 1
-        return param1
-
-    def calc_time_key(self, param1):
-        _loc2_ = 773625421
-        _loc3_ = self.ror(param1, _loc2_ % 13)
-        _loc3_ = _loc3_ ^ _loc2_
-        _loc3_ = self.ror(_loc3_, _loc2_ % 17)
-        return _loc3_
-
-    # see M3U8Encryption class in KLetvPlayer.swf
-    @staticmethod
-    def decrypt_m3u8(encrypted_data):
-        if encrypted_data[:5].decode('utf-8').lower() != 'vc_01':
-            return encrypted_data
-        encrypted_data = encrypted_data[5:]
-
-        _loc4_ = bytearray()
-        while encrypted_data:
-            b = compat_ord(encrypted_data[0])
-            _loc4_.extend([b // 16, b & 0x0f])
-            encrypted_data = encrypted_data[1:]
-        idx = len(_loc4_) - 11
-        _loc4_ = _loc4_[idx:] + _loc4_[:idx]
-        _loc7_ = bytearray()
-        while _loc4_:
-            _loc7_.append(_loc4_[0] * 16 + _loc4_[1])
-            _loc4_ = _loc4_[2:]
-
-        return bytes(_loc7_)
-
-    def _real_extract(self, url):
-        media_id = self._match_id(url)
-        page = self._download_webpage(url, media_id)
-        params = {
-            'id': media_id,
-            'platid': 1,
-            'splatid': 101,
-            'format': 1,
-            'tkey': self.calc_time_key(int(time.time())),
-            'domain': 'www.letv.com'
-        }
-        play_json_req = sanitized_Request(
-            'http://api.letv.com/mms/out/video/playJson?' + compat_urllib_parse.urlencode(params)
-        )
-        cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
-        if cn_verification_proxy:
-            play_json_req.add_header('Ytdl-request-proxy', cn_verification_proxy)
-
-        play_json = self._download_json(
-            play_json_req,
-            media_id, 'Downloading playJson data')
-
-        # Check for errors
-        playstatus = play_json['playstatus']
-        if playstatus['status'] == 0:
-            flag = playstatus['flag']
-            if flag == 1:
-                msg = 'Country %s auth error' % playstatus['country']
-            else:
-                msg = 'Generic error. flag = %d' % flag
-            raise ExtractorError(msg, expected=True)
-
-        playurl = play_json['playurl']
-
-        formats = ['350', '1000', '1300', '720p', '1080p']
-        dispatch = playurl['dispatch']
-
-        urls = []
-        for format_id in formats:
-            if format_id in dispatch:
-                media_url = playurl['domain'][0] + dispatch[format_id][0]
-                media_url += '&' + compat_urllib_parse.urlencode({
-                    'm3v': 1,
-                    'format': 1,
-                    'expect': 3,
-                    'rateid': format_id,
-                })
-
-                nodes_data = self._download_json(
-                    media_url, media_id,
-                    'Download JSON metadata for format %s' % format_id)
-
-                req = self._request_webpage(
-                    nodes_data['nodelist'][0]['location'], media_id,
-                    note='Downloading m3u8 information for format %s' % format_id)
-
-                m3u8_data = self.decrypt_m3u8(req.read())
-
-                url_info_dict = {
-                    'url': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'),
-                    'ext': determine_ext(dispatch[format_id][1]),
-                    'format_id': format_id,
-                    'protocol': 'm3u8',
-                }
-
-                if format_id[-1:] == 'p':
-                    url_info_dict['height'] = int_or_none(format_id[:-1])
-
-                urls.append(url_info_dict)
-
-        publish_time = parse_iso8601(self._html_search_regex(
-            r'发布时间&nbsp;([^<>]+) ', page, 'publish time', default=None),
-            delimiter=' ', timezone=datetime.timedelta(hours=8))
-        description = self._html_search_meta('description', page, fatal=False)
-
-        return {
-            'id': media_id,
-            'formats': urls,
-            'title': playurl['title'],
-            'thumbnail': playurl['pic'],
-            'description': description,
-            'timestamp': publish_time,
-        }
-
-
-class LetvTvIE(InfoExtractor):
-    _VALID_URL = r'http://www.letv.com/tv/(?P<id>\d+).html'
-    _TESTS = [{
-        'url': 'http://www.letv.com/tv/46177.html',
-        'info_dict': {
-            'id': '46177',
-            'title': '美人天下',
-            'description': 'md5:395666ff41b44080396e59570dbac01c'
-        },
-        'playlist_count': 35
-    }]
-
-    def _real_extract(self, url):
-        playlist_id = self._match_id(url)
-        page = self._download_webpage(url, playlist_id)
-
-        media_urls = list(set(re.findall(
-            r'http://www.letv.com/ptv/vplay/\d+.html', page)))
-        entries = [self.url_result(media_url, ie='Letv')
-                   for media_url in media_urls]
-
-        title = self._html_search_meta('keywords', page,
-                                       fatal=False).split(',')[0]
-        description = self._html_search_meta('description', page, fatal=False)
-
-        return self.playlist_result(entries, playlist_id, playlist_title=title,
-                                    playlist_description=description)
-
-
-class LetvPlaylistIE(LetvTvIE):
-    _VALID_URL = r'http://tv.letv.com/[a-z]+/(?P<id>[a-z]+)/index.s?html'
-    _TESTS = [{
-        'url': 'http://tv.letv.com/izt/wuzetian/index.html',
-        'info_dict': {
-            'id': 'wuzetian',
-            'title': '武媚娘传奇',
-            'description': 'md5:e12499475ab3d50219e5bba00b3cb248'
-        },
-        # This playlist contains some extra videos other than the drama itself
-        'playlist_mincount': 96
-    }, {
-        'url': 'http://tv.letv.com/pzt/lswjzzjc/index.shtml',
-        'info_dict': {
-            'id': 'lswjzzjc',
-            # The title should be "劲舞青春", but I can't find a simple way to
-            # determine the playlist title
-            'title': '乐视午间自制剧场',
-            'description': 'md5:b1eef244f45589a7b5b1af9ff25a4489'
-        },
-        'playlist_mincount': 7
-    }]
-
-
-class LetvCloudIE(InfoExtractor):
-    IE_DESC = '乐视云'
-    _VALID_URL = r'https?://yuntv\.letv\.com/bcloud.html\?.+'
-
-    _TESTS = [{
-        'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=467623dedf',
-        'md5': '26450599afd64c513bc77030ad15db44',
-        'info_dict': {
-            'id': 'p7jnfw5hw9_467623dedf',
-            'ext': 'mp4',
-            'title': 'Video p7jnfw5hw9_467623dedf',
-        },
-    }, {
-        'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=ec93197892&pu=2c7cd40209&auto_play=1&gpcflag=1&width=640&height=360',
-        'md5': 'e03d9cc8d9c13191e1caf277e42dbd31',
-        'info_dict': {
-            'id': 'p7jnfw5hw9_ec93197892',
-            'ext': 'mp4',
-            'title': 'Video p7jnfw5hw9_ec93197892',
-        },
-    }, {
-        'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=187060b6fd',
-        'md5': 'cb988699a776b22d4a41b9d43acfb3ac',
-        'info_dict': {
-            'id': 'p7jnfw5hw9_187060b6fd',
-            'ext': 'mp4',
-            'title': 'Video p7jnfw5hw9_187060b6fd',
-        },
-    }]
-
-    @staticmethod
-    def sign_data(obj):
-        if obj['cf'] == 'flash':
-            salt = '2f9d6924b33a165a6d8b5d3d42f4f987'
-            items = ['cf', 'format', 'ran', 'uu', 'ver', 'vu']
-        elif obj['cf'] == 'html5':
-            salt = 'fbeh5player12c43eccf2bec3300344'
-            items = ['cf', 'ran', 'uu', 'bver', 'vu']
-        input_data = ''.join([item + obj[item] for item in items]) + salt
-        obj['sign'] = hashlib.md5(input_data.encode('utf-8')).hexdigest()
-
-    def _get_formats(self, cf, uu, vu, media_id):
-        def get_play_json(cf, timestamp):
-            data = {
-                'cf': cf,
-                'ver': '2.2',
-                'bver': 'firefox44.0',
-                'format': 'json',
-                'uu': uu,
-                'vu': vu,
-                'ran': compat_str(timestamp),
-            }
-            self.sign_data(data)
-            return self._download_json(
-                'http://api.letvcloud.com/gpc.php?' + compat_urllib_parse.urlencode(data),
-                media_id, 'Downloading playJson data for type %s' % cf)
-
-        play_json = get_play_json(cf, time.time())
-        # The server time may be different from local time
-        if play_json.get('code') == 10071:
-            play_json = get_play_json(cf, play_json['timestamp'])
-
-        if not play_json.get('data'):
-            if play_json.get('message'):
-                raise ExtractorError('Letv cloud said: %s' % play_json['message'], expected=True)
-            elif play_json.get('code'):
-                raise ExtractorError('Letv cloud returned error %d' % play_json['code'], expected=True)
-            else:
-                raise ExtractorError('Letv cloud returned an unknwon error')
-
-        def b64decode(s):
-            return base64.b64decode(s.encode('utf-8')).decode('utf-8')
-
-        formats = []
-        for media in play_json['data']['video_info']['media'].values():
-            play_url = media['play_url']
-            url = b64decode(play_url['main_url'])
-            decoded_url = b64decode(url_basename(url))
-            formats.append({
-                'url': url,
-                'ext': determine_ext(decoded_url),
-                'format_id': int_or_none(play_url.get('vtype')),
-                'format_note': str_or_none(play_url.get('definition')),
-                'width': int_or_none(play_url.get('vwidth')),
-                'height': int_or_none(play_url.get('vheight')),
-            })
-
-        return formats
-
-    def _real_extract(self, url):
-        uu_mobj = re.search('uu=([\w]+)', url)
-        vu_mobj = re.search('vu=([\w]+)', url)
-
-        if not uu_mobj or not vu_mobj:
-            raise ExtractorError('Invalid URL: %s' % url, expected=True)
-
-        uu = uu_mobj.group(1)
-        vu = vu_mobj.group(1)
-        media_id = uu + '_' + vu
-
-        formats = self._get_formats('flash', uu, vu, media_id) + self._get_formats('html5', uu, vu, media_id)
-        self._sort_formats(formats)
-
-        return {
-            'id': media_id,
-            'title': 'Video %s' % media_id,
-            'formats': formats,
-        }
index f8cbca7b36afab1890b71806d6761bbe67d7d924..a8fd639cc9276c2290f9c89dbded1bb9a1cbe7dd 100644 (file)
@@ -20,18 +20,18 @@ class LifeNewsIE(InfoExtractor):
     _VALID_URL = r'http://lifenews\.ru/(?:mobile/)?(?P<section>news|video)/(?P<id>\d+)'
 
     _TESTS = [{
-        'url': 'http://lifenews.ru/news/126342',
-        'md5': 'e1b50a5c5fb98a6a544250f2e0db570a',
+        # single video embedded via video/source
+        'url': 'http://lifenews.ru/news/98736',
+        'md5': '77c95eaefaca216e32a76a343ad89d23',
         'info_dict': {
-            'id': '126342',
+            'id': '98736',
             'ext': 'mp4',
-            'title': 'МВД разыскивает мужчин, оставивших в IKEA сумку с автоматом',
-            'description': 'Камеры наблюдения гипермаркета зафиксировали троих мужчин, спрятавших оружейный арсенал в камере хранения.',
-            'thumbnail': 're:http://.*\.jpg',
-            'upload_date': '20140130',
+            'title': 'Мужчина нашел дома архив оборонного завода',
+            'description': 'md5:3b06b1b39b5e2bea548e403d99b8bf26',
+            'upload_date': '20120805',
         }
     }, {
-        # video in <iframe>
+        # single video embedded via iframe
         'url': 'http://lifenews.ru/news/152125',
         'md5': '77d19a6f0886cd76bdbf44b4d971a273',
         'info_dict': {
@@ -42,15 +42,33 @@ class LifeNewsIE(InfoExtractor):
             'upload_date': '20150402',
         }
     }, {
+        # two videos embedded via iframe
         'url': 'http://lifenews.ru/news/153461',
-        'md5': '9b6ef8bc0ffa25aebc8bdb40d89ab795',
         'info_dict': {
             'id': '153461',
-            'ext': 'mp4',
             'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве',
             'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
             'upload_date': '20150505',
-        }
+        },
+        'playlist': [{
+            'md5': '9b6ef8bc0ffa25aebc8bdb40d89ab795',
+            'info_dict': {
+                'id': '153461-video1',
+                'ext': 'mp4',
+                'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 1)',
+                'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
+                'upload_date': '20150505',
+            },
+        }, {
+            'md5': 'ebb3bf3b1ce40e878d0d628e93eb0322',
+            'info_dict': {
+                'id': '153461-video2',
+                'ext': 'mp4',
+                'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 2)',
+                'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
+                'upload_date': '20150505',
+            },
+        }],
     }, {
         'url': 'http://lifenews.ru/video/13035',
         'only_matching': True,
@@ -65,10 +83,14 @@ class LifeNewsIE(InfoExtractor):
             'http://lifenews.ru/%s/%s' % (section, video_id),
             video_id, 'Downloading page')
 
-        videos = re.findall(r'<video.*?poster="(?P<poster>[^"]+)".*?src="(?P<video>[^"]+)".*?></video>', webpage)
-        iframe_link = self._html_search_regex(
-            '<iframe[^>]+src=["\']([^"\']+)["\']', webpage, 'iframe link', default=None)
-        if not videos and not iframe_link:
+        video_urls = re.findall(
+            r'<video[^>]+><source[^>]+src=["\'](.+?)["\']', webpage)
+
+        iframe_links = re.findall(
+            r'<iframe[^>]+src=["\']((?:https?:)?//embed\.life\.ru/embed/.+?)["\']',
+            webpage)
+
+        if not video_urls and not iframe_links:
             raise ExtractorError('No media links available for %s' % video_id)
 
         title = remove_end(
@@ -95,31 +117,44 @@ class LifeNewsIE(InfoExtractor):
             'upload_date': upload_date,
         }
 
-        def make_entry(video_id, media, video_number=None):
+        def make_entry(video_id, video_url, index=None):
             cur_info = dict(common_info)
             cur_info.update({
-                'id': video_id,
-                'url': media[1],
-                'thumbnail': media[0],
-                'title': title if video_number is None else '%s-video%s' % (title, video_number),
+                'id': video_id if not index else '%s-video%s' % (video_id, index),
+                'url': video_url,
+                'title': title if not index else '%s (Видео %s)' % (title, index),
             })
             return cur_info
 
-        if iframe_link:
-            iframe_link = self._proto_relative_url(iframe_link, 'http:')
-            cur_info = dict(common_info)
-            cur_info.update({
-                '_type': 'url_transparent',
-                'id': video_id,
-                'title': title,
-                'url': iframe_link,
-            })
+        def make_video_entry(video_id, video_url, index=None):
+            video_url = compat_urlparse.urljoin(url, video_url)
+            return make_entry(video_id, video_url, index)
+
+        def make_iframe_entry(video_id, video_url, index=None):
+            video_url = self._proto_relative_url(video_url, 'http:')
+            cur_info = make_entry(video_id, video_url, index)
+            cur_info['_type'] = 'url_transparent'
             return cur_info
 
-        if len(videos) == 1:
-            return make_entry(video_id, videos[0])
-        else:
-            return [make_entry(video_id, media, video_number + 1) for video_number, media in enumerate(videos)]
+        if len(video_urls) == 1 and not iframe_links:
+            return make_video_entry(video_id, video_urls[0])
+
+        if len(iframe_links) == 1 and not video_urls:
+            return make_iframe_entry(video_id, iframe_links[0])
+
+        entries = []
+
+        if video_urls:
+            for num, video_url in enumerate(video_urls, 1):
+                entries.append(make_video_entry(video_id, video_url, num))
+
+        if iframe_links:
+            for num, iframe_link in enumerate(iframe_links, len(video_urls) + 1):
+                entries.append(make_iframe_entry(video_id, iframe_link, num))
+
+        playlist = common_info.copy()
+        playlist.update(self.playlist_result(entries, video_id, title, description))
+        return playlist
 
 
 class LifeEmbedIE(InfoExtractor):
index 38fb3d9e4166f5f4a188ab2436c27240a8b04283..eada7c299238953baa9fd3d8219b2754aa7f9356 100644 (file)
@@ -14,6 +14,7 @@ from ..utils import (
     xpath_with_ns,
     xpath_text,
     orderedSet,
+    update_url_query,
     int_or_none,
     float_or_none,
     parse_iso8601,
@@ -64,7 +65,7 @@ class LivestreamIE(InfoExtractor):
     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
         base_ele = find_xpath_attr(
             smil, self._xpath_ns('.//meta', namespace), 'name', 'httpBase')
-        base = base_ele.get('content') if base_ele else 'http://livestreamvod-f.akamaihd.net/'
+        base = base_ele.get('content') if base_ele is not None else 'http://livestreamvod-f.akamaihd.net/'
 
         formats = []
         video_nodes = smil.findall(self._xpath_ns('.//video', namespace))
@@ -72,7 +73,10 @@ class LivestreamIE(InfoExtractor):
         for vn in video_nodes:
             tbr = int_or_none(vn.attrib.get('system-bitrate'), 1000)
             furl = (
-                '%s%s?v=3.0.3&fp=WIN%%2014,0,0,145' % (base, vn.attrib['src']))
+                update_url_query(compat_urlparse.urljoin(base, vn.attrib['src']), {
+                    'v': '3.0.3',
+                    'fp': 'WIN% 14,0,0,145',
+                }))
             if 'clipBegin' in vn.attrib:
                 furl += '&ssek=' + vn.attrib['clipBegin']
             formats.append({
diff --git a/youtube_dl/extractor/makerschannel.py b/youtube_dl/extractor/makerschannel.py
new file mode 100644 (file)
index 0000000..f5d00e6
--- /dev/null
@@ -0,0 +1,40 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class MakersChannelIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?makerschannel\.com/.*(?P<id_type>video|production)_id=(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://makerschannel.com/en/zoomin/community-highlights?video_id=849',
+        'md5': '624a512c6969236b5967bf9286345ad1',
+        'info_dict': {
+            'id': '849',
+            'ext': 'mp4',
+            'title': 'Landing a bus on a plane is an epic win',
+            'uploader': 'ZoomIn',
+            'description': 'md5:cd9cca2ea7b69b78be81d07020c97139',
+        }
+    }
+
+    def _real_extract(self, url):
+        id_type, url_id = re.match(self._VALID_URL, url).groups()
+        webpage = self._download_webpage(url, url_id)
+        video_data = self._html_search_regex(r'<div([^>]+data-%s-id="%s"[^>]+)>' % (id_type, url_id), webpage, 'video data')
+
+        def extract_data_val(attr, fatal=False):
+            return self._html_search_regex(r'data-%s\s*=\s*"([^"]+)"' % attr, video_data, attr, fatal=fatal)
+        minoto_id = self._search_regex(r'/id/([a-zA-Z0-9]+)', extract_data_val('video-src', True), 'minoto id')
+
+        return {
+            '_type': 'url_transparent',
+            'url': 'minoto:%s' % minoto_id,
+            'id': extract_data_val('video-id', True),
+            'title': extract_data_val('title', True),
+            'description': extract_data_val('description'),
+            'thumbnail': extract_data_val('image'),
+            'uploader': extract_data_val('channel'),
+        }
index 425fc9e2a69b93879eb71a32eae2a042d97770cc..2338e7f96f36bea7246e7357302cbcbcac39ad8a 100644 (file)
@@ -14,7 +14,7 @@ from ..utils import (
 
 class MDRIE(InfoExtractor):
     IE_DESC = 'MDR.DE and KiKA'
-    _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z]+(?P<id>\d+)(?:_.+?)?\.html'
+    _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z]+-?(?P<id>\d+)(?:_.+?)?\.html'
 
     _TESTS = [{
         # MDR regularly deletes its videos
@@ -60,6 +60,9 @@ class MDRIE(InfoExtractor):
     }, {
         'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html',
         'only_matching': True,
+    }, {
+        'url': 'http://www.mdr.de/mediathek/mdr-videos/a/video-1334.html',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
@@ -68,8 +71,8 @@ class MDRIE(InfoExtractor):
         webpage = self._download_webpage(url, video_id)
 
         data_url = self._search_regex(
-            r'dataURL\s*:\s*(["\'])(?P<url>/.+/(?:video|audio)[0-9]+-avCustom\.xml)\1',
-            webpage, 'data url', group='url')
+            r'(?:dataURL|playerXml(?:["\'])?)\s*:\s*(["\'])(?P<url>\\?/.+/(?:video|audio)-?[0-9]+-avCustom\.xml)\1',
+            webpage, 'data url', default=None, group='url').replace('\/', '/')
 
         doc = self._download_xml(
             compat_urlparse.urljoin(url, data_url), video_id)
diff --git a/youtube_dl/extractor/minoto.py b/youtube_dl/extractor/minoto.py
new file mode 100644 (file)
index 0000000..959a105
--- /dev/null
@@ -0,0 +1,56 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class MinotoIE(InfoExtractor):
+    _VALID_URL = r'(?:minoto:|https?://(?:play|iframe|embed)\.minoto-video\.com/(?P<player_id>[0-9]+)/)(?P<id>[a-zA-Z0-9]+)'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        player_id = mobj.group('player_id') or '1'
+        video_id = mobj.group('id')
+        video_data = self._download_json('http://play.minoto-video.com/%s/%s.js' % (player_id, video_id), video_id)
+        video_metadata = video_data['video-metadata']
+        formats = []
+        for fmt in video_data['video-files']:
+            fmt_url = fmt.get('url')
+            if not fmt_url:
+                continue
+            container = fmt.get('container')
+            if container == 'hls':
+                formats.extend(fmt_url, video_id, 'mp4', m3u8_id='hls', fatal=False)
+            else:
+                fmt_profile = fmt.get('profile') or {}
+                f = {
+                    'format_id': fmt_profile.get('name-short'),
+                    'format_note': fmt_profile.get('name'),
+                    'url': fmt_url,
+                    'container': container,
+                    'tbr': int_or_none(fmt.get('bitrate')),
+                    'filesize': int_or_none(fmt.get('filesize')),
+                    'width': int_or_none(fmt.get('width')),
+                    'height': int_or_none(fmt.get('height')),
+                }
+                codecs = fmt.get('codecs')
+                if codecs:
+                    codecs = codecs.split(',')
+                    if len(codecs) == 2:
+                        f.update({
+                            'vcodec': codecs[0],
+                            'acodec': codecs[1],
+                        })
+                formats.append(f)
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': video_metadata['title'],
+            'description': video_metadata.get('description'),
+            'thumbnail': video_metadata.get('video-poster', {}).get('url'),
+            'formats': formats,
+        }
index 29ca45778a17654c4d2125ceda177b71cffca8a8..819c1b90bb755c873b3f7f1b64e07dc97126a9b9 100644 (file)
@@ -99,7 +99,7 @@ class OCWMITIE(InfoExtractor):
             'url': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/',
             'info_dict': {
                 'id': 'EObHWIEKGjA',
-                'ext': 'mp4',
+                'ext': 'webm',
                 'title': 'Lecture 7: Multiple Discrete Random Variables: Expectations, Conditioning, Independence',
                 'description': 'In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution.',
                 'upload_date': '20121109',
index c2b7ed9abbd27a2b2c8e0d9d95c59e387630180a..101497118275b7f1b5bf0564048f1dc9fc4b878b 100644 (file)
@@ -7,6 +7,7 @@ from ..compat import compat_urllib_parse_unquote
 from ..utils import (
     ExtractorError,
     HEADRequest,
+    parse_count,
     str_to_int,
 )
 
@@ -85,8 +86,8 @@ class MixcloudIE(InfoExtractor):
         uploader_id = self._search_regex(
             r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False)
         description = self._og_search_description(webpage)
-        like_count = str_to_int(self._search_regex(
-            r'\bbutton-favorite\b[^>]+m-ajax-toggle-count="([^"]+)"',
+        like_count = parse_count(self._search_regex(
+            r'\bbutton-favorite[^>]+>.*?<span[^>]+class=["\']toggle-number[^>]+>\s*([^<]+)',
             webpage, 'like count', fatal=False))
         view_count = str_to_int(self._search_regex(
             [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',
index 97d5da626a7a5d2555ac3107eb89d1a4fd11b510..0b4787c1d23daa2ed8ed4065df9baa9ab706521b 100644 (file)
@@ -5,6 +5,7 @@ import re
 
 from .common import InfoExtractor
 from ..utils import (
+    ExtractorError,
     str_to_int,
     unified_strdate,
 )
@@ -12,55 +13,62 @@ from ..utils import (
 
 class MotherlessIE(InfoExtractor):
     _VALID_URL = r'http://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)'
-    _TESTS = [
-        {
-            'url': 'http://motherless.com/AC3FFE1',
-            'md5': '310f62e325a9fafe64f68c0bccb6e75f',
-            'info_dict': {
-                'id': 'AC3FFE1',
-                'ext': 'mp4',
-                'title': 'Fucked in the ass while playing PS3',
-                'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'],
-                'upload_date': '20100913',
-                'uploader_id': 'famouslyfuckedup',
-                'thumbnail': 're:http://.*\.jpg',
-                'age_limit': 18,
-            }
-        },
-        {
-            'url': 'http://motherless.com/532291B',
-            'md5': 'bc59a6b47d1f958e61fbd38a4d31b131',
-            'info_dict': {
-                'id': '532291B',
-                'ext': 'mp4',
-                'title': 'Amazing girl playing the omegle game, PERFECT!',
-                'categories': ['Amateur', 'webcam', 'omegle', 'pink', 'young', 'masturbate', 'teen', 'game', 'hairy'],
-                'upload_date': '20140622',
-                'uploader_id': 'Sulivana7x',
-                'thumbnail': 're:http://.*\.jpg',
-                'age_limit': 18,
-            }
+    _TESTS = [{
+        'url': 'http://motherless.com/AC3FFE1',
+        'md5': '310f62e325a9fafe64f68c0bccb6e75f',
+        'info_dict': {
+            'id': 'AC3FFE1',
+            'ext': 'mp4',
+            'title': 'Fucked in the ass while playing PS3',
+            'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'],
+            'upload_date': '20100913',
+            'uploader_id': 'famouslyfuckedup',
+            'thumbnail': 're:http://.*\.jpg',
+            'age_limit': 18,
+        }
+    }, {
+        'url': 'http://motherless.com/532291B',
+        'md5': 'bc59a6b47d1f958e61fbd38a4d31b131',
+        'info_dict': {
+            'id': '532291B',
+            'ext': 'mp4',
+            'title': 'Amazing girl playing the omegle game, PERFECT!',
+            'categories': ['Amateur', 'webcam', 'omegle', 'pink', 'young', 'masturbate', 'teen',
+                           'game', 'hairy'],
+            'upload_date': '20140622',
+            'uploader_id': 'Sulivana7x',
+            'thumbnail': 're:http://.*\.jpg',
+            'age_limit': 18,
         },
-        {
-            'url': 'http://motherless.com/g/cosplay/633979F',
-            'md5': '0b2a43f447a49c3e649c93ad1fafa4a0',
-            'info_dict': {
-                'id': '633979F',
-                'ext': 'mp4',
-                'title': 'Turtlette',
-                'categories': ['superheroine heroine  superher'],
-                'upload_date': '20140827',
-                'uploader_id': 'shade0230',
-                'thumbnail': 're:http://.*\.jpg',
-                'age_limit': 18,
-            }
+        'skip': '404',
+    }, {
+        'url': 'http://motherless.com/g/cosplay/633979F',
+        'md5': '0b2a43f447a49c3e649c93ad1fafa4a0',
+        'info_dict': {
+            'id': '633979F',
+            'ext': 'mp4',
+            'title': 'Turtlette',
+            'categories': ['superheroine heroine  superher'],
+            'upload_date': '20140827',
+            'uploader_id': 'shade0230',
+            'thumbnail': 're:http://.*\.jpg',
+            'age_limit': 18,
         }
-    ]
+    }, {
+        # no keywords
+        'url': 'http://motherless.com/8B4BBC1',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
 
+        if any(p in webpage for p in (
+                '<title>404 - MOTHERLESS.COM<',
+                ">The page you're looking for cannot be found.<")):
+            raise ExtractorError('Video %s does not exist' % video_id, expected=True)
+
         title = self._html_search_regex(
             r'id="view-upload-title">\s+([^<]+)<', webpage, 'title')
         video_url = self._html_search_regex(
@@ -86,7 +94,7 @@ class MotherlessIE(InfoExtractor):
             r'"thumb-member-username">\s+<a href="/m/([^"]+)"',
             webpage, 'uploader_id')
 
-        categories = self._html_search_meta('keywords', webpage)
+        categories = self._html_search_meta('keywords', webpage, default=None)
         if categories:
             categories = [cat.strip() for cat in categories.split(',')]
 
index e8bb527b89f443fb433694ac9e195fbbf5d2476e..ed068365d3d4936d35e2e62146f1dcbec750728b 100644 (file)
@@ -11,6 +11,7 @@ from ..utils import (
     ExtractorError,
     find_xpath_attr,
     fix_xml_ampersands,
+    float_or_none,
     HEADRequest,
     sanitized_Request,
     unescapeHTML,
@@ -110,7 +111,8 @@ class MTVServicesInfoExtractor(InfoExtractor):
         uri = itemdoc.find('guid').text
         video_id = self._id_from_uri(uri)
         self.report_extraction(video_id)
-        mediagen_url = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))).attrib['url']
+        content_el = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content')))
+        mediagen_url = content_el.attrib['url']
         # Remove the templates, like &device={device}
         mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', mediagen_url)
         if 'acceptMethods' not in mediagen_url:
@@ -165,6 +167,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
             'id': video_id,
             'thumbnail': self._get_thumbnail_url(uri, itemdoc),
             'description': description,
+            'duration': float_or_none(content_el.attrib.get('duration')),
         }
 
     def _get_feed_query(self, uri):
index a071378b6d1dc18cefe3d76f98c3b30d0fe8a880..3e2b3e59945f4a67275ca6bc9d3c9193f531506c 100644 (file)
@@ -1,18 +1,26 @@
 from __future__ import unicode_literals
 
+import functools
+import os.path
 import re
 
 from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_parse,
+    compat_urlparse,
+)
 from ..utils import (
-    parse_duration,
     int_or_none,
+    OnDemandPagedList,
+    parse_duration,
+    remove_start,
     xpath_text,
     xpath_attr,
 )
 
 
 class NBAIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?P<path>(?:[^/]+/)?video/(?P<id>[^?]*?))/?(?:/index\.html)?(?:\?.*)?$'
+    _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?P<path>(?:[^/]+/)+(?P<id>[^?]*?))/?(?:/index\.html)?(?:\?.*)?$'
     _TESTS = [{
         'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html',
         'md5': '9e7729d3010a9c71506fd1248f74e4f4',
@@ -44,14 +52,101 @@ class NBAIE(InfoExtractor):
             'timestamp': 1432134543,
             'upload_date': '20150520',
         }
+    }, {
+        'url': 'http://www.nba.com/clippers/news/doc-rivers-were-not-trading-blake',
+        'info_dict': {
+            'id': '1455672027478-Doc_Feb16_720',
+            'ext': 'mp4',
+            'title': 'Practice: Doc Rivers - 2/16/16',
+            'description': 'Head Coach Doc Rivers addresses the media following practice.',
+            'upload_date': '20160217',
+            'timestamp': 1455672000,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#',
+        'info_dict': {
+            'id': 'timberwolves',
+            'title': 'Shootaround Access - Dec. 12 | Andrew Wiggins',
+        },
+        'playlist_count': 30,
+        'params': {
+            # Download the whole playlist takes too long time
+            'playlist_items': '1-30',
+        },
+    }, {
+        'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#',
+        'info_dict': {
+            'id': 'Wigginsmp4',
+            'ext': 'mp4',
+            'title': 'Shootaround Access - Dec. 12 | Andrew Wiggins',
+            'description': 'Wolves rookie Andrew Wiggins addresses the media after Friday\'s shootaround.',
+            'upload_date': '20141212',
+            'timestamp': 1418418600,
+        },
+        'params': {
+            'noplaylist': True,
+            # m3u8 download
+            'skip_download': True,
+        },
     }]
 
+    _PAGE_SIZE = 30
+
+    def _fetch_page(self, team, video_id, page):
+        search_url = 'http://searchapp2.nba.com/nba-search/query.jsp?' + compat_urllib_parse.urlencode({
+            'type': 'teamvideo',
+            'start': page * self._PAGE_SIZE + 1,
+            'npp': (page + 1) * self._PAGE_SIZE + 1,
+            'sort': 'recent',
+            'output': 'json',
+            'site': team,
+        })
+        results = self._download_json(
+            search_url, video_id, note='Download page %d of playlist data' % page)['results'][0]
+        for item in results:
+            yield self.url_result(compat_urlparse.urljoin('http://www.nba.com/', item['url']))
+
+    def _extract_playlist(self, orig_path, video_id, webpage):
+        team = orig_path.split('/')[0]
+
+        if self._downloader.params.get('noplaylist'):
+            self.to_screen('Downloading just video because of --no-playlist')
+            video_path = self._search_regex(
+                r'nbaVideoCore\.firstVideo\s*=\s*\'([^\']+)\';', webpage, 'video path')
+            video_url = 'http://www.nba.com/%s/video/%s' % (team, video_path)
+            return self.url_result(video_url)
+
+        self.to_screen('Downloading playlist - add --no-playlist to just download video')
+        playlist_title = self._og_search_title(webpage, fatal=False)
+        entries = OnDemandPagedList(
+            functools.partial(self._fetch_page, team, video_id),
+            self._PAGE_SIZE, use_cache=True)
+
+        return self.playlist_result(entries, team, playlist_title)
+
     def _real_extract(self, url):
         path, video_id = re.match(self._VALID_URL, url).groups()
+        orig_path = path
         if path.startswith('nba/'):
             path = path[3:]
+
+        if 'video/' not in path:
+            webpage = self._download_webpage(url, video_id)
+            path = remove_start(self._search_regex(r'data-videoid="([^"]+)"', webpage, 'video id'), '/')
+
+            if path == '{{id}}':
+                return self._extract_playlist(orig_path, video_id, webpage)
+
+            # See prepareContentId() of pkgCvp.js
+            if path.startswith('video/teams'):
+                path = 'video/channels/proxy/' + path[6:]
+
         video_info = self._download_xml('http://www.nba.com/%s.xml' % path, video_id)
-        video_id = xpath_text(video_info, 'slug')
+        video_id = os.path.splitext(xpath_text(video_info, 'slug'))[0]
         title = xpath_text(video_info, 'headline')
         description = xpath_text(video_info, 'description')
         duration = parse_duration(xpath_text(video_info, 'length'))
index a126f5054fbee3c178cf19915caec9fdb03df297..3b21fbd4d9e1442e3aa8016d74c1b9a97278d90f 100644 (file)
@@ -4,7 +4,10 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..compat import compat_urlparse
+from ..compat import (
+    compat_urlparse,
+    compat_urllib_parse_unquote,
+)
 from ..utils import (
     determine_ext,
     ExtractorError,
@@ -87,7 +90,7 @@ class NRKIE(InfoExtractor):
 
 
 class NRKPlaylistIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video)(?:[^/]+/)+(?P<id>[^/]+)'
+    _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video|skole)(?:[^/]+/)+(?P<id>[^/]+)'
 
     _TESTS = [{
         'url': 'http://www.nrk.no/troms/gjenopplev-den-historiske-solformorkelsen-1.12270763',
@@ -126,6 +129,37 @@ class NRKPlaylistIE(InfoExtractor):
             entries, playlist_id, playlist_title, playlist_description)
 
 
+class NRKSkoleIE(InfoExtractor):
+    IE_DESC = 'NRK Skole'
+    _VALID_URL = r'https?://(?:www\.)?nrk\.no/skole/klippdetalj?.*\btopic=(?P<id>[^/?#&]+)'
+
+    _TESTS = [{
+        'url': 'http://nrk.no/skole/klippdetalj?topic=nrk:klipp/616532',
+        'md5': '04cd85877cc1913bce73c5d28a47e00f',
+        'info_dict': {
+            'id': '6021',
+            'ext': 'flv',
+            'title': 'Genetikk og eneggede tvillinger',
+            'description': 'md5:3aca25dcf38ec30f0363428d2b265f8d',
+            'duration': 399,
+        },
+    }, {
+        'url': 'http://www.nrk.no/skole/klippdetalj?topic=nrk%3Aklipp%2F616532#embed',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.nrk.no/skole/klippdetalj?topic=urn:x-mediadb:21379',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = compat_urllib_parse_unquote(self._match_id(url))
+
+        webpage = self._download_webpage(url, video_id)
+
+        nrk_id = self._search_regex(r'data-nrk-id=["\'](\d+)', webpage, 'nrk id')
+        return self.url_result('nrk:%s' % nrk_id)
+
+
 class NRKTVIE(InfoExtractor):
     IE_DESC = 'NRK TV and NRK Radio'
     _VALID_URL = r'(?P<baseurl>https?://(?:tv|radio)\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?'
index cca012953d01d173e638924b4992015625815291..f43e3a146e7bd35d9a99ab730289f4a1d4f5b91c 100644 (file)
@@ -337,6 +337,21 @@ class PBSIE(InfoExtractor):
                 'skip_download': True,  # requires ffmpeg
             },
         },
+        {
+            # Serves hd only via wigget/partnerplayer page
+            'url': 'http://www.pbs.org/video/2365641075/',
+            'info_dict': {
+                'id': '2365641075',
+                'ext': 'mp4',
+                'title': 'FRONTLINE - Netanyahu at War',
+                'duration': 6852,
+                'thumbnail': 're:^https?://.*\.jpg$',
+                'formats': 'mincount:8',
+            },
+            'params': {
+                'skip_download': True,  # requires ffmpeg
+            },
+        },
         {
             'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true',
             'only_matching': True,
@@ -437,34 +452,54 @@ class PBSIE(InfoExtractor):
                 for vid_id in video_id]
             return self.playlist_result(entries, display_id)
 
+        info = None
+        redirects = []
+        redirect_urls = set()
+
+        def extract_redirect_urls(info):
+            for encoding_name in ('recommended_encoding', 'alternate_encoding'):
+                redirect = info.get(encoding_name)
+                if not redirect:
+                    continue
+                redirect_url = redirect.get('url')
+                if redirect_url and redirect_url not in redirect_urls:
+                    redirects.append(redirect)
+                    redirect_urls.add(redirect_url)
+
         try:
-            info = self._download_json(
+            video_info = self._download_json(
                 'http://player.pbs.org/videoInfo/%s?format=json&type=partner' % video_id,
                 display_id, 'Downloading video info JSON')
+            extract_redirect_urls(video_info)
+            info = video_info
         except ExtractorError as e:
+            # videoInfo API may not work for some videos
             if not isinstance(e.cause, compat_HTTPError) or e.cause.code != 404:
                 raise
-            # videoInfo API may not work for some videos, fallback to portalplayer API
+
+        # Player pages may also serve different qualities
+        for page in ('widget/partnerplayer', 'portalplayer'):
             player = self._download_webpage(
-                'http://player.pbs.org/portalplayer/%s' % video_id, display_id)
-            info = self._parse_json(
-                self._search_regex(
-                    r'(?s)PBS\.videoData\s*=\s*({.+?});\n',
-                    player, 'video data', default='{}'),
-                display_id, transform_source=js_to_json, fatal=False)
+                'http://player.pbs.org/%s/%s' % (page, video_id),
+                display_id, 'Downloading %s page' % page, fatal=False)
+            if player:
+                video_info = self._parse_json(
+                    self._search_regex(
+                        r'(?s)PBS\.videoData\s*=\s*({.+?});\n',
+                        player, '%s video data' % page, default='{}'),
+                    display_id, transform_source=js_to_json, fatal=False)
+                if video_info:
+                    extract_redirect_urls(video_info)
+                    if not info:
+                        info = video_info
 
         formats = []
-        for encoding_name in ('recommended_encoding', 'alternate_encoding'):
-            redirect = info.get(encoding_name)
-            if not redirect:
-                continue
-            redirect_url = redirect.get('url')
-            if not redirect_url:
-                continue
+        for num, redirect in enumerate(redirects):
+            redirect_id = redirect.get('eeid')
 
             redirect_info = self._download_json(
-                redirect_url + '?format=json', display_id,
-                'Downloading %s video url info' % encoding_name)
+                '%s?format=json' % redirect['url'], display_id,
+                'Downloading %s video url info' % (redirect_id or num))
 
             if redirect_info['status'] == 'error':
                 raise ExtractorError(
@@ -483,8 +518,9 @@ class PBSIE(InfoExtractor):
             else:
                 formats.append({
                     'url': format_url,
-                    'format_id': redirect.get('eeid'),
+                    'format_id': redirect_id,
                 })
+        self._remove_duplicate_formats(formats)
         self._sort_formats(formats)
 
         rating_str = info.get('rating')
index 6d5732d45c3d3e22d085319ff45449881ac73ad2..30a5f2de4475a934cfa467764d0ce559d3e68a74 100644 (file)
@@ -12,14 +12,14 @@ class PyvideoIE(InfoExtractor):
     _TESTS = [
         {
             'url': 'http://pyvideo.org/video/1737/become-a-logging-expert-in-30-minutes',
-            'md5': 'de317418c8bc76b1fd8633e4f32acbc6',
+            'md5': '520915673e53a5c5d487c36e0c4d85b5',
             'info_dict': {
                 'id': '24_4WWkSmNo',
-                'ext': 'mp4',
+                'ext': 'webm',
                 'title': 'Become a logging expert in 30 minutes',
                 'description': 'md5:9665350d466c67fb5b1598de379021f7',
                 'upload_date': '20130320',
-                'uploader': 'NextDayVideo',
+                'uploader': 'Next Day Video',
                 'uploader_id': 'NextDayVideo',
             },
             'add_ie': ['Youtube'],
index b1b8800b97c9eb8caad2c03f999f1bc8f304c4da..99979ebe1a9fe82099076b46b576ef38a58bca8c 100644 (file)
@@ -19,7 +19,7 @@ class Revision3IE(InfoExtractor):
         'url': 'http://www.revision3.com/technobuffalo/5-google-predictions-for-2016',
         'md5': 'd94a72d85d0a829766de4deb8daaf7df',
         'info_dict': {
-            'id': '73034',
+            'id': '71089',
             'display_id': 'technobuffalo/5-google-predictions-for-2016',
             'ext': 'webm',
             'title': '5 Google Predictions for 2016',
@@ -31,6 +31,7 @@ class Revision3IE(InfoExtractor):
             'uploader_id': 'technobuffalo',
         }
     }, {
+        # Show
         'url': 'http://testtube.com/brainstuff',
         'info_dict': {
             'id': '251',
@@ -41,7 +42,7 @@ class Revision3IE(InfoExtractor):
     }, {
         'url': 'https://testtube.com/dnews/5-weird-ways-plants-can-eat-animals?utm_source=FB&utm_medium=DNews&utm_campaign=DNewsSocial',
         'info_dict': {
-            'id': '60163',
+            'id': '58227',
             'display_id': 'dnews/5-weird-ways-plants-can-eat-animals',
             'duration': 275,
             'ext': 'webm',
@@ -52,18 +53,72 @@ class Revision3IE(InfoExtractor):
             'uploader': 'DNews',
             'uploader_id': 'dnews',
         },
+    }, {
+        'url': 'http://testtube.com/tt-editors-picks/the-israel-palestine-conflict-explained-in-ten-min',
+        'info_dict': {
+            'id': '71618',
+            'ext': 'mp4',
+            'display_id': 'tt-editors-picks/the-israel-palestine-conflict-explained-in-ten-min',
+            'title': 'The Israel-Palestine Conflict Explained in Ten Minutes',
+            'description': 'If you\'d like to learn about the struggle between Israelis and Palestinians, this video is a great place to start',
+            'uploader': 'Editors\' Picks',
+            'uploader_id': 'tt-editors-picks',
+            'timestamp': 1453309200,
+            'upload_date': '20160120',
+        },
+        'add_ie': ['Youtube'],
+    }, {
+        # Tag
+        'url': 'http://testtube.com/tech-news',
+        'info_dict': {
+            'id': '21018',
+            'title': 'tech news',
+        },
+        'playlist_mincount': 9,
     }]
     _PAGE_DATA_TEMPLATE = 'http://www.%s/apiProxy/ddn/%s?domain=%s'
     _API_KEY = 'ba9c741bce1b9d8e3defcc22193f3651b8867e62'
 
     def _real_extract(self, url):
         domain, display_id = re.match(self._VALID_URL, url).groups()
+        site = domain.split('.')[0]
         page_info = self._download_json(
             self._PAGE_DATA_TEMPLATE % (domain, display_id, domain), display_id)
 
-        if page_info['data']['type'] == 'episode':
-            episode_data = page_info['data']
-            video_id = compat_str(episode_data['video']['data']['id'])
+        page_data = page_info['data']
+        page_type = page_data['type']
+        if page_type in ('episode', 'embed'):
+            show_data = page_data['show']['data']
+            page_id = compat_str(page_data['id'])
+            video_id = compat_str(page_data['video']['data']['id'])
+
+            preference = qualities(['mini', 'small', 'medium', 'large'])
+            thumbnails = [{
+                'url': image_url,
+                'id': image_id,
+                'preference': preference(image_id)
+            } for image_id, image_url in page_data.get('images', {}).items()]
+
+            info = {
+                'id': page_id,
+                'display_id': display_id,
+                'title': unescapeHTML(page_data['name']),
+                'description': unescapeHTML(page_data.get('summary')),
+                'timestamp': parse_iso8601(page_data.get('publishTime'), ' '),
+                'author': page_data.get('author'),
+                'uploader': show_data.get('name'),
+                'uploader_id': show_data.get('slug'),
+                'thumbnails': thumbnails,
+                'extractor_key': site,
+            }
+
+            if page_type == 'embed':
+                info.update({
+                    '_type': 'url_transparent',
+                    'url': page_data['video']['data']['embed'],
+                })
+                return info
+
             video_data = self._download_json(
                 'http://revision3.com/api/getPlaylist.json?api_key=%s&codecs=h264,vp8,theora&video_id=%s' % (self._API_KEY, video_id),
                 video_id)['items'][0]
@@ -84,36 +139,30 @@ class Revision3IE(InfoExtractor):
                         })
             self._sort_formats(formats)
 
-            preference = qualities(['mini', 'small', 'medium', 'large'])
-            thumbnails = [{
-                'url': image_url,
-                'id': image_id,
-                'preference': preference(image_id)
-            } for image_id, image_url in video_data.get('images', {}).items()]
-
-            return {
-                'id': video_id,
-                'display_id': display_id,
+            info.update({
                 'title': unescapeHTML(video_data['title']),
                 'description': unescapeHTML(video_data.get('summary')),
-                'timestamp': parse_iso8601(episode_data.get('publishTime'), ' '),
-                'author': episode_data.get('author'),
                 'uploader': video_data.get('show', {}).get('name'),
                 'uploader_id': video_data.get('show', {}).get('slug'),
                 'duration': int_or_none(video_data.get('duration')),
-                'thumbnails': thumbnails,
                 'formats': formats,
-            }
+            })
+            return info
         else:
-            show_data = page_info['show']['data']
+            list_data = page_info[page_type]['data']
             episodes_data = page_info['episodes']['data']
             num_episodes = page_info['meta']['totalEpisodes']
             processed_episodes = 0
             entries = []
             page_num = 1
             while True:
-                entries.extend([self.url_result(
-                    'http://%s/%s/%s' % (domain, display_id, episode['slug'])) for episode in episodes_data])
+                entries.extend([{
+                    '_type': 'url',
+                    'url': 'http://%s%s' % (domain, episode['path']),
+                    'id': compat_str(episode['id']),
+                    'ie_key': 'Revision3',
+                    'extractor_key': site,
+                } for episode in episodes_data])
                 processed_episodes += len(episodes_data)
                 if processed_episodes == num_episodes:
                     break
@@ -123,5 +172,5 @@ class Revision3IE(InfoExtractor):
                     display_id)['episodes']['data']
 
             return self.playlist_result(
-                entries, compat_str(show_data['id']),
-                show_data.get('name'), show_data.get('summary'))
+                entries, compat_str(list_data['id']),
+                list_data.get('name'), list_data.get('summary'))
index 603d7bd00620cef13cd957a54053dc7f77010e05..8a8c5d2a0e2b27bd6e476ef7d848753fbe648dd5 100644 (file)
@@ -10,6 +10,7 @@ from ..utils import (
     ExtractorError,
     float_or_none,
     remove_end,
+    remove_start,
     sanitized_Request,
     std_headers,
     struct_unpack,
@@ -178,14 +179,14 @@ class RTVEInfantilIE(InfoExtractor):
 class RTVELiveIE(InfoExtractor):
     IE_NAME = 'rtve.es:live'
     IE_DESC = 'RTVE.es live streams'
-    _VALID_URL = r'http://www\.rtve\.es/(?:deportes/directo|noticias|television)/(?P<id>[a-zA-Z0-9-]+)'
+    _VALID_URL = r'http://www\.rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)'
 
     _TESTS = [{
-        'url': 'http://www.rtve.es/noticias/directo-la-1/',
+        'url': 'http://www.rtve.es/directo/la-1/',
         'info_dict': {
-            'id': 'directo-la-1',
-            'ext': 'flv',
-            'title': 're:^La 1 de TVE [0-9]{4}-[0-9]{2}-[0-9]{2}Z[0-9]{6}$',
+            'id': 'la-1',
+            'ext': 'mp4',
+            'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2}Z[0-9]{6}$',
         },
         'params': {
             'skip_download': 'live stream',
@@ -198,23 +199,20 @@ class RTVELiveIE(InfoExtractor):
         video_id = mobj.group('id')
 
         webpage = self._download_webpage(url, video_id)
-        player_url = self._search_regex(
-            r'<param name="movie" value="([^"]+)"/>', webpage, 'player URL')
-        title = remove_end(self._og_search_title(webpage), ' en directo')
+        title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es')
+        title = remove_start(title, 'Estoy viendo ')
         title += ' ' + time.strftime('%Y-%m-%dZ%H%M%S', start_time)
 
         vidplayer_id = self._search_regex(
-            r' id="vidplayer([0-9]+)"', webpage, 'internal video ID')
-        png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % vidplayer_id
+            r'playerId=player([0-9]+)', webpage, 'internal video ID')
+        png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/amonet/videos/%s.png' % vidplayer_id
         png = self._download_webpage(png_url, video_id, 'Downloading url information')
-        video_url = _decrypt_url(png)
+        m3u8_url = _decrypt_url(png)
+        formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')
 
         return {
             'id': video_id,
-            'ext': 'flv',
             'title': title,
-            'url': video_url,
-            'app': 'rtve-live-live?ovpfv=2.1.2',
-            'player_url': player_url,
-            'rtmp_live': True,
+            'formats': formats,
+            'is_live': True,
         }
index 7de7b7273523ea8a43a6d22e8ab684afb4fc5875..256396bb8c21174fdaca4524c0b21d1ef7f802a9 100644 (file)
@@ -4,14 +4,13 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from .brightcove import BrightcoveLegacyIE
 
 from ..utils import (
     ExtractorError,
     sanitized_Request,
-    smuggle_url,
     std_headers,
     urlencode_postdata,
+    update_url_query,
 )
 
 
@@ -20,28 +19,30 @@ class SafariBaseIE(InfoExtractor):
     _SUCCESSFUL_LOGIN_REGEX = r'<a href="/accounts/logout/"[^>]*>Sign Out</a>'
     _NETRC_MACHINE = 'safari'
 
-    _API_BASE = 'https://www.safaribooksonline.com/api/v1/book'
+    _API_BASE = 'https://www.safaribooksonline.com/api/v1'
     _API_FORMAT = 'json'
 
     LOGGED_IN = False
 
     def _real_initialize(self):
-        # We only need to log in once for courses or individual videos
-        if not self.LOGGED_IN:
-            self._login()
-            SafariBaseIE.LOGGED_IN = True
+        self._login()
 
     def _login(self):
+        # We only need to log in once for courses or individual videos
+        if self.LOGGED_IN:
+            return
+
         (username, password) = self._get_login_info()
         if username is None:
-            self.raise_login_required('safaribooksonline.com account is required')
+            return
 
-        headers = std_headers
+        headers = std_headers.copy()
         if 'Referer' not in headers:
             headers['Referer'] = self._LOGIN_URL
+        login_page_request = sanitized_Request(self._LOGIN_URL, headers=headers)
 
         login_page = self._download_webpage(
-            self._LOGIN_URL, None,
+            login_page_request, None,
             'Downloading login form')
 
         csrf = self._html_search_regex(
@@ -66,6 +67,8 @@ class SafariBaseIE(InfoExtractor):
                 'Login failed; make sure your credentials are correct and try again.',
                 expected=True)
 
+        SafariBaseIE.LOGGED_IN = True
+
         self.to_screen('Login successful')
 
 
@@ -85,13 +88,15 @@ class SafariIE(SafariBaseIE):
 
     _TESTS = [{
         'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html',
-        'md5': '5b0c4cc1b3c1ba15dda7344085aa5592',
+        'md5': 'dcc5a425e79f2564148652616af1f2a3',
         'info_dict': {
-            'id': '2842601850001',
+            'id': '0_qbqx90ic',
             'ext': 'mp4',
-            'title': 'Introduction',
+            'title': 'Introduction to Hadoop Fundamentals LiveLessons',
+            'timestamp': 1437758058,
+            'upload_date': '20150724',
+            'uploader_id': 'stork',
         },
-        'skip': 'Requires safaribooksonline account credentials',
     }, {
         'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html',
         'only_matching': True,
@@ -106,15 +111,30 @@ class SafariIE(SafariBaseIE):
         course_id = mobj.group('course_id')
         part = mobj.group('part')
 
-        webpage = self._download_webpage(
-            '%s/%s/chapter-content/%s.html' % (self._API_BASE, course_id, part),
-            part)
+        webpage = self._download_webpage(url, '%s/%s' % (course_id, part))
+        reference_id = self._search_regex(r'data-reference-id="([^"]+)"', webpage, 'kaltura reference id')
+        partner_id = self._search_regex(r'data-partner-id="([^"]+)"', webpage, 'kaltura widget id')
+        ui_id = self._search_regex(r'data-ui-id="([^"]+)"', webpage, 'kaltura uiconf id')
+
+        query = {
+            'wid': '_%s' % partner_id,
+            'uiconf_id': ui_id,
+            'flashvars[referenceId]': reference_id,
+        }
 
-        bc_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)
-        if not bc_url:
-            raise ExtractorError('Could not extract Brightcove URL from %s' % url, expected=True)
+        if self.LOGGED_IN:
+            kaltura_session = self._download_json(
+                '%s/player/kaltura_session/?reference_id=%s' % (self._API_BASE, reference_id),
+                course_id, 'Downloading kaltura session JSON',
+                'Unable to download kaltura session JSON', fatal=False)
+            if kaltura_session:
+                session = kaltura_session.get('session')
+                if session:
+                    query['flashvars[ks]'] = session
 
-        return self.url_result(smuggle_url(bc_url, {'Referer': url}), 'BrightcoveLegacy')
+        return self.url_result(update_url_query(
+            'https://cdnapisec.kaltura.com/html5/html5lib/v2.37.1/mwEmbedFrame.php', query),
+            'Kaltura')
 
 
 class SafariCourseIE(SafariBaseIE):
@@ -140,7 +160,7 @@ class SafariCourseIE(SafariBaseIE):
         course_id = self._match_id(url)
 
         course_json = self._download_json(
-            '%s/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT),
+            '%s/book/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT),
             course_id, 'Downloading course JSON')
 
         if 'chapters' not in course_json:
index 2cf210e0d609a219f6ac2cbd205311cc166afa29..44b0bbee68953a199c67e420fe1928048be5f2cf 100644 (file)
@@ -70,25 +70,27 @@ class ScreenwaveMediaIE(InfoExtractor):
 
         formats = []
         for source in sources:
-            if source['type'] == 'hls':
-                formats.extend(self._extract_m3u8_formats(source['file'], video_id, ext='mp4'))
+            file_ = source.get('file')
+            if not file_:
+                continue
+            if source.get('type') == 'hls':
+                formats.extend(self._extract_m3u8_formats(file_, video_id, ext='mp4'))
             else:
-                file_ = source.get('file')
-                if not file_:
-                    continue
-                format_label = source.get('label')
                 format_id = self._search_regex(
                     r'_(.+?)\.[^.]+$', file_, 'format id', default=None)
+                if not self._is_valid_url(file_, video_id, format_id or 'video'):
+                    continue
+                format_label = source.get('label')
                 height = int_or_none(self._search_regex(
                     r'^(\d+)[pP]', format_label, 'height', default=None))
                 formats.append({
-                    'url': source['file'],
+                    'url': file_,
                     'format_id': format_id,
                     'format': format_label,
                     'ext': source.get('type'),
                     'height': height,
                 })
-        self._sort_formats(formats)
+        self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id'))
 
         return {
             'id': video_id,
index 6365a8779d74e2ac9d82ce83c32c404d51e64b2e..a99b2a8e7be1bc9de8a01d6ae2de6fb36055703c 100644 (file)
@@ -1,7 +1,5 @@
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
 
 
@@ -14,7 +12,7 @@ class SexuIE(InfoExtractor):
             'id': '961791',
             'ext': 'mp4',
             'title': 'md5:4d05a19a5fc049a63dbbaf05fb71d91b',
-            'description': 'md5:c5ed8625eb386855d5a7967bd7b77a54',
+            'description': 'md5:2b75327061310a3afb3fbd7d09e2e403',
             'categories': list,  # NSFW
             'thumbnail': 're:https?://.*\.jpg$',
             'age_limit': 18,
@@ -25,13 +23,18 @@ class SexuIE(InfoExtractor):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
 
-        quality_arr = self._search_regex(
-            r'sources:\s*\[([^\]]+)\]', webpage, 'forrmat string')
+        jwvideo = self._parse_json(
+            self._search_regex(r'\.setup\(\s*({.+?})\s*\);', webpage, 'jwvideo'),
+            video_id)
+
+        sources = jwvideo['sources']
+
         formats = [{
-            'url': fmt[0].replace('\\', ''),
-            'format_id': fmt[1],
-            'height': int(fmt[1][:3]),
-        } for fmt in re.findall(r'"file":"([^"]+)","label":"([^"]+)"', quality_arr)]
+            'url': source['file'].replace('\\', ''),
+            'format_id': source.get('label'),
+            'height': self._search_regex(
+                r'^(\d+)[pP]', source.get('label', ''), 'height', default=None),
+        } for source in sources if source.get('file')]
         self._sort_formats(formats)
 
         title = self._html_search_regex(
@@ -40,9 +43,7 @@ class SexuIE(InfoExtractor):
         description = self._html_search_meta(
             'description', webpage, 'description')
 
-        thumbnail = self._html_search_regex(
-            r'image:\s*"([^"]+)"',
-            webpage, 'thumbnail', fatal=False)
+        thumbnail = jwvideo.get('image')
 
         categories_str = self._html_search_meta(
             'keywords', webpage, 'categories')
diff --git a/youtube_dl/extractor/space.py b/youtube_dl/extractor/space.py
deleted file mode 100644 (file)
index ebb5d6e..0000000
+++ /dev/null
@@ -1,38 +0,0 @@
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from .brightcove import BrightcoveLegacyIE
-from ..utils import RegexNotFoundError, ExtractorError
-
-
-class SpaceIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:(?:www|m)\.)?space\.com/\d+-(?P<title>[^/\.\?]*?)-video\.html'
-    _TEST = {
-        'add_ie': ['BrightcoveLegacy'],
-        'url': 'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html',
-        'info_dict': {
-            'id': '2780937028001',
-            'ext': 'mp4',
-            'title': 'Huge Martian Landforms\' Detail Revealed By European Probe | Video',
-            'description': 'md5:db81cf7f3122f95ed234b631a6ea1e61',
-            'uploader': 'TechMedia Networks',
-        },
-    }
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        title = mobj.group('title')
-        webpage = self._download_webpage(url, title)
-        try:
-            # Some videos require the playerKey field, which isn't define in
-            # the BrightcoveExperience object
-            brightcove_url = self._og_search_video_url(webpage)
-        except RegexNotFoundError:
-            # Other videos works fine with the info from the object
-            brightcove_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)
-        if brightcove_url is None:
-            raise ExtractorError(
-                'The webpage does not contain a video', expected=True)
-        return self.url_result(brightcove_url, BrightcoveLegacyIE.ie_key())
index a48d77c309dcd1f9984cd0a6c71b7af574ca5498..cf8851438bb74000abb2692c34607f3137505f1d 100644 (file)
@@ -73,7 +73,7 @@ class TEDIE(InfoExtractor):
         'add_ie': ['Youtube'],
         'info_dict': {
             'id': '_ZG8HBuDjgc',
-            'ext': 'mp4',
+            'ext': 'webm',
             'title': 'Douglas Adams: Parrots the Universe and Everything',
             'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
             'uploader': 'University of California Television (UCTV)',
index 6890021cf199e8b53a19cfa291bf7800477dab3b..9ee84468488c61979f35b527dad0b617133f04b2 100644 (file)
@@ -48,8 +48,6 @@ class TF1IE(InfoExtractor):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
         wat_id = self._html_search_regex(
-            r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8})\1',
+            r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8})(?:#.*?)?\1',
             webpage, 'wat id', group='id')
-        wat_info = self._download_json(
-            'http://www.wat.tv/interface/contentv3/%s' % wat_id, video_id)
-        return self.url_result(wat_info['media']['url'], 'Wat')
+        return self.url_result('wat:%s' % wat_id, 'Wat')
index 755f816fff95bbe3567afe0168936eb6adb53095..9a57b49df66422a7a7828f2a12ff1385016951ef 100644 (file)
@@ -21,6 +21,8 @@ from ..utils import (
     sanitized_Request,
     unsmuggle_url,
     xpath_with_ns,
+    mimetype2ext,
+    find_xpath_attr,
 )
 
 default_ns = 'http://www.w3.org/2005/SMIL21/Language'
@@ -30,15 +32,11 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns})
 class ThePlatformBaseIE(InfoExtractor):
     def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'):
         meta = self._download_xml(smil_url, video_id, note=note)
-        try:
-            error_msg = next(
-                n.attrib['abstract']
-                for n in meta.findall(_x('.//smil:ref'))
-                if n.attrib.get('title') == 'Geographic Restriction' or n.attrib.get('title') == 'Expired')
-        except StopIteration:
-            pass
-        else:
-            raise ExtractorError(error_msg, expected=True)
+        error_element = find_xpath_attr(
+            meta, _x('.//smil:ref'), 'src',
+            'http://link.theplatform.com/s/errorFiles/Unavailable.mp4')
+        if error_element is not None:
+            raise ExtractorError(error_element.attrib['abstract'], expected=True)
 
         formats = self._parse_smil_formats(
             meta, smil_url, video_id, namespace=default_ns,
@@ -68,7 +66,7 @@ class ThePlatformBaseIE(InfoExtractor):
             for caption in captions:
                 lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type')
                 subtitles[lang] = [{
-                    'ext': 'srt' if mime == 'text/srt' else 'ttml',
+                    'ext': mimetype2ext(mime),
                     'url': src,
                 }]
 
index 49516abca690721a83dee5044bb2cdd6540d4a07..79f036fe4eca77f57ddd9e1fd912317f9af00ba5 100644 (file)
@@ -71,7 +71,7 @@ class TNAFlixNetworkBaseIE(InfoExtractor):
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
-        display_id = mobj.group('display_id')
+        display_id = mobj.group('display_id') if 'display_id' in mobj.groupdict() else video_id
 
         webpage = self._download_webpage(url, display_id)
 
@@ -117,7 +117,7 @@ class TNAFlixNetworkBaseIE(InfoExtractor):
         title = self._html_search_regex(
             self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage)
 
-        age_limit = self._rta_search(webpage)
+        age_limit = self._rta_search(webpage) or 18
 
         duration = parse_duration(self._html_search_meta(
             'duration', webpage, 'duration', default=None))
@@ -152,6 +152,36 @@ class TNAFlixNetworkBaseIE(InfoExtractor):
         }
 
 
+class TNAFlixNetworkEmbedIE(TNAFlixNetworkBaseIE):
+    _VALID_URL = r'https?://player\.(?:tna|emp)flix\.com/video/(?P<id>\d+)'
+
+    _TITLE_REGEX = r'<title>([^<]+)</title>'
+
+    _TESTS = [{
+        'url': 'https://player.tnaflix.com/video/6538',
+        'info_dict': {
+            'id': '6538',
+            'display_id': '6538',
+            'ext': 'mp4',
+            'title': 'Educational xxx video',
+            'thumbnail': 're:https?://.*\.jpg$',
+            'age_limit': 18,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://player.empflix.com/video/33051',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return [url for _, url in re.findall(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.(?:tna|emp)flix\.com/video/\d+)\1',
+            webpage)]
+
+
 class TNAFlixIE(TNAFlixNetworkBaseIE):
     _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)'
 
index 69882da6337bc6f20d0578f6d612e799429db3e8..958bf8fff58b4264742871ba019d9f8c5be21adb 100644 (file)
@@ -17,6 +17,7 @@ from ..utils import (
     encode_dict,
     ExtractorError,
     int_or_none,
+    orderedSet,
     parse_duration,
     parse_iso8601,
     sanitized_Request,
@@ -251,6 +252,7 @@ class TwitchVodIE(TwitchItemBaseIE):
                 self._USHER_BASE, item_id,
                 compat_urllib_parse.urlencode({
                     'allow_source': 'true',
+                    'allow_audio_only': 'true',
                     'allow_spectre': 'true',
                     'player': 'twitchweb',
                     'nauth': access_token['token'],
@@ -281,17 +283,36 @@ class TwitchPlaylistBaseIE(TwitchBaseIE):
         entries = []
         offset = 0
         limit = self._PAGE_LIMIT
+        broken_paging_detected = False
+        counter_override = None
         for counter in itertools.count(1):
             response = self._download_json(
                 self._PLAYLIST_URL % (channel_id, offset, limit),
-                channel_id, 'Downloading %s videos JSON page %d' % (self._PLAYLIST_TYPE, counter))
+                channel_id,
+                'Downloading %s videos JSON page %s'
+                % (self._PLAYLIST_TYPE, counter_override or counter))
             page_entries = self._extract_playlist_page(response)
             if not page_entries:
                 break
+            total = int_or_none(response.get('_total'))
+            # Since the beginning of March 2016 twitch's paging mechanism
+            # is completely broken on the twitch side. It simply ignores
+            # a limit and returns the whole offset number of videos.
+            # Working around by just requesting all videos at once.
+            if not broken_paging_detected and total and len(page_entries) > limit:
+                self.report_warning(
+                    'Twitch paging is broken on twitch side, requesting all videos at once',
+                    channel_id)
+                broken_paging_detected = True
+                offset = total
+                counter_override = '(all at once)'
+                continue
             entries.extend(page_entries)
+            if broken_paging_detected or total and len(page_entries) >= total:
+                break
             offset += limit
         return self.playlist_result(
-            [self.url_result(entry) for entry in set(entries)],
+            [self.url_result(entry) for entry in orderedSet(entries)],
             channel_id, channel_name)
 
     def _extract_playlist_page(self, response):
@@ -411,6 +432,7 @@ class TwitchStreamIE(TwitchBaseIE):
 
         query = {
             'allow_source': 'true',
+            'allow_audio_only': 'true',
             'p': random.randint(1000000, 10000000),
             'player': 'twitchweb',
             'segment_preference': '4',
index a161f046b2532805d864a26e083de06f68cf7a1f..e70b2ab3c8d564cd907e8763fd8136e1b3827ac5 100644 (file)
@@ -10,21 +10,26 @@ from ..utils import (
     remove_end,
     int_or_none,
     ExtractorError,
-    sanitized_Request,
 )
 
 
-class TwitterCardIE(InfoExtractor):
+class TwitterBaseIE(InfoExtractor):
+    def _get_vmap_video_url(self, vmap_url, video_id):
+        vmap_data = self._download_xml(vmap_url, video_id)
+        return xpath_text(vmap_data, './/MediaFile').strip()
+
+
+class TwitterCardIE(TwitterBaseIE):
     IE_NAME = 'twitter:card'
-    _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/(?:cards/tfw/v1|videos/tweet)/(?P<id>\d+)'
     _TESTS = [
         {
             'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889',
-            'md5': '4fa26a35f9d1bf4b646590ba8e84be19',
+            # MD5 checksums are different in different places
             'info_dict': {
                 'id': '560070183650213889',
                 'ext': 'mp4',
-                'title': 'TwitterCard',
+                'title': 'Twitter Card',
                 'thumbnail': 're:^https?://.*\.jpg$',
                 'duration': 30.033,
             }
@@ -35,14 +40,14 @@ class TwitterCardIE(InfoExtractor):
             'info_dict': {
                 'id': '623160978427936768',
                 'ext': 'mp4',
-                'title': 'TwitterCard',
+                'title': 'Twitter Card',
                 'thumbnail': 're:^https?://.*\.jpg',
                 'duration': 80.155,
             },
         },
         {
             'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977',
-            'md5': 'b6f35e8b08a0bec6c8af77a2f4b3a814',
+            'md5': 'd4724ffe6d2437886d004fa5de1043b3',
             'info_dict': {
                 'id': 'dq4Oj5quskI',
                 'ext': 'mp4',
@@ -62,69 +67,106 @@ class TwitterCardIE(InfoExtractor):
                 'ext': 'mp4',
                 'upload_date': '20151113',
                 'uploader_id': '1189339351084113920',
-                'uploader': '@ArsenalTerje',
-                'title': 'Vine by @ArsenalTerje',
+                'uploader': 'ArsenalTerje',
+                'title': 'Vine by ArsenalTerje',
             },
             'add_ie': ['Vine'],
-        }
+        }, {
+            'url': 'https://twitter.com/i/videos/tweet/705235433198714880',
+            'md5': '3846d0a07109b5ab622425449b59049d',
+            'info_dict': {
+                'id': '705235433198714880',
+                'ext': 'mp4',
+                'title': 'Twitter web player',
+                'thumbnail': 're:^https?://.*\.jpg',
+            },
+        },
     ]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        # Different formats served for different User-Agents
-        USER_AGENTS = [
-            'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',  # mp4
-            'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0',  # webm
-        ]
-
         config = None
         formats = []
-        for user_agent in USER_AGENTS:
-            request = sanitized_Request(url)
-            request.add_header('User-Agent', user_agent)
-            webpage = self._download_webpage(request, video_id)
-
-            iframe_url = self._html_search_regex(
-                r'<iframe[^>]+src="((?:https?:)?//(?:www.youtube.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"',
-                webpage, 'video iframe', default=None)
-            if iframe_url:
-                return self.url_result(iframe_url)
-
-            config = self._parse_json(self._html_search_regex(
-                r'data-player-config="([^"]+)"', webpage, 'data player config'),
-                video_id)
-            if 'playlist' not in config:
-                if 'vmapUrl' in config:
-                    vmap_data = self._download_xml(config['vmapUrl'], video_id)
-                    video_url = xpath_text(vmap_data, './/MediaFile').strip()
-                    formats.append({
-                        'url': video_url,
-                    })
-                    break   # same video regardless of UA
-                continue
-
-            video_url = config['playlist'][0]['source']
+        duration = None
 
-            f = {
-                'url': video_url,
-            }
+        webpage = self._download_webpage(url, video_id)
+
+        iframe_url = self._html_search_regex(
+            r'<iframe[^>]+src="((?:https?:)?//(?:www.youtube.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"',
+            webpage, 'video iframe', default=None)
+        if iframe_url:
+            return self.url_result(iframe_url)
+
+        config = self._parse_json(self._html_search_regex(
+            r'data-(?:player-)?config="([^"]+)"', webpage, 'data player config'),
+            video_id)
 
+        def _search_dimensions_in_video_url(a_format, video_url):
             m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url)
             if m:
-                f.update({
+                a_format.update({
                     'width': int(m.group('width')),
                     'height': int(m.group('height')),
                 })
+
+        playlist = config.get('playlist')
+        if playlist:
+            video_url = playlist[0]['source']
+
+            f = {
+                'url': video_url,
+            }
+
+            _search_dimensions_in_video_url(f, video_url)
+
             formats.append(f)
+
+        vmap_url = config.get('vmapUrl') or config.get('vmap_url')
+        if vmap_url:
+            formats.append({
+                'url': self._get_vmap_video_url(vmap_url, video_id),
+            })
+
+        media_info = None
+
+        for entity in config.get('status', {}).get('entities', []):
+            if 'mediaInfo' in entity:
+                media_info = entity['mediaInfo']
+
+        if media_info:
+            for media_variant in media_info['variants']:
+                media_url = media_variant['url']
+                if media_url.endswith('.m3u8'):
+                    formats.extend(self._extract_m3u8_formats(media_url, video_id, ext='mp4', m3u8_id='hls'))
+                elif media_url.endswith('.mpd'):
+                    formats.extend(self._extract_mpd_formats(media_url, video_id, mpd_id='dash'))
+                else:
+                    vbr = int_or_none(media_variant.get('bitRate'), scale=1000)
+                    a_format = {
+                        'url': media_url,
+                        'format_id': 'http-%d' % vbr if vbr else 'http',
+                        'vbr': vbr,
+                    }
+                    # Reported bitRate may be zero
+                    if not a_format['vbr']:
+                        del a_format['vbr']
+
+                    _search_dimensions_in_video_url(a_format, media_url)
+
+                    formats.append(a_format)
+
+            duration = float_or_none(media_info.get('duration', {}).get('nanos'), scale=1e9)
+
         self._sort_formats(formats)
 
-        thumbnail = config.get('posterImageUrl')
-        duration = float_or_none(config.get('duration'))
+        title = self._search_regex(r'<title>([^<]+)</title>', webpage, 'title')
+        thumbnail = config.get('posterImageUrl') or config.get('image_src')
+        duration = float_or_none(config.get('duration')) or duration
 
         return {
             'id': video_id,
-            'title': 'TwitterCard',
+            'title': title,
             'thumbnail': thumbnail,
             'duration': duration,
             'formats': formats,
@@ -138,7 +180,6 @@ class TwitterIE(InfoExtractor):
 
     _TESTS = [{
         'url': 'https://twitter.com/freethenipple/status/643211948184596480',
-        'md5': 'db6612ec5d03355953c3ca9250c97e5e',
         'info_dict': {
             'id': '643211948184596480',
             'ext': 'mp4',
@@ -149,6 +190,9 @@ class TwitterIE(InfoExtractor):
             'uploader': 'FREE THE NIPPLE',
             'uploader_id': 'freethenipple',
         },
+        'params': {
+            'skip_download': True,  # requires ffmpeg
+        },
     }, {
         'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1',
         'md5': 'f36dcd5fb92bf7057f155e7d927eeb42',
@@ -161,6 +205,7 @@ class TwitterIE(InfoExtractor):
             'uploader': 'Gifs',
             'uploader_id': 'giphz',
         },
+        'expected_warnings': ['height', 'width'],
     }, {
         'url': 'https://twitter.com/starwars/status/665052190608723968',
         'md5': '39b7199856dee6cd4432e72c74bc69d4',
@@ -172,6 +217,36 @@ class TwitterIE(InfoExtractor):
             'uploader_id': 'starwars',
             'uploader': 'Star Wars',
         },
+    }, {
+        'url': 'https://twitter.com/BTNBrentYarina/status/705235433198714880',
+        'info_dict': {
+            'id': '705235433198714880',
+            'ext': 'mp4',
+            'title': 'Brent Yarina - Khalil Iverson\'s missed highlight dunk. And made highlight dunk. In one highlight.',
+            'description': 'Brent Yarina on Twitter: "Khalil Iverson\'s missed highlight dunk. And made highlight dunk. In one highlight."',
+            'uploader_id': 'BTNBrentYarina',
+            'uploader': 'Brent Yarina',
+        },
+        'params': {
+            # The same video as https://twitter.com/i/videos/tweet/705235433198714880
+            # Test case of TwitterCardIE
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://twitter.com/jaydingeer/status/700207533655363584',
+        'md5': '',
+        'info_dict': {
+            'id': '700207533655363584',
+            'ext': 'mp4',
+            'title': 'jay - BEAT PROD: @suhmeduh #Damndaniel',
+            'description': 'jay on Twitter: "BEAT PROD: @suhmeduh  https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"',
+            'thumbnail': 're:^https?://.*\.jpg',
+            'uploader': 'jay',
+            'uploader_id': 'jaydingeer',
+        },
+        'params': {
+            'skip_download': True,  # requires ffmpeg
+        },
     }]
 
     def _real_extract(self, url):
@@ -208,21 +283,91 @@ class TwitterIE(InfoExtractor):
             return info
 
         mobj = re.search(r'''(?x)
-            <video[^>]+class="animated-gif"[^>]+
-                (?:data-height="(?P<height>\d+)")?[^>]+
-                (?:data-width="(?P<width>\d+)")?[^>]+
-                (?:poster="(?P<poster>[^"]+)")?[^>]*>\s*
+            <video[^>]+class="animated-gif"(?P<more_info>[^>]+)>\s*
                 <source[^>]+video-src="(?P<url>[^"]+)"
         ''', webpage)
 
         if mobj:
+            more_info = mobj.group('more_info')
+            height = int_or_none(self._search_regex(
+                r'data-height="(\d+)"', more_info, 'height', fatal=False))
+            width = int_or_none(self._search_regex(
+                r'data-width="(\d+)"', more_info, 'width', fatal=False))
+            thumbnail = self._search_regex(
+                r'poster="([^"]+)"', more_info, 'poster', fatal=False)
             info.update({
                 'id': twid,
                 'url': mobj.group('url'),
-                'height': int_or_none(mobj.group('height')),
-                'width': int_or_none(mobj.group('width')),
-                'thumbnail': mobj.group('poster'),
+                'height': height,
+                'width': width,
+                'thumbnail': thumbnail,
             })
             return info
 
-        raise ExtractorError('There\'s not video in this tweet.')
+        if 'class="PlayableMedia' in webpage:
+            info.update({
+                '_type': 'url_transparent',
+                'ie_key': 'TwitterCard',
+                'url': '%s//twitter.com/i/videos/tweet/%s' % (self.http_scheme(), twid),
+            })
+
+            return info
+
+        raise ExtractorError('There\'s no video in this tweet.')
+
+
+class TwitterAmplifyIE(TwitterBaseIE):
+    IE_NAME = 'twitter:amplify'
+    _VALID_URL = 'https?://amp\.twimg\.com/v/(?P<id>[0-9a-f\-]{36})'
+
+    _TEST = {
+        'url': 'https://amp.twimg.com/v/0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951',
+        'md5': '7df102d0b9fd7066b86f3159f8e81bf6',
+        'info_dict': {
+            'id': '0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951',
+            'ext': 'mp4',
+            'title': 'Twitter Video',
+            'thumbnail': 're:^https?://.*',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        vmap_url = self._html_search_meta(
+            'twitter:amplify:vmap', webpage, 'vmap url')
+        video_url = self._get_vmap_video_url(vmap_url, video_id)
+
+        thumbnails = []
+        thumbnail = self._html_search_meta(
+            'twitter:image:src', webpage, 'thumbnail', fatal=False)
+
+        def _find_dimension(target):
+            w = int_or_none(self._html_search_meta(
+                'twitter:%s:width' % target, webpage, fatal=False))
+            h = int_or_none(self._html_search_meta(
+                'twitter:%s:height' % target, webpage, fatal=False))
+            return w, h
+
+        if thumbnail:
+            thumbnail_w, thumbnail_h = _find_dimension('image')
+            thumbnails.append({
+                'url': thumbnail,
+                'width': thumbnail_w,
+                'height': thumbnail_h,
+            })
+
+        video_w, video_h = _find_dimension('player')
+        formats = [{
+            'url': video_url,
+            'width': video_w,
+            'height': video_h,
+        }]
+
+        return {
+            'id': video_id,
+            'title': 'Twitter Video',
+            'formats': formats,
+            'thumbnails': thumbnails,
+        }
diff --git a/youtube_dl/extractor/ustudio.py b/youtube_dl/extractor/ustudio.py
new file mode 100644 (file)
index 0000000..cafc082
--- /dev/null
@@ -0,0 +1,67 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    unified_strdate,
+)
+
+
+class UstudioIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:(?:www|v1)\.)?ustudio\.com/video/(?P<id>[^/]+)/(?P<display_id>[^/?#&]+)'
+    _TEST = {
+        'url': 'http://ustudio.com/video/Uxu2my9bgSph/san_francisco_golden_gate_bridge',
+        'md5': '58bbfca62125378742df01fc2abbdef6',
+        'info_dict': {
+            'id': 'Uxu2my9bgSph',
+            'display_id': 'san_francisco_golden_gate_bridge',
+            'ext': 'mp4',
+            'title': 'San Francisco: Golden Gate Bridge',
+            'description': 'md5:23925500697f2c6d4830e387ba51a9be',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'upload_date': '20111107',
+            'uploader': 'Tony Farley',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id')
+
+        config = self._download_xml(
+            'http://v1.ustudio.com/embed/%s/ustudio/config.xml' % video_id,
+            display_id)
+
+        def extract(kind):
+            return [{
+                'url': item.attrib['url'],
+                'width': int_or_none(item.get('width')),
+                'height': int_or_none(item.get('height')),
+            } for item in config.findall('./qualities/quality/%s' % kind) if item.get('url')]
+
+        formats = extract('video')
+        self._sort_formats(formats)
+
+        webpage = self._download_webpage(url, display_id)
+
+        title = self._og_search_title(webpage)
+        upload_date = unified_strdate(self._search_regex(
+            r'(?s)Uploaded by\s*.+?\s*on\s*<span>([^<]+)</span>',
+            webpage, 'upload date', fatal=False))
+        uploader = self._search_regex(
+            r'Uploaded by\s*<a[^>]*>([^<]+)<',
+            webpage, 'uploader', fatal=False)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': self._og_search_description(webpage),
+            'thumbnails': extract('image'),
+            'upload_date': upload_date,
+            'uploader': uploader,
+            'formats': formats,
+        }
index 14e945d494cd2f6e5f3b3e6a03ff6ebb076826dd..e148b1ef513321376efe1795056503ea2a8bcad8 100644 (file)
@@ -20,6 +20,7 @@ class VGTVIE(XstreamIE):
         'aftenbladet.no/tv': 'satv',
         'fvn.no/fvntv': 'fvntv',
         'aftenposten.no/webtv': 'aptv',
+        'ap.vgtv.no/webtv': 'aptv',
     }
 
     _APP_NAME_TO_VENDOR = {
@@ -35,7 +36,7 @@ class VGTVIE(XstreamIE):
                     (?P<host>
                         %s
                     )
-                    /
+                    /?
                     (?:
                         \#!/(?:video|live)/|
                         embed?.*id=
@@ -107,19 +108,27 @@ class VGTVIE(XstreamIE):
             'md5': 'fd828cd29774a729bf4d4425fe192972',
             'info_dict': {
                 'id': '21039',
-                'ext': 'mov',
+                'ext': 'mp4',
                 'title': 'TRAILER: «SWEATSHOP» - I can´t take any more',
                 'description': 'md5:21891f2b0dd7ec2f78d84a50e54f8238',
                 'duration': 66,
                 'timestamp': 1417002452,
                 'upload_date': '20141126',
                 'view_count': int,
-            }
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
         },
         {
             'url': 'http://www.bt.no/tv/#!/video/100250/norling-dette-er-forskjellen-paa-1-divisjon-og-eliteserien',
             'only_matching': True,
         },
+        {
+            'url': 'http://ap.vgtv.no/webtv#!/video/111084/de-nye-bysyklene-lettere-bedre-gir-stoerre-hjul-og-feste-til-mobil',
+            'only_matching': True,
+        },
     ]
 
     def _real_extract(self, url):
@@ -144,8 +153,6 @@ class VGTVIE(XstreamIE):
         if len(video_id) == 5:
             if appname == 'bttv':
                 info = self._extract_video_info('btno', video_id)
-            elif appname == 'aptv':
-                info = self._extract_video_info('ap', video_id)
 
         streams = data['streamUrls']
         stream_type = data.get('streamType')
index 3db6286e48c1402ff210d8f4cc666dac1a918a86..46c785ae183d72207ab12500618f3eb7b765373d 100644 (file)
@@ -1,31 +1,37 @@
 from __future__ import unicode_literals
 
+import re
+
 from .common import InfoExtractor
 from .ooyala import OoyalaIE
 from ..utils import ExtractorError
 
 
 class ViceIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)+(?P<id>.+)'
-
-    _TESTS = [
-        {
-            'url': 'http://www.vice.com/Fringes/cowboy-capitalists-part-1',
-            'info_dict': {
-                'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
-                'ext': 'mp4',
-                'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
-                'duration': 725.983,
-            },
-            'params': {
-                # Requires ffmpeg (m3u8 manifest)
-                'skip_download': True,
-            },
-        }, {
-            'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab',
-            'only_matching': True,
-        }
-    ]
+    _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?videos?/(?P<id>[^/?#&]+)'
+
+    _TESTS = [{
+        'url': 'http://www.vice.com/video/cowboy-capitalists-part-1',
+        'info_dict': {
+            'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
+            'ext': 'mp4',
+            'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
+            'duration': 725.983,
+        },
+        'params': {
+            # Requires ffmpeg (m3u8 manifest)
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.vice.com/ru/video/big-night-out-ibiza-clive-martin-229',
+        'only_matching': True,
+    }, {
+        'url': 'https://munchies.vice.com/en/videos/watch-the-trailer-for-our-new-series-the-pizza-show',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -38,3 +44,35 @@ class ViceIE(InfoExtractor):
         except ExtractorError:
             raise ExtractorError('The page doesn\'t contain a video', expected=True)
         return self.url_result(ooyala_url, ie='Ooyala')
+
+
+class ViceShowIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?show/(?P<id>[^/?#&]+)'
+
+    _TEST = {
+        'url': 'https://munchies.vice.com/en/show/fuck-thats-delicious-2',
+        'info_dict': {
+            'id': 'fuck-thats-delicious-2',
+            'title': "Fuck, That's Delicious",
+            'description': 'Follow the culinary adventures of rapper Action Bronson during his ongoing world tour.',
+        },
+        'playlist_count': 17,
+    }
+
+    def _real_extract(self, url):
+        show_id = self._match_id(url)
+        webpage = self._download_webpage(url, show_id)
+
+        entries = [
+            self.url_result(video_url, ViceIE.ie_key())
+            for video_url, _ in re.findall(
+                r'<h2[^>]+class="article-title"[^>]+data-id="\d+"[^>]*>\s*<a[^>]+href="(%s.*?)"'
+                % ViceIE._VALID_URL, webpage)]
+
+        title = self._search_regex(
+            r'<title>(.+?)</title>', webpage, 'title', default=None)
+        if title:
+            title = re.sub(r'(.+)\s*\|\s*.+$', r'\1', title).strip()
+        description = self._html_search_meta('description', webpage, 'description')
+
+        return self.playlist_result(entries, show_id, title, description)
index 5e2e7cbacc4d52a91dd3a789b76874fffac069db..4f0dcd18c7f28ab17aec58c814d53fd8ae21e7ac 100644 (file)
@@ -4,11 +4,13 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..utils import sanitized_Request
+from ..utils import (
+    decode_packed_codes,
+    sanitized_Request,
+)
 
 
 class VideoMegaIE(InfoExtractor):
-    _WORKING = False
     _VALID_URL = r'(?:videomega:|https?://(?:www\.)?videomega\.tv/(?:(?:view|iframe|cdn)\.php)?\?ref=)(?P<id>[A-Za-z0-9]+)'
     _TESTS = [{
         'url': 'http://videomega.tv/cdn.php?ref=AOSQBJYKIDDIKYJBQSOA',
@@ -42,8 +44,10 @@ class VideoMegaIE(InfoExtractor):
             r'(?:^[Vv]ideo[Mm]ega\.tv\s-\s*|\s*-\svideomega\.tv$)', '', title)
         thumbnail = self._search_regex(
             r'<video[^>]+?poster="([^"]+)"', webpage, 'thumbnail', fatal=False)
+
+        real_codes = decode_packed_codes(webpage)
         video_url = self._search_regex(
-            r'<source[^>]+?src="([^"]+)"', webpage, 'video URL')
+            r'"src"\s*,\s*"([^"]+)"', real_codes, 'video URL')
 
         return {
             'id': video_id,
index 7c6e98026823e0df415664ad0b4d6995c9e2f673..3c78fb3d5a071a6f49dec7467e620c0b8a01ded9 100644 (file)
@@ -1,11 +1,14 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-from .common import InfoExtractor
-from ..utils import smuggle_url
+from .jwplatform import JWPlatformBaseIE
+from ..utils import (
+    decode_packed_codes,
+    js_to_json,
+)
 
 
-class VidziIE(InfoExtractor):
+class VidziIE(JWPlatformBaseIE):
     _VALID_URL = r'https?://(?:www\.)?vidzi\.tv/(?P<id>\w+)'
     _TEST = {
         'url': 'http://vidzi.tv/cghql9yq6emu.html',
@@ -14,7 +17,6 @@ class VidziIE(InfoExtractor):
             'id': 'cghql9yq6emu',
             'ext': 'mp4',
             'title': 'youtube-dl test video  1\\\\2\'3/4<5\\\\6ä7↭',
-            'uploader': 'vidzi.tv',
         },
         'params': {
             # m3u8 download
@@ -29,11 +31,12 @@ class VidziIE(InfoExtractor):
         title = self._html_search_regex(
             r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title')
 
-        # Vidzi now uses jwplayer, which can be handled by GenericIE
-        return {
-            '_type': 'url_transparent',
-            'id': video_id,
-            'title': title,
-            'url': smuggle_url(url, {'to_generic': True}),
-            'ie_key': 'Generic',
-        }
+        code = decode_packed_codes(webpage).replace('\\\'', '\'')
+        jwplayer_data = self._parse_json(
+            self._search_regex(r'setup\(([^)]+)\)', code, 'jwplayer data'),
+            video_id, transform_source=js_to_json)
+
+        info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False)
+        info_dict['title'] = title
+
+        return info_dict
index 433fc9914a1d59fbc6743e8e82815b429a695cc5..e04b814c8cf27755bfe0a86af3d5bf43262bd0da 100644 (file)
@@ -176,13 +176,13 @@ class VikiIE(VikiBaseIE):
     }, {
         # youtube external
         'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1',
-        'md5': '216d1afdc0c64d1febc1e9f2bd4b864b',
+        'md5': '63f8600c1da6f01b7640eee7eca4f1da',
         'info_dict': {
             'id': '50562v',
-            'ext': 'mp4',
+            'ext': 'webm',
             'title': 'Poor Nastya [COMPLETE] - Episode 1',
             'description': '',
-            'duration': 607,
+            'duration': 606,
             'timestamp': 1274949505,
             'upload_date': '20101213',
             'uploader': 'ad14065n',
index 3049dffb6c98ed95251262bd3013add99f4b41db..71c30d2cde54f11802f1e187160ae48c0ea88423 100644 (file)
@@ -73,15 +73,26 @@ class VimeoIE(VimeoBaseInfoExtractor):
 
     # _VALID_URL matches Vimeo URLs
     _VALID_URL = r'''(?x)
-        https?://
-        (?:(?:www|(?P<player>player))\.)?
-        vimeo(?P<pro>pro)?\.com/
-        (?!channels/[^/?#]+/?(?:$|[?#])|album/)
-        (?:.*?/)?
-        (?:(?:play_redirect_hls|moogaloop\.swf)\?clip_id=)?
-        (?:videos?/)?
-        (?P<id>[0-9]+)
-        /?(?:[?&].*)?(?:[#].*)?$'''
+                    https?://
+                        (?:
+                            (?:
+                                www|
+                                (?P<player>player)
+                            )
+                            \.
+                        )?
+                        vimeo(?P<pro>pro)?\.com/
+                        (?!channels/[^/?#]+/?(?:$|[?#])|(?:album|ondemand)/)
+                        (?:.*?/)?
+                        (?:
+                            (?:
+                                play_redirect_hls|
+                                moogaloop\.swf)\?clip_id=
+                            )?
+                        (?:videos?/)?
+                        (?P<id>[0-9]+)
+                        /?(?:[?&].*)?(?:[#].*)?$
+                    '''
     IE_NAME = 'vimeo'
     _TESTS = [
         {
@@ -93,6 +104,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
                 'title': "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
                 'description': 'md5:2d3305bad981a06ff79f027f19865021',
                 'upload_date': '20121220',
+                'uploader_url': 're:https?://(?:www\.)?vimeo\.com/user7108434',
                 'uploader_id': 'user7108434',
                 'uploader': 'Filippo Valsorda',
                 'duration': 10,
@@ -105,6 +117,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
             'info_dict': {
                 'id': '68093876',
                 'ext': 'mp4',
+                'uploader_url': 're:https?://(?:www\.)?vimeo\.com/openstreetmapus',
                 'uploader_id': 'openstreetmapus',
                 'uploader': 'OpenStreetMap US',
                 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography',
@@ -121,6 +134,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
                 'ext': 'mp4',
                 'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software 2012',
                 'uploader': 'The BLN & Business of Software',
+                'uploader_url': 're:https?://(?:www\.)?vimeo\.com/theblnbusinessofsoftware',
                 'uploader_id': 'theblnbusinessofsoftware',
                 'duration': 3610,
                 'description': None,
@@ -135,6 +149,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
                 'ext': 'mp4',
                 'title': 'youtube-dl password protected test video',
                 'upload_date': '20130614',
+                'uploader_url': 're:https?://(?:www\.)?vimeo\.com/user18948128',
                 'uploader_id': 'user18948128',
                 'uploader': 'Jaime Marquínez Ferrándiz',
                 'duration': 10,
@@ -154,6 +169,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
                 'ext': 'mp4',
                 'title': 'Key & Peele: Terrorist Interrogation',
                 'description': 'md5:8678b246399b070816b12313e8b4eb5c',
+                'uploader_url': 're:https?://(?:www\.)?vimeo\.com/atencio',
                 'uploader_id': 'atencio',
                 'uploader': 'Peter Atencio',
                 'upload_date': '20130927',
@@ -169,6 +185,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
                 'title': 'The New Vimeo Player (You Know, For Videos)',
                 'description': 'md5:2ec900bf97c3f389378a96aee11260ea',
                 'upload_date': '20131015',
+                'uploader_url': 're:https?://(?:www\.)?vimeo\.com/staff',
                 'uploader_id': 'staff',
                 'uploader': 'Vimeo Staff',
                 'duration': 62,
@@ -183,6 +200,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
                 'ext': 'mp4',
                 'title': 'Pier Solar OUYA Official Trailer',
                 'uploader': 'Tulio Gonçalves',
+                'uploader_url': 're:https?://(?:www\.)?vimeo\.com/user28849593',
                 'uploader_id': 'user28849593',
             },
         },
@@ -195,6 +213,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
                 'ext': 'mp4',
                 'title': 'FOX CLASSICS - Forever Classic ID - A Full Minute',
                 'uploader': 'The DMCI',
+                'uploader_url': 're:https?://(?:www\.)?vimeo\.com/dmci',
                 'uploader_id': 'dmci',
                 'upload_date': '20111220',
                 'description': 'md5:ae23671e82d05415868f7ad1aec21147',
@@ -269,9 +288,8 @@ class VimeoIE(VimeoBaseInfoExtractor):
 
     def _real_extract(self, url):
         url, data = unsmuggle_url(url, {})
-        headers = std_headers
+        headers = std_headers.copy()
         if 'http_headers' in data:
-            headers = headers.copy()
             headers.update(data['http_headers'])
         if 'Referer' not in headers:
             headers['Referer'] = url
@@ -286,7 +304,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
             url = 'https://vimeo.com/' + video_id
 
         # Retrieve video webpage to extract further information
-        request = sanitized_Request(url, None, headers)
+        request = sanitized_Request(url, headers=headers)
         try:
             webpage = self._download_webpage(request, video_id)
         except ExtractorError as ee:
@@ -370,9 +388,10 @@ class VimeoIE(VimeoBaseInfoExtractor):
         # Extract title
         video_title = config['video']['title']
 
-        # Extract uploader and uploader_id
-        video_uploader = config['video']['owner']['name']
-        video_uploader_id = config['video']['owner']['url'].split('/')[-1] if config['video']['owner']['url'] else None
+        # Extract uploader, uploader_url and uploader_id
+        video_uploader = config['video'].get('owner', {}).get('name')
+        video_uploader_url = config['video'].get('owner', {}).get('url')
+        video_uploader_id = video_uploader_url.split('/')[-1] if video_uploader_url else None
 
         # Extract video thumbnail
         video_thumbnail = config['video'].get('thumbnail')
@@ -473,6 +492,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
         return {
             'id': video_id,
             'uploader': video_uploader,
+            'uploader_url': video_uploader_url,
             'uploader_id': video_uploader_id,
             'upload_date': video_upload_date,
             'title': video_title,
@@ -488,6 +508,38 @@ class VimeoIE(VimeoBaseInfoExtractor):
         }
 
 
+class VimeoOndemandIE(VimeoBaseInfoExtractor):
+    IE_NAME = 'vimeo:ondemand'
+    _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        # ondemand video not available via https://vimeo.com/id
+        'url': 'https://vimeo.com/ondemand/20704',
+        'md5': 'c424deda8c7f73c1dfb3edd7630e2f35',
+        'info_dict': {
+            'id': '105442900',
+            'ext': 'mp4',
+            'title': 'המעבדה - במאי יותם פלדמן',
+            'uploader': 'גם סרטים',
+            'uploader_url': 're:https?://(?:www\.)?vimeo\.com/gumfilms',
+            'uploader_id': 'gumfilms',
+        },
+    }, {
+        'url': 'https://vimeo.com/ondemand/nazmaalik',
+        'only_matching': True,
+    }, {
+        'url': 'https://vimeo.com/ondemand/141692381',
+        'only_matching': True,
+    }, {
+        'url': 'https://vimeo.com/ondemand/thelastcolony/150274832',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        return self.url_result(self._og_search_video_url(webpage), VimeoIE.ie_key())
+
+
 class VimeoChannelIE(VimeoBaseInfoExtractor):
     IE_NAME = 'vimeo:channel'
     _VALID_URL = r'https://vimeo\.com/channels/(?P<id>[^/?#]+)/?(?:$|[?#])'
index 0805e3c083937d7f58acbe4b872d41c19f8d1c02..d560a4b5e219c2d62cff17da8e47c3cfbb5f87ba 100644 (file)
@@ -11,6 +11,7 @@ from ..compat import (
 )
 from ..utils import (
     ExtractorError,
+    int_or_none,
     orderedSet,
     sanitized_Request,
     str_to_int,
@@ -141,16 +142,29 @@ class VKIE(InfoExtractor):
             'url': 'https://vk.com/video276849682_170681728',
             'info_dict': {
                 'id': 'V3K4mi0SYkc',
-                'ext': 'mp4',
+                'ext': 'webm',
                 'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate",
                 'description': 'md5:bf9c26cfa4acdfb146362682edd3827a',
-                'duration': 179,
+                'duration': 178,
                 'upload_date': '20130116',
                 'uploader': "Children's Joy Foundation",
                 'uploader_id': 'thecjf',
                 'view_count': int,
             },
         },
+        {
+            # video key is extra_data not url\d+
+            'url': 'http://vk.com/video-110305615_171782105',
+            'md5': 'e13fcda136f99764872e739d13fac1d1',
+            'info_dict': {
+                'id': '171782105',
+                'ext': 'mp4',
+                'title': 'S-Dance, репетиции к The way show',
+                'uploader': 'THE WAY SHOW | 17 апреля',
+                'upload_date': '20160207',
+                'view_count': int,
+            },
+        },
         {
             # removed video, just testing that we match the pattern
             'url': 'http://vk.com/feed?z=video-43215063_166094326%2Fbb50cacd3177146d7a',
@@ -298,12 +312,17 @@ class VKIE(InfoExtractor):
             view_count = str_to_int(self._search_regex(
                 r'([\d,.]+)', views, 'view count', fatal=False))
 
-        formats = [{
-            'format_id': k,
-            'url': v,
-            'width': int(k[len('url'):]),
-        } for k, v in data.items()
-            if k.startswith('url')]
+        formats = []
+        for k, v in data.items():
+            if not k.startswith('url') and k != 'extra_data' or not v:
+                continue
+            height = int_or_none(self._search_regex(
+                r'^url(\d+)', k, 'height', default=None))
+            formats.append({
+                'format_id': k,
+                'url': v,
+                'height': height,
+            })
         self._sort_formats(formats)
 
         return {
index affcc52f6e244c40bbca6381700c2f15e645580f..37cf3d3097c94b39f1b66ab11e0e651579f8d533 100644 (file)
@@ -12,7 +12,7 @@ from ..utils import (
 
 
 class WatIE(InfoExtractor):
-    _VALID_URL = r'http://www\.wat\.tv/video/(?P<display_id>.*)-(?P<short_id>.*?)_.*?\.html'
+    _VALID_URL = r'(?:wat:(?P<real_id>\d{8})|http://www\.wat\.tv/video/(?P<display_id>.*)-(?P<short_id>.*?)_.*?\.html)'
     IE_NAME = 'wat.tv'
     _TESTS = [
         {
@@ -54,10 +54,12 @@ class WatIE(InfoExtractor):
         def real_id_for_chapter(chapter):
             return chapter['tc_start'].split('-')[0]
         mobj = re.match(self._VALID_URL, url)
-        short_id = mobj.group('short_id')
         display_id = mobj.group('display_id')
-        webpage = self._download_webpage(url, display_id or short_id)
-        real_id = self._search_regex(r'xtpage = ".*-(.*?)";', webpage, 'real id')
+        real_id = mobj.group('real_id')
+        if not real_id:
+            short_id = mobj.group('short_id')
+            webpage = self._download_webpage(url, display_id or short_id)
+            real_id = self._search_regex(r'xtpage = ".*-(.*?)";', webpage, 'real id')
 
         video_info = self.download_video_info(real_id)
 
index 2037d9b3d57cd5876d85e9552ffcc9f387fcc975..7aea47ed52f7f64032034ab43d51dbe524bff2b3 100644 (file)
@@ -12,38 +12,52 @@ class WebOfStoriesIE(InfoExtractor):
     _VIDEO_DOMAIN = 'http://eu-mobile.webofstories.com/'
     _GREAT_LIFE_STREAMER = 'rtmp://eu-cdn1.webofstories.com/cfx/st/'
     _USER_STREAMER = 'rtmp://eu-users.webofstories.com/cfx/st/'
-    _TESTS = [
-        {
-            'url': 'http://www.webofstories.com/play/hans.bethe/71',
-            'md5': '373e4dd915f60cfe3116322642ddf364',
-            'info_dict': {
-                'id': '4536',
-                'ext': 'mp4',
-                'title': 'The temperature of the sun',
-                'thumbnail': 're:^https?://.*\.jpg$',
-                'description': 'Hans Bethe talks about calculating the temperature of the sun',
-                'duration': 238,
-            }
+    _TESTS = [{
+        'url': 'http://www.webofstories.com/play/hans.bethe/71',
+        'md5': '373e4dd915f60cfe3116322642ddf364',
+        'info_dict': {
+            'id': '4536',
+            'ext': 'mp4',
+            'title': 'The temperature of the sun',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'description': 'Hans Bethe talks about calculating the temperature of the sun',
+            'duration': 238,
+        }
+    }, {
+        'url': 'http://www.webofstories.com/play/55908',
+        'md5': '2985a698e1fe3211022422c4b5ed962c',
+        'info_dict': {
+            'id': '55908',
+            'ext': 'mp4',
+            'title': 'The story of Gemmata obscuriglobus',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'description': 'Planctomycete talks about The story of Gemmata obscuriglobus',
+            'duration': 169,
+        },
+        'skip': 'notfound',
+    }, {
+        # malformed og:title meta
+        'url': 'http://www.webofstories.com/play/54215?o=MS',
+        'info_dict': {
+            'id': '54215',
+            'ext': 'mp4',
+            'title': '"A Leg to Stand On"',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'description': 'Oliver Sacks talks about the death and resurrection of a limb',
+            'duration': 97,
         },
-        {
-            'url': 'http://www.webofstories.com/play/55908',
-            'md5': '2985a698e1fe3211022422c4b5ed962c',
-            'info_dict': {
-                'id': '55908',
-                'ext': 'mp4',
-                'title': 'The story of Gemmata obscuriglobus',
-                'thumbnail': 're:^https?://.*\.jpg$',
-                'description': 'Planctomycete talks about The story of Gemmata obscuriglobus',
-                'duration': 169,
-            }
+        'params': {
+            'skip_download': True,
         },
-    ]
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
         webpage = self._download_webpage(url, video_id)
-        title = self._og_search_title(webpage)
+        # Sometimes og:title meta is malformed
+        title = self._og_search_title(webpage, default=None) or self._html_search_regex(
+            r'(?s)<strong>Title:\s*</strong>(.+?)<', webpage, 'title')
         description = self._html_search_meta('description', webpage)
         thumbnail = self._og_search_thumbnail(webpage)
 
index 041ff6c555123d44c97bc63810d2aa7903ec069e..fb0accac744532625c04bb964c1fa031723ed8ff 100644 (file)
@@ -20,7 +20,7 @@ class WimpIE(InfoExtractor):
         'md5': '4e2986c793694b55b37cf92521d12bb4',
         'info_dict': {
             'id': 'clowncar',
-            'ext': 'mp4',
+            'ext': 'webm',
             'title': 'It\'s like a clown car.',
             'description': 'md5:0e56db1370a6e49c5c1d19124c0d2fb2',
         },
index fdb16d91c25ae4a3d3c8314a76050d09cf86ef3a..41061dd3125a6f298a5aa915c62b50aabdb68a47 100644 (file)
@@ -35,7 +35,8 @@ class WistiaIE(InfoExtractor):
 
         formats = []
         thumbnails = []
-        for atype, a in data['assets'].items():
+        for a in data['assets']:
+            atype = a.get('type')
             if atype == 'still':
                 thumbnails.append({
                     'url': a['url'],
index a3236e66cdba09235ff94960bc8a47984e6c3eb7..94abdb4f3ed3b99d5ab4f4b60897159d0b16bfc6 100644 (file)
@@ -17,7 +17,7 @@ class XFileShareIE(InfoExtractor):
     IE_DESC = 'XFileShare based sites: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net, filehoot.com and vidto.me'
     _VALID_URL = r'''(?x)
         https?://(?P<host>(?:www\.)?
-            (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in|realvid\.net|filehoot\.com|vidto\.me))/
+            (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in|realvid\.net|filehoot\.com|vidto\.me|powerwatch\.pw))/
         (?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:-[0-9]+x[0-9]+\.html)?
     '''
 
@@ -81,6 +81,13 @@ class XFileShareIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'test'
         }
+    }, {
+        'url': 'http://powerwatch.pw/duecjibvicbu',
+        'info_dict': {
+            'id': 'duecjibvicbu',
+            'ext': 'mp4',
+            'title': 'Big Buck Bunny trailer',
+        },
     }]
 
     def _real_extract(self, url):
@@ -112,6 +119,7 @@ class XFileShareIE(InfoExtractor):
         title = (self._search_regex(
             [r'style="z-index: [0-9]+;">([^<]+)</span>',
              r'<td nowrap>([^<]+)</td>',
+             r'h4-fine[^>]*>([^<]+)<',
              r'>Watch (.+) ',
              r'<h2 class="video-page-head">([^<]+)</h2>'],
             webpage, 'title', default=None) or self._og_search_title(webpage)).strip()
index d3cc1a29fa473fee2f58e91323774633be00fc4b..e699e663f60818b090bb6bf0ccdf24802c3c14c4 100644 (file)
@@ -10,13 +10,27 @@ from ..compat import (
     compat_urllib_parse,
 )
 from ..utils import (
+    ExtractorError,
     int_or_none,
     float_or_none,
     sanitized_Request,
 )
 
 
-class YandexMusicTrackIE(InfoExtractor):
+class YandexMusicBaseIE(InfoExtractor):
+    @staticmethod
+    def _handle_error(response):
+        error = response.get('error')
+        if error:
+            raise ExtractorError(error, expected=True)
+
+    def _download_json(self, *args, **kwargs):
+        response = super(YandexMusicBaseIE, self)._download_json(*args, **kwargs)
+        self._handle_error(response)
+        return response
+
+
+class YandexMusicTrackIE(YandexMusicBaseIE):
     IE_NAME = 'yandexmusic:track'
     IE_DESC = 'Яндекс.Музыка - Трек'
     _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P<album_id>\d+)/track/(?P<id>\d+)'
@@ -73,7 +87,7 @@ class YandexMusicTrackIE(InfoExtractor):
         return self._get_track_info(track)
 
 
-class YandexMusicPlaylistBaseIE(InfoExtractor):
+class YandexMusicPlaylistBaseIE(YandexMusicBaseIE):
     def _build_playlist(self, tracks):
         return [
             self.url_result(
index b29baafc441c220b4128c9363f341f7159b8df93..1124fe6c280cb0e23bee3a41ea323165ec714dce 100644 (file)
@@ -75,7 +75,7 @@ class YouPornIE(InfoExtractor):
         links = []
 
         sources = self._search_regex(
-            r'sources\s*:\s*({.+?})', webpage, 'sources', default=None)
+            r'(?s)sources\s*:\s*({.+?})', webpage, 'sources', default=None)
         if sources:
             for _, link in re.findall(r'[^:]+\s*:\s*(["\'])(http.+?)\1', sources):
                 links.append(link)
@@ -101,8 +101,9 @@ class YouPornIE(InfoExtractor):
             }
             # Video URL's path looks like this:
             #  /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
+            #  /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
             # We will benefit from it by extracting some metadata
-            mobj = re.search(r'/(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+/', video_url)
+            mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+/', video_url)
             if mobj:
                 height = int(mobj.group('height'))
                 bitrate = int(mobj.group('bitrate'))
index e24dd3e5b6ddbab40f128f29574777631c44ef80..27e67feb43efb9374c9638a2f5616770c83ed241 100644 (file)
@@ -6,6 +6,7 @@ from __future__ import unicode_literals
 import itertools
 import json
 import os.path
+import random
 import re
 import time
 import traceback
@@ -382,7 +383,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
                 'uploader': 'Philipp Hagemeister',
                 'uploader_id': 'phihag',
+                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/phihag',
                 'upload_date': '20121002',
+                'license': 'Standard YouTube License',
                 'description': 'test chars:  "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
                 'categories': ['Science & Technology'],
                 'tags': ['youtube-dl'],
@@ -401,12 +404,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'upload_date': '20120506',
                 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
                 'alt_title': 'I Love It (feat. Charli XCX)',
-                'description': 'md5:782e8651347686cba06e58f71ab51773',
+                'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8',
                 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
                          'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
                          'iconic ep', 'iconic', 'love', 'it'],
                 'uploader': 'Icona Pop',
                 'uploader_id': 'IconaPop',
+                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/IconaPop',
+                'license': 'Standard YouTube License',
                 'creator': 'Icona Pop',
             }
         },
@@ -422,6 +427,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'description': 'md5:64249768eec3bc4276236606ea996373',
                 'uploader': 'justintimberlakeVEVO',
                 'uploader_id': 'justintimberlakeVEVO',
+                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
+                'license': 'Standard YouTube License',
                 'creator': 'Justin Timberlake',
                 'age_limit': 18,
             }
@@ -437,6 +444,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
                 'uploader': 'SET India',
                 'uploader_id': 'setindia',
+                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/setindia',
+                'license': 'Standard YouTube License',
                 'age_limit': 18,
             }
         },
@@ -449,7 +458,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
                 'uploader': 'Philipp Hagemeister',
                 'uploader_id': 'phihag',
+                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/phihag',
                 'upload_date': '20121002',
+                'license': 'Standard YouTube License',
                 'description': 'test chars:  "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
                 'categories': ['Science & Technology'],
                 'tags': ['youtube-dl'],
@@ -468,8 +479,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'ext': 'm4a',
                 'upload_date': '20121002',
                 'uploader_id': '8KVIDEO',
+                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
                 'description': '',
                 'uploader': '8KVIDEO',
+                'license': 'Standard YouTube License',
                 'title': 'UHDTV TEST 8K VIDEO.mp4'
             },
             'params': {
@@ -488,6 +501,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'uploader': 'AfrojackVEVO',
                 'uploader_id': 'AfrojackVEVO',
                 'upload_date': '20131011',
+                'license': 'Standard YouTube License',
             },
             'params': {
                 'youtube_include_dash_manifest': True,
@@ -506,6 +520,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'uploader': 'TaylorSwiftVEVO',
                 'uploader_id': 'TaylorSwiftVEVO',
                 'upload_date': '20140818',
+                'license': 'Standard YouTube License',
                 'creator': 'Taylor Swift',
             },
             'params': {
@@ -522,6 +537,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'upload_date': '20100909',
                 'uploader': 'The Amazing Atheist',
                 'uploader_id': 'TheAmazingAtheist',
+                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
+                'license': 'Standard YouTube License',
                 'title': 'Burning Everyone\'s Koran',
                 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
             }
@@ -536,7 +553,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
                 'uploader': 'The Witcher',
                 'uploader_id': 'WitcherGame',
+                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
                 'upload_date': '20140605',
+                'license': 'Standard YouTube License',
                 'age_limit': 18,
             },
         },
@@ -550,7 +569,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
                 'uploader': 'LloydVEVO',
                 'uploader_id': 'LloydVEVO',
+                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',
                 'upload_date': '20110629',
+                'license': 'Standard YouTube License',
                 'age_limit': 18,
             },
         },
@@ -562,9 +583,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'ext': 'mp4',
                 'upload_date': '20100430',
                 'uploader_id': 'deadmau5',
+                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/deadmau5',
                 'creator': 'deadmau5',
                 'description': 'md5:12c56784b8032162bb936a5f76d55360',
                 'uploader': 'deadmau5',
+                'license': 'Standard YouTube License',
                 'title': 'Deadmau5 - Some Chords (HD)',
                 'alt_title': 'Some Chords',
             },
@@ -580,6 +603,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'ext': 'mp4',
                 'upload_date': '20150827',
                 'uploader_id': 'olympic',
+                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/olympic',
+                'license': 'Standard YouTube License',
                 'description': 'HO09  - Women -  GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
                 'uploader': 'Olympics',
                 'title': 'Hockey - Women -  GER-AUS - London 2012 Olympic Games',
@@ -597,8 +622,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'stretched_ratio': 16 / 9.,
                 'upload_date': '20110310',
                 'uploader_id': 'AllenMeow',
+                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
                 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
                 'uploader': '孫艾倫',
+                'license': 'Standard YouTube License',
                 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
             },
         },
@@ -629,7 +656,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'description': 'md5:116377fd2963b81ec4ce64b542173306',
                 'upload_date': '20150625',
                 'uploader_id': 'dorappi2000',
+                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
                 'uploader': 'dorappi2000',
+                'license': 'Standard YouTube License',
                 'formats': 'mincount:33',
             },
         },
@@ -644,6 +673,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'uploader': 'Airtek',
                 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
                 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
+                'license': 'Standard YouTube License',
                 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
             },
             'params': {
@@ -668,6 +698,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                     'upload_date': '20150721',
                     'uploader': 'Beer Games Beer',
                     'uploader_id': 'beergamesbeer',
+                    'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
+                    'license': 'Standard YouTube License',
                 },
             }, {
                 'info_dict': {
@@ -678,6 +710,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                     'upload_date': '20150721',
                     'uploader': 'Beer Games Beer',
                     'uploader_id': 'beergamesbeer',
+                    'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
+                    'license': 'Standard YouTube License',
                 },
             }, {
                 'info_dict': {
@@ -688,6 +722,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                     'upload_date': '20150721',
                     'uploader': 'Beer Games Beer',
                     'uploader_id': 'beergamesbeer',
+                    'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
+                    'license': 'Standard YouTube License',
                 },
             }, {
                 'info_dict': {
@@ -698,6 +734,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                     'upload_date': '20150721',
                     'uploader': 'Beer Games Beer',
                     'uploader_id': 'beergamesbeer',
+                    'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
+                    'license': 'Standard YouTube License',
                 },
             }],
             'params': {
@@ -731,7 +769,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
                 'upload_date': '20151119',
                 'uploader_id': 'IronSoulElf',
+                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
                 'uploader': 'IronSoulElf',
+                'license': 'Standard YouTube License',
                 'creator': 'Todd Haberman, Daniel Law Heath & Aaron Kaplan',
             },
             'params': {
@@ -759,6 +799,42 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'skip_download': True,
             },
         },
+        {
+            # Video licensed under Creative Commons
+            'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
+            'info_dict': {
+                'id': 'M4gD1WSo5mA',
+                'ext': 'mp4',
+                'title': 'md5:e41008789470fc2533a3252216f1c1d1',
+                'description': 'md5:a677553cf0840649b731a3024aeff4cc',
+                'upload_date': '20150127',
+                'uploader_id': 'BerkmanCenter',
+                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
+                'uploader': 'BerkmanCenter',
+                'license': 'Creative Commons Attribution license (reuse allowed)',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            # Channel-like uploader_url
+            'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
+            'info_dict': {
+                'id': 'eQcmzGIKrzg',
+                'ext': 'mp4',
+                'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
+                'description': 'md5:dda0d780d5a6e120758d1711d062a867',
+                'upload_date': '20151119',
+                'uploader': 'Bernie 2016',
+                'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
+                'uploader_url': 're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
+                'license': 'Creative Commons Attribution license (reuse allowed)',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
         {
             'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
             'only_matching': True,
@@ -975,40 +1051,67 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             return {}
         try:
             args = player_config['args']
-            caption_url = args['ttsurl']
-            if not caption_url:
-                self._downloader.report_warning(err_msg)
-                return {}
-            timestamp = args['timestamp']
-            # We get the available subtitles
-            list_params = compat_urllib_parse.urlencode({
-                'type': 'list',
-                'tlangs': 1,
-                'asrs': 1,
-            })
-            list_url = caption_url + '&' + list_params
-            caption_list = self._download_xml(list_url, video_id)
-            original_lang_node = caption_list.find('track')
-            if original_lang_node is None:
-                self._downloader.report_warning('Video doesn\'t have automatic captions')
-                return {}
-            original_lang = original_lang_node.attrib['lang_code']
-            caption_kind = original_lang_node.attrib.get('kind', '')
+            caption_url = args.get('ttsurl')
+            if caption_url:
+                timestamp = args['timestamp']
+                # We get the available subtitles
+                list_params = compat_urllib_parse.urlencode({
+                    'type': 'list',
+                    'tlangs': 1,
+                    'asrs': 1,
+                })
+                list_url = caption_url + '&' + list_params
+                caption_list = self._download_xml(list_url, video_id)
+                original_lang_node = caption_list.find('track')
+                if original_lang_node is None:
+                    self._downloader.report_warning('Video doesn\'t have automatic captions')
+                    return {}
+                original_lang = original_lang_node.attrib['lang_code']
+                caption_kind = original_lang_node.attrib.get('kind', '')
+
+                sub_lang_list = {}
+                for lang_node in caption_list.findall('target'):
+                    sub_lang = lang_node.attrib['lang_code']
+                    sub_formats = []
+                    for ext in self._SUBTITLE_FORMATS:
+                        params = compat_urllib_parse.urlencode({
+                            'lang': original_lang,
+                            'tlang': sub_lang,
+                            'fmt': ext,
+                            'ts': timestamp,
+                            'kind': caption_kind,
+                        })
+                        sub_formats.append({
+                            'url': caption_url + '&' + params,
+                            'ext': ext,
+                        })
+                    sub_lang_list[sub_lang] = sub_formats
+                return sub_lang_list
+
+            # Some videos don't provide ttsurl but rather caption_tracks and
+            # caption_translation_languages (e.g. 20LmZk1hakA)
+            caption_tracks = args['caption_tracks']
+            caption_translation_languages = args['caption_translation_languages']
+            caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
+            parsed_caption_url = compat_urlparse.urlparse(caption_url)
+            caption_qs = compat_parse_qs(parsed_caption_url.query)
 
             sub_lang_list = {}
-            for lang_node in caption_list.findall('target'):
-                sub_lang = lang_node.attrib['lang_code']
+            for lang in caption_translation_languages.split(','):
+                lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
+                sub_lang = lang_qs.get('lc', [None])[0]
+                if not sub_lang:
+                    continue
                 sub_formats = []
                 for ext in self._SUBTITLE_FORMATS:
-                    params = compat_urllib_parse.urlencode({
-                        'lang': original_lang,
-                        'tlang': sub_lang,
-                        'fmt': ext,
-                        'ts': timestamp,
-                        'kind': caption_kind,
+                    caption_qs.update({
+                        'tlang': [sub_lang],
+                        'fmt': [ext],
                     })
+                    sub_url = compat_urlparse.urlunparse(parsed_caption_url._replace(
+                        query=compat_urllib_parse.urlencode(caption_qs, True)))
                     sub_formats.append({
-                        'url': caption_url + '&' + params,
+                        'url': sub_url,
                         'ext': ext,
                     })
                 sub_lang_list[sub_lang] = sub_formats
@@ -1019,6 +1122,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             self._downloader.report_warning(err_msg)
             return {}
 
+    def _mark_watched(self, video_id, video_info):
+        playback_url = video_info.get('videostats_playback_base_url', [None])[0]
+        if not playback_url:
+            return
+        parsed_playback_url = compat_urlparse.urlparse(playback_url)
+        qs = compat_urlparse.parse_qs(parsed_playback_url.query)
+
+        # cpn generation algorithm is reverse engineered from base.js.
+        # In fact it works even with dummy cpn.
+        CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
+        cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
+
+        qs.update({
+            'ver': ['2'],
+            'cpn': [cpn],
+        })
+        playback_url = compat_urlparse.urlunparse(
+            parsed_playback_url._replace(query=compat_urllib_parse.urlencode(qs, True)))
+
+        self._download_webpage(
+            playback_url, video_id, 'Marking watched',
+            'Unable to mark watched', fatal=False)
+
     @classmethod
     def extract_id(cls, url):
         mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
@@ -1245,9 +1371,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
 
         # uploader_id
         video_uploader_id = None
-        mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
+        video_uploader_url = None
+        mobj = re.search(
+            r'<link itemprop="url" href="(?P<uploader_url>https?://www.youtube.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
+            video_webpage)
         if mobj is not None:
-            video_uploader_id = mobj.group(1)
+            video_uploader_id = mobj.group('uploader_id')
+            video_uploader_url = mobj.group('uploader_url')
         else:
             self._downloader.report_warning('unable to extract uploader nickname')
 
@@ -1275,6 +1405,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
         upload_date = unified_strdate(upload_date)
 
+        video_license = self._html_search_regex(
+            r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
+            video_webpage, 'license', default=None)
+
         m_music = re.search(
             r'<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*<ul[^>]*>\s*<li>(?P<title>.+?) by (?P<creator>.+?)(?:\(.+?\))?</li',
             video_webpage)
@@ -1348,6 +1482,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
             if 'rtmpe%3Dyes' in encoded_url_map:
                 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
+            formats_spec = {}
+            fmt_list = video_info.get('fmt_list', [''])[0]
+            if fmt_list:
+                for fmt in fmt_list.split(','):
+                    spec = fmt.split('/')
+                    if len(spec) > 1:
+                        width_height = spec[1].split('x')
+                        if len(width_height) == 2:
+                            formats_spec[spec[0]] = {
+                                'resolution': spec[1],
+                                'width': int_or_none(width_height[0]),
+                                'height': int_or_none(width_height[1]),
+                            }
             formats = []
             for url_data_str in encoded_url_map.split(','):
                 url_data = compat_parse_qs(url_data_str)
@@ -1416,6 +1563,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 }
                 if format_id in self._formats:
                     dct.update(self._formats[format_id])
+                if format_id in formats_spec:
+                    dct.update(formats_spec[format_id])
 
                 # Some itags are not included in DASH manifest thus corresponding formats will
                 # lack metadata (see https://github.com/rg3/youtube-dl/pull/5993).
@@ -1528,11 +1677,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
 
         self._sort_formats(formats)
 
+        self.mark_watched(video_id, video_info)
+
         return {
             'id': video_id,
             'uploader': video_uploader,
             'uploader_id': video_uploader_id,
+            'uploader_url': video_uploader_url,
             'upload_date': upload_date,
+            'license': video_license,
             'creator': video_creator,
             'title': video_title,
             'alt_title': video_alt_title,
index c619a75e2a2d6617fd083c44bf05114d9e4c83bc..81c22a6270f99eb08efd5dc5cfb798a860c4c6c8 100644 (file)
@@ -137,6 +137,10 @@ class ZDFIE(InfoExtractor):
                 formats.extend(self._extract_smil_formats(
                     video_url, video_id, fatal=False))
             elif ext == 'm3u8':
+                # the certificates are misconfigured (see
+                # https://github.com/rg3/youtube-dl/issues/8665)
+                if video_url.startswith('https://'):
+                    continue
                 formats.extend(self._extract_m3u8_formats(
                     video_url, video_id, 'mp4', m3u8_id=format_id, fatal=False))
             elif ext == 'f4m':
index 3afa8bb6faac3610dfc3695fe27d6006ff9339b5..9dd7a80346c2704c91cbaccb17096ac24a35f363 100644 (file)
@@ -170,6 +170,14 @@ def parseOpts(overrideArguments=None):
         action='store_const', dest='extract_flat', const='in_playlist',
         default=False,
         help='Do not extract the videos of a playlist, only list them.')
+    general.add_option(
+        '--mark-watched',
+        action='store_true', dest='mark_watched', default=False,
+        help='Mark videos watched (YouTube only)')
+    general.add_option(
+        '--no-mark-watched',
+        action='store_false', dest='mark_watched', default=False,
+        help='Do not mark videos watched (YouTube only)')
     general.add_option(
         '--no-color', '--no-colors',
         action='store_true', dest='no_color',
index 0d8ef6ca26c6ef7f1b7b402b387d20eebd3f8a8f..3ea5183999d5ed2adacbb05bccc6af93e8ac6750 100644 (file)
@@ -6,6 +6,7 @@ from .ffmpeg import (
     FFmpegEmbedSubtitlePP,
     FFmpegExtractAudioPP,
     FFmpegFixupStretchedPP,
+    FFmpegFixupM3u8PP,
     FFmpegFixupM4aPP,
     FFmpegMergerPP,
     FFmpegMetadataPP,
@@ -26,6 +27,7 @@ __all__ = [
     'ExecAfterDownloadPP',
     'FFmpegEmbedSubtitlePP',
     'FFmpegExtractAudioPP',
+    'FFmpegFixupM3u8PP',
     'FFmpegFixupM4aPP',
     'FFmpegFixupStretchedPP',
     'FFmpegMergerPP',
index e19dbf73d5fe36c602d9ffb83cd2d02ab39cb5e1..3bad5a266b6d51aaf0c92224a94986957da230f2 100644 (file)
@@ -40,7 +40,7 @@ class EmbedThumbnailPP(FFmpegPostProcessor):
                 'Skipping embedding the thumbnail because the file is missing.')
             return [], info
 
-        if info['ext'] == 'mp3':
+        if info['ext'] in ('mp3', 'mkv'):
             options = [
                 '-c', 'copy', '-map', '0', '-map', '1',
                 '-metadata:s:v', 'title="Album cover"', '-metadata:s:v', 'comment="Cover (Front)"']
index cc7aaeda397ba8b5dd368c770e39a7a4c34f4719..a8819f258013de5a1cbbf1a5b42ca87b2b4ef14e 100644 (file)
@@ -404,10 +404,6 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
         for (name, value) in metadata.items():
             options.extend(['-metadata', '%s=%s' % (name, value)])
 
-        # https://github.com/rg3/youtube-dl/issues/8350
-        if info.get('protocol') == 'm3u8_native' or info.get('protocol') == 'm3u8' and self._downloader.params.get('hls_prefer_native', False):
-            options.extend(['-bsf:a', 'aac_adtstoasc'])
-
         self._downloader.to_screen('[ffmpeg] Adding metadata to \'%s\'' % filename)
         self.run_ffmpeg(filename, temp_filename, options)
         os.remove(encodeFilename(filename))
@@ -480,6 +476,21 @@ class FFmpegFixupM4aPP(FFmpegPostProcessor):
         return [], info
 
 
+class FFmpegFixupM3u8PP(FFmpegPostProcessor):
+    def run(self, info):
+        filename = info['filepath']
+        temp_filename = prepend_extension(filename, 'temp')
+
+        options = ['-c', 'copy', '-f', 'mp4', '-bsf:a', 'aac_adtstoasc']
+        self._downloader.to_screen('[ffmpeg] Fixing malformated aac bitstream in "%s"' % filename)
+        self.run_ffmpeg(filename, temp_filename, options)
+
+        os.remove(encodeFilename(filename))
+        os.rename(encodeFilename(temp_filename), encodeFilename(filename))
+
+        return [], info
+
+
 class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):
     def __init__(self, downloader=None, format=None):
         super(FFmpegSubtitlesConvertorPP, self).__init__(downloader)
index 480d48d057400fafb0acdfc8492fca31b1d2f674..e39ca60aa08326b6f05814ff800bb09c75755e48 100644 (file)
@@ -6,6 +6,7 @@ import sys
 import errno
 
 from .common import PostProcessor
+from ..compat import compat_os_name
 from ..utils import (
     check_executable,
     hyphenate_date,
@@ -73,7 +74,7 @@ class XAttrMetadataPP(PostProcessor):
                     raise XAttrMetadataError(e.errno, e.strerror)
 
         except ImportError:
-            if os.name == 'nt':
+            if compat_os_name == 'nt':
                 # Write xattrs to NTFS Alternate Data Streams:
                 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
                 def write_xattr(path, key, value):
@@ -168,7 +169,7 @@ class XAttrMetadataPP(PostProcessor):
                     'Unable to write extended attributes due to too long values.')
             else:
                 msg = 'This filesystem doesn\'t support extended attributes. '
-                if os.name == 'nt':
+                if compat_os_name == 'nt':
                     msg += 'You need to use NTFS.'
                 else:
                     msg += '(You may have to enable them in your /etc/fstab)'
index 672ce05ea17b9d0ab9c009c383810ff54a97dd10..9fd0ec8d5856cbee27534c0f4a02cc90b05f8389 100644 (file)
@@ -4,6 +4,7 @@
 from __future__ import unicode_literals
 
 import base64
+import binascii
 import calendar
 import codecs
 import contextlib
@@ -159,8 +160,6 @@ if sys.version_info >= (2, 7):
     def find_xpath_attr(node, xpath, key, val=None):
         """ Find the xpath xpath[@key=val] """
         assert re.match(r'^[a-zA-Z_-]+$', key)
-        if val:
-            assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
         return node.find(expr)
 else:
@@ -466,6 +465,10 @@ def encodeFilename(s, for_subprocess=False):
     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
         return s
 
+    # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
+    if sys.platform.startswith('java'):
+        return s
+
     return s.encode(get_subprocess_encoding(), 'ignore')
 
 
@@ -904,9 +907,9 @@ def unified_strdate(date_str, day_first=True):
         '%d %b %Y',
         '%B %d %Y',
         '%b %d %Y',
-        '%b %dst %Y %I:%M%p',
-        '%b %dnd %Y %I:%M%p',
-        '%b %dth %Y %I:%M%p',
+        '%b %dst %Y %I:%M',
+        '%b %dnd %Y %I:%M',
+        '%b %dth %Y %I:%M',
         '%Y %m %d',
         '%Y-%m-%d',
         '%Y/%m/%d',
@@ -1216,13 +1219,23 @@ if sys.platform == 'win32':
             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 
 else:
-    import fcntl
+    # Some platforms, such as Jython, is missing fcntl
+    try:
+        import fcntl
 
-    def _lock_file(f, exclusive):
-        fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
+        def _lock_file(f, exclusive):
+            fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
 
-    def _unlock_file(f):
-        fcntl.flock(f, fcntl.LOCK_UN)
+        def _unlock_file(f):
+            fcntl.flock(f, fcntl.LOCK_UN)
+    except ImportError:
+        UNSUPPORTED_MSG = 'file locking is not supported on this platform'
+
+        def _lock_file(f, exclusive):
+            raise IOError(UNSUPPORTED_MSG)
+
+        def _unlock_file(f):
+            raise IOError(UNSUPPORTED_MSG)
 
 
 class locked_file(object):
@@ -1303,6 +1316,17 @@ def format_bytes(bytes):
     return '%.2f%s' % (converted, suffix)
 
 
+def lookup_unit_table(unit_table, s):
+    units_re = '|'.join(re.escape(u) for u in unit_table)
+    m = re.match(
+        r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
+    if not m:
+        return None
+    num_str = m.group('num').replace(',', '.')
+    mult = unit_table[m.group('unit')]
+    return int(float(num_str) * mult)
+
+
 def parse_filesize(s):
     if s is None:
         return None
@@ -1346,15 +1370,28 @@ def parse_filesize(s):
         'Yb': 1000 ** 8,
     }
 
-    units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
-    m = re.match(
-        r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
-    if not m:
+    return lookup_unit_table(_UNIT_TABLE, s)
+
+
+def parse_count(s):
+    if s is None:
         return None
 
-    num_str = m.group('num').replace(',', '.')
-    mult = _UNIT_TABLE[m.group('unit')]
-    return int(float(num_str) * mult)
+    s = s.strip()
+
+    if re.match(r'^[\d,.]+$', s):
+        return str_to_int(s)
+
+    _UNIT_TABLE = {
+        'k': 1000,
+        'K': 1000,
+        'm': 1000 ** 2,
+        'M': 1000 ** 2,
+        'kk': 1000 ** 2,
+        'KK': 1000 ** 2,
+    }
+
+    return lookup_unit_table(_UNIT_TABLE, s)
 
 
 def month_by_name(name):
@@ -1386,6 +1423,12 @@ def fix_xml_ampersands(xml_str):
 
 def setproctitle(title):
     assert isinstance(title, compat_str)
+
+    # ctypes in Jython is not complete
+    # http://bugs.jython.org/issue2148
+    if sys.platform.startswith('java'):
+        return
+
     try:
         libc = ctypes.cdll.LoadLibrary('libc.so.6')
     except OSError:
@@ -1569,9 +1612,12 @@ class PagedList(object):
 
 
 class OnDemandPagedList(PagedList):
-    def __init__(self, pagefunc, pagesize):
+    def __init__(self, pagefunc, pagesize, use_cache=False):
         self._pagefunc = pagefunc
         self._pagesize = pagesize
+        self._use_cache = use_cache
+        if use_cache:
+            self._cache = {}
 
     def getslice(self, start=0, end=None):
         res = []
@@ -1581,7 +1627,13 @@ class OnDemandPagedList(PagedList):
             if start >= nextfirstid:
                 continue
 
-            page_results = list(self._pagefunc(pagenum))
+            page_results = None
+            if self._use_cache:
+                page_results = self._cache.get(pagenum)
+            if page_results is None:
+                page_results = list(self._pagefunc(pagenum))
+            if self._use_cache:
+                self._cache[pagenum] = page_results
 
             startv = (
                 start % self._pagesize
@@ -1711,6 +1763,15 @@ def urlencode_postdata(*args, **kargs):
     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
 
 
+def update_url_query(url, query):
+    parsed_url = compat_urlparse.urlparse(url)
+    qs = compat_parse_qs(parsed_url.query)
+    qs.update(query)
+    qs = encode_dict(qs)
+    return compat_urlparse.urlunparse(parsed_url._replace(
+        query=compat_urllib_parse.urlencode(qs, True)))
+
+
 def encode_dict(d, encoding='utf-8'):
     def encode(v):
         return v.encode(encoding) if isinstance(v, compat_basestring) else v
@@ -1835,11 +1896,21 @@ def error_to_compat_str(err):
 
 
 def mimetype2ext(mt):
+    ext = {
+        'audio/mp4': 'm4a',
+    }.get(mt)
+    if ext is not None:
+        return ext
+
     _, _, res = mt.rpartition('/')
 
     return {
         '3gpp': '3gp',
+        'smptett+xml': 'tt',
+        'srt': 'srt',
+        'ttaf+xml': 'dfxp',
         'ttml+xml': 'ttml',
+        'vtt': 'vtt',
         'x-flv': 'flv',
         'x-mp4-fragmented': 'mp4',
         'x-ms-wmv': 'wmv',
@@ -2582,3 +2653,58 @@ class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
             return None  # No Proxy
         return compat_urllib_request.ProxyHandler.proxy_open(
             self, req, proxy, type)
+
+
+def ohdave_rsa_encrypt(data, exponent, modulus):
+    '''
+    Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
+
+    Input:
+        data: data to encrypt, bytes-like object
+        exponent, modulus: parameter e and N of RSA algorithm, both integer
+    Output: hex string of encrypted data
+
+    Limitation: supports one block encryption only
+    '''
+
+    payload = int(binascii.hexlify(data[::-1]), 16)
+    encrypted = pow(payload, exponent, modulus)
+    return '%x' % encrypted
+
+
+def encode_base_n(num, n, table=None):
+    FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
+    if not table:
+        table = FULL_TABLE[:n]
+
+    if n > len(table):
+        raise ValueError('base %d exceeds table length %d' % (n, len(table)))
+
+    if num == 0:
+        return table[0]
+
+    ret = ''
+    while num:
+        ret = table[num % n] + ret
+        num = num // n
+    return ret
+
+
+def decode_packed_codes(code):
+    mobj = re.search(
+        r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
+        code)
+    obfucasted_code, base, count, symbols = mobj.groups()
+    base = int(base)
+    count = int(count)
+    symbols = symbols.split('|')
+    symbol_table = {}
+
+    while count:
+        count -= 1
+        base_n_count = encode_base_n(count, base)
+        symbol_table[base_n_count] = symbols[count] or base_n_count
+
+    return re.sub(
+        r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
+        obfucasted_code)
index 9aca8001aa0d623c64695589c0a3d0ca2689593f..246f5740d2920dcf9ce4c586ded6c6f47767235c 100644 (file)
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '2016.02.13'
+__version__ = '2016.03.06'